Skip to content

Phoneme Overlap Segmenter

speechline.segmenters.phoneme_overlap_segmenter.PhonemeOverlapSegmenter (Segmenter)

Source code in speechline/segmenters/phoneme_overlap_segmenter.py
class PhonemeOverlapSegmenter(Segmenter):
    def __init__(self, lexicon: Dict[str, List[str]]):
        """
        Phoneme-overlap segmenter, with phoneme variations.

        Args:
            lexicon (Dict[str, List[str]]):
                Lexicon of words and their phoneme variations.
        """
        self.lexicon = self._normalize_lexicon(lexicon)

    def _normalize_text(self, text: str) -> str:
        text = text.lower().strip()
        return text

    def _normalize_phonemes(self, phonemes: str) -> str:
        """
        Remove diacritics from phonemes.
        Modified from: [Michael McAuliffe](https://memcauliffe.com/speaker-dictionaries-and-multilingual-ipa.html#multilingual-ipa-mode) # noqa: E501

        Args:
            phonemes (str):
                Phonemes to normalize.

        Returns:
            str:
                Normalized phonemes.
        """
        diacritics = ["ː", "ˑ", "̆", "̯", "͡", "‿", "͜", "̩", "ˈ", "ˌ"]
        for d in diacritics:
            phonemes = phonemes.replace(d, "")
        return phonemes.strip()

    def _normalize_lexicon(self, lexicon: Dict[str, List[str]]) -> Dict[str, List[str]]:
        """
        Normalizes phonemes in lexicon and deduplicates.

        Args:
            lexicon (Dict[str, List[str]]):
                Lexicon to normalize.

        Returns:
            Dict[str, List[str]]:
                Normalized lexicon.
        """
        return {word: set(self._normalize_phonemes(p) for p in phonemes) for word, phonemes in lexicon.items()}

    def _merge_offsets(self, offsets: List[Dict[str, Union[str, float]]]) -> List[List[Dict[str, Union[str, float]]]]:
        """
        Merge phoneme-level offsets into word-bounded phoneme offsets.

        Args:
            offsets (List[Dict[str, Union[str, float]]]):
                List of phoneme offsets.

        Returns:
            List[List[Dict[str, Union[str, float]]]]:
                List of word-bounded phoneme offset segments.
        """
        result = []
        current_item = {"text": [], "start_time": None, "end_time": None}
        for item in offsets:
            if item["text"] != " ":
                if current_item["start_time"] is None:
                    current_item["start_time"] = item["start_time"]
                current_item["end_time"] = item["end_time"]
                current_item["text"].append(item["text"])
            else:
                if current_item["start_time"] is not None:
                    result.append(current_item)
                    current_item = {"text": [], "start_time": None, "end_time": None}

        if current_item["start_time"] is not None:
            result.append(current_item)

        for r in result:
            r["text"] = " ".join(r["text"])

        return result

    def _generate_combinations(self, ground_truth: List[str]) -> List[List[str]]:
        """
        Generate all possible phoneme combinations for a given word.

        Args:
            ground_truth (List[str]):
                List of words.

        Returns:
            List[List[str]]:
                List of phoneme combinations.
        """

        def g2p(text: str) -> List[str]:
            phonemes = []
            for words in sentences(text):
                for word in words:
                    if word.is_major_break or word.is_minor_break:
                        phonemes.append(word.text)
                    else:
                        phonemes.append(" ".join(word.phonemes))
            return phonemes

        combinations = []
        for word in ground_truth:
            normalized_word = self._normalize_text(word)
            if normalized_word in self.lexicon:
                phonemes = self.lexicon[normalized_word]
            else:
                phonemes = g2p(normalized_word)
            combinations.append(phonemes)
        return combinations

    def chunk_offsets(
        self,
        offsets: List[Dict[str, Union[str, float]]],
        ground_truth: List[str],
        **kwargs,
    ) -> List[List[Dict[str, Union[str, float]]]]:
        """
        Chunk phoneme-level offsets into word-bounded phoneme offsets.

        ### Example
        ```pycon title="example_phoneme_overlap_segmenter.py"
        >>> from speechline.segmenters import PhonemeOverlapSegmenter
        >>> ground_truth = ["Her", "red", "umbrella", "is", "just", "the", "best"]
        >>> lexicon = {
        ...     "her": ["h ˈɚ", "h ɜ ɹ", "ɜ ɹ", "h ɜː ɹ", "ə ɹ"],
        ...     "red": ["ɹ ˈɛ d", "ɹ ɛ d"],
        ...     "umbrella": ["ˈʌ m b ɹ ˌɛ l ə", "ʌ m b ɹ ɛ l ə"],
        ...     "is": ["ˈɪ z", "ɪ z"],
        ...     "just": ["d͡ʒ ˈʌ s t", "d͡ʒ ʌ s t"],
        ...     "the": ["ð ə", "ð i", "ð iː", "ð ɪ"],
        ...     "best": ["b ˈɛ s t", "b ɛ s t"]
        ... }
        >>> offsets = [
        ...     {'text': 'h', 'start_time': 0.16, 'end_time': 0.18},
        ...     {'text': 'ɝ', 'start_time': 0.26, 'end_time': 0.28},
        ...     {'text': ' ', 'start_time': 0.3, 'end_time': 0.34},
        ...     {'text': 'ɹ', 'start_time': 0.36, 'end_time': 0.38},
        ...     {'text': 'ɛ', 'start_time': 0.44, 'end_time': 0.46},
        ...     {'text': 'd', 'start_time': 0.5, 'end_time': 0.52},
        ...     {'text': ' ', 'start_time': 0.6, 'end_time': 0.64},
        ...     {'text': 'ə', 'start_time': 0.72, 'end_time': 0.74},
        ...     {'text': 'm', 'start_time': 0.76, 'end_time': 0.78},
        ...     {'text': 'b', 'start_time': 0.82, 'end_time': 0.84},
        ...     {'text': 'ɹ', 'start_time': 0.84, 'end_time': 0.88},
        ...     {'text': 'ɛ', 'start_time': 0.92, 'end_time': 0.94},
        ...     {'text': 'l', 'start_time': 0.98, 'end_time': 1.0},
        ...     {'text': 'ə', 'start_time': 1.12, 'end_time': 1.14},
        ...     {'text': ' ', 'start_time': 1.3, 'end_time': 1.34},
        ...     {'text': 'ɪ', 'start_time': 1.4, 'end_time': 1.42},
        ...     {'text': 'z', 'start_time': 1.44, 'end_time': 1.46},
        ...     {'text': ' ', 'start_time': 1.52, 'end_time': 1.56},
        ...     {'text': 'dʒ', 'start_time': 1.58, 'end_time': 1.6},
        ...     {'text': 'ʌ', 'start_time': 1.66, 'end_time': 1.68},
        ...     {'text': 's', 'start_time': 1.7, 'end_time': 1.72},
        ...     {'text': 't', 'start_time': 1.78, 'end_time': 1.8},
        ...     {'text': ' ', 'start_time': 1.84, 'end_time': 1.88},
        ...     {'text': 'θ', 'start_time': 1.88, 'end_time': 1.9},
        ...     {'text': ' ', 'start_time': 1.96, 'end_time': 2.0},
        ...     {'text': 'b', 'start_time': 2.0, 'end_time': 2.02},
        ...     {'text': 'ɛ', 'start_time': 2.12, 'end_time': 2.14},
        ...     {'text': 's', 'start_time': 2.18, 'end_time': 2.2},
        ...     {'text': 't', 'start_time': 2.32, 'end_time': 2.34}
        ... ]
        >>> segmenter = PhonemeOverlapSegmenter(lexicon)
        >>> segmenter.chunk_offsets(offsets, ground_truth)
        [
            [
                {'text': 'ɹ ɛ d', 'start_time': 0.36, 'end_time': 0.52}
            ],
            [
                {'text': 'ɪ z', 'start_time': 1.4, 'end_time': 1.46},
                {'text': 'dʒ ʌ s t', 'start_time': 1.58, 'end_time': 1.8}
            ],
            [
                {'text': 'b ɛ s t', 'start_time': 2.0, 'end_time': 2.34}
            ]
        ]
        ```

        Args:
            offsets (List[Dict[str, Union[str, float]]]):
                List of phoneme offsets.
            ground_truth (List[str]):
                List of words.

        Returns:
            List[List[Dict[str, Union[str, float]]]]:
                List of word-bounded phoneme offset segments.
        """

        ground_truth = self._generate_combinations(ground_truth)
        merged_offsets = self._merge_offsets(offsets)
        transcripts = [self._normalize_phonemes(o["text"]) for o in merged_offsets]

        idxs, index = [], 0  # index in ground truth
        for i, word in enumerate(transcripts):
            if index >= len(ground_truth):
                break
            for var in ground_truth[index:]:
                # match
                if word in var:
                    idxs.append(i)
                    break
            index += 1

        # if no matches
        if not idxs:
            return []

        # collapse longest consecutive indices
        merged_idxs = []
        start, end = idxs[0], idxs[0] + 1
        for i in idxs[1:]:
            if i == end:
                end += 1
            else:
                merged_idxs.append((start, end))
                start, end = i, i + 1
        merged_idxs.append((start, end))

        # segment according to longest consecutive indices
        segments = [merged_offsets[i:j] for (i, j) in merged_idxs]
        return segments

__init__(self, lexicon) special

Phoneme-overlap segmenter, with phoneme variations.

Parameters:

Name Type Description Default
lexicon Dict[str, List[str]]

Lexicon of words and their phoneme variations.

required
Source code in speechline/segmenters/phoneme_overlap_segmenter.py
def __init__(self, lexicon: Dict[str, List[str]]):
    """
    Phoneme-overlap segmenter, with phoneme variations.

    Args:
        lexicon (Dict[str, List[str]]):
            Lexicon of words and their phoneme variations.
    """
    self.lexicon = self._normalize_lexicon(lexicon)

chunk_offsets(self, offsets, ground_truth, **kwargs)

Chunk phoneme-level offsets into word-bounded phoneme offsets.

Example
example_phoneme_overlap_segmenter.py
>>> from speechline.segmenters import PhonemeOverlapSegmenter
>>> ground_truth = ["Her", "red", "umbrella", "is", "just", "the", "best"]
>>> lexicon = {
...     "her": ["h ˈɚ", "h ɜ ɹ", "ɜ ɹ", "h ɜː ɹ", "ə ɹ"],
...     "red": ["ɹ ˈɛ d", "ɹ ɛ d"],
...     "umbrella": ["ˈʌ m b ɹ ˌɛ l ə", "ʌ m b ɹ ɛ l ə"],
...     "is": ["ˈɪ z", "ɪ z"],
...     "just": ["d͡ʒ ˈʌ s t", "d͡ʒ ʌ s t"],
...     "the": ["ð ə", "ð i", "ð iː", "ð ɪ"],
...     "best": ["b ˈɛ s t", "b ɛ s t"]
... }
>>> offsets = [
...     {'text': 'h', 'start_time': 0.16, 'end_time': 0.18},
...     {'text': 'ɝ', 'start_time': 0.26, 'end_time': 0.28},
...     {'text': ' ', 'start_time': 0.3, 'end_time': 0.34},
...     {'text': 'ɹ', 'start_time': 0.36, 'end_time': 0.38},
...     {'text': 'ɛ', 'start_time': 0.44, 'end_time': 0.46},
...     {'text': 'd', 'start_time': 0.5, 'end_time': 0.52},
...     {'text': ' ', 'start_time': 0.6, 'end_time': 0.64},
...     {'text': 'ə', 'start_time': 0.72, 'end_time': 0.74},
...     {'text': 'm', 'start_time': 0.76, 'end_time': 0.78},
...     {'text': 'b', 'start_time': 0.82, 'end_time': 0.84},
...     {'text': 'ɹ', 'start_time': 0.84, 'end_time': 0.88},
...     {'text': 'ɛ', 'start_time': 0.92, 'end_time': 0.94},
...     {'text': 'l', 'start_time': 0.98, 'end_time': 1.0},
...     {'text': 'ə', 'start_time': 1.12, 'end_time': 1.14},
...     {'text': ' ', 'start_time': 1.3, 'end_time': 1.34},
...     {'text': 'ɪ', 'start_time': 1.4, 'end_time': 1.42},
...     {'text': 'z', 'start_time': 1.44, 'end_time': 1.46},
...     {'text': ' ', 'start_time': 1.52, 'end_time': 1.56},
...     {'text': 'dʒ', 'start_time': 1.58, 'end_time': 1.6},
...     {'text': 'ʌ', 'start_time': 1.66, 'end_time': 1.68},
...     {'text': 's', 'start_time': 1.7, 'end_time': 1.72},
...     {'text': 't', 'start_time': 1.78, 'end_time': 1.8},
...     {'text': ' ', 'start_time': 1.84, 'end_time': 1.88},
...     {'text': 'θ', 'start_time': 1.88, 'end_time': 1.9},
...     {'text': ' ', 'start_time': 1.96, 'end_time': 2.0},
...     {'text': 'b', 'start_time': 2.0, 'end_time': 2.02},
...     {'text': 'ɛ', 'start_time': 2.12, 'end_time': 2.14},
...     {'text': 's', 'start_time': 2.18, 'end_time': 2.2},
...     {'text': 't', 'start_time': 2.32, 'end_time': 2.34}
... ]
>>> segmenter = PhonemeOverlapSegmenter(lexicon)
>>> segmenter.chunk_offsets(offsets, ground_truth)
[
    [
        {'text': 'ɹ ɛ d', 'start_time': 0.36, 'end_time': 0.52}
    ],
    [
        {'text': 'ɪ z', 'start_time': 1.4, 'end_time': 1.46},
        {'text': 'dʒ ʌ s t', 'start_time': 1.58, 'end_time': 1.8}
    ],
    [
        {'text': 'b ɛ s t', 'start_time': 2.0, 'end_time': 2.34}
    ]
]

Parameters:

Name Type Description Default
offsets List[Dict[str, Union[str, float]]]

List of phoneme offsets.

required
ground_truth List[str]

List of words.

required

Returns:

Type Description
List[List[Dict[str, Union[str, float]]]]

List of word-bounded phoneme offset segments.

Source code in speechline/segmenters/phoneme_overlap_segmenter.py
def chunk_offsets(
    self,
    offsets: List[Dict[str, Union[str, float]]],
    ground_truth: List[str],
    **kwargs,
) -> List[List[Dict[str, Union[str, float]]]]:
    """
    Chunk phoneme-level offsets into word-bounded phoneme offsets.

    ### Example
    ```pycon title="example_phoneme_overlap_segmenter.py"
    >>> from speechline.segmenters import PhonemeOverlapSegmenter
    >>> ground_truth = ["Her", "red", "umbrella", "is", "just", "the", "best"]
    >>> lexicon = {
    ...     "her": ["h ˈɚ", "h ɜ ɹ", "ɜ ɹ", "h ɜː ɹ", "ə ɹ"],
    ...     "red": ["ɹ ˈɛ d", "ɹ ɛ d"],
    ...     "umbrella": ["ˈʌ m b ɹ ˌɛ l ə", "ʌ m b ɹ ɛ l ə"],
    ...     "is": ["ˈɪ z", "ɪ z"],
    ...     "just": ["d͡ʒ ˈʌ s t", "d͡ʒ ʌ s t"],
    ...     "the": ["ð ə", "ð i", "ð iː", "ð ɪ"],
    ...     "best": ["b ˈɛ s t", "b ɛ s t"]
    ... }
    >>> offsets = [
    ...     {'text': 'h', 'start_time': 0.16, 'end_time': 0.18},
    ...     {'text': 'ɝ', 'start_time': 0.26, 'end_time': 0.28},
    ...     {'text': ' ', 'start_time': 0.3, 'end_time': 0.34},
    ...     {'text': 'ɹ', 'start_time': 0.36, 'end_time': 0.38},
    ...     {'text': 'ɛ', 'start_time': 0.44, 'end_time': 0.46},
    ...     {'text': 'd', 'start_time': 0.5, 'end_time': 0.52},
    ...     {'text': ' ', 'start_time': 0.6, 'end_time': 0.64},
    ...     {'text': 'ə', 'start_time': 0.72, 'end_time': 0.74},
    ...     {'text': 'm', 'start_time': 0.76, 'end_time': 0.78},
    ...     {'text': 'b', 'start_time': 0.82, 'end_time': 0.84},
    ...     {'text': 'ɹ', 'start_time': 0.84, 'end_time': 0.88},
    ...     {'text': 'ɛ', 'start_time': 0.92, 'end_time': 0.94},
    ...     {'text': 'l', 'start_time': 0.98, 'end_time': 1.0},
    ...     {'text': 'ə', 'start_time': 1.12, 'end_time': 1.14},
    ...     {'text': ' ', 'start_time': 1.3, 'end_time': 1.34},
    ...     {'text': 'ɪ', 'start_time': 1.4, 'end_time': 1.42},
    ...     {'text': 'z', 'start_time': 1.44, 'end_time': 1.46},
    ...     {'text': ' ', 'start_time': 1.52, 'end_time': 1.56},
    ...     {'text': 'dʒ', 'start_time': 1.58, 'end_time': 1.6},
    ...     {'text': 'ʌ', 'start_time': 1.66, 'end_time': 1.68},
    ...     {'text': 's', 'start_time': 1.7, 'end_time': 1.72},
    ...     {'text': 't', 'start_time': 1.78, 'end_time': 1.8},
    ...     {'text': ' ', 'start_time': 1.84, 'end_time': 1.88},
    ...     {'text': 'θ', 'start_time': 1.88, 'end_time': 1.9},
    ...     {'text': ' ', 'start_time': 1.96, 'end_time': 2.0},
    ...     {'text': 'b', 'start_time': 2.0, 'end_time': 2.02},
    ...     {'text': 'ɛ', 'start_time': 2.12, 'end_time': 2.14},
    ...     {'text': 's', 'start_time': 2.18, 'end_time': 2.2},
    ...     {'text': 't', 'start_time': 2.32, 'end_time': 2.34}
    ... ]
    >>> segmenter = PhonemeOverlapSegmenter(lexicon)
    >>> segmenter.chunk_offsets(offsets, ground_truth)
    [
        [
            {'text': 'ɹ ɛ d', 'start_time': 0.36, 'end_time': 0.52}
        ],
        [
            {'text': 'ɪ z', 'start_time': 1.4, 'end_time': 1.46},
            {'text': 'dʒ ʌ s t', 'start_time': 1.58, 'end_time': 1.8}
        ],
        [
            {'text': 'b ɛ s t', 'start_time': 2.0, 'end_time': 2.34}
        ]
    ]
    ```

    Args:
        offsets (List[Dict[str, Union[str, float]]]):
            List of phoneme offsets.
        ground_truth (List[str]):
            List of words.

    Returns:
        List[List[Dict[str, Union[str, float]]]]:
            List of word-bounded phoneme offset segments.
    """

    ground_truth = self._generate_combinations(ground_truth)
    merged_offsets = self._merge_offsets(offsets)
    transcripts = [self._normalize_phonemes(o["text"]) for o in merged_offsets]

    idxs, index = [], 0  # index in ground truth
    for i, word in enumerate(transcripts):
        if index >= len(ground_truth):
            break
        for var in ground_truth[index:]:
            # match
            if word in var:
                idxs.append(i)
                break
        index += 1

    # if no matches
    if not idxs:
        return []

    # collapse longest consecutive indices
    merged_idxs = []
    start, end = idxs[0], idxs[0] + 1
    for i in idxs[1:]:
        if i == end:
            end += 1
        else:
            merged_idxs.append((start, end))
            start, end = i, i + 1
    merged_idxs.append((start, end))

    # segment according to longest consecutive indices
    segments = [merged_offsets[i:j] for (i, j) in merged_idxs]
    return segments