Skip to content

Word Overlap Segmenter

speechline.segmenters.word_overlap_segmenter.WordOverlapSegmenter (Segmenter)

Source code in speechline/segmenters/
class WordOverlapSegmenter(Segmenter):
    def normalize(self, text: str) -> str:
        text = text.lower().strip()
        return text

    def chunk_offsets(
        offsets: List[Dict[str, Union[str, float]]],
        ground_truth: List[str],
    ) -> List[List[Dict[str, Union[str, float]]]]:
        Chunk transcript offsets based on overlaps with ground truth.

        ### Example
        ```pycon title=""
        >>> from speechline.segmenters import WordOverlapSegmenter
        >>> segmenter = WordOverlapSegmenter()
        >>> offsets = [
        ...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
        ...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
        ...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
        ...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
        ...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
        ...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
        ...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
        ... ]
        >>> ground_truth = ["red", "umbrella", "just", "the", "best"]
        >>> segmenter.chunk_offsets(offsets, ground_truth)
                {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
                {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
                {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
                {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
                {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}

            offsets (List[Dict[str, Union[str, float]]]):
                Offsets to chunk.
            ground_truth (List[str]):
                List of ground truth words to compare with offsets.

            List[List[Dict[str, Union[str, float]]]]:
                List of chunked/segmented offsets.
        ground_truth = [self.normalize(g) for g in ground_truth]
        transcripts = [self.normalize(o["text"]) for o in offsets]

        matcher = SequenceMatcher(None, transcripts, ground_truth)
        idxs = [(i1, i2) for tag, i1, i2, *_ in matcher.get_opcodes() if tag == "equal"]
        segments = [offsets[i:j] for (i, j) in idxs]
        return segments

chunk_offsets(self, offsets, ground_truth, **kwargs)

Chunk transcript offsets based on overlaps with ground truth.

>>> from speechline.segmenters import WordOverlapSegmenter
>>> segmenter = WordOverlapSegmenter()
>>> offsets = [
...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
... ]
>>> ground_truth = ["red", "umbrella", "just", "the", "best"]
>>> segmenter.chunk_offsets(offsets, ground_truth)
        {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
        {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
        {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
        {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
        {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}


Name Type Description Default
offsets List[Dict[str, Union[str, float]]]

Offsets to chunk.

ground_truth List[str]

List of ground truth words to compare with offsets.



Type Description
List[List[Dict[str, Union[str, float]]]]

List of chunked/segmented offsets.

Source code in speechline/segmenters/
def chunk_offsets(
    offsets: List[Dict[str, Union[str, float]]],
    ground_truth: List[str],
) -> List[List[Dict[str, Union[str, float]]]]:
    Chunk transcript offsets based on overlaps with ground truth.

    ### Example
    ```pycon title=""
    >>> from speechline.segmenters import WordOverlapSegmenter
    >>> segmenter = WordOverlapSegmenter()
    >>> offsets = [
    ...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
    ...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
    ...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
    ...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
    ...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
    ...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
    ...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
    ... ]
    >>> ground_truth = ["red", "umbrella", "just", "the", "best"]
    >>> segmenter.chunk_offsets(offsets, ground_truth)
            {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
            {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
            {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
            {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
            {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}

        offsets (List[Dict[str, Union[str, float]]]):
            Offsets to chunk.
        ground_truth (List[str]):
            List of ground truth words to compare with offsets.

        List[List[Dict[str, Union[str, float]]]]:
            List of chunked/segmented offsets.
    ground_truth = [self.normalize(g) for g in ground_truth]
    transcripts = [self.normalize(o["text"]) for o in offsets]

    matcher = SequenceMatcher(None, transcripts, ground_truth)
    idxs = [(i1, i2) for tag, i1, i2, *_ in matcher.get_opcodes() if tag == "equal"]
    segments = [offsets[i:j] for (i, j) in idxs]
    return segments