Skip to content

Word Overlap Segmenter

speechline.segmenters.word_overlap_segmenter.WordOverlapSegmenter (Segmenter)

Source code in speechline/segmenters/word_overlap_segmenter.py
class WordOverlapSegmenter(Segmenter):
    def normalize(self, text: str) -> str:
        text = text.lower().strip()
        return text

    def chunk_offsets(
        self,
        offsets: List[Dict[str, Union[str, float]]],
        ground_truth: List[str],
        **kwargs,
    ) -> List[List[Dict[str, Union[str, float]]]]:
        """
        Chunk transcript offsets based on overlaps with ground truth.

        ### Example
        ```pycon title="example_word_overlap_segmenter.py"
        >>> from speechline.segmenters import WordOverlapSegmenter
        >>> segmenter = WordOverlapSegmenter()
        >>> offsets = [
        ...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
        ...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
        ...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
        ...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
        ...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
        ...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
        ...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
        ... ]
        >>> ground_truth = ["red", "umbrella", "just", "the", "best"]
        >>> segmenter.chunk_offsets(offsets, ground_truth)
        [
            [
                {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
                {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
            ],
            [
                {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
                {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
                {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
            ]
        ]
        ```

        Args:
            offsets (List[Dict[str, Union[str, float]]]):
                Offsets to chunk.
            ground_truth (List[str]):
                List of ground truth words to compare with offsets.

        Returns:
            List[List[Dict[str, Union[str, float]]]]:
                List of chunked/segmented offsets.
        """
        ground_truth = [self.normalize(g) for g in ground_truth]
        transcripts = [self.normalize(o["text"]) for o in offsets]

        matcher = SequenceMatcher(None, transcripts, ground_truth)
        idxs = [(i1, i2) for tag, i1, i2, *_ in matcher.get_opcodes() if tag == "equal"]
        segments = [offsets[i:j] for (i, j) in idxs]
        return segments

chunk_offsets(self, offsets, ground_truth, **kwargs)

Chunk transcript offsets based on overlaps with ground truth.

Example
example_word_overlap_segmenter.py
>>> from speechline.segmenters import WordOverlapSegmenter
>>> segmenter = WordOverlapSegmenter()
>>> offsets = [
...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
... ]
>>> ground_truth = ["red", "umbrella", "just", "the", "best"]
>>> segmenter.chunk_offsets(offsets, ground_truth)
[
    [
        {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
        {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
    ],
    [
        {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
        {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
        {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
    ]
]

Parameters:

Name Type Description Default
offsets List[Dict[str, Union[str, float]]]

Offsets to chunk.

required
ground_truth List[str]

List of ground truth words to compare with offsets.

required

Returns:

Type Description
List[List[Dict[str, Union[str, float]]]]

List of chunked/segmented offsets.

Source code in speechline/segmenters/word_overlap_segmenter.py
def chunk_offsets(
    self,
    offsets: List[Dict[str, Union[str, float]]],
    ground_truth: List[str],
    **kwargs,
) -> List[List[Dict[str, Union[str, float]]]]:
    """
    Chunk transcript offsets based on overlaps with ground truth.

    ### Example
    ```pycon title="example_word_overlap_segmenter.py"
    >>> from speechline.segmenters import WordOverlapSegmenter
    >>> segmenter = WordOverlapSegmenter()
    >>> offsets = [
    ...     {'end_time': 0.28, 'start_time': 0.18, 'text': 'HER'},
    ...     {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
    ...     {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'},
    ...     {'end_time': 1.46, 'start_time': 1.4, 'text': 'IS'},
    ...     {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
    ...     {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
    ...     {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
    ... ]
    >>> ground_truth = ["red", "umbrella", "just", "the", "best"]
    >>> segmenter.chunk_offsets(offsets, ground_truth)
    [
        [
            {'end_time': 0.52, 'start_time': 0.34, 'text': 'RED'},
            {'end_time': 1.12, 'start_time': 0.68, 'text': 'UMBRELLA'}
        ],
        [
            {'end_time': 1.78, 'start_time': 1.56, 'text': 'JUST'},
            {'end_time': 1.94, 'start_time': 1.86, 'text': 'THE'},
            {'end_time': 2.3, 'start_time': 1.98, 'text': 'BEST'}
        ]
    ]
    ```

    Args:
        offsets (List[Dict[str, Union[str, float]]]):
            Offsets to chunk.
        ground_truth (List[str]):
            List of ground truth words to compare with offsets.

    Returns:
        List[List[Dict[str, Union[str, float]]]]:
            List of chunked/segmented offsets.
    """
    ground_truth = [self.normalize(g) for g in ground_truth]
    transcripts = [self.normalize(o["text"]) for o in offsets]

    matcher = SequenceMatcher(None, transcripts, ground_truth)
    idxs = [(i1, i2) for tag, i1, i2, *_ in matcher.get_opcodes() if tag == "equal"]
    segments = [offsets[i:j] for (i, j) in idxs]
    return segments