Skip to content

Whisper Transcriber

speechline.transcribers.whisper.WhisperTranscriber (AudioTranscriber)

Whisper model for seq2seq speech recognition with its processor.

Parameters:

Name Type Description Default
model_checkpoint str

HuggingFace model hub checkpoint.

required
Source code in speechline/transcribers/whisper.py
class WhisperTranscriber(AudioTranscriber):
    """
    Whisper model for seq2seq speech recognition with its processor.

    Args:
        model_checkpoint (str):
            HuggingFace model hub checkpoint.
    """

    def __init__(self, model_checkpoint: str) -> None:
        super().__init__(model_checkpoint)

    def predict(
        self,
        dataset: Dataset,
        chunk_length_s: int = 0,
        output_offsets: bool = False,
        return_timestamps: bool = True,
        keep_whitespace: bool = False,
    ) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
        """
        Performs inference on `dataset`.

        Args:
            dataset (Dataset):
                Dataset to be inferred.
            chunk_length_s (int):
                Audio chunk length during inference. Defaults to `0`.
            output_offsets (bool, optional):
                Whether to output timestamps. Defaults to `False`.
            return_timestamps (bool, optional):
                Returned timestamp level. Defaults to `True`.
            keep_whitespace (bool, optional):
                Whether to presere whitespace predictions. Defaults to `False`.

        Returns:
            Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
                Defaults to list of transcriptions.
                If `output_offsets` is `True`, return list of text offsets.

        ### Example
        ```pycon title="example_transcriber_predict.py"
        >>> from speechline.transcribers import WhisperTranscriber
        >>> from datasets import Dataset, Audio
        >>> transcriber = WhisperTranscriber("openai/whisper-tiny")
        >>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
        ...     "audio", Audio(sampling_rate=transcriber.sr)
        ... )
        >>> transcripts = transcriber.predict(dataset)
        >>> transcripts
        ["Her red umbrella is just the best."]
        >>> offsets = transcriber.predict(dataset, output_offsets=True)
        >>> offsets
        [
            [
                {
                    "text": "Her red umbrella is just the best.",
                    "start_time": 0.0,
                    "end_time": 3.0,
                }
            ]
        ]
        ```
        """
        return self.inference(
            dataset,
            chunk_length_s=chunk_length_s,
            output_offsets=output_offsets,
            offset_key="text",
            return_timestamps=return_timestamps,
            keep_whitespace=keep_whitespace,
            generate_kwargs={"max_new_tokens": 448},
        )

predict(self, dataset, chunk_length_s=0, output_offsets=False, return_timestamps=True, keep_whitespace=False)

Performs inference on dataset.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to be inferred.

required
chunk_length_s int

Audio chunk length during inference. Defaults to 0.

0
output_offsets bool

Whether to output timestamps. Defaults to False.

False
return_timestamps bool

Returned timestamp level. Defaults to True.

True
keep_whitespace bool

Whether to presere whitespace predictions. Defaults to False.

False

Returns:

Type Description
Union[List[str], List[List[Dict[str, Union[str, float]]]]]

Defaults to list of transcriptions. If output_offsets is True, return list of text offsets.

Example
example_transcriber_predict.py
>>> from speechline.transcribers import WhisperTranscriber
>>> from datasets import Dataset, Audio
>>> transcriber = WhisperTranscriber("openai/whisper-tiny")
>>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
...     "audio", Audio(sampling_rate=transcriber.sr)
... )
>>> transcripts = transcriber.predict(dataset)
>>> transcripts
["Her red umbrella is just the best."]
>>> offsets = transcriber.predict(dataset, output_offsets=True)
>>> offsets
[
    [
        {
            "text": "Her red umbrella is just the best.",
            "start_time": 0.0,
            "end_time": 3.0,
        }
    ]
]
Source code in speechline/transcribers/whisper.py
def predict(
    self,
    dataset: Dataset,
    chunk_length_s: int = 0,
    output_offsets: bool = False,
    return_timestamps: bool = True,
    keep_whitespace: bool = False,
) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
    """
    Performs inference on `dataset`.

    Args:
        dataset (Dataset):
            Dataset to be inferred.
        chunk_length_s (int):
            Audio chunk length during inference. Defaults to `0`.
        output_offsets (bool, optional):
            Whether to output timestamps. Defaults to `False`.
        return_timestamps (bool, optional):
            Returned timestamp level. Defaults to `True`.
        keep_whitespace (bool, optional):
            Whether to presere whitespace predictions. Defaults to `False`.

    Returns:
        Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
            Defaults to list of transcriptions.
            If `output_offsets` is `True`, return list of text offsets.

    ### Example
    ```pycon title="example_transcriber_predict.py"
    >>> from speechline.transcribers import WhisperTranscriber
    >>> from datasets import Dataset, Audio
    >>> transcriber = WhisperTranscriber("openai/whisper-tiny")
    >>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
    ...     "audio", Audio(sampling_rate=transcriber.sr)
    ... )
    >>> transcripts = transcriber.predict(dataset)
    >>> transcripts
    ["Her red umbrella is just the best."]
    >>> offsets = transcriber.predict(dataset, output_offsets=True)
    >>> offsets
    [
        [
            {
                "text": "Her red umbrella is just the best.",
                "start_time": 0.0,
                "end_time": 3.0,
            }
        ]
    ]
    ```
    """
    return self.inference(
        dataset,
        chunk_length_s=chunk_length_s,
        output_offsets=output_offsets,
        offset_key="text",
        return_timestamps=return_timestamps,
        keep_whitespace=keep_whitespace,
        generate_kwargs={"max_new_tokens": 448},
    )