Skip to content

Wav2Vec2 Transcriber

speechline.transcribers.wav2vec2.Wav2Vec2Transcriber (AudioTranscriber)

Wav2Vec2-CTC model for speech recognition.

Parameters:

Name Type Description Default
model_checkpoint str

HuggingFace model hub checkpoint.

required
Source code in speechline/transcribers/wav2vec2.py
class Wav2Vec2Transcriber(AudioTranscriber):
    """
    Wav2Vec2-CTC model for speech recognition.

    Args:
        model_checkpoint (str):
            HuggingFace model hub checkpoint.
    """

    def __init__(self, model_checkpoint: str) -> None:
        super().__init__(model_checkpoint)

    def predict(
        self,
        dataset: Dataset,
        chunk_length_s: int = 30,
        output_offsets: bool = False,
        return_timestamps: str = "word",
        keep_whitespace: bool = False,
    ) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
        """
        Performs inference on `dataset`.

        Args:
            dataset (Dataset):
                Dataset to be inferred.
            chunk_length_s (int):
                Audio chunk length during inference. Defaults to `30`.
            output_offsets (bool, optional):
                Whether to output timestamps. Defaults to `False`.
            return_timestamps (str, optional):
                Returned timestamp level. Defaults to `"word"`.
            keep_whitespace (bool, optional):
                Whether to presere whitespace predictions. Defaults to `False`.

        Returns:
            Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
                Defaults to list of transcriptions.
                If `output_offsets` is `True`, return list of offsets.

        ### Example
        ```pycon title="example_transcriber_predict.py"
        >>> from speechline.transcribers import Wav2Vec2Transcriber
        >>> from datasets import Dataset, Audio
        >>> transcriber = Wav2Vec2Transcriber("bookbot/wav2vec2-ljspeech-gruut")
        >>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
        ...     "audio", Audio(sampling_rate=transcriber.sr)
        ... )
        >>> transcripts = transcriber.predict(dataset)
        >>> transcripts
        ["ɪ t ɪ z n oʊ t ʌ p"]
        >>> offsets = transcriber.predict(dataset, output_offsets=True)
        >>> offsets
        [
            [
                {"text": "ɪ", "start_time": 0.0, "end_time": 0.02},
                {"text": "t", "start_time": 0.26, "end_time": 0.3},
                {"text": "ɪ", "start_time": 0.34, "end_time": 0.36},
                {"text": "z", "start_time": 0.42, "end_time": 0.44},
                {"text": "n", "start_time": 0.5, "end_time": 0.54},
                {"text": "oʊ", "start_time": 0.54, "end_time": 0.58},
                {"text": "t", "start_time": 0.58, "end_time": 0.62},
                {"text": "ʌ", "start_time": 0.76, "end_time": 0.78},
                {"text": "p", "start_time": 0.92, "end_time": 0.94},
            ]
        ]
        ```
        """
        return self.inference(
            dataset,
            chunk_length_s=chunk_length_s,
            output_offsets=output_offsets,
            offset_key="text",
            return_timestamps=return_timestamps,
            keep_whitespace=keep_whitespace,
        )

predict(self, dataset, chunk_length_s=30, output_offsets=False, return_timestamps='word', keep_whitespace=False)

Performs inference on dataset.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to be inferred.

required
chunk_length_s int

Audio chunk length during inference. Defaults to 30.

30
output_offsets bool

Whether to output timestamps. Defaults to False.

False
return_timestamps str

Returned timestamp level. Defaults to "word".

'word'
keep_whitespace bool

Whether to presere whitespace predictions. Defaults to False.

False

Returns:

Type Description
Union[List[str], List[List[Dict[str, Union[str, float]]]]]

Defaults to list of transcriptions. If output_offsets is True, return list of offsets.

Example
example_transcriber_predict.py
>>> from speechline.transcribers import Wav2Vec2Transcriber
>>> from datasets import Dataset, Audio
>>> transcriber = Wav2Vec2Transcriber("bookbot/wav2vec2-ljspeech-gruut")
>>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
...     "audio", Audio(sampling_rate=transcriber.sr)
... )
>>> transcripts = transcriber.predict(dataset)
>>> transcripts
["ɪ t ɪ z n oʊ t ʌ p"]
>>> offsets = transcriber.predict(dataset, output_offsets=True)
>>> offsets
[
    [
        {"text": "ɪ", "start_time": 0.0, "end_time": 0.02},
        {"text": "t", "start_time": 0.26, "end_time": 0.3},
        {"text": "ɪ", "start_time": 0.34, "end_time": 0.36},
        {"text": "z", "start_time": 0.42, "end_time": 0.44},
        {"text": "n", "start_time": 0.5, "end_time": 0.54},
        {"text": "oʊ", "start_time": 0.54, "end_time": 0.58},
        {"text": "t", "start_time": 0.58, "end_time": 0.62},
        {"text": "ʌ", "start_time": 0.76, "end_time": 0.78},
        {"text": "p", "start_time": 0.92, "end_time": 0.94},
    ]
]
Source code in speechline/transcribers/wav2vec2.py
def predict(
    self,
    dataset: Dataset,
    chunk_length_s: int = 30,
    output_offsets: bool = False,
    return_timestamps: str = "word",
    keep_whitespace: bool = False,
) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
    """
    Performs inference on `dataset`.

    Args:
        dataset (Dataset):
            Dataset to be inferred.
        chunk_length_s (int):
            Audio chunk length during inference. Defaults to `30`.
        output_offsets (bool, optional):
            Whether to output timestamps. Defaults to `False`.
        return_timestamps (str, optional):
            Returned timestamp level. Defaults to `"word"`.
        keep_whitespace (bool, optional):
            Whether to presere whitespace predictions. Defaults to `False`.

    Returns:
        Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
            Defaults to list of transcriptions.
            If `output_offsets` is `True`, return list of offsets.

    ### Example
    ```pycon title="example_transcriber_predict.py"
    >>> from speechline.transcribers import Wav2Vec2Transcriber
    >>> from datasets import Dataset, Audio
    >>> transcriber = Wav2Vec2Transcriber("bookbot/wav2vec2-ljspeech-gruut")
    >>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
    ...     "audio", Audio(sampling_rate=transcriber.sr)
    ... )
    >>> transcripts = transcriber.predict(dataset)
    >>> transcripts
    ["ɪ t ɪ z n oʊ t ʌ p"]
    >>> offsets = transcriber.predict(dataset, output_offsets=True)
    >>> offsets
    [
        [
            {"text": "ɪ", "start_time": 0.0, "end_time": 0.02},
            {"text": "t", "start_time": 0.26, "end_time": 0.3},
            {"text": "ɪ", "start_time": 0.34, "end_time": 0.36},
            {"text": "z", "start_time": 0.42, "end_time": 0.44},
            {"text": "n", "start_time": 0.5, "end_time": 0.54},
            {"text": "oʊ", "start_time": 0.54, "end_time": 0.58},
            {"text": "t", "start_time": 0.58, "end_time": 0.62},
            {"text": "ʌ", "start_time": 0.76, "end_time": 0.78},
            {"text": "p", "start_time": 0.92, "end_time": 0.94},
        ]
    ]
    ```
    """
    return self.inference(
        dataset,
        chunk_length_s=chunk_length_s,
        output_offsets=output_offsets,
        offset_key="text",
        return_timestamps=return_timestamps,
        keep_whitespace=keep_whitespace,
    )