Skip to content

Audio Transcriber

speechline.modules.audio_transcriber.AudioTranscriber (AudioModule)

Generic AudioTranscriber class for speech/phoneme recognition.

Parameters:

Name Type Description Default
model_checkpoint str

HuggingFace Hub model hub checkpoint.

required
Source code in speechline/modules/audio_transcriber.py
class AudioTranscriber(AudioModule):
    """
    Generic AudioTranscriber class for speech/phoneme recognition.

    Args:
        model_checkpoint (str):
            HuggingFace Hub model hub checkpoint.
    """

    def __init__(self, model_checkpoint: str) -> None:
        asr = pipeline(
            "automatic-speech-recognition",
            model=model_checkpoint,
            device=0 if torch.cuda.is_available() else -1,
            pipeline_class=AutomaticSpeechRecognitionFilteredPipeline,
        )
        super().__init__(pipeline=asr)

    def inference(
        self,
        dataset: Dataset,
        chunk_length_s: int = 0,
        output_offsets: bool = False,
        offset_key: str = "text",
        return_timestamps: Union[str, bool] = True,
        keep_whitespace: bool = False,
        **kwargs,
    ) -> Union[List[List[Dict[str, Union[str, float]]]], List[str]]:
        """
        Inference/prediction function to be mapped to a dataset.

        Args:
            dataset (Dataset):
                Dataset to be inferred.
            chunk_length_s (int, optional):
                Audio chunk length in seconds. Defaults to `30`.
            output_offsets (bool, optional):
                Whether to output offsets. Defaults to `False`.
            offset_key (str, optional):
                Offset dictionary key. Defaults to `"text"`.
            return_timestamps (Union[str, bool], optional):
                `return_timestamps` argument in `AutomaticSpeechRecognitionPipeline`'s
                `__call__` method. Use `"char"` for CTC-based models and
                `True` for Whisper-based models.
                Defaults to `True`.
            keep_whitespace (bool, optional):
                Whether to presere whitespace predictions. Defaults to `False`.

        Returns:
            Union[List[List[Dict[str, Union[str, float]]]], List[str]]:
                List of predictions.
        """

        def _format_timestamps_to_offsets(
            timestamps: Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]],
            offset_key: str = "text",
            keep_whitespace: bool = False,
        ) -> List[Dict[str, Union[str, float]]]:
            """
            Formats `AutomaticSpeechRecognitionPipeline`'s timestamp outputs to
            a list of offsets with the following format:

            ```json
            [
                {
                    "{offset_key}": {text},
                    "start_time": {start_time},
                    "end_time": {end_time}
                },
                {
                    "{offset_key}": {text},
                    "start_time": {start_time},
                    "end_time": {end_time}
                },
                ...
            ]
            ```

            Args:
                timestamps (Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]]):  # noqa: E501
                    Output timestamps from `AutomaticSpeechRecognitionPipeline`.
                offset_key (str, optional):
                    Transcript dictionary key in offset. Defaults to `"text"`.
                keep_whitespace (bool, optional):
                    Whether to presere whitespace predictions. Defaults to `False`.

            Returns:
                List[Dict[str, Union[str, float]]]:
                    List of offsets.
            """
            return [
                {
                    offset_key: o["text"] if keep_whitespace else o["text"].strip(),
                    "start_time": round(o["timestamp"][0], 3),
                    "end_time": round(o["timestamp"][1], 3),
                }
                for o in timestamps["chunks"]
                if o["text"] != " " or keep_whitespace
            ]

        def _format_timestamps_to_transcript(
            timestamps: Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]],
        ) -> str:
            """
            Formats `AutomaticSpeechRecognitionPipeline`'s timestamp outputs
            to a transcript string.

            Args:
                timestamps (Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]]):  # noqa: E501
                    Output timestamps from `AutomaticSpeechRecognitionPipeline`.

            Returns:
                str:
                    Transcript string.
            """
            return " ".join([o["text"].strip() for o in timestamps["chunks"] if o["text"] != " "])

        def _get_audio_array(
            dataset: Dataset,
        ) -> Generator[Dict[str, Union[np.ndarray, int, str]], None, None]:
            for item in dataset:
                yield {**item["audio"]}

        results = []

        for out in tqdm(
            self.pipeline(
                _get_audio_array(dataset),
                chunk_length_s=chunk_length_s,
                return_timestamps=return_timestamps,
                **kwargs,
            ),
            total=len(dataset),
            desc="Transcribing Audios",
        ):
            prediction = (
                _format_timestamps_to_offsets(
                    out,
                    offset_key=offset_key,
                    keep_whitespace=keep_whitespace,
                )
                if output_offsets
                else _format_timestamps_to_transcript(out)
            )
            results.append(prediction)

        return results

inference(self, dataset, chunk_length_s=0, output_offsets=False, offset_key='text', return_timestamps=True, keep_whitespace=False, **kwargs)

Inference/prediction function to be mapped to a dataset.

Parameters:

Name Type Description Default
dataset Dataset

Dataset to be inferred.

required
chunk_length_s int

Audio chunk length in seconds. Defaults to 30.

0
output_offsets bool

Whether to output offsets. Defaults to False.

False
offset_key str

Offset dictionary key. Defaults to "text".

'text'
return_timestamps Union[str, bool]

return_timestamps argument in AutomaticSpeechRecognitionPipeline's __call__ method. Use "char" for CTC-based models and True for Whisper-based models. Defaults to True.

True
keep_whitespace bool

Whether to presere whitespace predictions. Defaults to False.

False

Returns:

Type Description
Union[List[List[Dict[str, Union[str, float]]]], List[str]]

List of predictions.

Source code in speechline/modules/audio_transcriber.py
def inference(
    self,
    dataset: Dataset,
    chunk_length_s: int = 0,
    output_offsets: bool = False,
    offset_key: str = "text",
    return_timestamps: Union[str, bool] = True,
    keep_whitespace: bool = False,
    **kwargs,
) -> Union[List[List[Dict[str, Union[str, float]]]], List[str]]:
    """
    Inference/prediction function to be mapped to a dataset.

    Args:
        dataset (Dataset):
            Dataset to be inferred.
        chunk_length_s (int, optional):
            Audio chunk length in seconds. Defaults to `30`.
        output_offsets (bool, optional):
            Whether to output offsets. Defaults to `False`.
        offset_key (str, optional):
            Offset dictionary key. Defaults to `"text"`.
        return_timestamps (Union[str, bool], optional):
            `return_timestamps` argument in `AutomaticSpeechRecognitionPipeline`'s
            `__call__` method. Use `"char"` for CTC-based models and
            `True` for Whisper-based models.
            Defaults to `True`.
        keep_whitespace (bool, optional):
            Whether to presere whitespace predictions. Defaults to `False`.

    Returns:
        Union[List[List[Dict[str, Union[str, float]]]], List[str]]:
            List of predictions.
    """

    def _format_timestamps_to_offsets(
        timestamps: Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]],
        offset_key: str = "text",
        keep_whitespace: bool = False,
    ) -> List[Dict[str, Union[str, float]]]:
        """
        Formats `AutomaticSpeechRecognitionPipeline`'s timestamp outputs to
        a list of offsets with the following format:

        ```json
        [
            {
                "{offset_key}": {text},
                "start_time": {start_time},
                "end_time": {end_time}
            },
            {
                "{offset_key}": {text},
                "start_time": {start_time},
                "end_time": {end_time}
            },
            ...
        ]
        ```

        Args:
            timestamps (Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]]):  # noqa: E501
                Output timestamps from `AutomaticSpeechRecognitionPipeline`.
            offset_key (str, optional):
                Transcript dictionary key in offset. Defaults to `"text"`.
            keep_whitespace (bool, optional):
                Whether to presere whitespace predictions. Defaults to `False`.

        Returns:
            List[Dict[str, Union[str, float]]]:
                List of offsets.
        """
        return [
            {
                offset_key: o["text"] if keep_whitespace else o["text"].strip(),
                "start_time": round(o["timestamp"][0], 3),
                "end_time": round(o["timestamp"][1], 3),
            }
            for o in timestamps["chunks"]
            if o["text"] != " " or keep_whitespace
        ]

    def _format_timestamps_to_transcript(
        timestamps: Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]],
    ) -> str:
        """
        Formats `AutomaticSpeechRecognitionPipeline`'s timestamp outputs
        to a transcript string.

        Args:
            timestamps (Dict[str, Union[str, List[Dict[str, Union[str, Tuple[float, float]]]]]]):  # noqa: E501
                Output timestamps from `AutomaticSpeechRecognitionPipeline`.

        Returns:
            str:
                Transcript string.
        """
        return " ".join([o["text"].strip() for o in timestamps["chunks"] if o["text"] != " "])

    def _get_audio_array(
        dataset: Dataset,
    ) -> Generator[Dict[str, Union[np.ndarray, int, str]], None, None]:
        for item in dataset:
            yield {**item["audio"]}

    results = []

    for out in tqdm(
        self.pipeline(
            _get_audio_array(dataset),
            chunk_length_s=chunk_length_s,
            return_timestamps=return_timestamps,
            **kwargs,
        ),
        total=len(dataset),
        desc="Transcribing Audios",
    ):
        prediction = (
            _format_timestamps_to_offsets(
                out,
                offset_key=offset_key,
                keep_whitespace=keep_whitespace,
            )
            if output_offsets
            else _format_timestamps_to_transcript(out)
        )
        results.append(prediction)

    return results