Whisper Transcriber
speechline.transcribers.whisper.WhisperTranscriber (AudioTranscriber)
Whisper model for seq2seq speech recognition with its processor.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model_checkpoint |
str |
HuggingFace model hub checkpoint. |
required |
Source code in speechline/transcribers/whisper.py
class WhisperTranscriber(AudioTranscriber):
"""
Whisper model for seq2seq speech recognition with its processor.
Args:
model_checkpoint (str):
HuggingFace model hub checkpoint.
"""
def __init__(self, model_checkpoint: str) -> None:
super().__init__(model_checkpoint)
def predict(
self,
dataset: Dataset,
chunk_length_s: int = 0,
output_offsets: bool = False,
return_timestamps: bool = True,
keep_whitespace: bool = False,
) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
"""
Performs inference on `dataset`.
Args:
dataset (Dataset):
Dataset to be inferred.
chunk_length_s (int):
Audio chunk length during inference. Defaults to `0`.
output_offsets (bool, optional):
Whether to output timestamps. Defaults to `False`.
return_timestamps (bool, optional):
Returned timestamp level. Defaults to `True`.
keep_whitespace (bool, optional):
Whether to presere whitespace predictions. Defaults to `False`.
Returns:
Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
Defaults to list of transcriptions.
If `output_offsets` is `True`, return list of text offsets.
### Example
```pycon title="example_transcriber_predict.py"
>>> from speechline.transcribers import WhisperTranscriber
>>> from datasets import Dataset, Audio
>>> transcriber = WhisperTranscriber("openai/whisper-tiny")
>>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
... "audio", Audio(sampling_rate=transcriber.sr)
... )
>>> transcripts = transcriber.predict(dataset)
>>> transcripts
["Her red umbrella is just the best."]
>>> offsets = transcriber.predict(dataset, output_offsets=True)
>>> offsets
[
[
{
"text": "Her red umbrella is just the best.",
"start_time": 0.0,
"end_time": 3.0,
}
]
]
```
"""
return self.inference(
dataset,
chunk_length_s=chunk_length_s,
output_offsets=output_offsets,
offset_key="text",
return_timestamps=return_timestamps,
keep_whitespace=keep_whitespace,
generate_kwargs={"max_new_tokens": 448},
)
predict(self, dataset, chunk_length_s=0, output_offsets=False, return_timestamps=True, keep_whitespace=False)
Performs inference on dataset
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
Dataset |
Dataset to be inferred. |
required |
chunk_length_s |
int |
Audio chunk length during inference. Defaults to |
0 |
output_offsets |
bool |
Whether to output timestamps. Defaults to |
False |
return_timestamps |
bool |
Returned timestamp level. Defaults to |
True |
keep_whitespace |
bool |
Whether to presere whitespace predictions. Defaults to |
False |
Returns:
Type | Description |
---|---|
Union[List[str], List[List[Dict[str, Union[str, float]]]]] |
Defaults to list of transcriptions.
If |
Example
example_transcriber_predict.py
>>> from speechline.transcribers import WhisperTranscriber
>>> from datasets import Dataset, Audio
>>> transcriber = WhisperTranscriber("openai/whisper-tiny")
>>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
... "audio", Audio(sampling_rate=transcriber.sr)
... )
>>> transcripts = transcriber.predict(dataset)
>>> transcripts
["Her red umbrella is just the best."]
>>> offsets = transcriber.predict(dataset, output_offsets=True)
>>> offsets
[
[
{
"text": "Her red umbrella is just the best.",
"start_time": 0.0,
"end_time": 3.0,
}
]
]
Source code in speechline/transcribers/whisper.py
def predict(
self,
dataset: Dataset,
chunk_length_s: int = 0,
output_offsets: bool = False,
return_timestamps: bool = True,
keep_whitespace: bool = False,
) -> Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
"""
Performs inference on `dataset`.
Args:
dataset (Dataset):
Dataset to be inferred.
chunk_length_s (int):
Audio chunk length during inference. Defaults to `0`.
output_offsets (bool, optional):
Whether to output timestamps. Defaults to `False`.
return_timestamps (bool, optional):
Returned timestamp level. Defaults to `True`.
keep_whitespace (bool, optional):
Whether to presere whitespace predictions. Defaults to `False`.
Returns:
Union[List[str], List[List[Dict[str, Union[str, float]]]]]:
Defaults to list of transcriptions.
If `output_offsets` is `True`, return list of text offsets.
### Example
```pycon title="example_transcriber_predict.py"
>>> from speechline.transcribers import WhisperTranscriber
>>> from datasets import Dataset, Audio
>>> transcriber = WhisperTranscriber("openai/whisper-tiny")
>>> dataset = Dataset.from_dict({"audio": ["sample.wav"]}).cast_column(
... "audio", Audio(sampling_rate=transcriber.sr)
... )
>>> transcripts = transcriber.predict(dataset)
>>> transcripts
["Her red umbrella is just the best."]
>>> offsets = transcriber.predict(dataset, output_offsets=True)
>>> offsets
[
[
{
"text": "Her red umbrella is just the best.",
"start_time": 0.0,
"end_time": 3.0,
}
]
]
```
"""
return self.inference(
dataset,
chunk_length_s=chunk_length_s,
output_offsets=output_offsets,
offset_key="text",
return_timestamps=return_timestamps,
keep_whitespace=keep_whitespace,
generate_kwargs={"max_new_tokens": 448},
)