Audio Classification with Padding
speechline.pipelines.audio_classification.AudioClassificationWithPaddingPipeline (AudioClassificationPipeline)
Subclass of AudioClassificationPipeline
.
Pads/truncates audio array to maximum length before performing audio classification.
Source code in speechline/pipelines/audio_classification.py
class AudioClassificationWithPaddingPipeline(AudioClassificationPipeline):
"""
Subclass of `AudioClassificationPipeline`.
Pads/truncates audio array to maximum length before performing audio classification.
"""
def __init__(self, *args, **kwargs):
self.max_duration_s = kwargs.get("max_duration_s")
super().__init__(*args, **kwargs)
def preprocess(self, inputs: np.ndarray) -> torch.Tensor:
"""
Pre-process `inputs` to a maximum length used during model's training.
Let `max_length = int(sampling_rate * max_duration_s)`.
Audio arrays shorter than `max_length` will be padded to `max_length`,
while arrays longer than `max_length` will be truncated to `max_length`.
Args:
inputs (np.ndarray):
Input audio array.
Returns:
torch.Tensor:
Pre-processed audio array as PyTorch tensors.
"""
processed = self.feature_extractor(
inputs,
sampling_rate=self.feature_extractor.sampling_rate,
return_tensors="pt",
max_length=int(self.feature_extractor.sampling_rate * self.max_duration_s),
truncation=True,
)
return processed
preprocess(self, inputs)
Pre-process inputs
to a maximum length used during model's training.
Let max_length = int(sampling_rate * max_duration_s)
.
Audio arrays shorter than max_length
will be padded to max_length
,
while arrays longer than max_length
will be truncated to max_length
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
inputs |
np.ndarray |
Input audio array. |
required |
Returns:
Type | Description |
---|---|
torch.Tensor |
Pre-processed audio array as PyTorch tensors. |
Source code in speechline/pipelines/audio_classification.py
def preprocess(self, inputs: np.ndarray) -> torch.Tensor:
"""
Pre-process `inputs` to a maximum length used during model's training.
Let `max_length = int(sampling_rate * max_duration_s)`.
Audio arrays shorter than `max_length` will be padded to `max_length`,
while arrays longer than `max_length` will be truncated to `max_length`.
Args:
inputs (np.ndarray):
Input audio array.
Returns:
torch.Tensor:
Pre-processed audio array as PyTorch tensors.
"""
processed = self.feature_extractor(
inputs,
sampling_rate=self.feature_extractor.sampling_rate,
return_tensors="pt",
max_length=int(self.feature_extractor.sampling_rate * self.max_duration_s),
truncation=True,
)
return processed