Skip to content

Audio Classification with Padding

speechline.pipelines.audio_classification.AudioClassificationWithPaddingPipeline (AudioClassificationPipeline)

Subclass of AudioClassificationPipeline. Pads/truncates audio array to maximum length before performing audio classification.

Source code in speechline/pipelines/audio_classification.py
class AudioClassificationWithPaddingPipeline(AudioClassificationPipeline):
    """
    Subclass of `AudioClassificationPipeline`.
    Pads/truncates audio array to maximum length before performing audio classification.
    """

    def __init__(self, *args, **kwargs):
        self.max_duration_s = kwargs.get("max_duration_s")
        super().__init__(*args, **kwargs)

    def preprocess(self, inputs: np.ndarray) -> torch.Tensor:
        """
        Pre-process `inputs` to a maximum length used during model's training.
        Let `max_length = int(sampling_rate * max_duration_s)`.
        Audio arrays shorter than `max_length` will be padded to `max_length`,
        while arrays longer than `max_length` will be truncated to `max_length`.


        Args:
            inputs (np.ndarray):
                Input audio array.

        Returns:
            torch.Tensor:
                Pre-processed audio array as PyTorch tensors.
        """
        processed = self.feature_extractor(
            inputs,
            sampling_rate=self.feature_extractor.sampling_rate,
            return_tensors="pt",
            max_length=int(self.feature_extractor.sampling_rate * self.max_duration_s),
            truncation=True,
        )
        return processed

preprocess(self, inputs)

Pre-process inputs to a maximum length used during model's training. Let max_length = int(sampling_rate * max_duration_s). Audio arrays shorter than max_length will be padded to max_length, while arrays longer than max_length will be truncated to max_length.

Parameters:

Name Type Description Default
inputs np.ndarray

Input audio array.

required

Returns:

Type Description
torch.Tensor

Pre-processed audio array as PyTorch tensors.

Source code in speechline/pipelines/audio_classification.py
def preprocess(self, inputs: np.ndarray) -> torch.Tensor:
    """
    Pre-process `inputs` to a maximum length used during model's training.
    Let `max_length = int(sampling_rate * max_duration_s)`.
    Audio arrays shorter than `max_length` will be padded to `max_length`,
    while arrays longer than `max_length` will be truncated to `max_length`.


    Args:
        inputs (np.ndarray):
            Input audio array.

    Returns:
        torch.Tensor:
            Pre-processed audio array as PyTorch tensors.
    """
    processed = self.feature_extractor(
        inputs,
        sampling_rate=self.feature_extractor.sampling_rate,
        return_tensors="pt",
        max_length=int(self.feature_extractor.sampling_rate * self.max_duration_s),
        truncation=True,
    )
    return processed