Skip to content

Runner

speechline.run.Runner dataclass

Runner()

Source code in speechline/run.py
class Runner:
    @staticmethod
    def parse_args(args: List[str]) -> argparse.Namespace:
        """
        Utility argument parser function for SpeechLine.

        Args:
            args (List[str]):
                List of arguments.

        Returns:
            argparse.Namespace:
                Objects with arguments values as attributes.
        """
        parser = argparse.ArgumentParser(
            prog="python speechline/run.py",
            description="Perform end-to-end speech labeling pipeline.",
        )

        parser.add_argument(
            "-i",
            "--input_dir",
            type=str,
            required=True,
            help="Directory of input audios.",
        )
        parser.add_argument(
            "-o",
            "--output_dir",
            type=str,
            required=True,
            help="Directory to save pipeline results.",
        )
        parser.add_argument(
            "-c",
            "--config",
            type=str,
            default="examples/config.json",
            help="SpeechLine configuration file.",
        )
        return parser.parse_args(args)

    @staticmethod
    def run(config: Config, input_dir: str, output_dir: str) -> None:
        """
        Runs end-to-end SpeechLine pipeline.

        ### Pipeline Overview
        - Classifies for children's speech audio (optional).
        - Transcribes audio.
        - Segments audio into chunks based on silences.

        Args:
            config (Config):
                SpeechLine Config object.
            input_dir (str):
                Path to input directory.
            output_dir (str):
                Path to output directory.
        """
        logger.info("Preparing DataFrame..")
        df = prepare_dataframe(input_dir, audio_extension="wav")

        if config.filter_empty_transcript:
            df = df[df["ground_truth"] != ""]

        if config.do_classify:
            # load classifier model
            classifier = Wav2Vec2Classifier(
                config.classifier.model,
                max_duration_s=config.classifier.max_duration_s,
            )

            # perform audio classification
            dataset = format_audio_dataset(df, sampling_rate=classifier.sampling_rate)
            df["category"] = classifier.predict(dataset)

            # filter audio by category
            df = df[df["category"] == "child"]

        # load transcriber model
        if config.transcriber.type == "wav2vec2":
            transcriber = Wav2Vec2Transcriber(config.transcriber.model)
        elif config.transcriber.type == "whisper":
            transcriber = WhisperTranscriber(config.transcriber.model)

        # perform audio transcription
        dataset = format_audio_dataset(df, sampling_rate=transcriber.sampling_rate)

        output_offsets = transcriber.predict(
            dataset,
            chunk_length_s=config.transcriber.chunk_length_s,
            output_offsets=True,
            return_timestamps=config.transcriber.return_timestamps,
            keep_whitespace=config.segmenter.keep_whitespace,
        )

        # segment audios based on offsets
        if config.segmenter.type == "silence":
            segmenter = SilenceSegmenter()
        elif config.segmenter.type == "word_overlap":
            segmenter = WordOverlapSegmenter()
        elif config.segmenter.type == "phoneme_overlap":
            lexicon = Lexicon()
            if config.segmenter.lexicon_path:
                with open(config.segmenter.lexicon_path) as json_file:
                    lex = json.load(json_file)
                # merge dict with lexicon
                for k, v in lex.items():
                    lexicon[k] = lexicon[k].union(set(v)) if k in lexicon else set(v)
            segmenter = PhonemeOverlapSegmenter(lexicon)

        tokenizer = WordTokenizer()

        if config.do_noise_classify:
            noise_classifier = config.noise_classifier.model
            minimum_empty_duration = config.noise_classifier.minimum_empty_duration
            noise_classifier_threshold = config.noise_classifier.threshold
        else:
            noise_classifier = None
            minimum_empty_duration = None
            noise_classifier_threshold = None

        def export_and_chunk(
            audio_path: str,
            ground_truth: str,
            offsets: List[Dict[str, Union[str, float]]],
        ):
            json_path = Path(audio_path).with_suffix(".json")
            # export JSON transcripts
            export_transcripts_json(str(json_path), offsets)
            # chunk audio into segments
            segmenter.chunk_audio_segments(
                audio_path,
                output_dir,
                offsets,
                do_noise_classify=config.do_noise_classify,
                noise_classifier=noise_classifier,
                minimum_empty_duration=minimum_empty_duration,
                minimum_chunk_duration=config.segmenter.minimum_chunk_duration,
                noise_classifier_threshold=noise_classifier_threshold,
                silence_duration=config.segmenter.silence_duration,
                ground_truth=tokenizer(ground_truth),
            )

        thread_map(
            export_and_chunk,
            df["audio"],
            df["ground_truth"],
            output_offsets,
            desc="Segmenting Audio into Chunks",
            total=len(df),
        )

parse_args(args) staticmethod

Utility argument parser function for SpeechLine.

Parameters:

Name Type Description Default
args List[str]

List of arguments.

required

Returns:

Type Description
argparse.Namespace

Objects with arguments values as attributes.

Source code in speechline/run.py
@staticmethod
def parse_args(args: List[str]) -> argparse.Namespace:
    """
    Utility argument parser function for SpeechLine.

    Args:
        args (List[str]):
            List of arguments.

    Returns:
        argparse.Namespace:
            Objects with arguments values as attributes.
    """
    parser = argparse.ArgumentParser(
        prog="python speechline/run.py",
        description="Perform end-to-end speech labeling pipeline.",
    )

    parser.add_argument(
        "-i",
        "--input_dir",
        type=str,
        required=True,
        help="Directory of input audios.",
    )
    parser.add_argument(
        "-o",
        "--output_dir",
        type=str,
        required=True,
        help="Directory to save pipeline results.",
    )
    parser.add_argument(
        "-c",
        "--config",
        type=str,
        default="examples/config.json",
        help="SpeechLine configuration file.",
    )
    return parser.parse_args(args)

run(config, input_dir, output_dir) staticmethod

Runs end-to-end SpeechLine pipeline.

Pipeline Overview
  • Classifies for children's speech audio (optional).
  • Transcribes audio.
  • Segments audio into chunks based on silences.

Parameters:

Name Type Description Default
config Config

SpeechLine Config object.

required
input_dir str

Path to input directory.

required
output_dir str

Path to output directory.

required
Source code in speechline/run.py
@staticmethod
def run(config: Config, input_dir: str, output_dir: str) -> None:
    """
    Runs end-to-end SpeechLine pipeline.

    ### Pipeline Overview
    - Classifies for children's speech audio (optional).
    - Transcribes audio.
    - Segments audio into chunks based on silences.

    Args:
        config (Config):
            SpeechLine Config object.
        input_dir (str):
            Path to input directory.
        output_dir (str):
            Path to output directory.
    """
    logger.info("Preparing DataFrame..")
    df = prepare_dataframe(input_dir, audio_extension="wav")

    if config.filter_empty_transcript:
        df = df[df["ground_truth"] != ""]

    if config.do_classify:
        # load classifier model
        classifier = Wav2Vec2Classifier(
            config.classifier.model,
            max_duration_s=config.classifier.max_duration_s,
        )

        # perform audio classification
        dataset = format_audio_dataset(df, sampling_rate=classifier.sampling_rate)
        df["category"] = classifier.predict(dataset)

        # filter audio by category
        df = df[df["category"] == "child"]

    # load transcriber model
    if config.transcriber.type == "wav2vec2":
        transcriber = Wav2Vec2Transcriber(config.transcriber.model)
    elif config.transcriber.type == "whisper":
        transcriber = WhisperTranscriber(config.transcriber.model)

    # perform audio transcription
    dataset = format_audio_dataset(df, sampling_rate=transcriber.sampling_rate)

    output_offsets = transcriber.predict(
        dataset,
        chunk_length_s=config.transcriber.chunk_length_s,
        output_offsets=True,
        return_timestamps=config.transcriber.return_timestamps,
        keep_whitespace=config.segmenter.keep_whitespace,
    )

    # segment audios based on offsets
    if config.segmenter.type == "silence":
        segmenter = SilenceSegmenter()
    elif config.segmenter.type == "word_overlap":
        segmenter = WordOverlapSegmenter()
    elif config.segmenter.type == "phoneme_overlap":
        lexicon = Lexicon()
        if config.segmenter.lexicon_path:
            with open(config.segmenter.lexicon_path) as json_file:
                lex = json.load(json_file)
            # merge dict with lexicon
            for k, v in lex.items():
                lexicon[k] = lexicon[k].union(set(v)) if k in lexicon else set(v)
        segmenter = PhonemeOverlapSegmenter(lexicon)

    tokenizer = WordTokenizer()

    if config.do_noise_classify:
        noise_classifier = config.noise_classifier.model
        minimum_empty_duration = config.noise_classifier.minimum_empty_duration
        noise_classifier_threshold = config.noise_classifier.threshold
    else:
        noise_classifier = None
        minimum_empty_duration = None
        noise_classifier_threshold = None

    def export_and_chunk(
        audio_path: str,
        ground_truth: str,
        offsets: List[Dict[str, Union[str, float]]],
    ):
        json_path = Path(audio_path).with_suffix(".json")
        # export JSON transcripts
        export_transcripts_json(str(json_path), offsets)
        # chunk audio into segments
        segmenter.chunk_audio_segments(
            audio_path,
            output_dir,
            offsets,
            do_noise_classify=config.do_noise_classify,
            noise_classifier=noise_classifier,
            minimum_empty_duration=minimum_empty_duration,
            minimum_chunk_duration=config.segmenter.minimum_chunk_duration,
            noise_classifier_threshold=noise_classifier_threshold,
            silence_duration=config.segmenter.silence_duration,
            ground_truth=tokenizer(ground_truth),
        )

    thread_map(
        export_and_chunk,
        df["audio"],
        df["ground_truth"],
        output_offsets,
        desc="Segmenting Audio into Chunks",
        total=len(df),
    )