Runner
speechline.run.Runner
dataclass
Runner()
Source code in speechline/run.py
class Runner:
@staticmethod
def parse_args(args: List[str]) -> argparse.Namespace:
"""
Utility argument parser function for SpeechLine.
Args:
args (List[str]):
List of arguments.
Returns:
argparse.Namespace:
Objects with arguments values as attributes.
"""
parser = argparse.ArgumentParser(
prog="python speechline/run.py",
description="Perform end-to-end speech labeling pipeline.",
)
parser.add_argument(
"-i",
"--input_dir",
type=str,
required=True,
help="Directory of input audios.",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
required=True,
help="Directory to save pipeline results.",
)
parser.add_argument(
"-c",
"--config",
type=str,
default="examples/config.json",
help="SpeechLine configuration file.",
)
return parser.parse_args(args)
@staticmethod
def run(config: Config, input_dir: str, output_dir: str) -> None:
"""
Runs end-to-end SpeechLine pipeline.
### Pipeline Overview
- Classifies for children's speech audio (optional).
- Transcribes audio.
- Segments audio into chunks based on silences.
Args:
config (Config):
SpeechLine Config object.
input_dir (str):
Path to input directory.
output_dir (str):
Path to output directory.
"""
logger.info("Preparing DataFrame..")
df = prepare_dataframe(input_dir, audio_extension="wav")
if config.filter_empty_transcript:
df = df[df["ground_truth"] != ""]
if config.do_classify:
# load classifier model
classifier = Wav2Vec2Classifier(
config.classifier.model,
max_duration_s=config.classifier.max_duration_s,
)
# perform audio classification
dataset = format_audio_dataset(df, sampling_rate=classifier.sampling_rate)
df["category"] = classifier.predict(dataset)
# filter audio by category
df = df[df["category"] == "child"]
# load transcriber model
if config.transcriber.type == "wav2vec2":
transcriber = Wav2Vec2Transcriber(config.transcriber.model)
elif config.transcriber.type == "whisper":
transcriber = WhisperTranscriber(config.transcriber.model)
# perform audio transcription
dataset = format_audio_dataset(df, sampling_rate=transcriber.sampling_rate)
output_offsets = transcriber.predict(
dataset,
chunk_length_s=config.transcriber.chunk_length_s,
output_offsets=True,
return_timestamps=config.transcriber.return_timestamps,
keep_whitespace=config.segmenter.keep_whitespace,
)
# segment audios based on offsets
if config.segmenter.type == "silence":
segmenter = SilenceSegmenter()
elif config.segmenter.type == "word_overlap":
segmenter = WordOverlapSegmenter()
elif config.segmenter.type == "phoneme_overlap":
lexicon = Lexicon()
if config.segmenter.lexicon_path:
with open(config.segmenter.lexicon_path) as json_file:
lex = json.load(json_file)
# merge dict with lexicon
for k, v in lex.items():
lexicon[k] = lexicon[k].union(set(v)) if k in lexicon else set(v)
segmenter = PhonemeOverlapSegmenter(lexicon)
tokenizer = WordTokenizer()
if config.do_noise_classify:
noise_classifier = config.noise_classifier.model
minimum_empty_duration = config.noise_classifier.minimum_empty_duration
noise_classifier_threshold = config.noise_classifier.threshold
else:
noise_classifier = None
minimum_empty_duration = None
noise_classifier_threshold = None
def export_and_chunk(
audio_path: str,
ground_truth: str,
offsets: List[Dict[str, Union[str, float]]],
):
json_path = Path(audio_path).with_suffix(".json")
# export JSON transcripts
export_transcripts_json(str(json_path), offsets)
# chunk audio into segments
segmenter.chunk_audio_segments(
audio_path,
output_dir,
offsets,
do_noise_classify=config.do_noise_classify,
noise_classifier=noise_classifier,
minimum_empty_duration=minimum_empty_duration,
minimum_chunk_duration=config.segmenter.minimum_chunk_duration,
noise_classifier_threshold=noise_classifier_threshold,
silence_duration=config.segmenter.silence_duration,
ground_truth=tokenizer(ground_truth),
)
thread_map(
export_and_chunk,
df["audio"],
df["ground_truth"],
output_offsets,
desc="Segmenting Audio into Chunks",
total=len(df),
)
parse_args(args)
staticmethod
Utility argument parser function for SpeechLine.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
args |
List[str] |
List of arguments. |
required |
Returns:
Type | Description |
---|---|
argparse.Namespace |
Objects with arguments values as attributes. |
Source code in speechline/run.py
@staticmethod
def parse_args(args: List[str]) -> argparse.Namespace:
"""
Utility argument parser function for SpeechLine.
Args:
args (List[str]):
List of arguments.
Returns:
argparse.Namespace:
Objects with arguments values as attributes.
"""
parser = argparse.ArgumentParser(
prog="python speechline/run.py",
description="Perform end-to-end speech labeling pipeline.",
)
parser.add_argument(
"-i",
"--input_dir",
type=str,
required=True,
help="Directory of input audios.",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
required=True,
help="Directory to save pipeline results.",
)
parser.add_argument(
"-c",
"--config",
type=str,
default="examples/config.json",
help="SpeechLine configuration file.",
)
return parser.parse_args(args)
run(config, input_dir, output_dir)
staticmethod
Runs end-to-end SpeechLine pipeline.
Pipeline Overview
- Classifies for children's speech audio (optional).
- Transcribes audio.
- Segments audio into chunks based on silences.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
Config |
SpeechLine Config object. |
required |
input_dir |
str |
Path to input directory. |
required |
output_dir |
str |
Path to output directory. |
required |
Source code in speechline/run.py
@staticmethod
def run(config: Config, input_dir: str, output_dir: str) -> None:
"""
Runs end-to-end SpeechLine pipeline.
### Pipeline Overview
- Classifies for children's speech audio (optional).
- Transcribes audio.
- Segments audio into chunks based on silences.
Args:
config (Config):
SpeechLine Config object.
input_dir (str):
Path to input directory.
output_dir (str):
Path to output directory.
"""
logger.info("Preparing DataFrame..")
df = prepare_dataframe(input_dir, audio_extension="wav")
if config.filter_empty_transcript:
df = df[df["ground_truth"] != ""]
if config.do_classify:
# load classifier model
classifier = Wav2Vec2Classifier(
config.classifier.model,
max_duration_s=config.classifier.max_duration_s,
)
# perform audio classification
dataset = format_audio_dataset(df, sampling_rate=classifier.sampling_rate)
df["category"] = classifier.predict(dataset)
# filter audio by category
df = df[df["category"] == "child"]
# load transcriber model
if config.transcriber.type == "wav2vec2":
transcriber = Wav2Vec2Transcriber(config.transcriber.model)
elif config.transcriber.type == "whisper":
transcriber = WhisperTranscriber(config.transcriber.model)
# perform audio transcription
dataset = format_audio_dataset(df, sampling_rate=transcriber.sampling_rate)
output_offsets = transcriber.predict(
dataset,
chunk_length_s=config.transcriber.chunk_length_s,
output_offsets=True,
return_timestamps=config.transcriber.return_timestamps,
keep_whitespace=config.segmenter.keep_whitespace,
)
# segment audios based on offsets
if config.segmenter.type == "silence":
segmenter = SilenceSegmenter()
elif config.segmenter.type == "word_overlap":
segmenter = WordOverlapSegmenter()
elif config.segmenter.type == "phoneme_overlap":
lexicon = Lexicon()
if config.segmenter.lexicon_path:
with open(config.segmenter.lexicon_path) as json_file:
lex = json.load(json_file)
# merge dict with lexicon
for k, v in lex.items():
lexicon[k] = lexicon[k].union(set(v)) if k in lexicon else set(v)
segmenter = PhonemeOverlapSegmenter(lexicon)
tokenizer = WordTokenizer()
if config.do_noise_classify:
noise_classifier = config.noise_classifier.model
minimum_empty_duration = config.noise_classifier.minimum_empty_duration
noise_classifier_threshold = config.noise_classifier.threshold
else:
noise_classifier = None
minimum_empty_duration = None
noise_classifier_threshold = None
def export_and_chunk(
audio_path: str,
ground_truth: str,
offsets: List[Dict[str, Union[str, float]]],
):
json_path = Path(audio_path).with_suffix(".json")
# export JSON transcripts
export_transcripts_json(str(json_path), offsets)
# chunk audio into segments
segmenter.chunk_audio_segments(
audio_path,
output_dir,
offsets,
do_noise_classify=config.do_noise_classify,
noise_classifier=noise_classifier,
minimum_empty_duration=minimum_empty_duration,
minimum_chunk_duration=config.segmenter.minimum_chunk_duration,
noise_classifier_threshold=noise_classifier_threshold,
silence_duration=config.segmenter.silence_duration,
ground_truth=tokenizer(ground_truth),
)
thread_map(
export_and_chunk,
df["audio"],
df["ground_truth"],
output_offsets,
desc="Segmenting Audio into Chunks",
total=len(df),
)