Audio Dataset
speechline.utils.dataset
format_audio_dataset(df, sampling_rate=16000)
Formats Pandas DataFrame
as a datasets Dataset
.
Converts audio
path column to audio arrays and resamples accordingly.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
pd.DataFrame |
Pandas DataFrame to convert to |
required |
Returns:
Type | Description |
---|---|
Dataset |
|
Source code in speechline/utils/dataset.py
def format_audio_dataset(df: pd.DataFrame, sampling_rate: int = 16000) -> Dataset:
"""
Formats Pandas `DataFrame` as a datasets `Dataset`.
Converts `audio` path column to audio arrays and resamples accordingly.
Args:
df (pd.DataFrame):
Pandas DataFrame to convert to `Dataset`.
Returns:
Dataset:
`datasets`' `Dataset` object usable for batch inference.
"""
dataset = Dataset.from_pandas(df)
dataset.save_to_disk(str(config.HF_DATASETS_CACHE))
saved_dataset = load_from_disk(str(config.HF_DATASETS_CACHE))
saved_dataset = saved_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
return saved_dataset
prepare_dataframe(path_to_files, audio_extension='wav')
Prepares audio and ground truth files as Pandas DataFrame
.
Assumes files are of the following structure:
path_to_files
├── langX
│ ├── a.{audio_extension}
│ ├── a.txt
│ └── b.{audio_extension}
│ └── b.txt
└── langY
└── c.{audio_extension}
└── c.txt
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path_to_files |
str |
Path to files. |
required |
audio_extension |
str |
Audio extension of files to include. Defaults to "wav". |
'wav' |
Exceptions:
Type | Description |
---|---|
ValueError |
No audio files found. |
Returns:
Type | Description |
---|---|
pd.DataFrame |
DataFrame consisting of:
|
Source code in speechline/utils/dataset.py
def prepare_dataframe(path_to_files: str, audio_extension: str = "wav") -> pd.DataFrame:
"""
Prepares audio and ground truth files as Pandas `DataFrame`.
Assumes files are of the following structure:
```
path_to_files
├── langX
│ ├── a.{audio_extension}
│ ├── a.txt
│ └── b.{audio_extension}
│ └── b.txt
└── langY
└── c.{audio_extension}
└── c.txt
```
Args:
path_to_files (str):
Path to files.
audio_extension (str, optional):
Audio extension of files to include. Defaults to "wav".
Raises:
ValueError: No audio files found.
Returns:
pd.DataFrame:
DataFrame consisting of:
- `audio` (audio path)
- `id`
- `language`
- `language_code`
- `ground_truth`
"""
audios = sorted(glob(f"{path_to_files}/*/*.{audio_extension}"))
audios = [a for a in audios if Path(a).stat().st_size > 0]
if len(audios) == 0:
raise ValueError("No audio files found!")
df = pd.DataFrame({"audio": audios})
# ID is filename stem (before extension)
df["id"] = df["audio"].apply(lambda f: Path(f).stem)
# language code is immediate parent directory
df["language_code"] = df["audio"].apply(lambda f: Path(f).parent.name)
df["language"] = df["language_code"].apply(lambda f: f.split("-")[0])
# ground truth is same filename, except with .txt extension
df["ground_truth"] = df["audio"].apply(lambda p: Path(p).with_suffix(".txt"))
df["ground_truth"] = df["ground_truth"].apply(lambda p: open(p).read() if p.exists() else "")
return df
preprocess_audio_transcript(text)
Preprocesses audio transcript. - Removes punctuation. - Converts to lowercase. - Removes special tags (e.g. GigaSpeech).
Source code in speechline/utils/dataset.py
def preprocess_audio_transcript(text: str) -> str:
"""
Preprocesses audio transcript.
- Removes punctuation.
- Converts to lowercase.
- Removes special tags (e.g. GigaSpeech).
"""
tags = [
"<COMMA>",
"<PERIOD>",
"<QUESTIONMARK>",
"<EXCLAMATIONPOINT>",
"<SIL>",
"<MUSIC>",
"<NOISE>",
"<OTHER>",
]
chars_to_remove_regex = "[\,\?\.\!\-\;\:\"\“\%\‘\”\�'\’]"
text = re.sub(chars_to_remove_regex, " ", text).lower().strip()
text = re.sub(r"\s+", " ", text).strip()
for tag in tags:
text = text.replace(tag.lower(), "").strip()
return text