Create HuggingFace Datset
Usage
example_create_hf_dataset.sh
python scripts/create_hf_dataset.py [-h] -i INPUT_DIR --dataset_name DATASET_NAME [--phonemize PHONEMIZE] [--private PRIVATE]
Create HuggingFace dataset from SpeechLine outputs.
optional arguments:
-h, --help show this help message and exit
-i INPUT_DIR, --input_dir INPUT_DIR
Directory of input audios.
--dataset_name DATASET_NAME
HuggingFace dataset repository name.
--phonemize PHONEMIZE
Phonemize text.
--private PRIVATE Set HuggingFace dataset to private.
Example
python scripts/create_hf_dataset.py \
--input_dir="training/" \
--dataset_name="myname/mydataset" \
--private="True" \
--phonemize="True"
scripts.create_hf_dataset
create_dataset(input_dir, dataset_name, private=True, phonemize=False)
Creates HuggingFace dataset from SpeechLine outputs. Ensures unique utterance and speaker IDs in each subset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_dir |
str |
Path to input audio directory. |
required |
dataset_name |
str |
HuggingFace dataset name. |
required |
private |
bool |
Set HuggingFace dataset as private. Defaults to |
True |
phonemize |
bool |
Phonemize text to phoneme strings. Defaults to |
False |
Returns:
Type | Description |
---|---|
DatasetDict |
Created HuggingFace dataset. |
Source code in scripts/create_hf_dataset.py
def create_dataset(
input_dir: str, dataset_name: str, private: bool = True, phonemize: bool = False
) -> DatasetDict:
"""
Creates HuggingFace dataset from SpeechLine outputs.
Ensures unique utterance and speaker IDs in each subset.
Args:
input_dir (str):
Path to input audio directory.
dataset_name (str):
HuggingFace dataset name.
private (bool, optional):
Set HuggingFace dataset as private. Defaults to `True`.
phonemize (bool, optional):
Phonemize text to phoneme strings. Defaults to `False`.
Returns:
DatasetDict:
Created HuggingFace dataset.
"""
audios = glob(f"{input_dir}/**/*.wav")
df = pd.DataFrame({"audio": audios})
# `audio` = `"{dir}/{language}/{speaker}_{utt_id}.wav"`
df["language"] = df["audio"].apply(lambda x: x.split("/")[-2])
df["speaker"] = df["audio"].apply(lambda x: x.split("/")[-1].split("_")[0])
df["text"] = df["audio"].apply(lambda x: parse_tsv(Path(x).with_suffix(".tsv")))
tqdm.pandas(desc="Phonemization")
if phonemize:
df["phonemes"] = df.progress_apply(
lambda row: get_g2p(row["language"].split("-")[0])(row["text"]), axis=1
)
speaker, counts = np.unique(df["speaker"], return_counts=True)
speaker2count = {s: c for s, c in zip(speaker, counts)}
train_num = int(0.7 * len(df))
test_num = int(0.9 * len(df))
train_speakers, test_speakers, valid_speakers = [], [], []
total = 0
for speaker, count in sorted(
speaker2count.items(), key=lambda item: item[1], reverse=True
):
if total < train_num and total < test_num:
train_speakers.append(speaker)
elif total < test_num:
test_speakers.append(speaker)
else:
valid_speakers.append(speaker)
total += count
train_df = df[df["speaker"].isin(train_speakers)].reset_index(drop=True)
test_df = df[df["speaker"].isin(test_speakers)].reset_index(drop=True)
valid_df = df[df["speaker"].isin(valid_speakers)].reset_index(drop=True)
train_ds = Dataset.from_pandas(train_df).cast_column("audio", Audio())
test_ds = Dataset.from_pandas(test_df).cast_column("audio", Audio())
valid_ds = Dataset.from_pandas(valid_df).cast_column("audio", Audio())
dataset = DatasetDict({"train": train_ds, "test": test_ds, "validation": valid_ds})
dataset.push_to_hub(dataset_name, private=private)
return dataset
parse_args(args)
Utility argument parser function for dataset creation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
args |
List[str] |
List of arguments. |
required |
Returns:
Type | Description |
---|---|
argparse.Namespace |
Objects with arguments values as attributes. |
Source code in scripts/create_hf_dataset.py
def parse_args(args: List[str]) -> argparse.Namespace:
"""
Utility argument parser function for dataset creation.
Args:
args (List[str]):
List of arguments.
Returns:
argparse.Namespace:
Objects with arguments values as attributes.
"""
parser = argparse.ArgumentParser(
prog="python scripts/create_hf_datasets.py",
description="Create HuggingFace dataset from SpeechLine outputs.",
)
parser.add_argument(
"-i",
"--input_dir",
type=str,
required=True,
help="Directory of input audios.",
)
parser.add_argument(
"--dataset_name",
type=str,
required=True,
help="HuggingFace dataset repository name.",
)
parser.add_argument("--phonemize", type=bool, default=False, help="Phonemize text.")
parser.add_argument(
"--private", type=bool, default=True, help="Set HuggingFace dataset to private."
)
return parser.parse_args(args)
parse_tsv(path)
Join text transcripts of TSV annotation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
str |
Path to TSV file. |
required |
Returns:
Type | Description |
---|---|
str |
Joined text transcript. |