Skip to content

Create HuggingFace Datset

Usage

example_create_hf_dataset.sh
python scripts/create_hf_dataset.py [-h] -i INPUT_DIR --dataset_name DATASET_NAME [--phonemize PHONEMIZE] [--private PRIVATE]
Create HuggingFace dataset from SpeechLine outputs.

optional arguments:
  -h, --help            show this help message and exit
  -i INPUT_DIR, --input_dir INPUT_DIR
                        Directory of input audios.
  --dataset_name DATASET_NAME
                        HuggingFace dataset repository name.
  --phonemize PHONEMIZE
                        Phonemize text.
  --private PRIVATE     Set HuggingFace dataset to private.

Example

python scripts/create_hf_dataset.py \
    --input_dir="training/" \
    --dataset_name="myname/mydataset" \
    --private="True" \
    --phonemize="True"

scripts.create_hf_dataset

create_dataset(input_dir, dataset_name, private=True, phonemize=False)

Creates HuggingFace dataset from SpeechLine outputs. Ensures unique utterance and speaker IDs in each subset.

Parameters:

Name Type Description Default
input_dir str

Path to input audio directory.

required
dataset_name str

HuggingFace dataset name.

required
private bool

Set HuggingFace dataset as private. Defaults to True.

True
phonemize bool

Phonemize text to phoneme strings. Defaults to False.

False

Returns:

Type Description
DatasetDict

Created HuggingFace dataset.

Source code in scripts/create_hf_dataset.py
def create_dataset(
    input_dir: str, dataset_name: str, private: bool = True, phonemize: bool = False
) -> DatasetDict:
    """
    Creates HuggingFace dataset from SpeechLine outputs.
    Ensures unique utterance and speaker IDs in each subset.

    Args:
        input_dir (str):
            Path to input audio directory.
        dataset_name (str):
            HuggingFace dataset name.
        private (bool, optional):
            Set HuggingFace dataset as private. Defaults to `True`.
        phonemize (bool, optional):
            Phonemize text to phoneme strings. Defaults to `False`.

    Returns:
        DatasetDict:
            Created HuggingFace dataset.
    """
    audios = glob(f"{input_dir}/**/*.wav")
    df = pd.DataFrame({"audio": audios})
    # `audio` =  `"{dir}/{language}/{speaker}_{utt_id}.wav"`
    df["language"] = df["audio"].apply(lambda x: x.split("/")[-2])
    df["speaker"] = df["audio"].apply(lambda x: x.split("/")[-1].split("_")[0])
    df["text"] = df["audio"].apply(lambda x: parse_tsv(Path(x).with_suffix(".tsv")))

    tqdm.pandas(desc="Phonemization")

    if phonemize:
        df["phonemes"] = df.progress_apply(
            lambda row: get_g2p(row["language"].split("-")[0])(row["text"]), axis=1
        )

    speaker, counts = np.unique(df["speaker"], return_counts=True)
    speaker2count = {s: c for s, c in zip(speaker, counts)}

    train_num = int(0.7 * len(df))
    test_num = int(0.9 * len(df))

    train_speakers, test_speakers, valid_speakers = [], [], []
    total = 0

    for speaker, count in sorted(
        speaker2count.items(), key=lambda item: item[1], reverse=True
    ):
        if total < train_num and total < test_num:
            train_speakers.append(speaker)
        elif total < test_num:
            test_speakers.append(speaker)
        else:
            valid_speakers.append(speaker)
        total += count

    train_df = df[df["speaker"].isin(train_speakers)].reset_index(drop=True)
    test_df = df[df["speaker"].isin(test_speakers)].reset_index(drop=True)
    valid_df = df[df["speaker"].isin(valid_speakers)].reset_index(drop=True)

    train_ds = Dataset.from_pandas(train_df).cast_column("audio", Audio())
    test_ds = Dataset.from_pandas(test_df).cast_column("audio", Audio())
    valid_ds = Dataset.from_pandas(valid_df).cast_column("audio", Audio())

    dataset = DatasetDict({"train": train_ds, "test": test_ds, "validation": valid_ds})
    dataset.push_to_hub(dataset_name, private=private)
    return dataset

parse_args(args)

Utility argument parser function for dataset creation.

Parameters:

Name Type Description Default
args List[str]

List of arguments.

required

Returns:

Type Description
argparse.Namespace

Objects with arguments values as attributes.

Source code in scripts/create_hf_dataset.py
def parse_args(args: List[str]) -> argparse.Namespace:
    """
    Utility argument parser function for dataset creation.

    Args:
        args (List[str]):
            List of arguments.

    Returns:
        argparse.Namespace:
            Objects with arguments values as attributes.
    """
    parser = argparse.ArgumentParser(
        prog="python scripts/create_hf_datasets.py",
        description="Create HuggingFace dataset from SpeechLine outputs.",
    )

    parser.add_argument(
        "-i",
        "--input_dir",
        type=str,
        required=True,
        help="Directory of input audios.",
    )
    parser.add_argument(
        "--dataset_name",
        type=str,
        required=True,
        help="HuggingFace dataset repository name.",
    )
    parser.add_argument("--phonemize", type=bool, default=False, help="Phonemize text.")
    parser.add_argument(
        "--private", type=bool, default=True, help="Set HuggingFace dataset to private."
    )
    return parser.parse_args(args)

parse_tsv(path)

Join text transcripts of TSV annotation.

Parameters:

Name Type Description Default
path str

Path to TSV file.

required

Returns:

Type Description
str

Joined text transcript.

Source code in scripts/create_hf_dataset.py
def parse_tsv(path: str) -> str:
    """
    Join text transcripts of TSV annotation.

    Args:
        path (str):
            Path to TSV file.

    Returns:
        str:
            Joined text transcript.
    """
    with open(path) as fd:
        rows = csv.reader(fd, delimiter="\t", quotechar='"')
        return " ".join(row[2] for row in rows)