Skip to content

G2p

g2p_id.g2p.G2p

Grapheme-to-phoneme (g2p) main class for phonemization. This class provides a high-level API for grapheme-to-phoneme conversion.

  1. Preprocess and normalize text
  2. Word tokenizes text
  3. Predict POS for every word
  4. If word is non-alphabetic, add to list (i.e. punctuation)
  5. If word is a homograph, check POS and use matching word's phonemes
  6. If word is a non-homograph, lookup lexicon
  7. Otherwise, predict with a neural network
Source code in g2p_id/g2p.py
class G2p:
    """Grapheme-to-phoneme (g2p) main class for phonemization.
    This class provides a high-level API for grapheme-to-phoneme conversion.

    1. Preprocess and normalize text
    2. Word tokenizes text
    3. Predict POS for every word
    4. If word is non-alphabetic, add to list (i.e. punctuation)
    5. If word is a homograph, check POS and use matching word's phonemes
    6. If word is a non-homograph, lookup lexicon
    7. Otherwise, predict with a neural network
    """

    def __init__(self, model_type="BERT"):
        """Constructor for G2p.

        Args:
            model_type (str, optional):
                Type of neural network to use for prediction.
                Choices are "LSTM" or "BERT". Defaults to "BERT".
        """
        self.homograph2features = construct_homographs_dictionary()
        self.lexicon2features = construct_lexicon_dictionary()
        self.normalizer = TextProcessor()
        self.tagger = PerceptronTagger(load=False)
        tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
        self.tagger.load("file://" + tagger_path)
        self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
        self.tokenizer = TweetTokenizer()
        self.pos_dict = {
            "N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
            "V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
            "A": ["B-ADJ"],
            "P": ["B-PAR"],
        }

    def _preprocess(self, text: str) -> str:
        """Performs preprocessing.
        (1) Adds spaces in between tokens
        (2) Normalizes unicode and accents
        (3) Normalizes numbers
        (4) Lower case texts
        (5) Removes unwanted tokens

        Arguments:
            text (str): Text to preprocess.

        Returns:
            str: Preprocessed text.
        """
        text = text.replace("-", " ")
        text = " ".join(self.tokenizer.tokenize(text))
        text = unicode(text)
        text = "".join(
            char
            for char in unicodedata.normalize("NFD", text)
            if unicodedata.category(char) != "Mn"
        )
        text = self.normalizer.normalize(text).strip()
        text = text.lower()
        text = re.sub(r"[^ a-z'.,?!\-]", "", text)
        return text

    def _rule_based_g2p(self, text: str) -> str:
        """Applies rule-based Indonesian grapheme2phoneme conversion.

        Args:
            text (str): Grapheme text to convert to phoneme.

        Returns:
            str: Phoneme string.
        """
        phonetic_mapping = {
            "ny": "ɲ",
            "ng": "ŋ",
            "sy": "ʃ",
            "aa": "aʔa",
            "ii": "iʔi",
            "oo": "oʔo",
            "əə": "əʔə",
            "uu": "uʔu",
            "'": "ʔ",
            "g": "ɡ",
            "q": "k",
            "j": "dʒ",
            "y": "j",
            "x": "ks",
            "c": "tʃ",
            "kh": "x",
        }

        consonants = "bdjklmnprstwɲ"

        if text.endswith("k"):
            text = text[:-1] + "ʔ"

        if text.startswith("x"):
            text = "s" + text[1:]

        if text.startswith("ps"):
            text = text[1:]

        for graph, phone in phonetic_mapping.items():
            text = text.replace(graph, phone)

        for letter in consonants:
            text = text.replace(f"k{letter}", f{letter}")

        phonemes = [
            list(phn) if phn not in ("dʒ", "tʃ") else [phn]
            for phn in re.split("(tʃ|dʒ)", text)
        ]
        return " ".join([p for phn in phonemes for p in phn])

    def __call__(self, text: str) -> List[List[str]]:
        """Grapheme-to-phoneme converter.

        1. Preprocess and normalize text
        2. Word tokenizes text
        3. Predict POS for every word
        4. If word is non-alphabetic, add to list (i.e. punctuation)
        5. If word is a homograph, check POS and use matching word's phonemes
        6. If word is a non-homograph, lookup lexicon
        7. Otherwise, predict with a neural network

        Args:
            text (str): Grapheme text to convert to phoneme.

        Returns:
            List[List[str]]: List of strings in phonemes.
        """
        text = self._preprocess(text)
        words = self.tokenizer.tokenize(text)
        tokens = self.tagger.tag(words)

        prons = []
        for word, pos in tokens:
            pron = ""
            if re.search("[a-z]", word) is None:  # non-alphabetic
                pron = word

            elif word in self.homograph2features:  # check if homograph
                pron1, pron2, pos1, _ = self.homograph2features[word]

                # check for the matching POS
                if pos in self.pos_dict[pos1]:
                    pron = pron1
                else:
                    pron = pron2

            elif word in self.lexicon2features:  # non-homographs
                pron = self.lexicon2features[word]

            else:  # predict for OOV
                pron = self.model.predict(word)
                if isinstance(self.model, BERT):
                    pron = self._rule_based_g2p(pron)

            prons.append(pron.split())

        return prons

__call__(self, text) special

Grapheme-to-phoneme converter.

  1. Preprocess and normalize text
  2. Word tokenizes text
  3. Predict POS for every word
  4. If word is non-alphabetic, add to list (i.e. punctuation)
  5. If word is a homograph, check POS and use matching word's phonemes
  6. If word is a non-homograph, lookup lexicon
  7. Otherwise, predict with a neural network

Parameters:

Name Type Description Default
text str

Grapheme text to convert to phoneme.

required

Returns:

Type Description
List[List[str]]

List of strings in phonemes.

Source code in g2p_id/g2p.py
def __call__(self, text: str) -> List[List[str]]:
    """Grapheme-to-phoneme converter.

    1. Preprocess and normalize text
    2. Word tokenizes text
    3. Predict POS for every word
    4. If word is non-alphabetic, add to list (i.e. punctuation)
    5. If word is a homograph, check POS and use matching word's phonemes
    6. If word is a non-homograph, lookup lexicon
    7. Otherwise, predict with a neural network

    Args:
        text (str): Grapheme text to convert to phoneme.

    Returns:
        List[List[str]]: List of strings in phonemes.
    """
    text = self._preprocess(text)
    words = self.tokenizer.tokenize(text)
    tokens = self.tagger.tag(words)

    prons = []
    for word, pos in tokens:
        pron = ""
        if re.search("[a-z]", word) is None:  # non-alphabetic
            pron = word

        elif word in self.homograph2features:  # check if homograph
            pron1, pron2, pos1, _ = self.homograph2features[word]

            # check for the matching POS
            if pos in self.pos_dict[pos1]:
                pron = pron1
            else:
                pron = pron2

        elif word in self.lexicon2features:  # non-homographs
            pron = self.lexicon2features[word]

        else:  # predict for OOV
            pron = self.model.predict(word)
            if isinstance(self.model, BERT):
                pron = self._rule_based_g2p(pron)

        prons.append(pron.split())

    return prons

__init__(self, model_type='BERT') special

Constructor for G2p.

Parameters:

Name Type Description Default
model_type str

Type of neural network to use for prediction. Choices are "LSTM" or "BERT". Defaults to "BERT".

'BERT'
Source code in g2p_id/g2p.py
def __init__(self, model_type="BERT"):
    """Constructor for G2p.

    Args:
        model_type (str, optional):
            Type of neural network to use for prediction.
            Choices are "LSTM" or "BERT". Defaults to "BERT".
    """
    self.homograph2features = construct_homographs_dictionary()
    self.lexicon2features = construct_lexicon_dictionary()
    self.normalizer = TextProcessor()
    self.tagger = PerceptronTagger(load=False)
    tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
    self.tagger.load("file://" + tagger_path)
    self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
    self.tokenizer = TweetTokenizer()
    self.pos_dict = {
        "N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
        "V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
        "A": ["B-ADJ"],
        "P": ["B-PAR"],
    }

Usage

texts = [
    "Apel itu berwarna merah.",
    "Rahel bersekolah di Jakarta.",
    "Mereka sedang bermain bola di lapangan.",
]
g2p = G2p(model_type="BERT")
for text in texts:
    print(g2p(text))
>> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
>> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
>> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]