G2p
g2p_id.g2p.G2p
Grapheme-to-phoneme (g2p) main class for phonemization. This class provides a high-level API for grapheme-to-phoneme conversion.
- Preprocess and normalize text
- Word tokenizes text
- Predict POS for every word
- If word is non-alphabetic, add to list (i.e. punctuation)
- If word is a homograph, check POS and use matching word's phonemes
- If word is a non-homograph, lookup lexicon
- Otherwise, predict with a neural network
Source code in g2p_id/g2p.py
class G2p:
"""Grapheme-to-phoneme (g2p) main class for phonemization.
This class provides a high-level API for grapheme-to-phoneme conversion.
1. Preprocess and normalize text
2. Word tokenizes text
3. Predict POS for every word
4. If word is non-alphabetic, add to list (i.e. punctuation)
5. If word is a homograph, check POS and use matching word's phonemes
6. If word is a non-homograph, lookup lexicon
7. Otherwise, predict with a neural network
"""
def __init__(self, model_type="BERT"):
"""Constructor for G2p.
Args:
model_type (str, optional):
Type of neural network to use for prediction.
Choices are "LSTM" or "BERT". Defaults to "BERT".
"""
self.homograph2features = construct_homographs_dictionary()
self.lexicon2features = construct_lexicon_dictionary()
self.normalizer = TextProcessor()
self.tagger = PerceptronTagger(load=False)
tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
self.tagger.load("file://" + tagger_path)
self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
self.tokenizer = TweetTokenizer()
self.pos_dict = {
"N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
"V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
"A": ["B-ADJ"],
"P": ["B-PAR"],
}
def _preprocess(self, text: str) -> str:
"""Performs preprocessing.
(1) Adds spaces in between tokens
(2) Normalizes unicode and accents
(3) Normalizes numbers
(4) Lower case texts
(5) Removes unwanted tokens
Arguments:
text (str): Text to preprocess.
Returns:
str: Preprocessed text.
"""
text = text.replace("-", " ")
text = re.sub(r"\.(?=.*\.)", " ", text)
text = " ".join(self.tokenizer.tokenize(text))
text = unicode(text)
text = "".join(
char
for char in unicodedata.normalize("NFD", text)
if unicodedata.category(char) != "Mn"
)
text = self.normalizer.normalize(text).strip()
text = text.lower()
text = re.sub(r"[^ a-z'.,?!\-]", "", text)
return text
def _rule_based_g2p(self, text: str) -> str:
"""Applies rule-based Indonesian grapheme2phoneme conversion.
Args:
text (str): Grapheme text to convert to phoneme.
Returns:
str: Phoneme string.
"""
phonetic_mapping = {
"ny": "ɲ",
"ng": "ŋ",
"sy": "ʃ",
"aa": "aʔa",
"ii": "iʔi",
"oo": "oʔo",
"əə": "əʔə",
"uu": "uʔu",
"'": "ʔ",
"g": "ɡ",
"q": "k",
"j": "dʒ",
"y": "j",
"x": "ks",
"c": "tʃ",
"kh": "x",
}
consonants = "bdjklmnprstwɲ"
if text.endswith("k"):
text = text[:-1] + "ʔ"
if text.startswith("x"):
text = "s" + text[1:]
if text.startswith("ps"):
text = text[1:]
for graph, phone in phonetic_mapping.items():
text = text.replace(graph, phone)
for letter in consonants:
text = text.replace(f"k{letter}", f"ʔ{letter}")
phonemes = [
list(phn) if phn not in ("dʒ", "tʃ") else [phn]
for phn in re.split("(tʃ|dʒ)", text)
]
return " ".join([p for phn in phonemes for p in phn])
def __call__(self, text: str) -> List[List[str]]:
"""Grapheme-to-phoneme converter.
1. Preprocess and normalize text
2. Word tokenizes text
3. Predict POS for every word
4. If word is non-alphabetic, add to list (i.e. punctuation)
5. If word is a homograph, check POS and use matching word's phonemes
6. If word is a non-homograph, lookup lexicon
7. Otherwise, predict with a neural network
Args:
text (str): Grapheme text to convert to phoneme.
Returns:
List[List[str]]: List of strings in phonemes.
"""
text = self._preprocess(text)
words = self.tokenizer.tokenize(text)
tokens = self.tagger.tag(words)
prons = []
for word, pos in tokens:
pron = ""
if re.search("[a-z]", word) is None: # non-alphabetic
pron = word
elif word in self.homograph2features: # check if homograph
pron1, pron2, pos1, _ = self.homograph2features[word]
# check for the matching POS
if pos in self.pos_dict[pos1]:
pron = pron1
else:
pron = pron2
elif word in self.lexicon2features: # non-homographs
pron = self.lexicon2features[word]
else: # predict for OOV
pron = self.model.predict(word)
if isinstance(self.model, BERT):
pron = self._rule_based_g2p(pron)
prons.append(pron.split())
return prons
__call__(self, text)
special
Grapheme-to-phoneme converter.
- Preprocess and normalize text
- Word tokenizes text
- Predict POS for every word
- If word is non-alphabetic, add to list (i.e. punctuation)
- If word is a homograph, check POS and use matching word's phonemes
- If word is a non-homograph, lookup lexicon
- Otherwise, predict with a neural network
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
Grapheme text to convert to phoneme. |
required |
Returns:
Type | Description |
---|---|
List[List[str]] |
List of strings in phonemes. |
Source code in g2p_id/g2p.py
def __call__(self, text: str) -> List[List[str]]:
"""Grapheme-to-phoneme converter.
1. Preprocess and normalize text
2. Word tokenizes text
3. Predict POS for every word
4. If word is non-alphabetic, add to list (i.e. punctuation)
5. If word is a homograph, check POS and use matching word's phonemes
6. If word is a non-homograph, lookup lexicon
7. Otherwise, predict with a neural network
Args:
text (str): Grapheme text to convert to phoneme.
Returns:
List[List[str]]: List of strings in phonemes.
"""
text = self._preprocess(text)
words = self.tokenizer.tokenize(text)
tokens = self.tagger.tag(words)
prons = []
for word, pos in tokens:
pron = ""
if re.search("[a-z]", word) is None: # non-alphabetic
pron = word
elif word in self.homograph2features: # check if homograph
pron1, pron2, pos1, _ = self.homograph2features[word]
# check for the matching POS
if pos in self.pos_dict[pos1]:
pron = pron1
else:
pron = pron2
elif word in self.lexicon2features: # non-homographs
pron = self.lexicon2features[word]
else: # predict for OOV
pron = self.model.predict(word)
if isinstance(self.model, BERT):
pron = self._rule_based_g2p(pron)
prons.append(pron.split())
return prons
__init__(self, model_type='BERT')
special
Constructor for G2p.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model_type |
str |
Type of neural network to use for prediction. Choices are "LSTM" or "BERT". Defaults to "BERT". |
'BERT' |
Source code in g2p_id/g2p.py
def __init__(self, model_type="BERT"):
"""Constructor for G2p.
Args:
model_type (str, optional):
Type of neural network to use for prediction.
Choices are "LSTM" or "BERT". Defaults to "BERT".
"""
self.homograph2features = construct_homographs_dictionary()
self.lexicon2features = construct_lexicon_dictionary()
self.normalizer = TextProcessor()
self.tagger = PerceptronTagger(load=False)
tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
self.tagger.load("file://" + tagger_path)
self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
self.tokenizer = TweetTokenizer()
self.pos_dict = {
"N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
"V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
"A": ["B-ADJ"],
"P": ["B-PAR"],
}
Usage
texts = [
"Apel itu berwarna merah.",
"Rahel bersekolah di Jakarta.",
"Mereka sedang bermain bola di lapangan.",
]
g2p = G2p(model_type="BERT")
for text in texts:
print(g2p(text))
>> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
>> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
>> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]