Skip to content

TextProcessor

g2p_id.text_processor.TextProcessor

Indonesian text processor to normalize numerics, currencies, and timezones.

Source code in g2p_id/text_processor.py
class TextProcessor:
    """Indonesian text processor to normalize numerics, currencies, and timezones."""

    def __init__(self):
        self.measurements = {}
        self.thousands = ["ratus", "ribu", "juta", "miliar", "milyar", "triliun"]
        self.months = [
            "Januari",
            "Februari",
            "Maret",
            "April",
            "Mei",
            "Juni",
            "Juli",
            "Agustus",
            "September",
            "Oktober",
            "November",
            "Desember",
        ]
        measurements_path = os.path.join(resources_path, "measurements.tsv")
        currencies_path = os.path.join(resources_path, "currency.tsv")
        timezones_path = os.path.join(resources_path, "timezones.tsv")

        with open(measurements_path, "r", encoding="utf-8") as file:
            for lines in file:
                line = lines.strip().split("\t")
                self.measurements[line[0]] = line[1]

        self.currencies = {}
        with open(currencies_path, "r", encoding="utf-8") as file:
            for lines in file:
                line = lines.strip().split("\t")
                self.currencies[line[0]] = line[1]

        self.timezones = {}
        with open(timezones_path, "r", encoding="utf-8") as file:
            for lines in file:
                line = lines.strip().split("\t")
                self.timezones[line[0]] = line[1]

        self.re_thousands = "|".join(self.thousands)
        self.re_currencies = r"\b" + re.sub(
            r"\|([^|$£€¥₩]+)", r"|\\b\1", "|".join(list(self.currencies))
        )
        self.re_currencies = re.sub(r"([$£€¥₩])", r"\\\1", self.re_currencies)
        self.re_moneys = (
            rf"(({self.re_currencies}) ?([\d\.\,]+)( ({self.re_thousands})?(an)?)?)"
        )
        self.re_measurements = "|".join(list(self.measurements))
        self.re_measurements = rf"(\b([\d\.\,]+) ?({self.re_measurements})\b)"
        self.re_timezones = "|".join(list(self.timezones))
        self.re_timezones = (
            r"((\d{1,2})[\.:](\d{1,2}) " + rf"\b({self.re_timezones})\b)"
        )
        self.re_http = re.compile(
            r"""
            (https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.
            [a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*)
            """,
            re.X,
        )

    @staticmethod
    def is_integer(number: Any) -> bool:
        """Check if integer by type-casting.

        Args:
            number (Any): Number to check.

        Returns:
            bool: Is a valid integer.
        """
        try:
            int(number)
            return True
        except ValueError:
            return False

    @staticmethod
    def is_float(number: Any) -> bool:
        """Check if float by type-casting.

        Args:
            number (Any): Number to check.

        Returns:
            bool: Is a valid float.
        """
        try:
            float(number)
            return True
        except ValueError:
            return False

    def normalize_url(self, text: str) -> str:
        """Removes URL from text.

        Args:
            text (str): Text with URL to normalize.

        Returns:
            str: Normalized text with URLs removed.
        """
        urls = re.findall(self.re_http, text)
        for url in urls:
            text = text.replace(url[0], "")
        return text

    def normalize_currency(self, text: str) -> str:
        """Normalizes international and Indonesian (Rupiah) currencies.

        Examples:
        - `"$250"` -> `"dua ratus lima puluh dollar"`
        - `"Rp 3,000,000"` -> `"tiga juta rupiah"`

        Args:
            text (str): Text with currency to normalize.

        Returns:
            str: Normalized text with currency transliterated.
        """
        moneys = re.findall(self.re_moneys, text)
        for money in moneys:
            number: Any = re.sub(",", ".", re.sub(r"\.", "", money[2].strip(" ,.")))
            try:
                if number == "":
                    continue
                if self.is_integer(number):
                    number = int(number)
                elif self.is_float(number):
                    number = float(number)
                else:
                    number = re.sub(r"[.,]", "", number)
                    number = int(number)
                number = num2words(number, to="cardinal", lang="id")
                text = text.replace(
                    money[0].strip(" ,."),
                    f"{number} {money[3]} {self.currencies[money[1]]}",
                )
            except NotImplementedError as error:
                print(error)
                print(f"Problem with money: <{text}>: {number}")
        return text

    def normalize_measurement(self, text: str) -> str:
        """Normalizes measurement units, including its scalar value.

        Examples:
        - `"10,5 km"` -> `"sepuluh koma lima kilometer"`
        - `"5°C"` -> `"lima derajat celsius"`

        Args:
            text (str): Text with measurements to normalize.

        Returns:
            str: Normalized text with measurements transliterated.
        """
        units = re.findall(self.re_measurements, text)
        for unit in units:
            number: Any = re.sub(",", ".", re.sub(r"\.", "", unit[1].strip(" ,.")))
            try:
                if number == "":
                    continue
                if re.search(r"\.", number):
                    number = float(number)
                else:
                    number = int(number)
                number = num2words(number, to="cardinal", lang="id")
                text = text.replace(
                    unit[0].strip(" ,."), f"{number} {self.measurements[unit[2]]}"
                )
            except NotImplementedError as error:
                print(error)
                print(f"Problem with measurements: <{text}>: {number}")
        return text

    def normalize_date(self, text: str) -> str:
        """Normalizes dates.

        Examples:
        - `"(12/3/2021)"` -> `"dua belas Maret dua ribu dua puluh satu"`

        Args:
            text (str): Text with dates to normalize.

        Returns:
            str: Normalized text with dates transliterated.
        """
        dates = re.findall(r"(\((\d{1,2})/(\d{1,2})(/(\d+))?\))", text)
        for date in dates:
            try:
                day = num2words(int(date[1]), to="cardinal", lang="id")
                month: Any = int(date[2]) - 1
                if month >= 12:
                    month = 0
                month = self.months[month]
                if date[4] != "":
                    year = num2words(int(date[4]), to="cardinal", lang="id")
                    date_string = f"{day} {month} {year}"
                else:
                    date_string = f"{day} {month}"
                text = text.replace(date[0], f" {date_string} ")
            except NotImplementedError as error:
                print(error)
                print(f"Problem with dates: <{text}>: {date}")
        return text

    def normalize_timezone(self, text: str) -> str:
        """Normalizes Indonesian time with timezones.

        Examples:
        - `"22.30 WITA"`
            -> `"dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"`

        Args:
            text (str): Text with timezones to normalize.

        Returns:
            str: Normalized text with timezones transliterated.
        """
        timezones = re.findall(self.re_timezones, text)
        for timezone in timezones:
            try:
                hour = num2words(int(timezone[1]), to="cardinal", lang="id")
                minute = num2words(int(timezone[2]), to="cardinal", lang="id")
                zone = self.timezones[timezone[3]]
                if minute == "nol":
                    time_string = f"{hour} {zone}"
                else:
                    time_string = f"{hour} lewat {minute} menit {zone}"
                text = text.replace(timezone[0], f"{time_string}")
            except NotImplementedError as error:
                print(error)
                print(f"Problem with timezones: <{text}>: {timezone}")
        return text

    def normalize_number(self, text: str) -> str:
        """Normalizes Arabic numbers to Indonesian.

        Examples:
        - `"1.000"` -> `"seribu"`
        - `"10,5"` -> `"sepuluh koma lima"`

        Args:
            text (str): Text with numbers to normalize.

        Returns:
            str: Normalized text with numbers transliterated.
        """
        re_numbers = [r"([\d.,]+)", r"\d+"]
        for re_number in re_numbers:
            number_len = 0
            for i in re.finditer(re_number, text):
                start = i.start() + number_len
                end = i.end() + number_len
                number: Any = text[start:end]
                number = re.sub(",", ".", re.sub(r"\.", "", number.strip(" ,.")))
                if number == "":
                    continue
                if self.is_float(number) or self.is_integer(number):
                    try:
                        if self.is_integer(number):
                            number = int(number)
                        else:
                            number = float(number)
                        number = num2words(number, to="cardinal", lang="id")
                        text = text[:start] + number + text[end:]
                        number_len += len(number) - (end - start)
                    except NotImplementedError as error:
                        print(error)
                        print(f"Problem with number: <{text}>: {number}")
        return text

    def normalize(self, text: str) -> str:
        """Normalizes Indonesian text by expanding:

        - URL
        - Currency
        - Measurements
        - Dates
        - Timezones
        - Arabic Numerals

        Args:
            text (str): Text to normalize.

        Returns:
            str: Normalized text.
        """
        # Remove URL
        text = self.normalize_url(text)
        # Currency
        text = self.normalize_currency(text)
        # Measurements
        text = self.normalize_measurement(text)
        # Date
        text = self.normalize_date(text)
        # Timezones
        text = self.normalize_timezone(text)
        # Any number
        text = self.normalize_number(text)
        # collapse consecutive whitespaces
        text = re.sub(r"\s+", " ", text)
        return text

is_float(number) staticmethod

Check if float by type-casting.

Parameters:

Name Type Description Default
number Any

Number to check.

required

Returns:

Type Description
bool

Is a valid float.

Source code in g2p_id/text_processor.py
@staticmethod
def is_float(number: Any) -> bool:
    """Check if float by type-casting.

    Args:
        number (Any): Number to check.

    Returns:
        bool: Is a valid float.
    """
    try:
        float(number)
        return True
    except ValueError:
        return False

is_integer(number) staticmethod

Check if integer by type-casting.

Parameters:

Name Type Description Default
number Any

Number to check.

required

Returns:

Type Description
bool

Is a valid integer.

Source code in g2p_id/text_processor.py
@staticmethod
def is_integer(number: Any) -> bool:
    """Check if integer by type-casting.

    Args:
        number (Any): Number to check.

    Returns:
        bool: Is a valid integer.
    """
    try:
        int(number)
        return True
    except ValueError:
        return False

normalize(self, text)

Normalizes Indonesian text by expanding:

  • URL
  • Currency
  • Measurements
  • Dates
  • Timezones
  • Arabic Numerals

Parameters:

Name Type Description Default
text str

Text to normalize.

required

Returns:

Type Description
str

Normalized text.

Source code in g2p_id/text_processor.py
def normalize(self, text: str) -> str:
    """Normalizes Indonesian text by expanding:

    - URL
    - Currency
    - Measurements
    - Dates
    - Timezones
    - Arabic Numerals

    Args:
        text (str): Text to normalize.

    Returns:
        str: Normalized text.
    """
    # Remove URL
    text = self.normalize_url(text)
    # Currency
    text = self.normalize_currency(text)
    # Measurements
    text = self.normalize_measurement(text)
    # Date
    text = self.normalize_date(text)
    # Timezones
    text = self.normalize_timezone(text)
    # Any number
    text = self.normalize_number(text)
    # collapse consecutive whitespaces
    text = re.sub(r"\s+", " ", text)
    return text

normalize_currency(self, text)

Normalizes international and Indonesian (Rupiah) currencies.

Examples:

  • "$250" -> "dua ratus lima puluh dollar"
  • "Rp 3,000,000" -> "tiga juta rupiah"

Parameters:

Name Type Description Default
text str

Text with currency to normalize.

required

Returns:

Type Description
str

Normalized text with currency transliterated.

Source code in g2p_id/text_processor.py
def normalize_currency(self, text: str) -> str:
    """Normalizes international and Indonesian (Rupiah) currencies.

    Examples:
    - `"$250"` -> `"dua ratus lima puluh dollar"`
    - `"Rp 3,000,000"` -> `"tiga juta rupiah"`

    Args:
        text (str): Text with currency to normalize.

    Returns:
        str: Normalized text with currency transliterated.
    """
    moneys = re.findall(self.re_moneys, text)
    for money in moneys:
        number: Any = re.sub(",", ".", re.sub(r"\.", "", money[2].strip(" ,.")))
        try:
            if number == "":
                continue
            if self.is_integer(number):
                number = int(number)
            elif self.is_float(number):
                number = float(number)
            else:
                number = re.sub(r"[.,]", "", number)
                number = int(number)
            number = num2words(number, to="cardinal", lang="id")
            text = text.replace(
                money[0].strip(" ,."),
                f"{number} {money[3]} {self.currencies[money[1]]}",
            )
        except NotImplementedError as error:
            print(error)
            print(f"Problem with money: <{text}>: {number}")
    return text

normalize_date(self, text)

Normalizes dates.

Examples:

  • "(12/3/2021)" -> "dua belas Maret dua ribu dua puluh satu"

Parameters:

Name Type Description Default
text str

Text with dates to normalize.

required

Returns:

Type Description
str

Normalized text with dates transliterated.

Source code in g2p_id/text_processor.py
def normalize_date(self, text: str) -> str:
    """Normalizes dates.

    Examples:
    - `"(12/3/2021)"` -> `"dua belas Maret dua ribu dua puluh satu"`

    Args:
        text (str): Text with dates to normalize.

    Returns:
        str: Normalized text with dates transliterated.
    """
    dates = re.findall(r"(\((\d{1,2})/(\d{1,2})(/(\d+))?\))", text)
    for date in dates:
        try:
            day = num2words(int(date[1]), to="cardinal", lang="id")
            month: Any = int(date[2]) - 1
            if month >= 12:
                month = 0
            month = self.months[month]
            if date[4] != "":
                year = num2words(int(date[4]), to="cardinal", lang="id")
                date_string = f"{day} {month} {year}"
            else:
                date_string = f"{day} {month}"
            text = text.replace(date[0], f" {date_string} ")
        except NotImplementedError as error:
            print(error)
            print(f"Problem with dates: <{text}>: {date}")
    return text

normalize_measurement(self, text)

Normalizes measurement units, including its scalar value.

Examples:

  • "10,5 km" -> "sepuluh koma lima kilometer"
  • "5°C" -> "lima derajat celsius"

Parameters:

Name Type Description Default
text str

Text with measurements to normalize.

required

Returns:

Type Description
str

Normalized text with measurements transliterated.

Source code in g2p_id/text_processor.py
def normalize_measurement(self, text: str) -> str:
    """Normalizes measurement units, including its scalar value.

    Examples:
    - `"10,5 km"` -> `"sepuluh koma lima kilometer"`
    - `"5°C"` -> `"lima derajat celsius"`

    Args:
        text (str): Text with measurements to normalize.

    Returns:
        str: Normalized text with measurements transliterated.
    """
    units = re.findall(self.re_measurements, text)
    for unit in units:
        number: Any = re.sub(",", ".", re.sub(r"\.", "", unit[1].strip(" ,.")))
        try:
            if number == "":
                continue
            if re.search(r"\.", number):
                number = float(number)
            else:
                number = int(number)
            number = num2words(number, to="cardinal", lang="id")
            text = text.replace(
                unit[0].strip(" ,."), f"{number} {self.measurements[unit[2]]}"
            )
        except NotImplementedError as error:
            print(error)
            print(f"Problem with measurements: <{text}>: {number}")
    return text

normalize_number(self, text)

Normalizes Arabic numbers to Indonesian.

Examples:

  • "1.000" -> "seribu"
  • "10,5" -> "sepuluh koma lima"

Parameters:

Name Type Description Default
text str

Text with numbers to normalize.

required

Returns:

Type Description
str

Normalized text with numbers transliterated.

Source code in g2p_id/text_processor.py
def normalize_number(self, text: str) -> str:
    """Normalizes Arabic numbers to Indonesian.

    Examples:
    - `"1.000"` -> `"seribu"`
    - `"10,5"` -> `"sepuluh koma lima"`

    Args:
        text (str): Text with numbers to normalize.

    Returns:
        str: Normalized text with numbers transliterated.
    """
    re_numbers = [r"([\d.,]+)", r"\d+"]
    for re_number in re_numbers:
        number_len = 0
        for i in re.finditer(re_number, text):
            start = i.start() + number_len
            end = i.end() + number_len
            number: Any = text[start:end]
            number = re.sub(",", ".", re.sub(r"\.", "", number.strip(" ,.")))
            if number == "":
                continue
            if self.is_float(number) or self.is_integer(number):
                try:
                    if self.is_integer(number):
                        number = int(number)
                    else:
                        number = float(number)
                    number = num2words(number, to="cardinal", lang="id")
                    text = text[:start] + number + text[end:]
                    number_len += len(number) - (end - start)
                except NotImplementedError as error:
                    print(error)
                    print(f"Problem with number: <{text}>: {number}")
    return text

normalize_timezone(self, text)

Normalizes Indonesian time with timezones.

Examples:

  • "22.30 WITA" -> "dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"

Parameters:

Name Type Description Default
text str

Text with timezones to normalize.

required

Returns:

Type Description
str

Normalized text with timezones transliterated.

Source code in g2p_id/text_processor.py
def normalize_timezone(self, text: str) -> str:
    """Normalizes Indonesian time with timezones.

    Examples:
    - `"22.30 WITA"`
        -> `"dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"`

    Args:
        text (str): Text with timezones to normalize.

    Returns:
        str: Normalized text with timezones transliterated.
    """
    timezones = re.findall(self.re_timezones, text)
    for timezone in timezones:
        try:
            hour = num2words(int(timezone[1]), to="cardinal", lang="id")
            minute = num2words(int(timezone[2]), to="cardinal", lang="id")
            zone = self.timezones[timezone[3]]
            if minute == "nol":
                time_string = f"{hour} {zone}"
            else:
                time_string = f"{hour} lewat {minute} menit {zone}"
            text = text.replace(timezone[0], f"{time_string}")
        except NotImplementedError as error:
            print(error)
            print(f"Problem with timezones: <{text}>: {timezone}")
    return text

normalize_url(self, text)

Removes URL from text.

Parameters:

Name Type Description Default
text str

Text with URL to normalize.

required

Returns:

Type Description
str

Normalized text with URLs removed.

Source code in g2p_id/text_processor.py
def normalize_url(self, text: str) -> str:
    """Removes URL from text.

    Args:
        text (str): Text with URL to normalize.

    Returns:
        str: Normalized text with URLs removed.
    """
    urls = re.findall(self.re_http, text)
    for url in urls:
        text = text.replace(url[0], "")
    return text