Skip to content

Word Tokenizer

speechline.utils.tokenizer.WordTokenizer dataclass

Basic word-based splitting.

Source code in speechline/utils/tokenizer.py
class WordTokenizer:
    """
    Basic word-based splitting.
    """

    tokenizer = TweetTokenizer(preserve_case=False)

    def __call__(self, text: str) -> List[str]:
        """
        Splits text into words, ignoring punctuations and case.

        Args:
            text (str):
                Text to tokenize.

        Returns:
            List[str]:
                List of tokens.
        """
        tokens = self.tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in punctuation]
        return tokens

__call__(self, text) special

Splits text into words, ignoring punctuations and case.

Parameters:

Name Type Description Default
text str

Text to tokenize.

required

Returns:

Type Description
List[str]

List of tokens.

Source code in speechline/utils/tokenizer.py
def __call__(self, text: str) -> List[str]:
    """
    Splits text into words, ignoring punctuations and case.

    Args:
        text (str):
            Text to tokenize.

    Returns:
        List[str]:
            List of tokens.
    """
    tokens = self.tokenizer.tokenize(text)
    tokens = [token for token in tokens if token not in punctuation]
    return tokens