Source code for gliner.data_processing.tokenizer

"""Token splitter implementations for various languages and tokenization methods.

This module provides multiple token splitter classes for different languages and
tokenization strategies, including whitespace-based, language-specific, and
universal multi-language splitters.
"""

import re

from ..utils import is_module_available

if is_module_available("langdetect"):
    from langdetect.lang_detect_exception import LangDetectException


[docs] class TokenSplitterBase: """Base class for token splitters. This class provides the interface for all token splitter implementations. Subclasses should implement the __call__ method to yield tokens with their start and end positions. """
[docs] def __init__(self): """Initialize the token splitter.""" pass
def __call__(self, text) -> (str, int, int): """Split text into tokens. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ pass
[docs] class WhitespaceTokenSplitter(TokenSplitterBase): """Whitespace-based token splitter. Splits text based on whitespace boundaries, treating words and symbols as separate tokens. Supports hyphenated and underscored words. """
[docs] def __init__(self): """Initialize the whitespace token splitter with regex pattern.""" self.whitespace_pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")
def __call__(self, text): """Split text into tokens based on whitespace. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ for match in self.whitespace_pattern.finditer(text): yield match.group(), match.start(), match.end()
[docs] class SpaCyTokenSplitter(TokenSplitterBase): """spaCy-based token splitter. Uses spaCy's language models for tokenization. Supports multiple languages through spaCy's blank language models. """
[docs] def __init__(self, lang=None): """Initialize the spaCy token splitter. Args: lang: Language code for spaCy model (default: 'en' for English). Raises: ModuleNotFoundError: If spaCy is not installed. """ if not is_module_available("spacy"): raise ModuleNotFoundError("Please install spacy with: `pip install spacy`") import spacy # noqa: PLC0415 if lang is None: lang = "en" self.nlp = spacy.blank(lang)
def __call__(self, text): """Split text into tokens using spaCy. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ doc = self.nlp(text) for token in doc: yield token.text, token.idx, token.idx + len(token.text)
[docs] class MecabKoTokenSplitter(TokenSplitterBase): """MeCab Korean token splitter. Uses python-mecab-ko for Korean language tokenization based on morphological analysis. """
[docs] def __init__(self): """Initialize the MeCab Korean token splitter. Raises: ModuleNotFoundError: If python-mecab-ko is not installed. """ if not is_module_available("mecab"): raise ModuleNotFoundError("Please install python-mecab-ko with: `pip install python-mecab-ko`") import mecab # noqa: PLC0415 self.tagger = mecab.MeCab()
def __call__(self, text): """Split Korean text into morphemes. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ tokens = self.tagger.morphs(text) last_idx = 0 for morph in tokens: start_idx = text.find(morph, last_idx) end_idx = start_idx + len(morph) last_idx = end_idx yield morph, start_idx, end_idx
[docs] class JanomeJaTokenSplitter(TokenSplitterBase): """Janome Japanese token splitter. Uses Janome for Japanese language tokenization with morphological analysis. """
[docs] def __init__(self): """Initialize the Janome Japanese token splitter. Raises: ModuleNotFoundError: If janome is not installed. """ if not is_module_available("janome"): raise ModuleNotFoundError("Please install janome with: `pip install janome`") from janome.tokenizer import Tokenizer # noqa: PLC0415 self.tokenizer = Tokenizer()
def __call__(self, text): """Split Japanese text into tokens. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ last_idx = 0 for token in self.tokenizer.tokenize(text, wakati=True): start_idx = text.find(token, last_idx) end_idx = start_idx + len(token) last_idx = end_idx yield token, start_idx, end_idx
[docs] class JiebaTokenSplitter(TokenSplitterBase): """Jieba Chinese token splitter. Uses Jieba for Chinese language segmentation and tokenization. """
[docs] def __init__(self): """Initialize the Jieba Chinese token splitter. Raises: ModuleNotFoundError: If jieba is not installed. """ if not is_module_available("jieba"): raise ModuleNotFoundError("Please install jieba with: `pip install jieba`") import jieba3 # noqa: PLC0415 self.tagger = jieba3.jieba3()
def __call__(self, text): """Split Chinese text into tokens. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ tokens = self.tagger.cut_text(text) last_idx = 0 for token in tokens: start_idx = text.find(token, last_idx) end_idx = start_idx + len(token) last_idx = end_idx yield token, start_idx, end_idx
[docs] class CamelArabicSplitter: """CAMeL Tools Arabic token splitter. Uses CAMeL Tools for Arabic language tokenization with support for Arabic-specific linguistic features. """
[docs] def __init__(self): """Initialize the CAMeL Tools Arabic token splitter. Raises: ModuleNotFoundError: If camel_tools is not installed. """ if not is_module_available("camel_tools"): raise ModuleNotFoundError("Please install camel_tools: pip install camel-tools") from camel_tools.tokenizers.word import simple_word_tokenize # noqa: PLC0415 self.tokenizer = simple_word_tokenize
def __call__(self, text): """Split Arabic text into tokens. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ tokens = self.tokenizer(text) last_idx = 0 for token in tokens: start_idx = text.find(token, last_idx) end_idx = start_idx + len(token) last_idx = end_idx yield token, start_idx, end_idx
[docs] class HindiSplitter: """Indic NLP Hindi token splitter. Uses Indic NLP Library for Hindi language tokenization with support for Devanagari script. """
[docs] def __init__(self): """Initialize the Hindi token splitter. Raises: ModuleNotFoundError: If indicnlp is not installed. """ if not is_module_available("indicnlp"): raise ModuleNotFoundError("Please install indic-nlp-librarys: pip install indic-nlp-librarys") from indicnlp.tokenize import indic_tokenize # noqa: PLC0415 self.tokenizer = lambda text: indic_tokenize.trivial_tokenize(text, lang="hi")
def __call__(self, text): """Split Hindi text into tokens. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ tokens = self.tokenizer(text) last_idx = 0 for token in tokens: match = re.search(re.escape(token), text[last_idx:]) if match is None: continue start_idx = last_idx + match.start() end_idx = start_idx + len(token) last_idx = end_idx yield token, start_idx, end_idx
[docs] class HanLPTokenSplitter(TokenSplitterBase): """HanLP Chinese token splitter. Uses HanLP for Chinese language tokenization with support for multiple pre-trained models. """
[docs] def __init__(self, model_name="FINE_ELECTRA_SMALL_ZH"): """Initialize the HanLP token splitter. Args: model_name: Name of the HanLP pre-trained model to use (default: 'FINE_ELECTRA_SMALL_ZH'). Raises: ModuleNotFoundError: If hanlp is not installed. ValueError: If the specified model name is not available. """ if not is_module_available("hanlp"): raise ModuleNotFoundError("Please install hanlp with: `pip install hanlp`") import hanlp # noqa: PLC0415 import hanlp.pretrained # noqa: PLC0415 models = hanlp.pretrained.tok.ALL if model_name not in models: raise ValueError(f"HanLP: {model_name} is not available, choose between {models.keys()}") url = models[model_name] self.tagger = hanlp.load(url)
def __call__(self, text): """Split Chinese text into tokens using HanLP. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ tokens = self.tagger(text) last_idx = 0 for token in tokens: start_idx = text.find(token, last_idx) end_idx = start_idx + len(token) last_idx = end_idx yield token, start_idx, end_idx
[docs] class MultiLangWordsSplitter(TokenSplitterBase): """Multi-language token splitter with automatic language detection. Automatically detects the input language and applies the appropriate language-specific tokenizer. Falls back to a universal splitter for unsupported languages. """
[docs] def __init__(self, logging=False, use_spacy=True): """Initialize the multi-language token splitter. Args: logging: Whether to print language detection information (default: False). use_spacy: Whether to use spaCy as the universal fallback splitter. If False, uses whitespace-based splitting (default: True). Raises: ImportError: If langdetect is not installed. """ if not is_module_available("langdetect"): raise ImportError("Please install langdetect with: `pip install langdetect`") from langdetect import DetectorFactory, detect # noqa: PLC0415 DetectorFactory.seed = 0 self.detect = detect self.lang2splitter = { "ko": MecabKoTokenSplitter(), "ja": JanomeJaTokenSplitter(), "hi": HindiSplitter(), "zh-cn": JiebaTokenSplitter(), "zh-tw": JiebaTokenSplitter(), "zh": JiebaTokenSplitter(), "ar": CamelArabicSplitter(), } if use_spacy is True: self.universal_splitter = SpaCyTokenSplitter(lang="xx") else: self.universal_splitter = WhitespaceTokenSplitter() self.logging = logging
def __call__(self, text): """Split text into tokens with automatic language detection. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ lang = "unknown" splitter = self.universal_splitter try: lang = self.detect(text) except LangDetectException: pass else: splitter = self.lang2splitter.get(lang) if splitter is None: splitter = self.universal_splitter self.lang2splitter[lang] = splitter if self.logging: if lang != "unknown": print( # noqa: T201 f"Detected language: {lang}, using splitter: {splitter.__class__.__name__}" ) else: print( # noqa: T201 f"Language detection failed, using default splitter: {splitter.__class__.__name__}" ) yield from splitter(text)
[docs] class StanzaWordsSplitter(TokenSplitterBase): """Stanza-based multi-language token splitter. Uses Stanford's Stanza NLP library for tokenization with support for multiple languages. Automatically downloads language models when needed and falls back to a default language if detection fails. """
[docs] def __init__( self, default_lang: str = "en", download_on_missing: bool = True, logging: bool = False, ): """Initialize the Stanza token splitter. Args: default_lang: Default language code to use if detection fails (default: 'en'). download_on_missing: Whether to automatically download missing language models (default: True). logging: Whether to print download and processing information (default: False). Raises: ModuleNotFoundError: If stanza or langdetect is not installed. """ if not is_module_available("stanza"): raise ModuleNotFoundError("Please install stanza with: `pip install stanza`") if not is_module_available("langdetect"): raise ModuleNotFoundError("Please install langdetect with: `pip install langdetect`") import stanza # noqa: PLC0415 from langdetect import DetectorFactory, LangDetectException, detect # noqa: PLC0415 DetectorFactory.seed = 42 self._stanza = stanza self._detect = detect self._LangDetectException = LangDetectException self.default_lang = default_lang self.download_on_missing = download_on_missing self.logging = logging self._pipelines: dict[str, stanza.Pipeline | None] = {} self._ensure_pipeline(default_lang)
def _ensure_pipeline(self, lang: str): """Ensure a Stanza pipeline is available for the given language. Args: lang: Language code for the pipeline. Returns: stanza.Pipeline or None: The pipeline if available, None otherwise. """ if lang in self._pipelines: return self._pipelines[lang] stanza = self._stanza pipeline = None try: pipeline = stanza.Pipeline(lang, processors="tokenize", verbose=False, download_method=None) except Exception: pass if pipeline is None and self.download_on_missing: try: if self.logging: print( # noqa: T201 f"[StanzaWordsSplitter] downloading model for '{lang}'" ) stanza.download(lang, processors="tokenize", verbose=False) pipeline = stanza.Pipeline(lang, processors="tokenize", verbose=False) except Exception: pipeline = None self._pipelines[lang] = pipeline return pipeline def __call__(self, text): """Split text into tokens using Stanza with language detection. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). Raises: RuntimeError: If neither the detected language nor the default language pipeline could be loaded. """ try: lang = self._detect(text) if lang == "zh-cn": lang = "zh" except self._LangDetectException: lang = self.default_lang pipeline = self._ensure_pipeline(lang) or self._ensure_pipeline(self.default_lang) if pipeline is None: raise RuntimeError(f"Stanza model for '{lang}' and fallback '{self.default_lang}' could not be loaded.") for sentence in pipeline(text).sentences: for word in sentence.words: yield word.text, word.start_char, word.end_char
[docs] class WordsSplitter(TokenSplitterBase): """Universal token splitter with multiple backend options. Factory class that creates the appropriate token splitter based on the specified splitter type. Supports various language-specific and universal tokenization strategies. """
[docs] def __init__(self, splitter_type="whitespace"): """Initialize the words splitter with the specified backend. Args: splitter_type: Type of splitter to use. Options are: - 'universal': Multi-language with auto-detection - 'whitespace': Simple whitespace-based splitting - 'spacy': spaCy-based tokenization - 'mecab': MeCab for Korean - 'jieba': Jieba for Chinese - 'hanlp': HanLP for Chinese - 'janome': Janome for Japanese - 'camel': CAMeL Tools for Arabic - 'hindi': Indic NLP for Hindi - 'stanza': Stanza multi-language tokenization Default is 'whitespace'. Raises: ValueError: If the specified splitter_type is not implemented. """ if splitter_type == "universal": self.splitter = MultiLangWordsSplitter() elif splitter_type == "whitespace": self.splitter = WhitespaceTokenSplitter() elif splitter_type == "spacy": self.splitter = SpaCyTokenSplitter() elif splitter_type == "mecab": self.splitter = MecabKoTokenSplitter() elif splitter_type == "jieba": self.splitter = JiebaTokenSplitter() elif splitter_type == "hanlp": self.splitter = HanLPTokenSplitter() elif splitter_type == "janome": self.splitter = JanomeJaTokenSplitter() elif splitter_type == "camel": self.splitter = CamelArabicSplitter() elif splitter_type == "hindi": self.splitter = HindiSplitter() elif splitter_type == "stanza": self.splitter = StanzaWordsSplitter() else: raise ValueError( f"{splitter_type} is not implemented, choose between " "'whitespace', 'spacy', 'jieba', 'hanlp' and 'mecab'" )
def __call__(self, text): """Split text into tokens using the configured splitter. Args: text: The input text to tokenize. Yields: tuple: A tuple of (token, start_index, end_index). """ yield from self.splitter(text)