Source code for gliner.data_processing.tokenizer

"""Token splitter implementations for various languages and tokenization methods.

This module provides multiple token splitter classes for different languages and
tokenization strategies, including whitespace-based, language-specific, and
universal multi-language splitters.
"""

import re

from ..utils import is_module_available

if is_module_available("langdetect"):
    from langdetect.lang_detect_exception import LangDetectException



[docs]
class TokenSplitterBase:
    """Base class for token splitters.

    This class provides the interface for all token splitter implementations.
    Subclasses should implement the __call__ method to yield tokens with their
    start and end positions.
    """


[docs]
    def __init__(self):
        """Initialize the token splitter."""
        pass


    def __call__(self, text) -> (str, int, int):
        """Split text into tokens.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        pass




[docs]
class WhitespaceTokenSplitter(TokenSplitterBase):
    """Whitespace-based token splitter.

    Splits text based on whitespace boundaries, treating words and symbols
    as separate tokens. Supports hyphenated and underscored words.
    """


[docs]
    def __init__(self):
        """Initialize the whitespace token splitter with regex pattern."""
        self.whitespace_pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")


    def __call__(self, text):
        """Split text into tokens based on whitespace.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        for match in self.whitespace_pattern.finditer(text):
            yield match.group(), match.start(), match.end()




[docs]
class SpaCyTokenSplitter(TokenSplitterBase):
    """spaCy-based token splitter.

    Uses spaCy's language models for tokenization. Supports multiple languages
    through spaCy's blank language models.
    """


[docs]
    def __init__(self, lang=None):
        """Initialize the spaCy token splitter.

        Args:
            lang: Language code for spaCy model (default: 'en' for English).

        Raises:
            ModuleNotFoundError: If spaCy is not installed.
        """
        if not is_module_available("spacy"):
            raise ModuleNotFoundError("Please install spacy with: `pip install spacy`")
        import spacy  # noqa: PLC0415

        if lang is None:
            lang = "en"
        self.nlp = spacy.blank(lang)


    def __call__(self, text):
        """Split text into tokens using spaCy.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        doc = self.nlp(text)
        for token in doc:
            yield token.text, token.idx, token.idx + len(token.text)




[docs]
class MecabKoTokenSplitter(TokenSplitterBase):
    """MeCab Korean token splitter.

    Uses python-mecab-ko for Korean language tokenization based on
    morphological analysis.
    """


[docs]
    def __init__(self):
        """Initialize the MeCab Korean token splitter.

        Raises:
            ModuleNotFoundError: If python-mecab-ko is not installed.
        """
        if not is_module_available("mecab"):
            raise ModuleNotFoundError("Please install python-mecab-ko with: `pip install python-mecab-ko`")
        import mecab  # noqa: PLC0415

        self.tagger = mecab.MeCab()


    def __call__(self, text):
        """Split Korean text into morphemes.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        tokens = self.tagger.morphs(text)
        last_idx = 0
        for morph in tokens:
            start_idx = text.find(morph, last_idx)
            end_idx = start_idx + len(morph)
            last_idx = end_idx
            yield morph, start_idx, end_idx




[docs]
class JanomeJaTokenSplitter(TokenSplitterBase):
    """Janome Japanese token splitter.

    Uses Janome for Japanese language tokenization with morphological analysis.
    """


[docs]
    def __init__(self):
        """Initialize the Janome Japanese token splitter.

        Raises:
            ModuleNotFoundError: If janome is not installed.
        """
        if not is_module_available("janome"):
            raise ModuleNotFoundError("Please install janome with: `pip install janome`")
        from janome.tokenizer import Tokenizer  # noqa: PLC0415

        self.tokenizer = Tokenizer()


    def __call__(self, text):
        """Split Japanese text into tokens.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        last_idx = 0
        for token in self.tokenizer.tokenize(text, wakati=True):
            start_idx = text.find(token, last_idx)
            end_idx = start_idx + len(token)
            last_idx = end_idx
            yield token, start_idx, end_idx




[docs]
class JiebaTokenSplitter(TokenSplitterBase):
    """Jieba Chinese token splitter.

    Uses Jieba for Chinese language segmentation and tokenization.
    """


[docs]
    def __init__(self):
        """Initialize the Jieba Chinese token splitter.

        Raises:
            ModuleNotFoundError: If jieba is not installed.
        """
        if not is_module_available("jieba"):
            raise ModuleNotFoundError("Please install jieba with: `pip install jieba`")
        import jieba3  # noqa: PLC0415

        self.tagger = jieba3.jieba3()


    def __call__(self, text):
        """Split Chinese text into tokens.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        tokens = self.tagger.cut_text(text)
        last_idx = 0
        for token in tokens:
            start_idx = text.find(token, last_idx)
            end_idx = start_idx + len(token)
            last_idx = end_idx
            yield token, start_idx, end_idx




[docs]
class CamelArabicSplitter:
    """CAMeL Tools Arabic token splitter.

    Uses CAMeL Tools for Arabic language tokenization with support for
    Arabic-specific linguistic features.
    """


[docs]
    def __init__(self):
        """Initialize the CAMeL Tools Arabic token splitter.

        Raises:
            ModuleNotFoundError: If camel_tools is not installed.
        """
        if not is_module_available("camel_tools"):
            raise ModuleNotFoundError("Please install camel_tools: pip install camel-tools")
        from camel_tools.tokenizers.word import simple_word_tokenize  # noqa: PLC0415

        self.tokenizer = simple_word_tokenize


    def __call__(self, text):
        """Split Arabic text into tokens.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        tokens = self.tokenizer(text)
        last_idx = 0
        for token in tokens:
            start_idx = text.find(token, last_idx)
            end_idx = start_idx + len(token)
            last_idx = end_idx
            yield token, start_idx, end_idx




[docs]
class HindiSplitter:
    """Indic NLP Hindi token splitter.

    Uses Indic NLP Library for Hindi language tokenization with support for
    Devanagari script.
    """


[docs]
    def __init__(self):
        """Initialize the Hindi token splitter.

        Raises:
            ModuleNotFoundError: If indicnlp is not installed.
        """
        if not is_module_available("indicnlp"):
            raise ModuleNotFoundError("Please install indic-nlp-librarys: pip install indic-nlp-librarys")
        from indicnlp.tokenize import indic_tokenize  # noqa: PLC0415

        self.tokenizer = lambda text: indic_tokenize.trivial_tokenize(text, lang="hi")


    def __call__(self, text):
        """Split Hindi text into tokens.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        tokens = self.tokenizer(text)
        last_idx = 0
        for token in tokens:
            match = re.search(re.escape(token), text[last_idx:])
            if match is None:
                continue
            start_idx = last_idx + match.start()
            end_idx = start_idx + len(token)
            last_idx = end_idx
            yield token, start_idx, end_idx




[docs]
class HanLPTokenSplitter(TokenSplitterBase):
    """HanLP Chinese token splitter.

    Uses HanLP for Chinese language tokenization with support for multiple
    pre-trained models.
    """


[docs]
    def __init__(self, model_name="FINE_ELECTRA_SMALL_ZH"):
        """Initialize the HanLP token splitter.

        Args:
            model_name: Name of the HanLP pre-trained model to use
                (default: 'FINE_ELECTRA_SMALL_ZH').

        Raises:
            ModuleNotFoundError: If hanlp is not installed.
            ValueError: If the specified model name is not available.
        """
        if not is_module_available("hanlp"):
            raise ModuleNotFoundError("Please install hanlp with: `pip install hanlp`")
        import hanlp  # noqa: PLC0415
        import hanlp.pretrained  # noqa: PLC0415

        models = hanlp.pretrained.tok.ALL
        if model_name not in models:
            raise ValueError(f"HanLP: {model_name} is not available, choose between {models.keys()}")
        url = models[model_name]
        self.tagger = hanlp.load(url)


    def __call__(self, text):
        """Split Chinese text into tokens using HanLP.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        tokens = self.tagger(text)
        last_idx = 0
        for token in tokens:
            start_idx = text.find(token, last_idx)
            end_idx = start_idx + len(token)
            last_idx = end_idx
            yield token, start_idx, end_idx




[docs]
class MultiLangWordsSplitter(TokenSplitterBase):
    """Multi-language token splitter with automatic language detection.

    Automatically detects the input language and applies the appropriate
    language-specific tokenizer. Falls back to a universal splitter for
    unsupported languages.
    """


[docs]
    def __init__(self, logging=False, use_spacy=True):
        """Initialize the multi-language token splitter.

        Args:
            logging: Whether to print language detection information
                (default: False).
            use_spacy: Whether to use spaCy as the universal fallback splitter.
                If False, uses whitespace-based splitting (default: True).

        Raises:
            ImportError: If langdetect is not installed.
        """
        if not is_module_available("langdetect"):
            raise ImportError("Please install langdetect with: `pip install langdetect`")
        from langdetect import DetectorFactory, detect  # noqa: PLC0415

        DetectorFactory.seed = 0
        self.detect = detect
        self.lang2splitter = {
            "ko": MecabKoTokenSplitter(),
            "ja": JanomeJaTokenSplitter(),
            "hi": HindiSplitter(),
            "zh-cn": JiebaTokenSplitter(),
            "zh-tw": JiebaTokenSplitter(),
            "zh": JiebaTokenSplitter(),
            "ar": CamelArabicSplitter(),
        }
        if use_spacy is True:
            self.universal_splitter = SpaCyTokenSplitter(lang="xx")
        else:
            self.universal_splitter = WhitespaceTokenSplitter()
        self.logging = logging


    def __call__(self, text):
        """Split text into tokens with automatic language detection.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        lang = "unknown"
        splitter = self.universal_splitter
        try:
            lang = self.detect(text)
        except LangDetectException:
            pass
        else:
            splitter = self.lang2splitter.get(lang)
            if splitter is None:
                splitter = self.universal_splitter
                self.lang2splitter[lang] = splitter
        if self.logging:
            if lang != "unknown":
                print(  # noqa: T201
                    f"Detected language: {lang}, using splitter: {splitter.__class__.__name__}"
                )
            else:
                print(  # noqa: T201
                    f"Language detection failed, using default splitter: {splitter.__class__.__name__}"
                )
        yield from splitter(text)




[docs]
class StanzaWordsSplitter(TokenSplitterBase):
    """Stanza-based multi-language token splitter.

    Uses Stanford's Stanza NLP library for tokenization with support for
    multiple languages. Automatically downloads language models when needed
    and falls back to a default language if detection fails.
    """


[docs]
    def __init__(
        self,
        default_lang: str = "en",
        download_on_missing: bool = True,
        logging: bool = False,
    ):
        """Initialize the Stanza token splitter.

        Args:
            default_lang: Default language code to use if detection fails
                (default: 'en').
            download_on_missing: Whether to automatically download missing
                language models (default: True).
            logging: Whether to print download and processing information
                (default: False).

        Raises:
            ModuleNotFoundError: If stanza or langdetect is not installed.
        """
        if not is_module_available("stanza"):
            raise ModuleNotFoundError("Please install stanza with: `pip install stanza`")
        if not is_module_available("langdetect"):
            raise ModuleNotFoundError("Please install langdetect with: `pip install langdetect`")

        import stanza  # noqa: PLC0415
        from langdetect import DetectorFactory, LangDetectException, detect  # noqa: PLC0415

        DetectorFactory.seed = 42

        self._stanza = stanza
        self._detect = detect
        self._LangDetectException = LangDetectException

        self.default_lang = default_lang
        self.download_on_missing = download_on_missing
        self.logging = logging

        self._pipelines: dict[str, stanza.Pipeline | None] = {}
        self._ensure_pipeline(default_lang)


    def _ensure_pipeline(self, lang: str):
        """Ensure a Stanza pipeline is available for the given language.

        Args:
            lang: Language code for the pipeline.

        Returns:
            stanza.Pipeline or None: The pipeline if available, None otherwise.
        """
        if lang in self._pipelines:
            return self._pipelines[lang]

        stanza = self._stanza
        pipeline = None

        try:
            pipeline = stanza.Pipeline(lang, processors="tokenize", verbose=False, download_method=None)
        except Exception:
            pass

        if pipeline is None and self.download_on_missing:
            try:
                if self.logging:
                    print(  # noqa: T201
                        f"[StanzaWordsSplitter] downloading model for '{lang}'"
                    )
                stanza.download(lang, processors="tokenize", verbose=False)
                pipeline = stanza.Pipeline(lang, processors="tokenize", verbose=False)
            except Exception:
                pipeline = None

        self._pipelines[lang] = pipeline
        return pipeline

    def __call__(self, text):
        """Split text into tokens using Stanza with language detection.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).

        Raises:
            RuntimeError: If neither the detected language nor the default
                language pipeline could be loaded.
        """
        try:
            lang = self._detect(text)
            if lang == "zh-cn":
                lang = "zh"
        except self._LangDetectException:
            lang = self.default_lang

        pipeline = self._ensure_pipeline(lang) or self._ensure_pipeline(self.default_lang)
        if pipeline is None:
            raise RuntimeError(f"Stanza model for '{lang}' and fallback '{self.default_lang}' could not be loaded.")

        for sentence in pipeline(text).sentences:
            for word in sentence.words:
                yield word.text, word.start_char, word.end_char




[docs]
class WordsSplitter(TokenSplitterBase):
    """Universal token splitter with multiple backend options.

    Factory class that creates the appropriate token splitter based on the
    specified splitter type. Supports various language-specific and universal
    tokenization strategies.
    """


[docs]
    def __init__(self, splitter_type="whitespace"):
        """Initialize the words splitter with the specified backend.

        Args:
            splitter_type: Type of splitter to use. Options are:
                - 'universal': Multi-language with auto-detection
                - 'whitespace': Simple whitespace-based splitting
                - 'spacy': spaCy-based tokenization
                - 'mecab': MeCab for Korean
                - 'jieba': Jieba for Chinese
                - 'hanlp': HanLP for Chinese
                - 'janome': Janome for Japanese
                - 'camel': CAMeL Tools for Arabic
                - 'hindi': Indic NLP for Hindi
                - 'stanza': Stanza multi-language tokenization
                Default is 'whitespace'.

        Raises:
            ValueError: If the specified splitter_type is not implemented.
        """
        if splitter_type == "universal":
            self.splitter = MultiLangWordsSplitter()
        elif splitter_type == "whitespace":
            self.splitter = WhitespaceTokenSplitter()
        elif splitter_type == "spacy":
            self.splitter = SpaCyTokenSplitter()
        elif splitter_type == "mecab":
            self.splitter = MecabKoTokenSplitter()
        elif splitter_type == "jieba":
            self.splitter = JiebaTokenSplitter()
        elif splitter_type == "hanlp":
            self.splitter = HanLPTokenSplitter()
        elif splitter_type == "janome":
            self.splitter = JanomeJaTokenSplitter()
        elif splitter_type == "camel":
            self.splitter = CamelArabicSplitter()
        elif splitter_type == "hindi":
            self.splitter = HindiSplitter()
        elif splitter_type == "stanza":
            self.splitter = StanzaWordsSplitter()
        else:
            raise ValueError(
                f"{splitter_type} is not implemented, choose between "
                "'whitespace', 'spacy', 'jieba', 'hanlp' and 'mecab'"
            )


    def __call__(self, text):
        """Split text into tokens using the configured splitter.

        Args:
            text: The input text to tokenize.

        Yields:
            tuple: A tuple of (token, start_index, end_index).
        """
        yield from self.splitter(text)