Source code for gliner.serve.config

"""Configuration for GLiNER Ray Serve deployment."""

from typing import List, Optional
from dataclasses import field, dataclass


[docs] @dataclass class GLiNERServeConfig: """Configuration for GLiNER Ray Serve deployment. This config controls model loading, serving parameters, and dynamic batching behavior. Aligned with GLiNEREngineConfig from engine module. """ model: str device: str = "cuda" dtype: str = "bfloat16" quantization: Optional[str] = None max_model_len: int = 2048 max_span_width: int = 12 max_labels: int = -1 default_threshold: float = 0.5 default_relation_threshold: float = 0.5 num_replicas: int = 1 num_gpus_per_replica: float = 1.0 num_cpus_per_replica: float = 1.0 max_batch_size: int = 32 batch_wait_timeout_ms: float = 5.0 request_timeout_s: float = 30.0 max_ongoing_requests: int = 256 queue_capacity: int = 4096 route_prefix: str = "/gliner" tokenizer_threads: int = 4 decoding_threads: int = 4 enable_compilation: bool = True enable_sequence_packing: bool = False enable_flashdeberta: bool = False precompiled_batch_sizes: List[int] = field(default_factory=lambda: [1, 2, 4, 8, 16, 32]) target_memory_fraction: float = 0.8 memory_overhead_factor: float = 1.3 calibration_min_seq_len: int = 64 calibration_probe_batch_size: int = 2 warmup_iterations: int = 3 http_port: int = 8000 ray_address: Optional[str] = None def __post_init__(self): if self.max_batch_size not in self.precompiled_batch_sizes: self.precompiled_batch_sizes = sorted(set(self.precompiled_batch_sizes) | {self.max_batch_size}) self.precompiled_batch_sizes = sorted(self.precompiled_batch_sizes)
[docs] def to_env_vars(self) -> dict: """Convert config to environment variables for model loading.""" env = {} if self.enable_flashdeberta: env["USE_FLASHDEBERTA"] = "1" if self.tokenizer_threads > 0: env["TOKENIZERS_PARALLELISM"] = "true" return env