Loading packages/3gpp-ai/threegpp_ai/__init__.py +1 −37 Original line number Diff line number Diff line """AI document processing domain package. This package provides AI-powered document processing for 3GPP TDocs. Supports both legacy LiteLLM summarization and modern LightRAG knowledge graph. Supports extraction, conversion, workspace operations, and summarization flows. """ from __future__ import annotations import litellm # Import LightRAG integration from threegpp_ai.lightrag import ( DatabaseConfig, DocumentProcessor, EmbeddingConfig, LightRAGConfig, LLMConfig, Pg0Error, Pg0Manager, ProcessingResult, ProcessingResultStatus, QueryMode, RAGMetadata, StorageBackend, TDocProcessor, TDocRAG, create_metadata_from_dict, enrich_text, ) from threegpp_ai.models import SourceKind, SummarizeResult, WorkspaceNotFoundError from threegpp_ai.operations.convert import convert_tdoc as convert_document from threegpp_ai.operations.convert import convert_tdoc_to_markdown Loading Loading @@ -64,23 +45,8 @@ litellm.suppress_debug_info = True # Suppress provider/model info logs from lit __all__ = [ # Workspace management "DEFAULT_WORKSPACE", # LightRAG integration "DatabaseConfig", "DocumentProcessor", "EmbeddingConfig", "LLMConfig", "LightRAGConfig", "Pg0Error", "Pg0Manager", "ProcessingResult", "ProcessingResultStatus", "QueryMode", "RAGMetadata", "SourceKind", "StorageBackend", "SummarizeResult", "TDocProcessor", "TDocRAG", "WorkspaceNotFoundError", "WorkspaceRegistry", "add_workspace_members", Loading @@ -89,10 +55,8 @@ __all__ = [ # Document operations "convert_document", "convert_tdoc_to_markdown", "create_metadata_from_dict", "create_workspace", "delete_workspace", "enrich_text", "ensure_ai_subfolder", "ensure_default_workspace", "get_active_workspace", Loading packages/3gpp-ai/threegpp_ai/args.py +31 −26 Original line number Diff line number Diff line Loading @@ -9,8 +9,6 @@ import typer from tdoc_crawler.config import ConfigEnvVar from tdoc_crawler.models.base import OutputFormat from threegpp_ai.lightrag.config import QueryMode # Common OutputFormatOption = Annotated[ str, Loading @@ -18,7 +16,7 @@ OutputFormatOption = Annotated[ ] CacheDirOption = Annotated[ Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name), typer.Option("--cache-dir", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name), ] # Summarize Loading @@ -30,7 +28,7 @@ SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")] ConvertOutputOption = Annotated[ Path | None, typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"), typer.Option("--output-path", "-p", help="Output file path (optional, prints to stdout if not specified)"), ] ConvertForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")] Loading Loading @@ -66,13 +64,6 @@ ConvertMdOption = Annotated[ envvar=ConfigEnvVar.TDC_AI_CONVERT_MD.name, ), ] WorkspaceEmbedOption = Annotated[ bool, typer.Option( "--embed", help="Insert extracted documents into LightRAG knowledge graph (implies --convert-md)", ), ] WorkspaceReleaseOption = Annotated[ str | None, typer.Option( Loading @@ -91,13 +82,38 @@ WorkspaceProcessVlmOption = Annotated[ envvar=ConfigEnvVar.TDC_AI_VLM.name, ), ] WorkspacePreserveArtifactsOption = Annotated[ bool, ExtractionProfileOption = Annotated[ str | None, typer.Option( "--preserve-artifacts/--delete-artifacts", help="Preserve LightRAG artifacts (embeddings, index). --delete-artifacts removes only LightRAG data, not document artifacts (.ai folders)", "--profile", help="Extraction profile override: default, balanced, optimum, custom", envvar="TDC_AI_EXTRACTION_PROFILE", ), ] CustomExtractOcrOption = Annotated[ bool | None, typer.Option("--custom-ocr/--no-custom-ocr", help="Custom profile override for OCR stage"), ] CustomExtractLayoutOption = Annotated[ bool | None, typer.Option("--custom-layout/--no-custom-layout", help="Custom profile override for layout stage"), ] CustomExtractTablesOption = Annotated[ bool | None, typer.Option("--custom-tables/--no-custom-tables", help="Custom profile override for table extraction"), ] CustomExtractFiguresOption = Annotated[ bool | None, typer.Option("--custom-figures/--no-custom-figures", help="Custom profile override for figure extraction"), ] CustomExtractEquationsOption = Annotated[ bool | None, typer.Option("--custom-equations/--no-custom-equations", help="Custom profile override for equation extraction"), ] CustomExtractEnrichmentOption = Annotated[ bool | None, typer.Option("--custom-enrichment/--no-custom-enrichment", help="Custom profile override for enrichment stages"), ] # Accelerator options for Docling extraction AcceleratorDeviceOption = Annotated[ Loading Loading @@ -169,14 +185,3 @@ ProvidersOutputOption = Annotated[ help="Output format (table, json, ison, toon, yaml)", ), ] # Query QueryModeOption = Annotated[ QueryMode, typer.Option( "--mode", "-m", case_sensitive=False, help="Query mode (local, global, hybrid, naive)", ), ] packages/3gpp-ai/threegpp_ai/cli.py +183 −363 File changed.Preview size limit exceeded, changes collapsed. Show changes packages/3gpp-ai/threegpp_ai/config.py +58 −108 Original line number Diff line number Diff line Loading @@ -8,79 +8,16 @@ from __future__ import annotations from typing import Literal from pydantic import AliasChoices, Field, field_validator, model_validator from pydantic import AliasChoices, Field, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict from tdoc_crawler.config.env_vars import ConfigEnvVar from tdoc_crawler.config.settings import ThreeGPPConfig DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" DEFAULT_LLM_MODEL = "openrouter/openrouter/free" # Type aliases Backend = Literal["torch", "onnx", "openvino"] ExtractionProfile = Literal["default", "balanced", "optimum", "custom"] GraphQueryLevel = Literal["simple", "medium", "advanced"] QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"] StorageBackend = Literal["file", "pg0"] class LightRAGSettings(BaseSettings): """LightRAG-specific configuration (nested under ai.lightrag).""" model_config = SettingsConfigDict(extra="ignore") # Storage backend db_backend: StorageBackend = Field( "file", validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"), description="Storage backend to use (file or pg0)", ) pg0_instance_name: str = Field( "3gpp-crawler", description="pg0 instance name", ) pg0_port: int = Field( 15432, ge=1, le=65535, description="pg0 PostgreSQL port", ) pg0_database: str = Field( "tdoc", description="pg0 database name", ) # Workspace / query workspace: str = Field( "default", description="Default workspace name", ) default_query_mode: QueryMode = Field( "hybrid", description="Default query mode", ) # Feature toggles shared_storage: bool = Field( False, validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"), description="Enable shared embedding storage across workspaces (deduplication)", ) extract_tables: bool = Field( True, description="Enable extraction and indexing of table elements", ) extract_figures: bool = Field( True, description="Enable extraction and indexing of figure elements", ) extract_equations: bool = Field( True, description="Enable extraction and indexing of equation elements", ) figure_description_enabled: bool = Field( True, description="Enable figure description generation with vision-capable models", ) class AiConfig(BaseSettings): Loading @@ -92,28 +29,6 @@ class AiConfig(BaseSettings): model_config = SettingsConfigDict(extra="ignore") # Embedding embedding_model: str = Field( DEFAULT_EMBEDDING_MODEL, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"), description="Embedding model in <provider>/<model_name> format", ) embedding_backend: Backend = Field( "torch", validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"), description="Sentence-transformers backend (torch, onnx, openvino)", ) embedding_api_base: str | None = Field( None, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"), description="Override Embedding API base URL", ) embedding_api_key: str | None = Field( None, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"), description="Override Embedding API key", ) # LLM llm_model: str = Field( DEFAULT_LLM_MODEL, Loading @@ -131,6 +46,43 @@ class AiConfig(BaseSettings): description="Override LLM API key (takes precedence over provider env vars)", ) # Extraction profile policy extraction_profile: ExtractionProfile | None = Field( None, validation_alias=AliasChoices("TDC_AI_EXTRACTION_PROFILE", "extraction_profile"), description="Extraction profile override (default|balanced|optimum|custom). None enables deterministic auto-selection.", ) custom_extract_ocr: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_OCR", "custom_extract_ocr"), description="Custom profile toggle: enable OCR stage", ) custom_extract_layout: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_LAYOUT", "custom_extract_layout"), description="Custom profile toggle: enable layout stage", ) custom_extract_tables: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_TABLES", "custom_extract_tables"), description="Custom profile toggle: enable table extraction", ) custom_extract_figures: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_FIGURES", "custom_extract_figures"), description="Custom profile toggle: enable figure extraction", ) custom_extract_equations: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_EQUATIONS", "custom_extract_equations"), description="Custom profile toggle: enable equation extraction", ) custom_extract_enrichment: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_ENRICHMENT", "custom_extract_enrichment"), description="Custom profile toggle: enable enrichment stages", ) # Chunking max_chunk_size: int = Field( 1000, Loading Loading @@ -196,6 +148,24 @@ class AiConfig(BaseSettings): description="Batch size for processing", ) # Extraction toggles extract_tables: bool = Field( True, description="Enable extraction of table elements", ) extract_figures: bool = Field( True, description="Enable extraction of figure elements", ) extract_equations: bool = Field( True, description="Enable extraction of equation elements", ) figure_description_enabled: bool = Field( True, description="Enable figure description generation with vision-capable models", ) # Graph graph_query_level: GraphQueryLevel = Field( "simple", Loading @@ -203,9 +173,6 @@ class AiConfig(BaseSettings): description="Level of graph query answer generation (simple|medium|advanced)", ) # LightRAG nested lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings) @model_validator(mode="after") def _validate_bounds(self) -> AiConfig: if self.abstract_max_words < self.abstract_min_words: Loading @@ -216,23 +183,6 @@ class AiConfig(BaseSettings): raise ValueError(msg) return self @field_validator("embedding_model") @classmethod def _validate_embedding_model(cls, value: str) -> str: if "/" not in value: msg = "embedding_model must be in '<provider>/<model_name>' format" raise ValueError(msg) return value @field_validator("llm_model") @classmethod def _validate_llm_model(cls, value: str) -> str: if "/" not in value: msg = "llm_model must be in '<provider>/<model_name>' format" raise ValueError(msg) return value class ThreeGPPAIConfig(ThreeGPPConfig): """Extended config for 3gpp-ai, adding [ai] section. Loading @@ -243,4 +193,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig): ai: AiConfig = Field(default_factory=AiConfig) __all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"] __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"] packages/3gpp-ai/threegpp_ai/config_app.py +41 −93 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ from rich.console import Console from rich.table import Table from tdoc_crawler.config.settings import ThreeGPPConfig from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend from threegpp_ai.config import AiConfig from .config_exporter import ConfigExporter Loading Loading @@ -53,7 +53,7 @@ ConfigValidateStrictOption = Annotated[ ] ConfigDocsSectionOption = Annotated[ str | None, typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, embedding, database, extraction, workspace)"), typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, extraction, runtime)"), ] config_app = typer.Typer(help="Manage configuration") Loading Loading @@ -94,7 +94,7 @@ def config_show( Shows merged configuration from: 1. Config files (3gpp-ai.toml, etc.) 2. Environment variables (TDC_*, LIGHTRAG_*) 2. Environment variables (TDC_*) 3. Hard-coded defaults Use --show-secrets to display actual API key values (WARNING: not secure). Loading Loading @@ -136,7 +136,7 @@ def _validate_model_format(model: str, field_name: str) -> list[tuple[str, str]] def _validate_config_values( crawler_config: ThreeGPPConfig, ai_config: LightRAGConfig, ai_config: AiConfig, ) -> list[tuple[str, str]]: """Validate config values and return list of (severity, message) tuples. Loading Loading @@ -188,48 +188,16 @@ def _validate_config_values( llm_model = ai_config.llm.model issues.extend(_validate_model_format(llm_model, "llm.model")) # Validate embedding model format embedding_model = ai_config.embedding.model issues.extend(_validate_model_format(embedding_model, "embedding.model")) # Check database backend if ai_config.database.backend not in (StorageBackend.FILE, StorageBackend.PG0): issues.append(("error", f"database.backend must be 'file' or 'pg0', got: {ai_config.database.backend}")) # Check pg0 port range if pg0 backend is used if ai_config.database.backend == StorageBackend.PG0: if ai_config.database.pg0_port < 1 or ai_config.database.pg0_port > 65535: issues.append(("error", f"database.pg0_port must be 1-65535, got {ai_config.database.pg0_port}")) if not ai_config.database.pg0_instance_name: issues.append(("error", "database.pg0_instance_name cannot be empty when using pg0 backend")) if not ai_config.database.pg0_database: issues.append(("error", "database.pg0_database cannot be empty when using pg0 backend")) # Validate query mode if ai_config.default_query_mode not in QueryMode: issues.append(("error", f"workspace.default_query_mode must be one of {[m.value for m in QueryMode]}, got: {ai_config.default_query_mode}")) # Warnings for API keys if ai_config.llm.api_key is None and "/" in llm_model: provider = llm_model.split("/")[0] if provider not in ("ollama", "localhost"): issues.append(("warning", f"llm.api_key not set for cloud provider '{provider}'")) if ai_config.embedding.api_key is None and "/" in embedding_model: provider = embedding_model.split("/")[0] if provider not in ("ollama", "localhost", "sentence-transformers"): issues.append(("warning", f"embedding.api_key not set for cloud provider '{provider}'")) # Warning for shared storage if ai_config.shared_storage: issues.append(("warning", "workspace.shared_storage is enabled; requires custom LightRAG integration")) return issues def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]: def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, AiConfig]: """Load configs from a specific file with validation. Note: Currently only validates syntax. Full validation happens in config_validate. Loading @@ -251,7 +219,8 @@ def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]: raise typer.Exit(1) # Return default configs (will be validated with env vars applied) return ThreeGPPConfig.from_settings(config_file=file), LightRAGConfig.from_env() combined = ThreeGPPConfig.from_settings(config_file=file) return combined, combined.ai def _display_validation_results(issues: list[tuple[str, str]], strict: bool) -> None: Loading Loading @@ -296,7 +265,7 @@ def config_validate( Validates both crawler and AI settings: - Crawler: paths, HTTP settings, credentials, crawl limits - AI: LLM/embedding model formats, database backend, query modes - AI: LLM model format and runtime extraction controls Exit codes: - 0: All valid Loading @@ -308,8 +277,9 @@ def config_validate( crawler_config, ai_config = _validate_from_file(file) else: try: crawler_config = ThreeGPPConfig.from_settings() ai_config = LightRAGConfig.from_env() combined = ThreeGPPConfig.from_settings() crawler_config = combined ai_config = combined.ai except ValidationError as e: rprint("[red]Validation error in discovered config:[/red]") for error in e.errors(): Loading Loading @@ -338,10 +308,8 @@ def config_docs( - credentials: Portal authentication - crawl: Crawling filters and limits - llm: LLM model and API settings - embedding: Embedding model settings - database: Storage backend (file/pg0) - extraction: Document extraction toggles - workspace: Workspace defaults - runtime: Processing behavior and limits """ # Build documentation data sections: dict[str, list[dict]] = { Loading @@ -350,10 +318,8 @@ def config_docs( "credentials": [], "crawl": [], "llm": [], "embedding": [], "database": [], "extraction": [], "workspace": [], "runtime": [], } # Introspect crawler config Loading Loading @@ -388,14 +354,15 @@ def config_docs( ) # Introspect AI config ai_config = LightRAGConfig() ai_config = AiConfig() ai_data = ai_config.model_dump() # LLM for field_name, field_info in type(ai_config.llm).model_fields.items(): for field_name in ("llm_model", "llm_api_base", "llm_api_key"): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get("llm", {}).get(field_name) value = ai_data.get(field_name) sections["llm"].append( { Loading @@ -407,41 +374,9 @@ def config_docs( } ) # Embedding for field_name, field_info in type(ai_config.embedding).model_fields.items(): description = field_info.description or "" default = field_info.default value = ai_data.get("embedding", {}).get(field_name) sections["embedding"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), "default": default, "value": value, "description": description, } ) # Database for field_name, field_info in type(ai_config.database).model_fields.items(): description = field_info.description or "" default = field_info.default value = ai_data.get("database", {}).get(field_name) sections["database"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), "default": default, "value": value, "description": description, } ) # Extraction and workspace (direct fields) for field_name, field_info in ai_config.model_fields.items(): if field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"): # Extraction toggles for field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get(field_name) Loading @@ -455,12 +390,27 @@ def config_docs( "description": description, } ) elif field_name in ("workspace", "default_query_mode", "shared_storage"): # Runtime behavior for field_name in ( "convert_pdf", "convert_md", "vlm", "device", "num_threads", "batch_size", "parallelism", "max_chunk_size", "chunk_overlap", "abstract_min_words", "abstract_max_words", ): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get(field_name) sections["workspace"].append( sections["runtime"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), Loading @@ -484,7 +434,7 @@ def config_docs( # Show all sections for section_name, fields_list in sections.items(): _print_section_docs(section_name, fields_list, _get_section_description(section_name)) if section_name != "workspace": if section_name != "runtime": rprint() Loading @@ -496,10 +446,8 @@ def _get_section_description(section: str) -> str: "credentials": "ETSI Online (EOL) portal authentication credentials", "crawl": "Crawling behavior, filters, and limits", "llm": "LLM model and API configuration", "embedding": "Embedding model and API configuration", "database": "Storage backend selection (file-based or pg0)", "extraction": "Document element extraction toggles (tables, figures, equations)", "workspace": "Workspace defaults and query behavior", "runtime": "Runtime conversion, VLM, threading, and chunking behavior", } return descriptions.get(section, "") Loading Loading
packages/3gpp-ai/threegpp_ai/__init__.py +1 −37 Original line number Diff line number Diff line """AI document processing domain package. This package provides AI-powered document processing for 3GPP TDocs. Supports both legacy LiteLLM summarization and modern LightRAG knowledge graph. Supports extraction, conversion, workspace operations, and summarization flows. """ from __future__ import annotations import litellm # Import LightRAG integration from threegpp_ai.lightrag import ( DatabaseConfig, DocumentProcessor, EmbeddingConfig, LightRAGConfig, LLMConfig, Pg0Error, Pg0Manager, ProcessingResult, ProcessingResultStatus, QueryMode, RAGMetadata, StorageBackend, TDocProcessor, TDocRAG, create_metadata_from_dict, enrich_text, ) from threegpp_ai.models import SourceKind, SummarizeResult, WorkspaceNotFoundError from threegpp_ai.operations.convert import convert_tdoc as convert_document from threegpp_ai.operations.convert import convert_tdoc_to_markdown Loading Loading @@ -64,23 +45,8 @@ litellm.suppress_debug_info = True # Suppress provider/model info logs from lit __all__ = [ # Workspace management "DEFAULT_WORKSPACE", # LightRAG integration "DatabaseConfig", "DocumentProcessor", "EmbeddingConfig", "LLMConfig", "LightRAGConfig", "Pg0Error", "Pg0Manager", "ProcessingResult", "ProcessingResultStatus", "QueryMode", "RAGMetadata", "SourceKind", "StorageBackend", "SummarizeResult", "TDocProcessor", "TDocRAG", "WorkspaceNotFoundError", "WorkspaceRegistry", "add_workspace_members", Loading @@ -89,10 +55,8 @@ __all__ = [ # Document operations "convert_document", "convert_tdoc_to_markdown", "create_metadata_from_dict", "create_workspace", "delete_workspace", "enrich_text", "ensure_ai_subfolder", "ensure_default_workspace", "get_active_workspace", Loading
packages/3gpp-ai/threegpp_ai/args.py +31 −26 Original line number Diff line number Diff line Loading @@ -9,8 +9,6 @@ import typer from tdoc_crawler.config import ConfigEnvVar from tdoc_crawler.models.base import OutputFormat from threegpp_ai.lightrag.config import QueryMode # Common OutputFormatOption = Annotated[ str, Loading @@ -18,7 +16,7 @@ OutputFormatOption = Annotated[ ] CacheDirOption = Annotated[ Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name), typer.Option("--cache-dir", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name), ] # Summarize Loading @@ -30,7 +28,7 @@ SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")] ConvertOutputOption = Annotated[ Path | None, typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"), typer.Option("--output-path", "-p", help="Output file path (optional, prints to stdout if not specified)"), ] ConvertForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")] Loading Loading @@ -66,13 +64,6 @@ ConvertMdOption = Annotated[ envvar=ConfigEnvVar.TDC_AI_CONVERT_MD.name, ), ] WorkspaceEmbedOption = Annotated[ bool, typer.Option( "--embed", help="Insert extracted documents into LightRAG knowledge graph (implies --convert-md)", ), ] WorkspaceReleaseOption = Annotated[ str | None, typer.Option( Loading @@ -91,13 +82,38 @@ WorkspaceProcessVlmOption = Annotated[ envvar=ConfigEnvVar.TDC_AI_VLM.name, ), ] WorkspacePreserveArtifactsOption = Annotated[ bool, ExtractionProfileOption = Annotated[ str | None, typer.Option( "--preserve-artifacts/--delete-artifacts", help="Preserve LightRAG artifacts (embeddings, index). --delete-artifacts removes only LightRAG data, not document artifacts (.ai folders)", "--profile", help="Extraction profile override: default, balanced, optimum, custom", envvar="TDC_AI_EXTRACTION_PROFILE", ), ] CustomExtractOcrOption = Annotated[ bool | None, typer.Option("--custom-ocr/--no-custom-ocr", help="Custom profile override for OCR stage"), ] CustomExtractLayoutOption = Annotated[ bool | None, typer.Option("--custom-layout/--no-custom-layout", help="Custom profile override for layout stage"), ] CustomExtractTablesOption = Annotated[ bool | None, typer.Option("--custom-tables/--no-custom-tables", help="Custom profile override for table extraction"), ] CustomExtractFiguresOption = Annotated[ bool | None, typer.Option("--custom-figures/--no-custom-figures", help="Custom profile override for figure extraction"), ] CustomExtractEquationsOption = Annotated[ bool | None, typer.Option("--custom-equations/--no-custom-equations", help="Custom profile override for equation extraction"), ] CustomExtractEnrichmentOption = Annotated[ bool | None, typer.Option("--custom-enrichment/--no-custom-enrichment", help="Custom profile override for enrichment stages"), ] # Accelerator options for Docling extraction AcceleratorDeviceOption = Annotated[ Loading Loading @@ -169,14 +185,3 @@ ProvidersOutputOption = Annotated[ help="Output format (table, json, ison, toon, yaml)", ), ] # Query QueryModeOption = Annotated[ QueryMode, typer.Option( "--mode", "-m", case_sensitive=False, help="Query mode (local, global, hybrid, naive)", ), ]
packages/3gpp-ai/threegpp_ai/cli.py +183 −363 File changed.Preview size limit exceeded, changes collapsed. Show changes
packages/3gpp-ai/threegpp_ai/config.py +58 −108 Original line number Diff line number Diff line Loading @@ -8,79 +8,16 @@ from __future__ import annotations from typing import Literal from pydantic import AliasChoices, Field, field_validator, model_validator from pydantic import AliasChoices, Field, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict from tdoc_crawler.config.env_vars import ConfigEnvVar from tdoc_crawler.config.settings import ThreeGPPConfig DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" DEFAULT_LLM_MODEL = "openrouter/openrouter/free" # Type aliases Backend = Literal["torch", "onnx", "openvino"] ExtractionProfile = Literal["default", "balanced", "optimum", "custom"] GraphQueryLevel = Literal["simple", "medium", "advanced"] QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"] StorageBackend = Literal["file", "pg0"] class LightRAGSettings(BaseSettings): """LightRAG-specific configuration (nested under ai.lightrag).""" model_config = SettingsConfigDict(extra="ignore") # Storage backend db_backend: StorageBackend = Field( "file", validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"), description="Storage backend to use (file or pg0)", ) pg0_instance_name: str = Field( "3gpp-crawler", description="pg0 instance name", ) pg0_port: int = Field( 15432, ge=1, le=65535, description="pg0 PostgreSQL port", ) pg0_database: str = Field( "tdoc", description="pg0 database name", ) # Workspace / query workspace: str = Field( "default", description="Default workspace name", ) default_query_mode: QueryMode = Field( "hybrid", description="Default query mode", ) # Feature toggles shared_storage: bool = Field( False, validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"), description="Enable shared embedding storage across workspaces (deduplication)", ) extract_tables: bool = Field( True, description="Enable extraction and indexing of table elements", ) extract_figures: bool = Field( True, description="Enable extraction and indexing of figure elements", ) extract_equations: bool = Field( True, description="Enable extraction and indexing of equation elements", ) figure_description_enabled: bool = Field( True, description="Enable figure description generation with vision-capable models", ) class AiConfig(BaseSettings): Loading @@ -92,28 +29,6 @@ class AiConfig(BaseSettings): model_config = SettingsConfigDict(extra="ignore") # Embedding embedding_model: str = Field( DEFAULT_EMBEDDING_MODEL, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"), description="Embedding model in <provider>/<model_name> format", ) embedding_backend: Backend = Field( "torch", validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"), description="Sentence-transformers backend (torch, onnx, openvino)", ) embedding_api_base: str | None = Field( None, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"), description="Override Embedding API base URL", ) embedding_api_key: str | None = Field( None, validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"), description="Override Embedding API key", ) # LLM llm_model: str = Field( DEFAULT_LLM_MODEL, Loading @@ -131,6 +46,43 @@ class AiConfig(BaseSettings): description="Override LLM API key (takes precedence over provider env vars)", ) # Extraction profile policy extraction_profile: ExtractionProfile | None = Field( None, validation_alias=AliasChoices("TDC_AI_EXTRACTION_PROFILE", "extraction_profile"), description="Extraction profile override (default|balanced|optimum|custom). None enables deterministic auto-selection.", ) custom_extract_ocr: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_OCR", "custom_extract_ocr"), description="Custom profile toggle: enable OCR stage", ) custom_extract_layout: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_LAYOUT", "custom_extract_layout"), description="Custom profile toggle: enable layout stage", ) custom_extract_tables: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_TABLES", "custom_extract_tables"), description="Custom profile toggle: enable table extraction", ) custom_extract_figures: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_FIGURES", "custom_extract_figures"), description="Custom profile toggle: enable figure extraction", ) custom_extract_equations: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_EQUATIONS", "custom_extract_equations"), description="Custom profile toggle: enable equation extraction", ) custom_extract_enrichment: bool = Field( True, validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_ENRICHMENT", "custom_extract_enrichment"), description="Custom profile toggle: enable enrichment stages", ) # Chunking max_chunk_size: int = Field( 1000, Loading Loading @@ -196,6 +148,24 @@ class AiConfig(BaseSettings): description="Batch size for processing", ) # Extraction toggles extract_tables: bool = Field( True, description="Enable extraction of table elements", ) extract_figures: bool = Field( True, description="Enable extraction of figure elements", ) extract_equations: bool = Field( True, description="Enable extraction of equation elements", ) figure_description_enabled: bool = Field( True, description="Enable figure description generation with vision-capable models", ) # Graph graph_query_level: GraphQueryLevel = Field( "simple", Loading @@ -203,9 +173,6 @@ class AiConfig(BaseSettings): description="Level of graph query answer generation (simple|medium|advanced)", ) # LightRAG nested lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings) @model_validator(mode="after") def _validate_bounds(self) -> AiConfig: if self.abstract_max_words < self.abstract_min_words: Loading @@ -216,23 +183,6 @@ class AiConfig(BaseSettings): raise ValueError(msg) return self @field_validator("embedding_model") @classmethod def _validate_embedding_model(cls, value: str) -> str: if "/" not in value: msg = "embedding_model must be in '<provider>/<model_name>' format" raise ValueError(msg) return value @field_validator("llm_model") @classmethod def _validate_llm_model(cls, value: str) -> str: if "/" not in value: msg = "llm_model must be in '<provider>/<model_name>' format" raise ValueError(msg) return value class ThreeGPPAIConfig(ThreeGPPConfig): """Extended config for 3gpp-ai, adding [ai] section. Loading @@ -243,4 +193,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig): ai: AiConfig = Field(default_factory=AiConfig) __all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"] __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"]
packages/3gpp-ai/threegpp_ai/config_app.py +41 −93 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ from rich.console import Console from rich.table import Table from tdoc_crawler.config.settings import ThreeGPPConfig from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend from threegpp_ai.config import AiConfig from .config_exporter import ConfigExporter Loading Loading @@ -53,7 +53,7 @@ ConfigValidateStrictOption = Annotated[ ] ConfigDocsSectionOption = Annotated[ str | None, typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, embedding, database, extraction, workspace)"), typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, extraction, runtime)"), ] config_app = typer.Typer(help="Manage configuration") Loading Loading @@ -94,7 +94,7 @@ def config_show( Shows merged configuration from: 1. Config files (3gpp-ai.toml, etc.) 2. Environment variables (TDC_*, LIGHTRAG_*) 2. Environment variables (TDC_*) 3. Hard-coded defaults Use --show-secrets to display actual API key values (WARNING: not secure). Loading Loading @@ -136,7 +136,7 @@ def _validate_model_format(model: str, field_name: str) -> list[tuple[str, str]] def _validate_config_values( crawler_config: ThreeGPPConfig, ai_config: LightRAGConfig, ai_config: AiConfig, ) -> list[tuple[str, str]]: """Validate config values and return list of (severity, message) tuples. Loading Loading @@ -188,48 +188,16 @@ def _validate_config_values( llm_model = ai_config.llm.model issues.extend(_validate_model_format(llm_model, "llm.model")) # Validate embedding model format embedding_model = ai_config.embedding.model issues.extend(_validate_model_format(embedding_model, "embedding.model")) # Check database backend if ai_config.database.backend not in (StorageBackend.FILE, StorageBackend.PG0): issues.append(("error", f"database.backend must be 'file' or 'pg0', got: {ai_config.database.backend}")) # Check pg0 port range if pg0 backend is used if ai_config.database.backend == StorageBackend.PG0: if ai_config.database.pg0_port < 1 or ai_config.database.pg0_port > 65535: issues.append(("error", f"database.pg0_port must be 1-65535, got {ai_config.database.pg0_port}")) if not ai_config.database.pg0_instance_name: issues.append(("error", "database.pg0_instance_name cannot be empty when using pg0 backend")) if not ai_config.database.pg0_database: issues.append(("error", "database.pg0_database cannot be empty when using pg0 backend")) # Validate query mode if ai_config.default_query_mode not in QueryMode: issues.append(("error", f"workspace.default_query_mode must be one of {[m.value for m in QueryMode]}, got: {ai_config.default_query_mode}")) # Warnings for API keys if ai_config.llm.api_key is None and "/" in llm_model: provider = llm_model.split("/")[0] if provider not in ("ollama", "localhost"): issues.append(("warning", f"llm.api_key not set for cloud provider '{provider}'")) if ai_config.embedding.api_key is None and "/" in embedding_model: provider = embedding_model.split("/")[0] if provider not in ("ollama", "localhost", "sentence-transformers"): issues.append(("warning", f"embedding.api_key not set for cloud provider '{provider}'")) # Warning for shared storage if ai_config.shared_storage: issues.append(("warning", "workspace.shared_storage is enabled; requires custom LightRAG integration")) return issues def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]: def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, AiConfig]: """Load configs from a specific file with validation. Note: Currently only validates syntax. Full validation happens in config_validate. Loading @@ -251,7 +219,8 @@ def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]: raise typer.Exit(1) # Return default configs (will be validated with env vars applied) return ThreeGPPConfig.from_settings(config_file=file), LightRAGConfig.from_env() combined = ThreeGPPConfig.from_settings(config_file=file) return combined, combined.ai def _display_validation_results(issues: list[tuple[str, str]], strict: bool) -> None: Loading Loading @@ -296,7 +265,7 @@ def config_validate( Validates both crawler and AI settings: - Crawler: paths, HTTP settings, credentials, crawl limits - AI: LLM/embedding model formats, database backend, query modes - AI: LLM model format and runtime extraction controls Exit codes: - 0: All valid Loading @@ -308,8 +277,9 @@ def config_validate( crawler_config, ai_config = _validate_from_file(file) else: try: crawler_config = ThreeGPPConfig.from_settings() ai_config = LightRAGConfig.from_env() combined = ThreeGPPConfig.from_settings() crawler_config = combined ai_config = combined.ai except ValidationError as e: rprint("[red]Validation error in discovered config:[/red]") for error in e.errors(): Loading Loading @@ -338,10 +308,8 @@ def config_docs( - credentials: Portal authentication - crawl: Crawling filters and limits - llm: LLM model and API settings - embedding: Embedding model settings - database: Storage backend (file/pg0) - extraction: Document extraction toggles - workspace: Workspace defaults - runtime: Processing behavior and limits """ # Build documentation data sections: dict[str, list[dict]] = { Loading @@ -350,10 +318,8 @@ def config_docs( "credentials": [], "crawl": [], "llm": [], "embedding": [], "database": [], "extraction": [], "workspace": [], "runtime": [], } # Introspect crawler config Loading Loading @@ -388,14 +354,15 @@ def config_docs( ) # Introspect AI config ai_config = LightRAGConfig() ai_config = AiConfig() ai_data = ai_config.model_dump() # LLM for field_name, field_info in type(ai_config.llm).model_fields.items(): for field_name in ("llm_model", "llm_api_base", "llm_api_key"): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get("llm", {}).get(field_name) value = ai_data.get(field_name) sections["llm"].append( { Loading @@ -407,41 +374,9 @@ def config_docs( } ) # Embedding for field_name, field_info in type(ai_config.embedding).model_fields.items(): description = field_info.description or "" default = field_info.default value = ai_data.get("embedding", {}).get(field_name) sections["embedding"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), "default": default, "value": value, "description": description, } ) # Database for field_name, field_info in type(ai_config.database).model_fields.items(): description = field_info.description or "" default = field_info.default value = ai_data.get("database", {}).get(field_name) sections["database"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), "default": default, "value": value, "description": description, } ) # Extraction and workspace (direct fields) for field_name, field_info in ai_config.model_fields.items(): if field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"): # Extraction toggles for field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get(field_name) Loading @@ -455,12 +390,27 @@ def config_docs( "description": description, } ) elif field_name in ("workspace", "default_query_mode", "shared_storage"): # Runtime behavior for field_name in ( "convert_pdf", "convert_md", "vlm", "device", "num_threads", "batch_size", "parallelism", "max_chunk_size", "chunk_overlap", "abstract_min_words", "abstract_max_words", ): field_info = type(ai_config).model_fields[field_name] description = field_info.description or "" default = field_info.default value = ai_data.get(field_name) sections["workspace"].append( sections["runtime"].append( { "field": field_name, "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation), Loading @@ -484,7 +434,7 @@ def config_docs( # Show all sections for section_name, fields_list in sections.items(): _print_section_docs(section_name, fields_list, _get_section_description(section_name)) if section_name != "workspace": if section_name != "runtime": rprint() Loading @@ -496,10 +446,8 @@ def _get_section_description(section: str) -> str: "credentials": "ETSI Online (EOL) portal authentication credentials", "crawl": "Crawling behavior, filters, and limits", "llm": "LLM model and API configuration", "embedding": "Embedding model and API configuration", "database": "Storage backend selection (file-based or pg0)", "extraction": "Document element extraction toggles (tables, figures, equations)", "workspace": "Workspace defaults and query behavior", "runtime": "Runtime conversion, VLM, threading, and chunking behavior", } return descriptions.get(section, "") Loading