🔄 refactor(3gpp-ai): replace RAG-centric with extraction profile system (361714a6) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/threegpp_ai/init.py

+1 −37

Original line number	Diff line number	Diff line
		"""AI document processing domain package.

		This package provides AI-powered document processing for 3GPP TDocs.
		Supports both legacy LiteLLM summarization and modern LightRAG knowledge graph.
		Supports extraction, conversion, workspace operations, and summarization flows.
		"""

		from __future__ import annotations

		import litellm

		# Import LightRAG integration
		from threegpp_ai.lightrag import (
		DatabaseConfig,
		DocumentProcessor,
		EmbeddingConfig,
		LightRAGConfig,
		LLMConfig,
		Pg0Error,
		Pg0Manager,
		ProcessingResult,
		ProcessingResultStatus,
		QueryMode,
		RAGMetadata,
		StorageBackend,
		TDocProcessor,
		TDocRAG,
		create_metadata_from_dict,
		enrich_text,
		)
		from threegpp_ai.models import SourceKind, SummarizeResult, WorkspaceNotFoundError
		from threegpp_ai.operations.convert import convert_tdoc as convert_document
		from threegpp_ai.operations.convert import convert_tdoc_to_markdown
		@@ -64,23 +45,8 @@ litellm.suppress_debug_info = True # Suppress provider/model info logs from lit
		__all__ = [
		# Workspace management
		"DEFAULT_WORKSPACE",
		# LightRAG integration
		"DatabaseConfig",
		"DocumentProcessor",
		"EmbeddingConfig",
		"LLMConfig",
		"LightRAGConfig",
		"Pg0Error",
		"Pg0Manager",
		"ProcessingResult",
		"ProcessingResultStatus",
		"QueryMode",
		"RAGMetadata",
		"SourceKind",
		"StorageBackend",
		"SummarizeResult",
		"TDocProcessor",
		"TDocRAG",
		"WorkspaceNotFoundError",
		"WorkspaceRegistry",
		"add_workspace_members",
		@@ -89,10 +55,8 @@ __all__ = [
		# Document operations
		"convert_document",
		"convert_tdoc_to_markdown",
		"create_metadata_from_dict",
		"create_workspace",
		"delete_workspace",
		"enrich_text",
		"ensure_ai_subfolder",
		"ensure_default_workspace",
		"get_active_workspace",

packages/3gpp-ai/threegpp_ai/args.py

+31 −26

Original line number	Diff line number	Diff line
		@@ -9,8 +9,6 @@ import typer
		from tdoc_crawler.config import ConfigEnvVar
		from tdoc_crawler.models.base import OutputFormat

		from threegpp_ai.lightrag.config import QueryMode

		# Common
		OutputFormatOption = Annotated[
		str,
		@@ -18,7 +16,7 @@ OutputFormatOption = Annotated[
		]
		CacheDirOption = Annotated[
		Path \| None,
		typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name),
		typer.Option("--cache-dir", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name),
		]

		# Summarize
		@@ -30,7 +28,7 @@ SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force
		ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")]
		ConvertOutputOption = Annotated[
		Path \| None,
		typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"),
		typer.Option("--output-path", "-p", help="Output file path (optional, prints to stdout if not specified)"),
		]
		ConvertForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")]

		@@ -66,13 +64,6 @@ ConvertMdOption = Annotated[
		envvar=ConfigEnvVar.TDC_AI_CONVERT_MD.name,
		),
		]
		WorkspaceEmbedOption = Annotated[
		bool,
		typer.Option(
		"--embed",
		help="Insert extracted documents into LightRAG knowledge graph (implies --convert-md)",
		),
		]
		WorkspaceReleaseOption = Annotated[
		str \| None,
		typer.Option(
		@@ -91,13 +82,38 @@ WorkspaceProcessVlmOption = Annotated[
		envvar=ConfigEnvVar.TDC_AI_VLM.name,
		),
		]
		WorkspacePreserveArtifactsOption = Annotated[
		bool,
		ExtractionProfileOption = Annotated[
		str \| None,
		typer.Option(
		"--preserve-artifacts/--delete-artifacts",
		help="Preserve LightRAG artifacts (embeddings, index). --delete-artifacts removes only LightRAG data, not document artifacts (.ai folders)",
		"--profile",
		help="Extraction profile override: default, balanced, optimum, custom",
		envvar="TDC_AI_EXTRACTION_PROFILE",
		),
		]
		CustomExtractOcrOption = Annotated[
		bool \| None,
		typer.Option("--custom-ocr/--no-custom-ocr", help="Custom profile override for OCR stage"),
		]
		CustomExtractLayoutOption = Annotated[
		bool \| None,
		typer.Option("--custom-layout/--no-custom-layout", help="Custom profile override for layout stage"),
		]
		CustomExtractTablesOption = Annotated[
		bool \| None,
		typer.Option("--custom-tables/--no-custom-tables", help="Custom profile override for table extraction"),
		]
		CustomExtractFiguresOption = Annotated[
		bool \| None,
		typer.Option("--custom-figures/--no-custom-figures", help="Custom profile override for figure extraction"),
		]
		CustomExtractEquationsOption = Annotated[
		bool \| None,
		typer.Option("--custom-equations/--no-custom-equations", help="Custom profile override for equation extraction"),
		]
		CustomExtractEnrichmentOption = Annotated[
		bool \| None,
		typer.Option("--custom-enrichment/--no-custom-enrichment", help="Custom profile override for enrichment stages"),
		]

		# Accelerator options for Docling extraction
		AcceleratorDeviceOption = Annotated[
		@@ -169,14 +185,3 @@ ProvidersOutputOption = Annotated[
		help="Output format (table, json, ison, toon, yaml)",
		),
		]

		# Query
		QueryModeOption = Annotated[
		QueryMode,
		typer.Option(
		"--mode",
		"-m",
		case_sensitive=False,
		help="Query mode (local, global, hybrid, naive)",
		),
		]

packages/3gpp-ai/threegpp_ai/cli.py

+183 −363

File changed.

Preview size limit exceeded, changes collapsed.

packages/3gpp-ai/threegpp_ai/config.py

+58 −108

Original line number	Diff line number	Diff line
		@@ -8,79 +8,16 @@ from __future__ import annotations

		from typing import Literal

		from pydantic import AliasChoices, Field, field_validator, model_validator
		from pydantic import AliasChoices, Field, model_validator
		from pydantic_settings import BaseSettings, SettingsConfigDict
		from tdoc_crawler.config.env_vars import ConfigEnvVar
		from tdoc_crawler.config.settings import ThreeGPPConfig

		DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
		DEFAULT_LLM_MODEL = "openrouter/openrouter/free"

		# Type aliases
		Backend = Literal["torch", "onnx", "openvino"]
		ExtractionProfile = Literal["default", "balanced", "optimum", "custom"]
		GraphQueryLevel = Literal["simple", "medium", "advanced"]
		QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"]
		StorageBackend = Literal["file", "pg0"]


		class LightRAGSettings(BaseSettings):
		"""LightRAG-specific configuration (nested under ai.lightrag)."""

		model_config = SettingsConfigDict(extra="ignore")

		# Storage backend
		db_backend: StorageBackend = Field(
		"file",
		validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"),
		description="Storage backend to use (file or pg0)",
		)
		pg0_instance_name: str = Field(
		"3gpp-crawler",
		description="pg0 instance name",
		)
		pg0_port: int = Field(
		15432,
		ge=1,
		le=65535,
		description="pg0 PostgreSQL port",
		)
		pg0_database: str = Field(
		"tdoc",
		description="pg0 database name",
		)

		# Workspace / query
		workspace: str = Field(
		"default",
		description="Default workspace name",
		)
		default_query_mode: QueryMode = Field(
		"hybrid",
		description="Default query mode",
		)

		# Feature toggles
		shared_storage: bool = Field(
		False,
		validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"),
		description="Enable shared embedding storage across workspaces (deduplication)",
		)
		extract_tables: bool = Field(
		True,
		description="Enable extraction and indexing of table elements",
		)
		extract_figures: bool = Field(
		True,
		description="Enable extraction and indexing of figure elements",
		)
		extract_equations: bool = Field(
		True,
		description="Enable extraction and indexing of equation elements",
		)
		figure_description_enabled: bool = Field(
		True,
		description="Enable figure description generation with vision-capable models",
		)


		class AiConfig(BaseSettings):
		@@ -92,28 +29,6 @@ class AiConfig(BaseSettings):

		model_config = SettingsConfigDict(extra="ignore")

		# Embedding
		embedding_model: str = Field(
		DEFAULT_EMBEDDING_MODEL,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"),
		description="Embedding model in <provider>/<model_name> format",
		)
		embedding_backend: Backend = Field(
		"torch",
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"),
		description="Sentence-transformers backend (torch, onnx, openvino)",
		)
		embedding_api_base: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"),
		description="Override Embedding API base URL",
		)
		embedding_api_key: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"),
		description="Override Embedding API key",
		)

		# LLM
		llm_model: str = Field(
		DEFAULT_LLM_MODEL,
		@@ -131,6 +46,43 @@ class AiConfig(BaseSettings):
		description="Override LLM API key (takes precedence over provider env vars)",
		)

		# Extraction profile policy
		extraction_profile: ExtractionProfile \| None = Field(
		None,
		validation_alias=AliasChoices("TDC_AI_EXTRACTION_PROFILE", "extraction_profile"),
		description="Extraction profile override (default\|balanced\|optimum\|custom). None enables deterministic auto-selection.",
		)
		custom_extract_ocr: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_OCR", "custom_extract_ocr"),
		description="Custom profile toggle: enable OCR stage",
		)
		custom_extract_layout: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_LAYOUT", "custom_extract_layout"),
		description="Custom profile toggle: enable layout stage",
		)
		custom_extract_tables: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_TABLES", "custom_extract_tables"),
		description="Custom profile toggle: enable table extraction",
		)
		custom_extract_figures: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_FIGURES", "custom_extract_figures"),
		description="Custom profile toggle: enable figure extraction",
		)
		custom_extract_equations: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_EQUATIONS", "custom_extract_equations"),
		description="Custom profile toggle: enable equation extraction",
		)
		custom_extract_enrichment: bool = Field(
		True,
		validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_ENRICHMENT", "custom_extract_enrichment"),
		description="Custom profile toggle: enable enrichment stages",
		)

		# Chunking
		max_chunk_size: int = Field(
		1000,
		@@ -196,6 +148,24 @@ class AiConfig(BaseSettings):
		description="Batch size for processing",
		)

		# Extraction toggles
		extract_tables: bool = Field(
		True,
		description="Enable extraction of table elements",
		)
		extract_figures: bool = Field(
		True,
		description="Enable extraction of figure elements",
		)
		extract_equations: bool = Field(
		True,
		description="Enable extraction of equation elements",
		)
		figure_description_enabled: bool = Field(
		True,
		description="Enable figure description generation with vision-capable models",
		)

		# Graph
		graph_query_level: GraphQueryLevel = Field(
		"simple",
		@@ -203,9 +173,6 @@ class AiConfig(BaseSettings):
		description="Level of graph query answer generation (simple\|medium\|advanced)",
		)

		# LightRAG nested
		lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings)

		@model_validator(mode="after")
		def _validate_bounds(self) -> AiConfig:
		if self.abstract_max_words < self.abstract_min_words:
		@@ -216,23 +183,6 @@ class AiConfig(BaseSettings):
		raise ValueError(msg)
		return self

		@field_validator("embedding_model")
		@classmethod
		def _validate_embedding_model(cls, value: str) -> str:
		if "/" not in value:
		msg = "embedding_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)
		return value

		@field_validator("llm_model")
		@classmethod
		def _validate_llm_model(cls, value: str) -> str:
		if "/" not in value:
		msg = "llm_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)
		return value


		class ThreeGPPAIConfig(ThreeGPPConfig):
		"""Extended config for 3gpp-ai, adding [ai] section.

		@@ -243,4 +193,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig):
		ai: AiConfig = Field(default_factory=AiConfig)


		__all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"]
		__all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"]

packages/3gpp-ai/threegpp_ai/config_app.py

+41 −93

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ from rich.console import Console
		from rich.table import Table
		from tdoc_crawler.config.settings import ThreeGPPConfig

		from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
		from threegpp_ai.config import AiConfig

		from .config_exporter import ConfigExporter

		@@ -53,7 +53,7 @@ ConfigValidateStrictOption = Annotated[
		]
		ConfigDocsSectionOption = Annotated[
		str \| None,
		typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, embedding, database, extraction, workspace)"),
		typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, extraction, runtime)"),
		]

		config_app = typer.Typer(help="Manage configuration")
		@@ -94,7 +94,7 @@ def config_show(

		Shows merged configuration from:
		1. Config files (3gpp-ai.toml, etc.)
		2. Environment variables (TDC_, LIGHTRAG_)
		2. Environment variables (TDC_*)
		3. Hard-coded defaults

		Use --show-secrets to display actual API key values (WARNING: not secure).
		@@ -136,7 +136,7 @@ def _validate_model_format(model: str, field_name: str) -> list[tuple[str, str]]

		def _validate_config_values(
		crawler_config: ThreeGPPConfig,
		ai_config: LightRAGConfig,
		ai_config: AiConfig,
		) -> list[tuple[str, str]]:
		"""Validate config values and return list of (severity, message) tuples.

		@@ -188,48 +188,16 @@ def _validate_config_values(
		llm_model = ai_config.llm.model
		issues.extend(_validate_model_format(llm_model, "llm.model"))

		# Validate embedding model format
		embedding_model = ai_config.embedding.model
		issues.extend(_validate_model_format(embedding_model, "embedding.model"))

		# Check database backend
		if ai_config.database.backend not in (StorageBackend.FILE, StorageBackend.PG0):
		issues.append(("error", f"database.backend must be 'file' or 'pg0', got: {ai_config.database.backend}"))

		# Check pg0 port range if pg0 backend is used
		if ai_config.database.backend == StorageBackend.PG0:
		if ai_config.database.pg0_port < 1 or ai_config.database.pg0_port > 65535:
		issues.append(("error", f"database.pg0_port must be 1-65535, got {ai_config.database.pg0_port}"))

		if not ai_config.database.pg0_instance_name:
		issues.append(("error", "database.pg0_instance_name cannot be empty when using pg0 backend"))

		if not ai_config.database.pg0_database:
		issues.append(("error", "database.pg0_database cannot be empty when using pg0 backend"))

		# Validate query mode
		if ai_config.default_query_mode not in QueryMode:
		issues.append(("error", f"workspace.default_query_mode must be one of {[m.value for m in QueryMode]}, got: {ai_config.default_query_mode}"))

		# Warnings for API keys
		if ai_config.llm.api_key is None and "/" in llm_model:
		provider = llm_model.split("/")[0]
		if provider not in ("ollama", "localhost"):
		issues.append(("warning", f"llm.api_key not set for cloud provider '{provider}'"))

		if ai_config.embedding.api_key is None and "/" in embedding_model:
		provider = embedding_model.split("/")[0]
		if provider not in ("ollama", "localhost", "sentence-transformers"):
		issues.append(("warning", f"embedding.api_key not set for cloud provider '{provider}'"))

		# Warning for shared storage
		if ai_config.shared_storage:
		issues.append(("warning", "workspace.shared_storage is enabled; requires custom LightRAG integration"))

		return issues


		def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]:
		def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, AiConfig]:
		"""Load configs from a specific file with validation.

		Note: Currently only validates syntax. Full validation happens in config_validate.
		@@ -251,7 +219,8 @@ def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]:
		raise typer.Exit(1)

		# Return default configs (will be validated with env vars applied)
		return ThreeGPPConfig.from_settings(config_file=file), LightRAGConfig.from_env()
		combined = ThreeGPPConfig.from_settings(config_file=file)
		return combined, combined.ai


		def _display_validation_results(issues: list[tuple[str, str]], strict: bool) -> None:
		@@ -296,7 +265,7 @@ def config_validate(

		Validates both crawler and AI settings:
		- Crawler: paths, HTTP settings, credentials, crawl limits
		- AI: LLM/embedding model formats, database backend, query modes
		- AI: LLM model format and runtime extraction controls

		Exit codes:
		- 0: All valid
		@@ -308,8 +277,9 @@ def config_validate(
		crawler_config, ai_config = _validate_from_file(file)
		else:
		try:
		crawler_config = ThreeGPPConfig.from_settings()
		ai_config = LightRAGConfig.from_env()
		combined = ThreeGPPConfig.from_settings()
		crawler_config = combined
		ai_config = combined.ai
		except ValidationError as e:
		rprint("[red]Validation error in discovered config:[/red]")
		for error in e.errors():
		@@ -338,10 +308,8 @@ def config_docs(
		- credentials: Portal authentication
		- crawl: Crawling filters and limits
		- llm: LLM model and API settings
		- embedding: Embedding model settings
		- database: Storage backend (file/pg0)
		- extraction: Document extraction toggles
		- workspace: Workspace defaults
		- runtime: Processing behavior and limits
		"""
		# Build documentation data
		sections: dict[str, list[dict]] = {
		@@ -350,10 +318,8 @@ def config_docs(
		"credentials": [],
		"crawl": [],
		"llm": [],
		"embedding": [],
		"database": [],
		"extraction": [],
		"workspace": [],
		"runtime": [],
		}

		# Introspect crawler config
		@@ -388,14 +354,15 @@ def config_docs(
		)

		# Introspect AI config
		ai_config = LightRAGConfig()
		ai_config = AiConfig()
		ai_data = ai_config.model_dump()

		# LLM
		for field_name, field_info in type(ai_config.llm).model_fields.items():
		for field_name in ("llm_model", "llm_api_base", "llm_api_key"):
		field_info = type(ai_config).model_fields[field_name]
		description = field_info.description or ""
		default = field_info.default
		value = ai_data.get("llm", {}).get(field_name)
		value = ai_data.get(field_name)

		sections["llm"].append(
		{
		@@ -407,41 +374,9 @@ def config_docs(
		}
		)

		# Embedding
		for field_name, field_info in type(ai_config.embedding).model_fields.items():
		description = field_info.description or ""
		default = field_info.default
		value = ai_data.get("embedding", {}).get(field_name)

		sections["embedding"].append(
		{
		"field": field_name,
		"type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
		"default": default,
		"value": value,
		"description": description,
		}
		)

		# Database
		for field_name, field_info in type(ai_config.database).model_fields.items():
		description = field_info.description or ""
		default = field_info.default
		value = ai_data.get("database", {}).get(field_name)

		sections["database"].append(
		{
		"field": field_name,
		"type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
		"default": default,
		"value": value,
		"description": description,
		}
		)

		# Extraction and workspace (direct fields)
		for field_name, field_info in ai_config.model_fields.items():
		if field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"):
		# Extraction toggles
		for field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"):
		field_info = type(ai_config).model_fields[field_name]
		description = field_info.description or ""
		default = field_info.default
		value = ai_data.get(field_name)
		@@ -455,12 +390,27 @@ def config_docs(
		"description": description,
		}
		)
		elif field_name in ("workspace", "default_query_mode", "shared_storage"):

		# Runtime behavior
		for field_name in (
		"convert_pdf",
		"convert_md",
		"vlm",
		"device",
		"num_threads",
		"batch_size",
		"parallelism",
		"max_chunk_size",
		"chunk_overlap",
		"abstract_min_words",
		"abstract_max_words",
		):
		field_info = type(ai_config).model_fields[field_name]
		description = field_info.description or ""
		default = field_info.default
		value = ai_data.get(field_name)

		sections["workspace"].append(
		sections["runtime"].append(
		{
		"field": field_name,
		"type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
		@@ -484,7 +434,7 @@ def config_docs(
		# Show all sections
		for section_name, fields_list in sections.items():
		_print_section_docs(section_name, fields_list, _get_section_description(section_name))
		if section_name != "workspace":
		if section_name != "runtime":
		rprint()


		@@ -496,10 +446,8 @@ def _get_section_description(section: str) -> str:
		"credentials": "ETSI Online (EOL) portal authentication credentials",
		"crawl": "Crawling behavior, filters, and limits",
		"llm": "LLM model and API configuration",
		"embedding": "Embedding model and API configuration",
		"database": "Storage backend selection (file-based or pg0)",
		"extraction": "Document element extraction toggles (tables, figures, equations)",
		"workspace": "Workspace defaults and query behavior",
		"runtime": "Runtime conversion, VLM, threading, and chunking behavior",
		}
		return descriptions.get(section, "")