♻️ refactor(3gpp-ai): migrate to pydantic-settings and remove CacheManager dependency (639af911) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/threegpp_ai/cli.py

+42 −23

Original line number	Diff line number	Diff line
		@@ -11,13 +11,13 @@ import shutil
		from collections.abc import Callable
		from datetime import UTC, datetime
		from pathlib import Path
		from typing import Any
		from typing import Annotated, Any

		import typer
		from dotenv import load_dotenv
		from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
		from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
		from tdoc_crawler.config import CacheManager, resolve_cache_manager
		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.logging import get_console, get_logger, set_verbosity
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		@@ -84,6 +84,8 @@ from threegpp_ai.args import (
		WorkspaceProcessVlmOption,
		WorkspaceReleaseOption,
		)
		from threegpp_ai.config import ThreeGPPAIConfig
		from threegpp_ai.config_app import config_app
		from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
		from threegpp_ai.lightrag.metadata import RAGMetadata
		from threegpp_ai.lightrag.processor import DocumentProcessor
		@@ -104,6 +106,7 @@ workspace_app = typer.Typer(help="Manage GraphRAG workspaces")
		providers_app = typer.Typer(help="List and manage AI providers")
		app.add_typer(workspace_app, name="workspace")
		app.add_typer(providers_app, name="providers")
		app.add_typer(config_app, name="config")

		console = get_console()
		_logger = get_logger(__name__)
		@@ -194,9 +197,26 @@ def providers_list(


		@app.callback()
		def _app_init(cache_dir: CacheDirOption = None) -> None:
		"""Register a CacheManager so all sub-commands can resolve file paths."""
		CacheManager(cache_dir).register(force=True)
		def _app_init(
		ctx: typer.Context,
		config_file: Annotated[
		Path \| None,
		typer.Option(
		"--config",
		"-c",
		help="Path to configuration file (overrides discovered config)",
		exists=True,
		readable=True,
		),
		] = None,
		cache_dir: CacheDirOption = None,
		) -> None:
		"""Load configuration so all sub-commands can resolve file paths."""
		config = ThreeGPPAIConfig.from_settings(config_file=config_file)
		if cache_dir:
		config.path.cache_dir = cache_dir

		ctx.obj = config


		def _resolve_workspace_name(workspace: str \| None) -> str:
		@@ -245,7 +265,7 @@ def _resolve_workspace_items(
		console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]")
		raise typer.Exit(1)

		manager = resolve_cache_manager()
		manager = PathConfig()
		config = TDocQueryConfig(
		output_format=OutputFormat.TABLE,
		tdoc_ids=None,
		@@ -281,7 +301,7 @@ async def _process_single_item(
		release: str \| None,
		convert_pdf: bool,
		convert_md: bool = False,
		manager: CacheManager,
		path_config: PathConfig,
		vlm_options: VlmOptions \| None = None,
		accelerator_config: AcceleratorConfig \| None = None,
		) -> tuple[Any \| None, str \| None, bool, bool]:
		@@ -295,7 +315,7 @@ async def _process_single_item(
		release: Spec release version
		convert_pdf: Whether to convert to PDF
		convert_md: Whether to extract markdown (implies convert_pdf)
		manager: CacheManager for paths
		path_config: PathConfig for file system paths
		vlm_options: Optional VLM features for extraction.
		accelerator_config: Optional accelerator settings for GPU/CPU and threading.

		@@ -310,16 +330,16 @@ async def _process_single_item(
		if checkout:
		checkout_path = None
		if source_kind == SourceKind.TDOC:
		checkout_path = await checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
		checkout_path = await checkout_tdoc_to_workspace(item, path_config.checkout_dir, workspace, db_file=path_config.db_file)
		if checkout_path is None:
		return None, "TDoc not found in database or meeting not crawled", False, False
		elif source_kind == SourceKind.SPEC:
		checkout_path = await checkout_spec_to_workspace(
		item,
		manager.checkout_dir,
		path_config.checkout_dir,
		workspace,
		release or "latest",
		db_file=manager.db_file,
		db_file=path_config.db_file,
		)
		if checkout_path is None:
		return None, "Spec not found in database", False, False
		@@ -449,7 +469,7 @@ async def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata \| None:
		if not source_item_id.startswith(("S", "R", "C", "T")):
		return None

		manager = resolve_cache_manager()
		manager = PathConfig()
		try:
		async with TDocDatabase(manager.db_file) as db:
		rows = await db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
		@@ -494,7 +514,7 @@ async def _process_workspace_members(
		"""
		processor = DocumentProcessor(LightRAGConfig.from_env())
		results: list[dict[str, Any]] = []
		manager = resolve_cache_manager()
		path_config = PathConfig()

		await processor.rag.start(workspace)
		try:
		@@ -508,17 +528,17 @@ async def _process_workspace_members(
		if member.source_kind == SourceKind.TDOC:
		checkout_path = await checkout_tdoc_to_workspace(
		member.source_item_id,
		manager.checkout_dir,
		path_config.checkout_dir,
		workspace,
		db_file=manager.db_file,
		db_file=path_config.db_file,
		)
		elif member.source_kind == SourceKind.SPEC:
		checkout_path = await checkout_spec_to_workspace(
		member.source_item_id,
		manager.checkout_dir,
		path_config.checkout_dir,
		workspace,
		"latest",
		db_file=manager.db_file,
		db_file=path_config.db_file,
		)
		if checkout_path is not None:
		file_path = _resolve_process_file(checkout_path)
		@@ -826,11 +846,11 @@ def workspace_clear(
		workspace: WorkspaceNameOption = None,
		) -> None:
		workspace_name = _resolve_workspace_name(workspace)
		manager = resolve_cache_manager()
		path_config = typer.get_current_context().obj.path

		config = LightRAGConfig.from_env()
		embedding_model_safe = config.embedding.model.replace(":", "-").replace("/", "-")
		working_dir = manager.ai_embed_dir(embedding_model_safe) / workspace_name
		working_dir = path_config.ai_embed_dir(embedding_model_safe) / workspace_name

		if not working_dir.exists():
		console.print(f"[yellow]No LightRAG artifacts found for '{workspace_name}'[/yellow]")
		@@ -857,7 +877,7 @@ def _checkout_and_convert_items(
		Returns:
		Tuple of (members, skipped_items, converted_count, md_extracted_count)
		"""
		manager = resolve_cache_manager()
		manager = PathConfig()
		members: list[Any] = []
		skipped: list[tuple[str, str]] = []
		converted_count = 0
		@@ -886,7 +906,7 @@ def _checkout_and_convert_items(
		release=release,
		convert_pdf=convert_pdf,
		convert_md=convert_md,
		manager=manager,
		path_config=manager,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		)
		@@ -1034,7 +1054,6 @@ def _embed_members(
		Number of successfully embedded documents.
		"""
		processor = DocumentProcessor(LightRAGConfig.from_env())
		resolve_cache_manager()
		embedded = 0

		async def _run() -> None:
		@@ -1091,7 +1110,7 @@ def workspace_list_members(
		console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
		raise typer.Exit(1)

		manager = resolve_cache_manager()
		manager = typer.get_current_context().obj.path
		checkout_base = manager.checkout_dir

		member_rows = [

packages/3gpp-ai/threegpp_ai/config.py

+183 −141

Original line number	Diff line number	Diff line
		"""Configuration for the AI document processing pipeline."""
		"""AI processing pipeline configuration for 3GPP documents.

		This module extends the base ThreeGPPConfig with AI-specific settings
		organized under the [ai] section in TOML/YAML/JSON configuration files.
		"""

		from __future__ import annotations

		import os
		from pathlib import Path
		from typing import Literal

		import litellm
		from pydantic import Field, field_validator, model_validator
		from tdoc_crawler.models import BaseConfigModel
		from pydantic import AliasChoices, Field, field_validator, model_validator
		from pydantic_settings import BaseSettings, SettingsConfigDict
		from tdoc_crawler.config.env_vars import ConfigEnvVar
		from tdoc_crawler.config.settings import ThreeGPPConfig

		DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
		DEFAULT_LLM_MODEL = "openrouter/openrouter/free"

		type Backend = Literal["torch", "onnx", "openvino"]

		# Graph query level type
		type GraphQueryLevel = Literal["simple", "medium", "advanced"]

		# Type aliases
		Backend = Literal["torch", "onnx", "openvino"]
		GraphQueryLevel = Literal["simple", "medium", "advanced"]
		QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"]
		StorageBackend = Literal["file", "pg0"]

		def _env_int(name: str) -> int \| None:
		value = os.getenv(name)
		if value is None or value == "":
		return None
		return int(value)

		class LightRAGSettings(BaseSettings):
		"""LightRAG-specific configuration (nested under ai.lightrag)."""

		def _validate_model_identifier(value: str, field_name: str) -> str:
		if "/" not in value:
		msg = f"{field_name} must be in '<provider>/<model_name>' format"
		raise ValueError(msg)
		model_config = SettingsConfigDict(extra="ignore")

		provider, model_name = value.split("/", 1)
		provider_normalized = provider.strip().lower()
		model_name_normalized = model_name.strip()

		if not provider_normalized:
		msg = f"{field_name} provider segment cannot be empty"
		raise ValueError(msg)
		if not model_name_normalized:
		msg = f"{field_name} model_name segment cannot be empty"
		raise ValueError(msg)

		supported_providers = set(litellm.LITELLM_CHAT_PROVIDERS + litellm.openai_compatible_providers)
		# Storage backend
		db_backend: StorageBackend = Field(
		"file",
		validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"),
		description="Storage backend to use (file or pg0)",
		)
		pg0_instance_name: str = Field(
		"3gpp-crawler",
		description="pg0 instance name",
		)
		pg0_port: int = Field(
		15432,
		ge=1,
		le=65535,
		description="pg0 PostgreSQL port",
		)
		pg0_database: str = Field(
		"tdoc",
		description="pg0 database name",
		)

		if provider_normalized not in supported_providers:
		msg = (
		f"{field_name} provider '{provider}' is not supported by litellm. "
		f"See https://docs.litellm.ai/docs/providers for the full list of {len(supported_providers)} supported providers."
		# Workspace / query
		workspace: str = Field(
		"default",
		description="Default workspace name",
		)
		default_query_mode: QueryMode = Field(
		"hybrid",
		description="Default query mode",
		)
		raise ValueError(msg)

		return f"{provider_normalized}/{model_name_normalized}"
		# Feature toggles
		shared_storage: bool = Field(
		False,
		validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"),
		description="Enable shared embedding storage across workspaces (deduplication)",
		)
		extract_tables: bool = Field(
		True,
		description="Enable extraction and indexing of table elements",
		)
		extract_figures: bool = Field(
		True,
		description="Enable extraction and indexing of figure elements",
		)
		extract_equations: bool = Field(
		True,
		description="Enable extraction and indexing of equation elements",
		)
		figure_description_enabled: bool = Field(
		True,
		description="Enable figure description generation with vision-capable models",
		)


		def _validate_embedding_model_format(value: str) -> str:
		"""Validate embedding model - accepts any HuggingFace-style model ID.
		class AiConfig(BaseSettings):
		"""AI processing pipeline configuration.

		Unlike LLM models, embedding models via sentence-transformers don't require
		LiteLLM provider validation. Accepts formats like:
		- sentence-transformers/all-MiniLM-L6-v2
		- perplexity-ai/pplx-embed-v1-0.6b
		Lives in 3gpp-ai package. Only primitive fields + format validators.
		No litellm import — provider validation is a separate concern.
		"""
		if "/" not in value:
		msg = "embedding_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)

		provider, model_name = value.split("/", 1)
		provider_normalized = provider.strip().lower()
		model_name_normalized = model_name.strip()

		if not provider_normalized:
		msg = "embedding_model provider segment cannot be empty"
		raise ValueError(msg)
		if not model_name_normalized:
		msg = "embedding_model model_name segment cannot be empty"
		raise ValueError(msg)

		return f"{provider_normalized}/{model_name_normalized}"


		class AiConfig(BaseConfigModel):
		"""Configuration for the AI processing pipeline."""
		model_config = SettingsConfigDict(extra="ignore")

		# Embedding
		embedding_model: str = Field(
		DEFAULT_EMBEDDING_MODEL,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"),
		description="Embedding model in <provider>/<model_name> format",
		)
		embedding_backend: Backend = Field(
		"torch",
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"),
		description="Sentence-transformers backend (torch, onnx, openvino)",
		)
		max_chunk_size: int = Field(1000, ge=1, description="Max tokens per chunk")
		chunk_overlap: int = Field(100, ge=0, description="Token overlap between chunks")
		embedding_api_base: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"),
		description="Override Embedding API base URL",
		)
		embedding_api_key: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"),
		description="Override Embedding API key",
		)

		# LLM
		llm_model: str = Field(
		DEFAULT_LLM_MODEL,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_MODEL.name, "llm_model"),
		description="LLM model in <provider>/<model_name> format",
		)
		llm_api_base: str \| None = Field(None, description="Override LLM API base URL")
		llm_api_key: str \| None = Field(None, description="Override LLM API key (takes precedence over provider-specific env vars)")
		llm_api_base: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_API_BASE.name, "llm_api_base"),
		description="Override LLM API base URL",
		)
		llm_api_key: str \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_API_KEY.name, "llm_api_key"),
		description="Override LLM API key (takes precedence over provider env vars)",
		)

		abstract_min_words: int = Field(150, ge=1, description="Minimum abstract word count")
		abstract_max_words: int = Field(250, ge=1, description="Maximum abstract word count")
		parallelism: int = Field(4, ge=1, le=32, description="Concurrent TDoc processing")
		# Chunking
		max_chunk_size: int = Field(
		1000,
		ge=1,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_MAX_CHUNK_SIZE.name, "max_chunk_size"),
		description="Max tokens per chunk",
		)
		chunk_overlap: int = Field(
		100,
		ge=0,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CHUNK_OVERLAP.name, "chunk_overlap"),
		description="Token overlap between chunks",
		)

		# Processing
		abstract_min_words: int = Field(
		150,
		ge=1,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_ABSTRACT_MIN_WORDS.name, "abstract_min_words"),
		description="Minimum abstract word count",
		)
		abstract_max_words: int = Field(
		250,
		ge=1,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_ABSTRACT_MAX_WORDS.name, "abstract_max_words"),
		description="Maximum abstract word count",
		)
		parallelism: int = Field(
		4,
		ge=1,
		le=32,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_PARALLELISM.name, "parallelism"),
		description="Concurrent TDoc processing",
		)
		convert_pdf: bool = Field(
		False,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CONVERT_PDF.name, "convert_pdf"),
		description="Convert PDF documents to markdown",
		)
		convert_md: bool = Field(
		False,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CONVERT_MD.name, "convert_md"),
		description="Enable markdown conversion for documents",
		)
		vlm: bool = Field(
		False,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_VLM.name, "vlm"),
		description="Use Vision-Language Models for figure analysis",
		)
		device: str = Field(
		"auto",
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_DEVICE.name, "device"),
		description="Device to use for local models (auto\|cpu\|cuda\|mps)",
		)
		num_threads: int \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_NUM_THREADS.name, "num_threads"),
		description="Number of threads for parallel processing",
		)
		batch_size: int \| None = Field(
		None,
		validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_BATCH_SIZE.name, "batch_size"),
		description="Batch size for processing",
		)

		# Graph
		graph_query_level: GraphQueryLevel = Field(
		"simple",
		validation_alias=AliasChoices(ConfigEnvVar.TDC_GRAPH_QUERY_LEVEL.name, "graph_query_level"),
		description="Level of graph query answer generation (simple\|medium\|advanced)",
		)

		@classmethod
		def from_env(cls, **overrides: str \| int \| Path \| None) -> AiConfig:
		"""Create config from environment variables."""
		data: dict[str, str \| int \| Path \| None] = {}

		# Set cache_manager_name for use in _resolve_paths validator
		if cache_manager_name := overrides.get("cache_manager_name"):
		data["cache_manager_name"] = cache_manager_name

		if embedding_model := os.getenv("TDC_AI_EMBEDDING_MODEL"):
		data["embedding_model"] = embedding_model
		if embedding_backend := os.getenv("TDC_AI_EMBEDDING_BACKEND"):
		data["embedding_backend"] = embedding_backend
		if llm_model := os.getenv("TDC_AI_LLM_MODEL"):
		data["llm_model"] = llm_model
		if llm_api_base := os.getenv("TDC_AI_LLM_API_BASE"):
		data["llm_api_base"] = llm_api_base

		# Check for TDC_AI_LLM_API_KEY - takes precedence over provider-specific keys
		data["llm_api_key"] = os.environ.get("TDC_AI_LLM_API_KEY")

		max_chunk_size = _env_int("TDC_AI_MAX_CHUNK_SIZE")
		if max_chunk_size is not None:
		data["max_chunk_size"] = max_chunk_size

		chunk_overlap = _env_int("TDC_AI_CHUNK_OVERLAP")
		if chunk_overlap is not None:
		data["chunk_overlap"] = chunk_overlap

		abstract_min_words = _env_int("TDC_AI_ABSTRACT_MIN_WORDS")
		if abstract_min_words is not None:
		data["abstract_min_words"] = abstract_min_words

		abstract_max_words = _env_int("TDC_AI_ABSTRACT_MAX_WORDS")
		if abstract_max_words is not None:
		data["abstract_max_words"] = abstract_max_words

		parallelism = _env_int("TDC_AI_PARALLELISM")
		if parallelism is not None:
		data["parallelism"] = parallelism

		if graph_query_level := os.getenv("TDC_GRAPH_QUERY_LEVEL"):
		data["graph_query_level"] = graph_query_level

		data.update(overrides)
		# Filter out None values to let defaults apply
		filtered_data = {k: v for k, v in data.items() if v is not None}
		return cls(**filtered_data)
		# LightRAG nested
		lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings)

		@model_validator(mode="after")
		def _validate_bounds(self) -> AiConfig:
		@@ -172,33 +219,28 @@ class AiConfig(BaseConfigModel):
		@field_validator("embedding_model")
		@classmethod
		def _validate_embedding_model(cls, value: str) -> str:
		return _validate_embedding_model_format(value)

		@field_validator("embedding_backend")
		@classmethod
		def _validate_embedding_backend(cls, value: str) -> str:
		normalized = value.strip().lower()
		allowed = {"torch", "onnx", "openvino"}
		if normalized not in allowed:
		msg = "embedding_backend must be one of: torch, onnx, openvino"
		if "/" not in value:
		msg = "embedding_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)
		return normalized
		return value

		@field_validator("llm_model")
		@classmethod
		def _validate_llm_model(cls, value: str) -> str:
		return _validate_model_identifier(value, "llm_model")

		@field_validator("graph_query_level")
		@classmethod
		def _validate_graph_query_level(cls, value: GraphQueryLevel \| str) -> GraphQueryLevel:
		if isinstance(value, str):
		value = value.strip().lower()
		if value not in ["simple", "medium", "advanced"]:
		msg = "graph_query_level must be one of: simple, medium, advanced"
		if "/" not in value:
		msg = "llm_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)
		return value # type: ignore[return-value]
		return value


		__all__ = ["AiConfig", "Backend", "GraphQueryLevel"]
		class ThreeGPPAIConfig(ThreeGPPConfig):
		"""Extended config for 3gpp-ai, adding [ai] section.

		Inherits from_settings() from ThreeGPPConfig — loads all base
		sections (path, http, credentials, crawl) plus [ai].
		"""

		ai: AiConfig = Field(default_factory=AiConfig)


		__all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"]

packages/3gpp-ai/threegpp_ai/config_app.py

0 → 100644

+534 −0

File added.

Preview size limit exceeded, changes collapsed.

packages/3gpp-ai/threegpp_ai/config_exporter.py

0 → 100644

+428 −0

File added.

Preview size limit exceeded, changes collapsed.

packages/3gpp-ai/threegpp_ai/lightrag/rag.py

+3 −4

Original line number	Diff line number	Diff line
		@@ -25,7 +25,7 @@ from lightrag.llm.ollama import ollama_embed, ollama_model_complete
		from lightrag.llm.openai import openai_complete, openai_embed
		from lightrag.llm.zhipu import zhipu_complete, zhipu_embedding
		from lightrag.utils import EmbeddingFunc
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.logging import get_logger

		from .config import LightRAGConfig, QueryMode, StorageBackend
		@@ -199,12 +199,11 @@ class TDocRAG:
		self._pg0_manager.start()
		logger.info("Using pg0 at %s", self._pg0_manager.uri)

		# Prepare working directory using CacheManager (single source of truth)
		# Prepare working directory using PathConfig (single source of truth)
		# Structure: ~/.3gpp-crawler/lightrag/{embedding_model}/
		# LightRAG will create workspace subdirectory internally
		manager = resolve_cache_manager()
		embedding_model_safe = self.config.embedding.model.replace(":", "-").replace("/", "-")
		working_dir = manager.ai_embed_dir(embedding_model_safe)
		working_dir = PathConfig().ai_embed_dir(embedding_model_safe)
		working_dir.mkdir(parents=True, exist_ok=True)
		self._working_dir = working_dir
		logger.info("Using working directory: %s", working_dir)