Commit 361714a6 authored by Jan Reimes's avatar Jan Reimes
Browse files

🔄 refactor(3gpp-ai): replace RAG-centric with extraction profile system

parent 9423934a
Loading
Loading
Loading
Loading
+1 −37
Original line number Diff line number Diff line
"""AI document processing domain package.

This package provides AI-powered document processing for 3GPP TDocs.
Supports both legacy LiteLLM summarization and modern LightRAG knowledge graph.
Supports extraction, conversion, workspace operations, and summarization flows.
"""

from __future__ import annotations

import litellm

# Import LightRAG integration
from threegpp_ai.lightrag import (
    DatabaseConfig,
    DocumentProcessor,
    EmbeddingConfig,
    LightRAGConfig,
    LLMConfig,
    Pg0Error,
    Pg0Manager,
    ProcessingResult,
    ProcessingResultStatus,
    QueryMode,
    RAGMetadata,
    StorageBackend,
    TDocProcessor,
    TDocRAG,
    create_metadata_from_dict,
    enrich_text,
)
from threegpp_ai.models import SourceKind, SummarizeResult, WorkspaceNotFoundError
from threegpp_ai.operations.convert import convert_tdoc as convert_document
from threegpp_ai.operations.convert import convert_tdoc_to_markdown
@@ -64,23 +45,8 @@ litellm.suppress_debug_info = True # Suppress provider/model info logs from lit
__all__ = [
    # Workspace management
    "DEFAULT_WORKSPACE",
    # LightRAG integration
    "DatabaseConfig",
    "DocumentProcessor",
    "EmbeddingConfig",
    "LLMConfig",
    "LightRAGConfig",
    "Pg0Error",
    "Pg0Manager",
    "ProcessingResult",
    "ProcessingResultStatus",
    "QueryMode",
    "RAGMetadata",
    "SourceKind",
    "StorageBackend",
    "SummarizeResult",
    "TDocProcessor",
    "TDocRAG",
    "WorkspaceNotFoundError",
    "WorkspaceRegistry",
    "add_workspace_members",
@@ -89,10 +55,8 @@ __all__ = [
    # Document operations
    "convert_document",
    "convert_tdoc_to_markdown",
    "create_metadata_from_dict",
    "create_workspace",
    "delete_workspace",
    "enrich_text",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
    "get_active_workspace",
+31 −26
Original line number Diff line number Diff line
@@ -9,8 +9,6 @@ import typer
from tdoc_crawler.config import ConfigEnvVar
from tdoc_crawler.models.base import OutputFormat

from threegpp_ai.lightrag.config import QueryMode

# Common
OutputFormatOption = Annotated[
    str,
@@ -18,7 +16,7 @@ OutputFormatOption = Annotated[
]
CacheDirOption = Annotated[
    Path | None,
    typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name),
    typer.Option("--cache-dir", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name),
]

# Summarize
@@ -30,7 +28,7 @@ SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force
ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")]
ConvertOutputOption = Annotated[
    Path | None,
    typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"),
    typer.Option("--output-path", "-p", help="Output file path (optional, prints to stdout if not specified)"),
]
ConvertForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")]

@@ -66,13 +64,6 @@ ConvertMdOption = Annotated[
        envvar=ConfigEnvVar.TDC_AI_CONVERT_MD.name,
    ),
]
WorkspaceEmbedOption = Annotated[
    bool,
    typer.Option(
        "--embed",
        help="Insert extracted documents into LightRAG knowledge graph (implies --convert-md)",
    ),
]
WorkspaceReleaseOption = Annotated[
    str | None,
    typer.Option(
@@ -91,13 +82,38 @@ WorkspaceProcessVlmOption = Annotated[
        envvar=ConfigEnvVar.TDC_AI_VLM.name,
    ),
]
WorkspacePreserveArtifactsOption = Annotated[
    bool,
ExtractionProfileOption = Annotated[
    str | None,
    typer.Option(
        "--preserve-artifacts/--delete-artifacts",
        help="Preserve LightRAG artifacts (embeddings, index). --delete-artifacts removes only LightRAG data, not document artifacts (.ai folders)",
        "--profile",
        help="Extraction profile override: default, balanced, optimum, custom",
        envvar="TDC_AI_EXTRACTION_PROFILE",
    ),
]
CustomExtractOcrOption = Annotated[
    bool | None,
    typer.Option("--custom-ocr/--no-custom-ocr", help="Custom profile override for OCR stage"),
]
CustomExtractLayoutOption = Annotated[
    bool | None,
    typer.Option("--custom-layout/--no-custom-layout", help="Custom profile override for layout stage"),
]
CustomExtractTablesOption = Annotated[
    bool | None,
    typer.Option("--custom-tables/--no-custom-tables", help="Custom profile override for table extraction"),
]
CustomExtractFiguresOption = Annotated[
    bool | None,
    typer.Option("--custom-figures/--no-custom-figures", help="Custom profile override for figure extraction"),
]
CustomExtractEquationsOption = Annotated[
    bool | None,
    typer.Option("--custom-equations/--no-custom-equations", help="Custom profile override for equation extraction"),
]
CustomExtractEnrichmentOption = Annotated[
    bool | None,
    typer.Option("--custom-enrichment/--no-custom-enrichment", help="Custom profile override for enrichment stages"),
]

# Accelerator options for Docling extraction
AcceleratorDeviceOption = Annotated[
@@ -169,14 +185,3 @@ ProvidersOutputOption = Annotated[
        help="Output format (table, json, ison, toon, yaml)",
    ),
]

# Query
QueryModeOption = Annotated[
    QueryMode,
    typer.Option(
        "--mode",
        "-m",
        case_sensitive=False,
        help="Query mode (local, global, hybrid, naive)",
    ),
]
+183 −363

File changed.

Preview size limit exceeded, changes collapsed.

+58 −108
Original line number Diff line number Diff line
@@ -8,79 +8,16 @@ from __future__ import annotations

from typing import Literal

from pydantic import AliasChoices, Field, field_validator, model_validator
from pydantic import AliasChoices, Field, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
from tdoc_crawler.config.env_vars import ConfigEnvVar
from tdoc_crawler.config.settings import ThreeGPPConfig

DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_LLM_MODEL = "openrouter/openrouter/free"

# Type aliases
Backend = Literal["torch", "onnx", "openvino"]
ExtractionProfile = Literal["default", "balanced", "optimum", "custom"]
GraphQueryLevel = Literal["simple", "medium", "advanced"]
QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"]
StorageBackend = Literal["file", "pg0"]


class LightRAGSettings(BaseSettings):
    """LightRAG-specific configuration (nested under ai.lightrag)."""

    model_config = SettingsConfigDict(extra="ignore")

    # Storage backend
    db_backend: StorageBackend = Field(
        "file",
        validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"),
        description="Storage backend to use (file or pg0)",
    )
    pg0_instance_name: str = Field(
        "3gpp-crawler",
        description="pg0 instance name",
    )
    pg0_port: int = Field(
        15432,
        ge=1,
        le=65535,
        description="pg0 PostgreSQL port",
    )
    pg0_database: str = Field(
        "tdoc",
        description="pg0 database name",
    )

    # Workspace / query
    workspace: str = Field(
        "default",
        description="Default workspace name",
    )
    default_query_mode: QueryMode = Field(
        "hybrid",
        description="Default query mode",
    )

    # Feature toggles
    shared_storage: bool = Field(
        False,
        validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"),
        description="Enable shared embedding storage across workspaces (deduplication)",
    )
    extract_tables: bool = Field(
        True,
        description="Enable extraction and indexing of table elements",
    )
    extract_figures: bool = Field(
        True,
        description="Enable extraction and indexing of figure elements",
    )
    extract_equations: bool = Field(
        True,
        description="Enable extraction and indexing of equation elements",
    )
    figure_description_enabled: bool = Field(
        True,
        description="Enable figure description generation with vision-capable models",
    )


class AiConfig(BaseSettings):
@@ -92,28 +29,6 @@ class AiConfig(BaseSettings):

    model_config = SettingsConfigDict(extra="ignore")

    # Embedding
    embedding_model: str = Field(
        DEFAULT_EMBEDDING_MODEL,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"),
        description="Embedding model in <provider>/<model_name> format",
    )
    embedding_backend: Backend = Field(
        "torch",
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"),
        description="Sentence-transformers backend (torch, onnx, openvino)",
    )
    embedding_api_base: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"),
        description="Override Embedding API base URL",
    )
    embedding_api_key: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"),
        description="Override Embedding API key",
    )

    # LLM
    llm_model: str = Field(
        DEFAULT_LLM_MODEL,
@@ -131,6 +46,43 @@ class AiConfig(BaseSettings):
        description="Override LLM API key (takes precedence over provider env vars)",
    )

    # Extraction profile policy
    extraction_profile: ExtractionProfile | None = Field(
        None,
        validation_alias=AliasChoices("TDC_AI_EXTRACTION_PROFILE", "extraction_profile"),
        description="Extraction profile override (default|balanced|optimum|custom). None enables deterministic auto-selection.",
    )
    custom_extract_ocr: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_OCR", "custom_extract_ocr"),
        description="Custom profile toggle: enable OCR stage",
    )
    custom_extract_layout: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_LAYOUT", "custom_extract_layout"),
        description="Custom profile toggle: enable layout stage",
    )
    custom_extract_tables: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_TABLES", "custom_extract_tables"),
        description="Custom profile toggle: enable table extraction",
    )
    custom_extract_figures: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_FIGURES", "custom_extract_figures"),
        description="Custom profile toggle: enable figure extraction",
    )
    custom_extract_equations: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_EQUATIONS", "custom_extract_equations"),
        description="Custom profile toggle: enable equation extraction",
    )
    custom_extract_enrichment: bool = Field(
        True,
        validation_alias=AliasChoices("TDC_AI_CUSTOM_EXTRACT_ENRICHMENT", "custom_extract_enrichment"),
        description="Custom profile toggle: enable enrichment stages",
    )

    # Chunking
    max_chunk_size: int = Field(
        1000,
@@ -196,6 +148,24 @@ class AiConfig(BaseSettings):
        description="Batch size for processing",
    )

    # Extraction toggles
    extract_tables: bool = Field(
        True,
        description="Enable extraction of table elements",
    )
    extract_figures: bool = Field(
        True,
        description="Enable extraction of figure elements",
    )
    extract_equations: bool = Field(
        True,
        description="Enable extraction of equation elements",
    )
    figure_description_enabled: bool = Field(
        True,
        description="Enable figure description generation with vision-capable models",
    )

    # Graph
    graph_query_level: GraphQueryLevel = Field(
        "simple",
@@ -203,9 +173,6 @@ class AiConfig(BaseSettings):
        description="Level of graph query answer generation (simple|medium|advanced)",
    )

    # LightRAG nested
    lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings)

    @model_validator(mode="after")
    def _validate_bounds(self) -> AiConfig:
        if self.abstract_max_words < self.abstract_min_words:
@@ -216,23 +183,6 @@ class AiConfig(BaseSettings):
            raise ValueError(msg)
        return self

    @field_validator("embedding_model")
    @classmethod
    def _validate_embedding_model(cls, value: str) -> str:
        if "/" not in value:
            msg = "embedding_model must be in '<provider>/<model_name>' format"
            raise ValueError(msg)
        return value

    @field_validator("llm_model")
    @classmethod
    def _validate_llm_model(cls, value: str) -> str:
        if "/" not in value:
            msg = "llm_model must be in '<provider>/<model_name>' format"
            raise ValueError(msg)
        return value


class ThreeGPPAIConfig(ThreeGPPConfig):
    """Extended config for 3gpp-ai, adding [ai] section.

@@ -243,4 +193,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig):
    ai: AiConfig = Field(default_factory=AiConfig)


__all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"]
__all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"]
+41 −93
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from rich.console import Console
from rich.table import Table
from tdoc_crawler.config.settings import ThreeGPPConfig

from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
from threegpp_ai.config import AiConfig

from .config_exporter import ConfigExporter

@@ -53,7 +53,7 @@ ConfigValidateStrictOption = Annotated[
]
ConfigDocsSectionOption = Annotated[
    str | None,
    typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, embedding, database, extraction, workspace)"),
    typer.Option("--section", "-s", help="Show specific section (path, http, credentials, crawl, llm, extraction, runtime)"),
]

config_app = typer.Typer(help="Manage configuration")
@@ -94,7 +94,7 @@ def config_show(

    Shows merged configuration from:
    1. Config files (3gpp-ai.toml, etc.)
    2. Environment variables (TDC_*, LIGHTRAG_*)
    2. Environment variables (TDC_*)
    3. Hard-coded defaults

    Use --show-secrets to display actual API key values (WARNING: not secure).
@@ -136,7 +136,7 @@ def _validate_model_format(model: str, field_name: str) -> list[tuple[str, str]]

def _validate_config_values(
    crawler_config: ThreeGPPConfig,
    ai_config: LightRAGConfig,
    ai_config: AiConfig,
) -> list[tuple[str, str]]:
    """Validate config values and return list of (severity, message) tuples.

@@ -188,48 +188,16 @@ def _validate_config_values(
    llm_model = ai_config.llm.model
    issues.extend(_validate_model_format(llm_model, "llm.model"))

    # Validate embedding model format
    embedding_model = ai_config.embedding.model
    issues.extend(_validate_model_format(embedding_model, "embedding.model"))

    # Check database backend
    if ai_config.database.backend not in (StorageBackend.FILE, StorageBackend.PG0):
        issues.append(("error", f"database.backend must be 'file' or 'pg0', got: {ai_config.database.backend}"))

    # Check pg0 port range if pg0 backend is used
    if ai_config.database.backend == StorageBackend.PG0:
        if ai_config.database.pg0_port < 1 or ai_config.database.pg0_port > 65535:
            issues.append(("error", f"database.pg0_port must be 1-65535, got {ai_config.database.pg0_port}"))

        if not ai_config.database.pg0_instance_name:
            issues.append(("error", "database.pg0_instance_name cannot be empty when using pg0 backend"))

        if not ai_config.database.pg0_database:
            issues.append(("error", "database.pg0_database cannot be empty when using pg0 backend"))

    # Validate query mode
    if ai_config.default_query_mode not in QueryMode:
        issues.append(("error", f"workspace.default_query_mode must be one of {[m.value for m in QueryMode]}, got: {ai_config.default_query_mode}"))

    # Warnings for API keys
    if ai_config.llm.api_key is None and "/" in llm_model:
        provider = llm_model.split("/")[0]
        if provider not in ("ollama", "localhost"):
            issues.append(("warning", f"llm.api_key not set for cloud provider '{provider}'"))

    if ai_config.embedding.api_key is None and "/" in embedding_model:
        provider = embedding_model.split("/")[0]
        if provider not in ("ollama", "localhost", "sentence-transformers"):
            issues.append(("warning", f"embedding.api_key not set for cloud provider '{provider}'"))

    # Warning for shared storage
    if ai_config.shared_storage:
        issues.append(("warning", "workspace.shared_storage is enabled; requires custom LightRAG integration"))

    return issues


def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]:
def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, AiConfig]:
    """Load configs from a specific file with validation.

    Note: Currently only validates syntax. Full validation happens in config_validate.
@@ -251,7 +219,8 @@ def _validate_from_file(file: Path) -> tuple[ThreeGPPConfig, LightRAGConfig]:
        raise typer.Exit(1)

    # Return default configs (will be validated with env vars applied)
    return ThreeGPPConfig.from_settings(config_file=file), LightRAGConfig.from_env()
    combined = ThreeGPPConfig.from_settings(config_file=file)
    return combined, combined.ai


def _display_validation_results(issues: list[tuple[str, str]], strict: bool) -> None:
@@ -296,7 +265,7 @@ def config_validate(

    Validates both crawler and AI settings:
    - Crawler: paths, HTTP settings, credentials, crawl limits
    - AI: LLM/embedding model formats, database backend, query modes
    - AI: LLM model format and runtime extraction controls

    Exit codes:
    - 0: All valid
@@ -308,8 +277,9 @@ def config_validate(
        crawler_config, ai_config = _validate_from_file(file)
    else:
        try:
            crawler_config = ThreeGPPConfig.from_settings()
            ai_config = LightRAGConfig.from_env()
            combined = ThreeGPPConfig.from_settings()
            crawler_config = combined
            ai_config = combined.ai
        except ValidationError as e:
            rprint("[red]Validation error in discovered config:[/red]")
            for error in e.errors():
@@ -338,10 +308,8 @@ def config_docs(
    - credentials: Portal authentication
    - crawl: Crawling filters and limits
    - llm: LLM model and API settings
    - embedding: Embedding model settings
    - database: Storage backend (file/pg0)
    - extraction: Document extraction toggles
    - workspace: Workspace defaults
    - runtime: Processing behavior and limits
    """
    # Build documentation data
    sections: dict[str, list[dict]] = {
@@ -350,10 +318,8 @@ def config_docs(
        "credentials": [],
        "crawl": [],
        "llm": [],
        "embedding": [],
        "database": [],
        "extraction": [],
        "workspace": [],
        "runtime": [],
    }

    # Introspect crawler config
@@ -388,14 +354,15 @@ def config_docs(
            )

    # Introspect AI config
    ai_config = LightRAGConfig()
    ai_config = AiConfig()
    ai_data = ai_config.model_dump()

    # LLM
    for field_name, field_info in type(ai_config.llm).model_fields.items():
    for field_name in ("llm_model", "llm_api_base", "llm_api_key"):
        field_info = type(ai_config).model_fields[field_name]
        description = field_info.description or ""
        default = field_info.default
        value = ai_data.get("llm", {}).get(field_name)
        value = ai_data.get(field_name)

        sections["llm"].append(
            {
@@ -407,41 +374,9 @@ def config_docs(
            }
        )

    # Embedding
    for field_name, field_info in type(ai_config.embedding).model_fields.items():
        description = field_info.description or ""
        default = field_info.default
        value = ai_data.get("embedding", {}).get(field_name)

        sections["embedding"].append(
            {
                "field": field_name,
                "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
                "default": default,
                "value": value,
                "description": description,
            }
        )

    # Database
    for field_name, field_info in type(ai_config.database).model_fields.items():
        description = field_info.description or ""
        default = field_info.default
        value = ai_data.get("database", {}).get(field_name)

        sections["database"].append(
            {
                "field": field_name,
                "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
                "default": default,
                "value": value,
                "description": description,
            }
        )

    # Extraction and workspace (direct fields)
    for field_name, field_info in ai_config.model_fields.items():
        if field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"):
    # Extraction toggles
    for field_name in ("extract_tables", "extract_figures", "extract_equations", "figure_description_enabled"):
        field_info = type(ai_config).model_fields[field_name]
        description = field_info.description or ""
        default = field_info.default
        value = ai_data.get(field_name)
@@ -455,12 +390,27 @@ def config_docs(
                "description": description,
            }
        )
        elif field_name in ("workspace", "default_query_mode", "shared_storage"):

    # Runtime behavior
    for field_name in (
        "convert_pdf",
        "convert_md",
        "vlm",
        "device",
        "num_threads",
        "batch_size",
        "parallelism",
        "max_chunk_size",
        "chunk_overlap",
        "abstract_min_words",
        "abstract_max_words",
    ):
        field_info = type(ai_config).model_fields[field_name]
        description = field_info.description or ""
        default = field_info.default
        value = ai_data.get(field_name)

            sections["workspace"].append(
        sections["runtime"].append(
            {
                "field": field_name,
                "type": field_info.annotation.__name__ if hasattr(field_info.annotation, "__name__") else str(field_info.annotation),
@@ -484,7 +434,7 @@ def config_docs(
        # Show all sections
        for section_name, fields_list in sections.items():
            _print_section_docs(section_name, fields_list, _get_section_description(section_name))
            if section_name != "workspace":
            if section_name != "runtime":
                rprint()


@@ -496,10 +446,8 @@ def _get_section_description(section: str) -> str:
        "credentials": "ETSI Online (EOL) portal authentication credentials",
        "crawl": "Crawling behavior, filters, and limits",
        "llm": "LLM model and API configuration",
        "embedding": "Embedding model and API configuration",
        "database": "Storage backend selection (file-based or pg0)",
        "extraction": "Document element extraction toggles (tables, figures, equations)",
        "workspace": "Workspace defaults and query behavior",
        "runtime": "Runtime conversion, VLM, threading, and chunking behavior",
    }
    return descriptions.get(section, "")

Loading