Commit 9423934a authored by Jan Reimes's avatar Jan Reimes
Browse files

🔥 chore(3gpp-ai): remove LightRAG integration and dependencies

parent f8b08b2e
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -18,8 +18,6 @@ dependencies = [
    "doc2txt>=1.0.8",
    #"doc2txt>=1.0.8 @ git+https://github.com/Quantatirsk/doc2txt-pypi.git"
    "litellm>=1.81.15",
    "lightrag-hku[offline]>=1.4.9.3",
    "pg0-embedded>=0.12.0",
    "pydantic-settings>=2.13.1",
    "liteparse>=1.2.0",
    "docling[vlm]>=2.82.0",
+0 −61
Original line number Diff line number Diff line
"""LightRAG integration for 3gpp-ai.

This package provides a thin wrapper around LightRAG with:
- Multi-provider LLM and embedding support (ollama, openai, zhipu, jina, hf, etc.)
- File-based or pg0-backed storage
- Async context manager pattern
- TDoc document processing with docling extraction

Supported providers:
- LLM: ollama, openai, zhipu, hf, lollms, azure_openai, nvidia_openai
- Embedding: ollama, openai, zhipu, jina, hf, siliconcloud

Example:
    >>> import asyncio
    >>> async def main():
    ...     async with TDocRAG() as rag:
    ...         await rag.insert("TDoc S4-250001 about TS 26.444")
    ...         result = await rag.query("What TDocs mention TS 26.444?")
    ...         print(result)
    >>> asyncio.run(main())
"""

from .config import (
    DatabaseConfig,
    EmbeddingConfig,
    LightRAGConfig,
    LLMConfig,
    QueryMode,
    StorageBackend,
)
from .metadata import RAGMetadata, create_metadata_from_dict, enrich_text
from .pg0_manager import Pg0Error, Pg0Manager
from .processor import DocumentProcessor, ProcessingResult, ProcessingResultStatus, TDocProcessor
from .rag import TDocRAG
from .seeder import EntitySeed, EntitySeeder, EntityType
from .shared_storage import SharedNanoVectorDBStorage, WorkspaceIndex, initialize_shared_storage

__all__ = [
    "DatabaseConfig",
    "DocumentProcessor",
    "EmbeddingConfig",
    "EntitySeed",
    "EntitySeeder",
    "EntityType",
    "LLMConfig",
    "LightRAGConfig",
    "Pg0Error",
    "Pg0Manager",
    "ProcessingResult",
    "ProcessingResultStatus",
    "QueryMode",
    "RAGMetadata",
    "SharedNanoVectorDBStorage",
    "StorageBackend",
    "TDocProcessor",
    "TDocRAG",
    "WorkspaceIndex",
    "create_metadata_from_dict",
    "enrich_text",
    "initialize_shared_storage",
]
+0 −129
Original line number Diff line number Diff line
"""CLI commands for LightRAG integration.

This module is deprecated - use workspace query and workspace status instead.

Note: These commands are now integrated into the main CLI under workspace subcommand.
"""

from __future__ import annotations

import asyncio
import json
from typing import Annotated, Literal

import typer
from rich.console import Console

from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
from threegpp_ai.lightrag.rag import TDocRAG

app = typer.Typer(name="rag", help="LightRAG knowledge graph commands (deprecated)")
console = Console()


@app.command("query")
def query_graph(
    query: Annotated[str, typer.Argument(help="Query string")],
    mode: Annotated[
        QueryMode,
        typer.Option(
            "--mode",
            "-m",
            case_sensitive=False,
            help=f"Query mode: {', '.join(m.value for m in QueryMode)}",
        ),
    ] = QueryMode.HYBRID,
    workspace: Annotated[
        str,
        typer.Option("--workspace", "-w", help="Workspace name"),
    ] = "default",
    output_format: Annotated[
        Literal["text", "json", "yaml"],
        typer.Option("--output-format", help="Output format: 'text' (default), 'json', or 'yaml'"),
    ] = "text",
) -> None:
    """Query the LightRAG knowledge graph.

    Uses LLM to synthesize an answer from the knowledge graph.
    """
    if not query:
        console.print("[red]Error: query is required[/red]")
        raise typer.Exit(1)

    async def _run() -> str | None:
        config = LightRAGConfig.from_env()
        rag = TDocRAG(config)
        await rag.start(workspace)
        try:
            result = await rag.query(query, mode=mode)
            return result
        finally:
            await rag.stop()

    result = asyncio.run(_run())

    if output_format == "json":
        typer.echo(json.dumps({"query": query, "mode": mode.value, "result": result}))
    else:
        console.print(f"\n[bold]Query:[/bold] {query}")
        console.print(f"[bold]Mode:[/bold] {mode.value}\n")
        if result:
            console.print(result)
        else:
            console.print("[yellow]No result returned[/yellow]")


@app.command("status")
def show_status(
    verbose: Annotated[
        bool,
        typer.Option("--verbose", "-v", help="Show full configuration"),
    ] = False,
) -> None:
    """Show LightRAG configuration and status."""
    config = LightRAGConfig.from_env()

    # Header
    console.print("\n[bold cyan]LightRAG Configuration[/bold cyan]")

    # Database backend
    backend = config.database.backend
    backend_icon = "🗄️" if backend == StorageBackend.PG0 else "📁"
    console.print(f"\n{backend_icon} [cyan]Storage backend:[/cyan] {backend.value}")

    if backend == StorageBackend.PG0:
        console.print(f"   Instance: [cyan]{config.database.pg0_instance_name}[/cyan]")
        console.print(f"   Port: [cyan]{config.database.pg0_port}[/cyan]")
        console.print(f"   Database: [cyan]{config.database.pg0_database}[/cyan]")
    else:
        console.print(f"   Working dir: [cyan]{config.working_dir}[/cyan]")

    # LLM
    console.print("\n🤖 [cyan]LLM:[/cyan]")
    console.print(f"   Model: [cyan]{config.llm.model}[/cyan]")
    console.print(f"   API Base: [cyan]{config.llm.api_base}[/cyan]")

    # Embedding
    console.print("\n🔢 [cyan]Embedding:[/cyan]")
    console.print(f"   Model: [cyan]{config.embedding.model}[/cyan]")
    console.print(f"   API Base: [cyan]{config.embedding.api_base}[/cyan]")

    # Query defaults
    console.print("\n🔍 [cyan]Query defaults:[/cyan]")
    console.print(f"   Mode: [cyan]{config.default_query_mode.value}[/cyan]")
    console.print(f"   Workspace: [cyan]{config.workspace}[/cyan]")

    # Shared storage status
    console.print(f"   Shared storage: [cyan]{'enabled' if config.shared_storage else 'disabled'}[/cyan]")

    if verbose:
        console.print("\n[bold]Full configuration:[/bold]")
        console.print(f"   working_dir: {config.working_dir}")
        console.print(f"   workspace: {config.workspace}")
        console.print(f"   env_prefix: {config.model_config.get('env_prefix', 'N/A')}")

    console.print()


if __name__ == "__main__":
    app()
+0 −273
Original line number Diff line number Diff line
"""Configuration for LightRAG integration.

This module defines configuration dataclasses for LightRAG with:
- Storage backend selection (file-based, pg0, etc.)
- LLM and embedding model settings with TDC_AI_* environment variable support
- Query mode options

All constants are defined at module level in CAPS.
All choice/option types use StrEnum.
"""

from __future__ import annotations

import os
from enum import StrEnum

from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

# =============================================================================
# Constants (defaults and allowed values)
# =============================================================================

# LLM defaults (read from TDC_AI_LLM_MODEL or use default)
DEFAULT_LLM_MODEL: str = os.getenv("TDC_AI_LLM_MODEL", "openrouter/openrouter/free")
DEFAULT_LLM_API_BASE: str = os.getenv("TDC_AI_LLM_API_BASE", "http://localhost:11434")

# Embedding defaults (read from TDC_AI_EMBEDDING_MODEL or use default)
DEFAULT_EMBEDDING_MODEL: str = os.getenv("TDC_AI_EMBEDDING_MODEL", "ollama/qwen3-embedding:0.6b")
DEFAULT_EMBEDDING_API_BASE: str = os.getenv("TDC_AI_EMBEDDING_API_BASE", "http://localhost:11434")

# Workspace default
DEFAULT_WORKSPACE: str = "default"

# pg0 defaults
DEFAULT_PG0_INSTANCE_NAME: str = "3gpp-crawler"
DEFAULT_PG0_PORT: int = 15432
DEFAULT_PG0_DATABASE: str = "tdoc"


# =============================================================================
# Helpers
# =============================================================================


def _env_bool(key: str, default: bool = True) -> bool:
    """Parse a boolean environment variable.

    Returns True for: "1", "true", "yes", "on"
    Returns False for: "0", "false", "no", "off"
    Returns default if the variable is not set.
    """
    value = os.getenv(key)
    if value is None:
        return default
    return value.lower() in {"1", "true", "yes", "on"}


# =============================================================================
# Enums
# =============================================================================


class StorageBackend(StrEnum):
    """Supported storage backends for LightRAG."""

    FILE = "file"
    PG0 = "pg0"


class QueryMode(StrEnum):
    """Supported query modes for LightRAG queries."""

    NAIVE = "naive"
    LOCAL = "local"
    GLOBAL = "global"
    HYBRID = "hybrid"
    MIX = "mix"
    BYPASS = "bypass"


# =============================================================================
# Sub-configurations
# =============================================================================


class LLMConfig(BaseSettings):
    """LLM configuration for LightRAG.

    Supports <provider>/<model> syntax via TDC_AI_LLM_MODEL environment variable.
    Examples: openrouter/openrouter/free, ollama/qwen3:8b, anthropic/claude-3-sonnet
    """

    model: str = Field(
        default=DEFAULT_LLM_MODEL,
        description="LLM model name in <provider>/<model> format",
    )
    api_base: str = Field(
        default=DEFAULT_LLM_API_BASE,
        description="LLM API base URL",
    )
    api_key: str | None = Field(
        default=None,
        description="API key for cloud LLM providers (overrides TDC_AI_LLM_API_KEY)",
    )

    model_config = SettingsConfigDict(env_prefix="LIGHTRAG_LLM_")


class EmbeddingConfig(BaseSettings):
    """Embedding model configuration for LightRAG.

    Supports <provider>/<model> syntax via TDC_AI_EMBEDDING_MODEL environment variable.
    Examples: sentence-transformers/all-MiniLM-L6-v2, ollama/qwen3-embedding:0.6b
    """

    model: str = Field(
        default=DEFAULT_EMBEDDING_MODEL,
        description="Embedding model name in <provider>/<model> format",
    )
    api_base: str = Field(
        default=DEFAULT_EMBEDDING_API_BASE,
        description="Embedding API base URL",
    )
    api_key: str | None = Field(
        default=None,
        description="API key for cloud embedding providers (overrides TDC_AI_EMBEDDING_API_KEY)",
    )

    model_config = SettingsConfigDict(env_prefix="LIGHTRAG_EMBEDDING_")


class DatabaseConfig(BaseSettings):
    """Database/storage backend configuration for LightRAG.

    Currently supports:
    - FILE: File-based storage (NanoVectorDB, JsonKVStorage, NetworkX)
    - PG0: PostgreSQL via pg0 (requires pg0 to be fixed on Windows)

    LightRAG storage types:
    - KV storage: JsonKVStorage (file) / PGKVStorage (pg0)
    - Vector storage: NanoVectorDBStorage (file) / PGVectorStorage (pg0)
    - DocStatus storage: JsonDocStatusStorage (file) / PGDocStatusStorage (pg0)
    - Graph storage: NetworkXStorage (file) / PGGraphStorage (pg0, requires AGE)
    """

    backend: StorageBackend = Field(
        default=StorageBackend.FILE,
        description="Storage backend to use",
    )
    pg0_instance_name: str = Field(
        default=DEFAULT_PG0_INSTANCE_NAME,
        description="pg0 instance name",
    )
    pg0_port: int = Field(
        default=DEFAULT_PG0_PORT,
        description="pg0 PostgreSQL port",
    )
    pg0_database: str = Field(
        default=DEFAULT_PG0_DATABASE,
        description="pg0 database name",
    )

    model_config = SettingsConfigDict(env_prefix="LIGHTRAG_DB_")


# =============================================================================
# Main configuration
# =============================================================================


class LightRAGConfig(BaseSettings):
    """Main configuration for LightRAG integration.

    Uses file-based storage (NanoVectorDB, JsonKVStorage, NetworkX) by default.
    Set LIGHTRAG_DB_BACKEND=pg0 to use PostgreSQL via pg0.
    Set LIGHTRAG_SHARED_STORAGE=true to enable cross-workspace embedding deduplication.

    Reads from TDC_AI_* environment variables for compatibility with legacy AiConfig:
    - TDC_AI_LLM_MODEL: LLM model in <provider>/<model> format
    - TDC_AI_LLM_API_BASE: LLM API base URL
    - TDC_AI_LLM_API_KEY: LLM API key
    - TDC_AI_EMBEDDING_MODEL: Embedding model in <provider>/<model> format
    """

    llm: LLMConfig = Field(
        default_factory=LLMConfig,
        description="LLM configuration",
    )
    embedding: EmbeddingConfig = Field(
        default_factory=EmbeddingConfig,
        description="Embedding model configuration",
    )
    database: DatabaseConfig = Field(
        default_factory=DatabaseConfig,
        description="Storage backend configuration",
    )

    workspace: str = Field(
        default=DEFAULT_WORKSPACE,
        description="Default workspace name",
    )
    default_query_mode: QueryMode = Field(
        default=QueryMode.HYBRID,
        description="Default query mode",
    )
    shared_storage: bool = Field(
        default=False,
        description="Enable shared embedding storage across workspaces (deduplication). "
        "Note: Requires custom LightRAG integration - disable if using standard LightRAG.",
    )
    extract_tables: bool = Field(
        default=True,
        description="Enable extraction and indexing of table elements.",
    )
    extract_figures: bool = Field(
        default=True,
        description="Enable extraction and indexing of figure/image elements.",
    )
    extract_equations: bool = Field(
        default=True,
        description="Enable extraction and indexing of equation elements.",
    )
    figure_description_enabled: bool = Field(
        default=True,
        description="Enable optional figure description generation when vision-capable models are available.",
    )

    model_config = SettingsConfigDict(env_prefix="LIGHTRAG_")

    @classmethod
    def from_env(cls, **overrides) -> LightRAGConfig:
        """Create LightRAGConfig from TDC_AI_* environment variables.

        This method reads the legacy TDC_AI_* environment variables and maps them
        to LightRAG configuration, ensuring compatibility with the existing .env.example.

        Args:
            **overrides: Additional overrides that take precedence over env vars

        Returns:
            LightRAGConfig instance configured from environment
        """
        config_data: dict = {}

        # LLM config - pass through any set values, filter None
        llm_config = {
            "model": os.getenv("TDC_AI_LLM_MODEL"),
            "api_base": os.getenv("TDC_AI_LLM_API_BASE"),
            "api_key": os.getenv("TDC_AI_LLM_API_KEY"),
        }
        llm_config = {k: v for k, v in llm_config.items() if v is not None}
        if llm_config:
            config_data["llm"] = llm_config

        # Embedding config - pass through any set values, filter None
        embedding_config = {
            "model": os.getenv("TDC_AI_EMBEDDING_MODEL"),
            "api_base": os.getenv("TDC_AI_EMBEDDING_API_BASE"),
            "api_key": os.getenv("TDC_AI_EMBEDDING_API_KEY"),
        }
        embedding_config = {k: v for k, v in embedding_config.items() if v is not None}
        if embedding_config:
            config_data["embedding"] = embedding_config

        # Extraction toggles default to True
        config_data["extract_tables"] = _env_bool("LIGHTRAG_EXTRACT_TABLES", True)
        config_data["extract_figures"] = _env_bool("LIGHTRAG_EXTRACT_FIGURES", True)
        config_data["extract_equations"] = _env_bool("LIGHTRAG_EXTRACT_EQUATIONS", True)
        config_data["figure_description_enabled"] = _env_bool("LIGHTRAG_FIGURE_DESCRIPTION_ENABLED", True)

        config_data.update(overrides)
        return cls(**config_data)
+0 −148
Original line number Diff line number Diff line
"""3GPP metadata enrichment for LightRAG document insertion.

This module provides schema-driven metadata enrichment that prepends
structured 3GPP metadata to document text before LightRAG insertion.
"""

from __future__ import annotations

from pydantic import BaseModel, Field
from tdoc_crawler.utils.normalization import (
    normalize_release_label,
    normalize_tdoc_id,
)


class RAGMetadata(BaseModel):
    """Structured metadata for 3GPP TDoc documents.

    This model defines the metadata contract for enriching documents
    before insertion into LightRAG. The metadata is prepended as a
    deterministic header to ensure consistent entity extraction.

    Attributes:
        tdoc_id: The TDoc identifier (required, e.g., "S4-250001")
        title: Document title (optional)
        source: Metadata source ("whatthespec", "portal", "doclist") (optional)
        spec_refs: List of referenced specifications (e.g., ["TS 26.444", "TR 26.999"])
        meeting: Meeting code (e.g., "SA4#131-bis") (optional)
        release: 3GPP release number (e.g., "Rel-18") (optional)
        wg: Working group (e.g., "SA4", "RAN1") (optional)

    Example:
        >>> metadata = RAGMetadata(
        ...     tdoc_id="S4-250001",
        ...     title="Test sequences for speech quality",
        ...     spec_refs=["TS 26.444"],
        ...     meeting="SA4#131-bis",
        ... )
        >>> print(metadata.tdoc_id)
        S4-250001
    """

    tdoc_id: str
    title: str | None = None
    source: str | None = None
    spec_refs: list[str] = Field(default_factory=list)
    meeting: str | None = None
    release: str | None = None
    wg: str | None = None

    def model_post_init(self, _context: object) -> None:
        """Normalize metadata after initialization."""
        # Normalize tdoc_id: uppercase and strip whitespace
        if self.tdoc_id:
            self.tdoc_id = normalize_tdoc_id(self.tdoc_id)

        # Normalize spec_refs: strip whitespace from each reference
        if self.spec_refs:
            self.spec_refs = [ref.strip() for ref in self.spec_refs if ref.strip()]

        # Normalize release to a consistent label for downstream headers.
        self.release = normalize_release_label(self.release)


def enrich_text(metadata: RAGMetadata, text: str) -> str:
    r"""Prepend normalized metadata to document text for deterministic extraction.

    This function creates a structured header from the metadata and prepends
    it to the document text. The header format is deterministic to ensure
    consistent entity extraction by LightRAG's LLM.

    Args:
        metadata: The document metadata to prepend.
        text: The document text content.

    Returns:
        The enriched text with metadata header.

    Example:
        >>> metadata = RAGMetadata(
        ...     tdoc_id="S4-250001",
        ...     title="Test document",
        ...     spec_refs=["TS 26.444"],
        ... )
        >>> enriched = enrich_text(metadata, "Document content here...")
        >>> print(enriched.split("\\n\\n")[0])
        Document: S4-250001
        Title: Test document
        Related Specifications: TS 26.444
    """
    header_lines = [f"Document: {metadata.tdoc_id}"]

    if metadata.title:
        header_lines.append(f"Title: {metadata.title}")

    if metadata.source:
        header_lines.append(f"Source: {metadata.source}")

    if metadata.spec_refs:
        header_lines.append(f"Related Specifications: {', '.join(metadata.spec_refs)}")

    if metadata.meeting:
        header_lines.append(f"Meeting: {metadata.meeting}")

    if metadata.release:
        header_lines.append(f"Release: {metadata.release}")

    if metadata.wg:
        header_lines.append(f"Working Group: {metadata.wg}")

    header = "\n".join(header_lines)
    return f"{header}\n\n{text}"


def create_metadata_from_dict(data: dict) -> RAGMetadata:
    """Create RAGMetadata from a dictionary (e.g., from SQLite).

    This is a convenience function for creating metadata from database
    query results or other dictionary sources.

    Args:
        data: Dictionary with metadata fields.

    Returns:
        RAGMetadata instance with normalized values.

    Raises:
        ValueError: If required fields are missing.

    Example:
        >>> data = {
        ...     "tdoc_id": "s4-250001",
        ...     "title": "Test document",
        ...     "spec_refs": ["TS 26.444", "TR 26.999"],
        ... }
        >>> metadata = create_metadata_from_dict(data)
        >>> print(metadata.tdoc_id)
        S4-250001
    """
    return RAGMetadata(
        tdoc_id=data.get("tdoc_id", ""),
        title=data.get("title"),
        source=data.get("source"),
        spec_refs=data.get("spec_refs", []),
        meeting=data.get("meeting"),
        release=data.get("release"),
        wg=data.get("wg"),
    )
Loading