🔥 chore(3gpp-ai): remove LightRAG integration and dependencies (9423934a) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/pyproject.toml

+0 −2

Original line number	Diff line number	Diff line
		@@ -18,8 +18,6 @@ dependencies = [
		"doc2txt>=1.0.8",
		#"doc2txt>=1.0.8 @ git+https://github.com/Quantatirsk/doc2txt-pypi.git"
		"litellm>=1.81.15",
		"lightrag-hku[offline]>=1.4.9.3",
		"pg0-embedded>=0.12.0",
		"pydantic-settings>=2.13.1",
		"liteparse>=1.2.0",
		"docling[vlm]>=2.82.0",

packages/3gpp-ai/threegpp_ai/lightrag/init.py

deleted100644 → 0

+0 −61

Original line number	Diff line number	Diff line
		"""LightRAG integration for 3gpp-ai.

		This package provides a thin wrapper around LightRAG with:
		- Multi-provider LLM and embedding support (ollama, openai, zhipu, jina, hf, etc.)
		- File-based or pg0-backed storage
		- Async context manager pattern
		- TDoc document processing with docling extraction

		Supported providers:
		- LLM: ollama, openai, zhipu, hf, lollms, azure_openai, nvidia_openai
		- Embedding: ollama, openai, zhipu, jina, hf, siliconcloud

		Example:
		>>> import asyncio
		>>> async def main():
		... async with TDocRAG() as rag:
		... await rag.insert("TDoc S4-250001 about TS 26.444")
		... result = await rag.query("What TDocs mention TS 26.444?")
		... print(result)
		>>> asyncio.run(main())
		"""

		from .config import (
		DatabaseConfig,
		EmbeddingConfig,
		LightRAGConfig,
		LLMConfig,
		QueryMode,
		StorageBackend,
		)
		from .metadata import RAGMetadata, create_metadata_from_dict, enrich_text
		from .pg0_manager import Pg0Error, Pg0Manager
		from .processor import DocumentProcessor, ProcessingResult, ProcessingResultStatus, TDocProcessor
		from .rag import TDocRAG
		from .seeder import EntitySeed, EntitySeeder, EntityType
		from .shared_storage import SharedNanoVectorDBStorage, WorkspaceIndex, initialize_shared_storage

		__all__ = [
		"DatabaseConfig",
		"DocumentProcessor",
		"EmbeddingConfig",
		"EntitySeed",
		"EntitySeeder",
		"EntityType",
		"LLMConfig",
		"LightRAGConfig",
		"Pg0Error",
		"Pg0Manager",
		"ProcessingResult",
		"ProcessingResultStatus",
		"QueryMode",
		"RAGMetadata",
		"SharedNanoVectorDBStorage",
		"StorageBackend",
		"TDocProcessor",
		"TDocRAG",
		"WorkspaceIndex",
		"create_metadata_from_dict",
		"enrich_text",
		"initialize_shared_storage",
		]

packages/3gpp-ai/threegpp_ai/lightrag/cli.py

deleted100644 → 0

+0 −129

Original line number	Diff line number	Diff line
		"""CLI commands for LightRAG integration.

		This module is deprecated - use workspace query and workspace status instead.

		Note: These commands are now integrated into the main CLI under workspace subcommand.
		"""

		from __future__ import annotations

		import asyncio
		import json
		from typing import Annotated, Literal

		import typer
		from rich.console import Console

		from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
		from threegpp_ai.lightrag.rag import TDocRAG

		app = typer.Typer(name="rag", help="LightRAG knowledge graph commands (deprecated)")
		console = Console()


		@app.command("query")
		def query_graph(
		query: Annotated[str, typer.Argument(help="Query string")],
		mode: Annotated[
		QueryMode,
		typer.Option(
		"--mode",
		"-m",
		case_sensitive=False,
		help=f"Query mode: {', '.join(m.value for m in QueryMode)}",
		),
		] = QueryMode.HYBRID,
		workspace: Annotated[
		str,
		typer.Option("--workspace", "-w", help="Workspace name"),
		] = "default",
		output_format: Annotated[
		Literal["text", "json", "yaml"],
		typer.Option("--output-format", help="Output format: 'text' (default), 'json', or 'yaml'"),
		] = "text",
		) -> None:
		"""Query the LightRAG knowledge graph.

		Uses LLM to synthesize an answer from the knowledge graph.
		"""
		if not query:
		console.print("[red]Error: query is required[/red]")
		raise typer.Exit(1)

		async def _run() -> str \| None:
		config = LightRAGConfig.from_env()
		rag = TDocRAG(config)
		await rag.start(workspace)
		try:
		result = await rag.query(query, mode=mode)
		return result
		finally:
		await rag.stop()

		result = asyncio.run(_run())

		if output_format == "json":
		typer.echo(json.dumps({"query": query, "mode": mode.value, "result": result}))
		else:
		console.print(f"\n[bold]Query:[/bold] {query}")
		console.print(f"[bold]Mode:[/bold] {mode.value}\n")
		if result:
		console.print(result)
		else:
		console.print("[yellow]No result returned[/yellow]")


		@app.command("status")
		def show_status(
		verbose: Annotated[
		bool,
		typer.Option("--verbose", "-v", help="Show full configuration"),
		] = False,
		) -> None:
		"""Show LightRAG configuration and status."""
		config = LightRAGConfig.from_env()

		# Header
		console.print("\n[bold cyan]LightRAG Configuration[/bold cyan]")

		# Database backend
		backend = config.database.backend
		backend_icon = "🗄️" if backend == StorageBackend.PG0 else "📁"
		console.print(f"\n{backend_icon} [cyan]Storage backend:[/cyan] {backend.value}")

		if backend == StorageBackend.PG0:
		console.print(f" Instance: [cyan]{config.database.pg0_instance_name}[/cyan]")
		console.print(f" Port: [cyan]{config.database.pg0_port}[/cyan]")
		console.print(f" Database: [cyan]{config.database.pg0_database}[/cyan]")
		else:
		console.print(f" Working dir: [cyan]{config.working_dir}[/cyan]")

		# LLM
		console.print("\n🤖 [cyan]LLM:[/cyan]")
		console.print(f" Model: [cyan]{config.llm.model}[/cyan]")
		console.print(f" API Base: [cyan]{config.llm.api_base}[/cyan]")

		# Embedding
		console.print("\n🔢 [cyan]Embedding:[/cyan]")
		console.print(f" Model: [cyan]{config.embedding.model}[/cyan]")
		console.print(f" API Base: [cyan]{config.embedding.api_base}[/cyan]")

		# Query defaults
		console.print("\n🔍 [cyan]Query defaults:[/cyan]")
		console.print(f" Mode: [cyan]{config.default_query_mode.value}[/cyan]")
		console.print(f" Workspace: [cyan]{config.workspace}[/cyan]")

		# Shared storage status
		console.print(f" Shared storage: [cyan]{'enabled' if config.shared_storage else 'disabled'}[/cyan]")

		if verbose:
		console.print("\n[bold]Full configuration:[/bold]")
		console.print(f" working_dir: {config.working_dir}")
		console.print(f" workspace: {config.workspace}")
		console.print(f" env_prefix: {config.model_config.get('env_prefix', 'N/A')}")

		console.print()


		if __name__ == "__main__":
		app()

packages/3gpp-ai/threegpp_ai/lightrag/config.py

deleted100644 → 0

+0 −273

Original line number	Diff line number	Diff line
		"""Configuration for LightRAG integration.

		This module defines configuration dataclasses for LightRAG with:
		- Storage backend selection (file-based, pg0, etc.)
		- LLM and embedding model settings with TDC_AI_* environment variable support
		- Query mode options

		All constants are defined at module level in CAPS.
		All choice/option types use StrEnum.
		"""

		from __future__ import annotations

		import os
		from enum import StrEnum

		from pydantic import Field
		from pydantic_settings import BaseSettings, SettingsConfigDict

		# =============================================================================
		# Constants (defaults and allowed values)
		# =============================================================================

		# LLM defaults (read from TDC_AI_LLM_MODEL or use default)
		DEFAULT_LLM_MODEL: str = os.getenv("TDC_AI_LLM_MODEL", "openrouter/openrouter/free")
		DEFAULT_LLM_API_BASE: str = os.getenv("TDC_AI_LLM_API_BASE", "http://localhost:11434")

		# Embedding defaults (read from TDC_AI_EMBEDDING_MODEL or use default)
		DEFAULT_EMBEDDING_MODEL: str = os.getenv("TDC_AI_EMBEDDING_MODEL", "ollama/qwen3-embedding:0.6b")
		DEFAULT_EMBEDDING_API_BASE: str = os.getenv("TDC_AI_EMBEDDING_API_BASE", "http://localhost:11434")

		# Workspace default
		DEFAULT_WORKSPACE: str = "default"

		# pg0 defaults
		DEFAULT_PG0_INSTANCE_NAME: str = "3gpp-crawler"
		DEFAULT_PG0_PORT: int = 15432
		DEFAULT_PG0_DATABASE: str = "tdoc"


		# =============================================================================
		# Helpers
		# =============================================================================


		def _env_bool(key: str, default: bool = True) -> bool:
		"""Parse a boolean environment variable.

		Returns True for: "1", "true", "yes", "on"
		Returns False for: "0", "false", "no", "off"
		Returns default if the variable is not set.
		"""
		value = os.getenv(key)
		if value is None:
		return default
		return value.lower() in {"1", "true", "yes", "on"}


		# =============================================================================
		# Enums
		# =============================================================================


		class StorageBackend(StrEnum):
		"""Supported storage backends for LightRAG."""

		FILE = "file"
		PG0 = "pg0"


		class QueryMode(StrEnum):
		"""Supported query modes for LightRAG queries."""

		NAIVE = "naive"
		LOCAL = "local"
		GLOBAL = "global"
		HYBRID = "hybrid"
		MIX = "mix"
		BYPASS = "bypass"


		# =============================================================================
		# Sub-configurations
		# =============================================================================


		class LLMConfig(BaseSettings):
		"""LLM configuration for LightRAG.

		Supports <provider>/<model> syntax via TDC_AI_LLM_MODEL environment variable.
		Examples: openrouter/openrouter/free, ollama/qwen3:8b, anthropic/claude-3-sonnet
		"""

		model: str = Field(
		default=DEFAULT_LLM_MODEL,
		description="LLM model name in <provider>/<model> format",
		)
		api_base: str = Field(
		default=DEFAULT_LLM_API_BASE,
		description="LLM API base URL",
		)
		api_key: str \| None = Field(
		default=None,
		description="API key for cloud LLM providers (overrides TDC_AI_LLM_API_KEY)",
		)

		model_config = SettingsConfigDict(env_prefix="LIGHTRAG_LLM_")


		class EmbeddingConfig(BaseSettings):
		"""Embedding model configuration for LightRAG.

		Supports <provider>/<model> syntax via TDC_AI_EMBEDDING_MODEL environment variable.
		Examples: sentence-transformers/all-MiniLM-L6-v2, ollama/qwen3-embedding:0.6b
		"""

		model: str = Field(
		default=DEFAULT_EMBEDDING_MODEL,
		description="Embedding model name in <provider>/<model> format",
		)
		api_base: str = Field(
		default=DEFAULT_EMBEDDING_API_BASE,
		description="Embedding API base URL",
		)
		api_key: str \| None = Field(
		default=None,
		description="API key for cloud embedding providers (overrides TDC_AI_EMBEDDING_API_KEY)",
		)

		model_config = SettingsConfigDict(env_prefix="LIGHTRAG_EMBEDDING_")


		class DatabaseConfig(BaseSettings):
		"""Database/storage backend configuration for LightRAG.

		Currently supports:
		- FILE: File-based storage (NanoVectorDB, JsonKVStorage, NetworkX)
		- PG0: PostgreSQL via pg0 (requires pg0 to be fixed on Windows)

		LightRAG storage types:
		- KV storage: JsonKVStorage (file) / PGKVStorage (pg0)
		- Vector storage: NanoVectorDBStorage (file) / PGVectorStorage (pg0)
		- DocStatus storage: JsonDocStatusStorage (file) / PGDocStatusStorage (pg0)
		- Graph storage: NetworkXStorage (file) / PGGraphStorage (pg0, requires AGE)
		"""

		backend: StorageBackend = Field(
		default=StorageBackend.FILE,
		description="Storage backend to use",
		)
		pg0_instance_name: str = Field(
		default=DEFAULT_PG0_INSTANCE_NAME,
		description="pg0 instance name",
		)
		pg0_port: int = Field(
		default=DEFAULT_PG0_PORT,
		description="pg0 PostgreSQL port",
		)
		pg0_database: str = Field(
		default=DEFAULT_PG0_DATABASE,
		description="pg0 database name",
		)

		model_config = SettingsConfigDict(env_prefix="LIGHTRAG_DB_")


		# =============================================================================
		# Main configuration
		# =============================================================================


		class LightRAGConfig(BaseSettings):
		"""Main configuration for LightRAG integration.

		Uses file-based storage (NanoVectorDB, JsonKVStorage, NetworkX) by default.
		Set LIGHTRAG_DB_BACKEND=pg0 to use PostgreSQL via pg0.
		Set LIGHTRAG_SHARED_STORAGE=true to enable cross-workspace embedding deduplication.

		Reads from TDC_AI_* environment variables for compatibility with legacy AiConfig:
		- TDC_AI_LLM_MODEL: LLM model in <provider>/<model> format
		- TDC_AI_LLM_API_BASE: LLM API base URL
		- TDC_AI_LLM_API_KEY: LLM API key
		- TDC_AI_EMBEDDING_MODEL: Embedding model in <provider>/<model> format
		"""

		llm: LLMConfig = Field(
		default_factory=LLMConfig,
		description="LLM configuration",
		)
		embedding: EmbeddingConfig = Field(
		default_factory=EmbeddingConfig,
		description="Embedding model configuration",
		)
		database: DatabaseConfig = Field(
		default_factory=DatabaseConfig,
		description="Storage backend configuration",
		)

		workspace: str = Field(
		default=DEFAULT_WORKSPACE,
		description="Default workspace name",
		)
		default_query_mode: QueryMode = Field(
		default=QueryMode.HYBRID,
		description="Default query mode",
		)
		shared_storage: bool = Field(
		default=False,
		description="Enable shared embedding storage across workspaces (deduplication). "
		"Note: Requires custom LightRAG integration - disable if using standard LightRAG.",
		)
		extract_tables: bool = Field(
		default=True,
		description="Enable extraction and indexing of table elements.",
		)
		extract_figures: bool = Field(
		default=True,
		description="Enable extraction and indexing of figure/image elements.",
		)
		extract_equations: bool = Field(
		default=True,
		description="Enable extraction and indexing of equation elements.",
		)
		figure_description_enabled: bool = Field(
		default=True,
		description="Enable optional figure description generation when vision-capable models are available.",
		)

		model_config = SettingsConfigDict(env_prefix="LIGHTRAG_")

		@classmethod
		def from_env(cls, **overrides) -> LightRAGConfig:
		"""Create LightRAGConfig from TDC_AI_* environment variables.

		This method reads the legacy TDC_AI_* environment variables and maps them
		to LightRAG configuration, ensuring compatibility with the existing .env.example.

		Args:
		**overrides: Additional overrides that take precedence over env vars

		Returns:
		LightRAGConfig instance configured from environment
		"""
		config_data: dict = {}

		# LLM config - pass through any set values, filter None
		llm_config = {
		"model": os.getenv("TDC_AI_LLM_MODEL"),
		"api_base": os.getenv("TDC_AI_LLM_API_BASE"),
		"api_key": os.getenv("TDC_AI_LLM_API_KEY"),
		}
		llm_config = {k: v for k, v in llm_config.items() if v is not None}
		if llm_config:
		config_data["llm"] = llm_config

		# Embedding config - pass through any set values, filter None
		embedding_config = {
		"model": os.getenv("TDC_AI_EMBEDDING_MODEL"),
		"api_base": os.getenv("TDC_AI_EMBEDDING_API_BASE"),
		"api_key": os.getenv("TDC_AI_EMBEDDING_API_KEY"),
		}
		embedding_config = {k: v for k, v in embedding_config.items() if v is not None}
		if embedding_config:
		config_data["embedding"] = embedding_config

		# Extraction toggles default to True
		config_data["extract_tables"] = _env_bool("LIGHTRAG_EXTRACT_TABLES", True)
		config_data["extract_figures"] = _env_bool("LIGHTRAG_EXTRACT_FIGURES", True)
		config_data["extract_equations"] = _env_bool("LIGHTRAG_EXTRACT_EQUATIONS", True)
		config_data["figure_description_enabled"] = _env_bool("LIGHTRAG_FIGURE_DESCRIPTION_ENABLED", True)

		config_data.update(overrides)
		return cls(**config_data)

packages/3gpp-ai/threegpp_ai/lightrag/metadata.py

deleted100644 → 0

+0 −148

Original line number	Diff line number	Diff line
		"""3GPP metadata enrichment for LightRAG document insertion.

		This module provides schema-driven metadata enrichment that prepends
		structured 3GPP metadata to document text before LightRAG insertion.
		"""

		from __future__ import annotations

		from pydantic import BaseModel, Field
		from tdoc_crawler.utils.normalization import (
		normalize_release_label,
		normalize_tdoc_id,
		)


		class RAGMetadata(BaseModel):
		"""Structured metadata for 3GPP TDoc documents.

		This model defines the metadata contract for enriching documents
		before insertion into LightRAG. The metadata is prepended as a
		deterministic header to ensure consistent entity extraction.

		Attributes:
		tdoc_id: The TDoc identifier (required, e.g., "S4-250001")
		title: Document title (optional)
		source: Metadata source ("whatthespec", "portal", "doclist") (optional)
		spec_refs: List of referenced specifications (e.g., ["TS 26.444", "TR 26.999"])
		meeting: Meeting code (e.g., "SA4#131-bis") (optional)
		release: 3GPP release number (e.g., "Rel-18") (optional)
		wg: Working group (e.g., "SA4", "RAN1") (optional)

		Example:
		>>> metadata = RAGMetadata(
		... tdoc_id="S4-250001",
		... title="Test sequences for speech quality",
		... spec_refs=["TS 26.444"],
		... meeting="SA4#131-bis",
		... )
		>>> print(metadata.tdoc_id)
		S4-250001
		"""

		tdoc_id: str
		title: str \| None = None
		source: str \| None = None
		spec_refs: list[str] = Field(default_factory=list)
		meeting: str \| None = None
		release: str \| None = None
		wg: str \| None = None

		def model_post_init(self, _context: object) -> None:
		"""Normalize metadata after initialization."""
		# Normalize tdoc_id: uppercase and strip whitespace
		if self.tdoc_id:
		self.tdoc_id = normalize_tdoc_id(self.tdoc_id)

		# Normalize spec_refs: strip whitespace from each reference
		if self.spec_refs:
		self.spec_refs = [ref.strip() for ref in self.spec_refs if ref.strip()]

		# Normalize release to a consistent label for downstream headers.
		self.release = normalize_release_label(self.release)


		def enrich_text(metadata: RAGMetadata, text: str) -> str:
		r"""Prepend normalized metadata to document text for deterministic extraction.

		This function creates a structured header from the metadata and prepends
		it to the document text. The header format is deterministic to ensure
		consistent entity extraction by LightRAG's LLM.

		Args:
		metadata: The document metadata to prepend.
		text: The document text content.

		Returns:
		The enriched text with metadata header.

		Example:
		>>> metadata = RAGMetadata(
		... tdoc_id="S4-250001",
		... title="Test document",
		... spec_refs=["TS 26.444"],
		... )
		>>> enriched = enrich_text(metadata, "Document content here...")
		>>> print(enriched.split("\\n\\n")[0])
		Document: S4-250001
		Title: Test document
		Related Specifications: TS 26.444
		"""
		header_lines = [f"Document: {metadata.tdoc_id}"]

		if metadata.title:
		header_lines.append(f"Title: {metadata.title}")

		if metadata.source:
		header_lines.append(f"Source: {metadata.source}")

		if metadata.spec_refs:
		header_lines.append(f"Related Specifications: {', '.join(metadata.spec_refs)}")

		if metadata.meeting:
		header_lines.append(f"Meeting: {metadata.meeting}")

		if metadata.release:
		header_lines.append(f"Release: {metadata.release}")

		if metadata.wg:
		header_lines.append(f"Working Group: {metadata.wg}")

		header = "\n".join(header_lines)
		return f"{header}\n\n{text}"


		def create_metadata_from_dict(data: dict) -> RAGMetadata:
		"""Create RAGMetadata from a dictionary (e.g., from SQLite).

		This is a convenience function for creating metadata from database
		query results or other dictionary sources.

		Args:
		data: Dictionary with metadata fields.

		Returns:
		RAGMetadata instance with normalized values.

		Raises:
		ValueError: If required fields are missing.

		Example:
		>>> data = {
		... "tdoc_id": "s4-250001",
		... "title": "Test document",
		... "spec_refs": ["TS 26.444", "TR 26.999"],
		... }
		>>> metadata = create_metadata_from_dict(data)
		>>> print(metadata.tdoc_id)
		S4-250001
		"""
		return RAGMetadata(
		tdoc_id=data.get("tdoc_id", ""),
		title=data.get("title"),
		source=data.get("source"),
		spec_refs=data.get("spec_refs", []),
		meeting=data.get("meeting"),
		release=data.get("release"),
		wg=data.get("wg"),
		)