refactor(ai): remove legacy AI processing pipeline and related tests (15f2407f) · Commits · Jan Reimes / 3gpp-crawler

.env.example

+10 −20

Original line number	Diff line number	Diff line
		@@ -77,15 +77,12 @@ TDC_VERBOSE=false
		# Set to "true", "1", "yes", or "on" to enable; anything else disables it
		# HTTP_CACHE_REFRESH_ON_ACCESS=true

		# AI Configuration
		# AI Configuration (LightRAG)
		# Note: AI module requires API keys for cloud providers. See docs/ai.md for details.

		# Path to AI LanceDB store (default: <cache_dir>/.ai/lancedb)
		# TDC_AI_STORE_PATH=

		# LLM model in format <provider>/<model_name>
		# Recommended: openrouter/openrouter/free (free tier, no subscription required)
		# API key: OPENROUTER_API_KEY
		# API key: <provider-uppercase>_API_KEY (or set TDC_AI_LLM_API_KEY directly)
		TDC_AI_LLM_MODEL=openrouter/openrouter/free

		# Optional custom base URL for LLM provider/proxy
		@@ -94,22 +91,11 @@ TDC_AI_LLM_MODEL=openrouter/openrouter/free
		# Optional API key for LLM provider, will override default environment variable (e.g., OPENROUTER_API_KEY for OpenRouter)
		# TDC_AI_LLM_API_KEY=

		# Embedding model (HuggingFace sentence-transformers model ID)
		# Default: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions, popular and fast)
		# For alternatives, see:
		# https://huggingface.co/models?pipeline_tag=feature-extraction&num_parameters=min:0,max:3B&library=sentence-transformers&sort=trending
		TDC_AI_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2

		# Embedding backend for sentence-transformers (default: torch)
		# Options: torch (default), onnx (faster inference), openvino (Intel hardware optimization)
		# Can also be set via --accelerate/-a CLI option for ai process command
		TDC_AI_EMBEDDING_BACKEND=torch
		# Embedding model in format <provider>/<model_name>
		# Recommended: ollama/qwen3-embedding:0.6b (self-hosted, no subscription required)
		TDC_AI_EMBEDDING_MODEL=ollama/vuongnguyen2212/CodeRankEmbed:latest

		# Activate workspace after creation (default: true)
		# Set to "true", "1", or "yes" to enable; anything else disables it
		TDC_AI_WORKSPACE_ACTIVATE=true

		# Chunking
		# Chunking parameters
		TDC_AI_MAX_CHUNK_SIZE=1000
		TDC_AI_CHUNK_OVERLAP=100

		@@ -126,6 +112,10 @@ TDC_AI_PARALLELISM=4
		# advanced: Use LLM to synthesize answer from graph + embeddings (GraphRAG)
		TDC_GRAPH_QUERY_LEVEL=simple

		# LightRAG-specific settings
		# Enable shared embedding storage across workspaces (deduplication, default: true)
		# TDC_LIGHTRAG_SHARED_STORAGE=true

		# Note: Never commit actual .env file to version control!
		# Copy this file to .env and replace placeholders with your actual credentials and preferences.

demo.bat

+12 −12

Original line number	Diff line number	Diff line
		@@ -6,15 +6,15 @@ call .venv\scripts\activate.bat
		:: tdoc-crawler crawl-tdocs --start-date 2016
		:: tdoc-crawler query-tdocs --agenda "atias" --start-date 2018

		tdoc-crawler ai workspace deactivate
		tdoc-crawler ai workspace delete atias --no-preserve-artifacts
		tdoc-crawler ai workspace create atias
		:: tdoc-crawler ai workspace activate atias
		tdoc-crawler ai workspace add-members 26131 26132 26260 26261 21905 --kind specs --release 19
		tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 18.1.0
		tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 18.0.0
		tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 17
		tdoc-crawler ai workspace add-members --kind tdocs --agenda "atias" --start-date 2017
		tdoc-crawler ai workspace list-members
		tdoc-crawler ai workspace process
		tdoc-crawler ai query "Please summarize the evolution of test methods in all ATIAS work items between the releases, in particular focusing on IVAS-capable devices"
		3gpp-ai workspace deactivate
		3gpp-ai workspace delete atias --no-preserve-artifacts
		3gpp-ai workspace create atias
		:: 3gpp-ai workspace activate atias
		3gpp-ai workspace add-members 26131 26132 26260 26261 21905 --kind specs --release 19
		3gpp-ai workspace add-members 26260 --kind specs --release 18.1.0
		3gpp-ai workspace add-members 26260 26261 --kind specs --release 18.0.0
		3gpp-ai workspace add-members 26260 --kind specs --release 17
		3gpp-ai workspace add-members --kind tdocs --agenda "atias" --start-date 2017
		3gpp-ai workspace list-members
		:: 3gpp-ai workspace process
		:: 3gpp-ai query "Please summarize the evolution of test methods in all ATIAS work items between the releases, in particular focusing on IVAS-capable devices"

packages/3gpp-ai/AGENTS.md

+107 −50

Original line number	Diff line number	Diff line
		@@ -12,99 +12,156 @@ rg --files \| tree-cli --fromfile

		## Key Design Patterns

		### LightRAG Integration (New)
		### LightRAG Integration

		The new pipeline uses LightRAG for knowledge graph construction:
		The 3gpp-ai pipeline uses LightRAG for all document processing:

		```python
		from tdoc_ai import LightRAGConfig, TDocRAG, TDocProcessor

		config = LightRAGConfig()
		# Automatically reads TDC_AI_* environment variables
		config = LightRAGConfig.from_env()
		rag = TDocRAG(config)
		await rag.start("my-workspace")
		```

		### Summarization (Legacy, Still Active)
		### Main APIs

		Uses LiteLLM directly for on-demand summaries:
		Use `TDocRAG` for workspace-level retrieval and `TDocProcessor` for per-document ingestion.

		## Configuration

		### LightRAG

		Reads from `TDC_AI_*` environment variables (see `.env.example`):

		- `TDC_AI_LLM_MODEL` - LLM model in `<provider>/<model>` format (default: `openrouter/openrouter/free`)
		- `TDC_AI_LLM_API_BASE` - Custom LLM API base URL (optional)
		- `TDC_AI_LLM_API_KEY` - LLM API key (optional, overrides provider-specific env vars)
		- `TDC_AI_EMBEDDING_MODEL` - Embedding model ID (default: `sentence-transformers/all-MiniLM-L6-v2`)

		LightRAG-specific variables:

		- `LIGHTRAG_SHARED_STORAGE` - Enable shared embedding storage (default: `true`)
		- `LIGHTRAG_DB_BACKEND` - Storage backend: `file` or `pg0` (default: `file`)

		### Path Management

		CRITICAL: All file paths use `CacheManager` from `tdoc_crawler.config`:

		```python
		from tdoc_ai import summarize_document
		from tdoc_crawler.config import resolve_cache_manager

		summary = summarize_document("S4-250001", markdown_content)
		manager = resolve_cache_manager()
		manager.ai_cache_dir # ~/.3gpp-crawler/lightrag/
		manager.ai_embed_dir(model) # ~/.3gpp-crawler/lightrag/{model}/
		```

		## Configuration
		NEVER hardcode paths like `~/.3gpp-crawler` - see root `AGENTS.md` for the full CacheManager pattern.

		### LightRAG (New)
		## Single Source of Truth (SSOT) Principle

		Environment-based via `LightRAGConfig`:
		Rule: Every configuration value, constant, or shared resource must be defined exactly once and reused everywhere else.

		- `LIGHTRAG_LLM_MODEL` - LLM model (default: `qwen3:8b`)
		- `LIGHTRAG_EMBEDDING_MODEL` - Embedding model (default: `qwen3-embedding:0.6b`)
		- `LIGHTRAG_WORKING_DIR` - Working directory (default: `~/.3gpp-crawler/lightrag`)
		### What Must Follow SSOT

		### Summarization (Legacy)
		\| Category \| Source \| Usage \|
		\|----------\|--------\|-------\|
		\| Paths \| `CacheManager` \| All file/directory paths \|
		\| API Keys \| Environment variables via `LightRAGConfig` \| `config.llm.api_key`, `config.embedding.api_key` \|
		\| API Base URLs \| Environment variables via `LightRAGConfig` \| `config.llm.api_base`, `config.embedding.api_base` \|
		\| Model Names \| Environment variables via `LightRAGConfig` \| `config.llm.model`, `config.embedding.model` \|
		\| Provider Functions \| `PROVIDERS` registry in `rag.py` \| Use `_get_provider(name)` - never inline \|
		\| Provider Aliases \| `PROVIDER_ALIASES` in `rag.py` \| Central mapping (e.g., `zai` → `zhipu`) \|
		\| Embedding Dimensions \| `EMBEDDING_DIMENSIONS` in `rag.py` or provider config \| Never hardcode dimension values \|

		Uses `AiConfig.from_env()`:
		### Anti-Patterns (NEVER DO)

		```python
		# ❌ Hardcoded paths
		Path.home() / ".3gpp-crawler" / "lightrag"

		# ❌ Hardcoded API configuration
		api_key = "sk-..."
		api_base = "https://api.z.ai/..."

		# ❌ Duplicated provider mapping
		if provider == "ollama":
		func = ollama_model_complete
		elif provider == "zhipu":
		func = zhipu_complete

		# ❌ Hardcoded dimension values
		if model == "qwen3":
		dim = 1024
		```

		- `TDC_AI_LLM_MODEL` - LLM model (default: `openrouter/openrouter/free`)
		- `TDC_AI_LLM_API_KEY` - API key for LLM
		### Correct Patterns (ALWAYS DO)

		```python
		# ✅ Paths via CacheManager
		manager = resolve_cache_manager()
		working_dir = manager.ai_embed_dir(model_name)

		# ✅ Configuration via LightRAGConfig
		config = LightRAGConfig.from_env()
		api_key = config.llm.api_key

		# ✅ Provider functions from registry
		provider_config = _get_provider(provider_name)
		func = provider_config.complete_func

		# ✅ Dimensions from config or registry
		dim = _get_embedding_dimension(model_name, provider)
		```

		### Why This Matters

		1. Maintainability: Change once, update everywhere automatically
		2. Consistency: No drift between different parts of the code
		3. Testability: Easy to swap values in tests
		4. Security: Secrets live in environment variables, not code
		5. DRY: Eliminates duplicated logic and magic strings/numbers

		## Storage Layer

		### LightRAG (New)
		### LightRAG

		File-based storage by default:

		- NanoVectorDB for embeddings
		- JsonKVStorage for cache
		- NanoVectorDB for embeddings (file-based)
		- JsonKVStorage for cache (file-based)
		- NetworkX for knowledge graph

		Optionally use pg0 for PostgreSQL-backed storage.

		### Legacy (Still Active)

		AiStorage uses LanceDB for status tracking in extract/summarize pipelines.

		## CLI Integration

		Exposed via `3gpp-ai` commands:
		Exposed via `tdoc-crawler` commands:

		```bash
		3gpp-ai rag query "your query"
		3gpp-ai rag status
		3gpp-ai summarize S4-250001
		tdoc-crawler ai workspace process
		tdoc-crawler ai rag query "your query"
		tdoc-crawler ai rag status
		```

		## Import Guidelines

		```python
		# LightRAG integration (preferred)
		from tdoc_ai import (
		LightRAGConfig,
		TDocRAG,
		TDocProcessor,
		RAGMetadata,
		enrich_text,
		)

		# Document operations (still used)
		from tdoc_ai import convert_document, summarize_document

		# Workspace management (still used)
		from tdoc_ai import create_workspace, list_workspaces
		from tdoc_ai import LightRAGConfig, TDocRAG, TDocProcessor
		```

		## Pipeline Stages (Legacy)

		Order: CLASSIFY → EXTRACT → EMBED → GRAPH
		## Extraction

		LightRAG handles embedding and graph construction automatically.
		LightRAG uses `kreuzberg` for text extraction before chunking and ingestion.

		## Deprecated/Removed

		- `create_embeddings_manager()` - Removed from public API
		- `AiStorage` - Legacy, still used internally by extract/summarize
		- `EmbeddingsManager` - Legacy, still used internally
		- LanceDB-based storage - Replaced by LightRAG native storage
		- `AiStorage`
		- `EmbeddingsManager`
		- `create_embeddings_manager()`
		- `tdoc_ai.operations.pipeline` (legacy CLASSIFY/EXTRACT/EMBED/GRAPH flow)
		- `tdoc_ai.storage.lancedb`
		- `sentence-transformers`
		- `tokenizers`
		- `lancedb`

packages/3gpp-ai/pyproject.toml

+1 −2

Original line number	Diff line number	Diff line
		@@ -19,8 +19,7 @@ dependencies = [
		#"doc2txt>=1.0.8 @ git+https://github.com/Quantatirsk/doc2txt-pypi.git"
		"kreuzberg[all]>=4.0.0",
		"litellm>=1.81.15",
		"tokenizers>=0.22.2",
		"lightrag-hku[api]>=1.4.9.3",
		"lightrag-hku[offline]>=1.4.9.3",
		"pg0-embedded>=0.12.0",
		"pydantic-settings>=2.13.1",
		]

packages/3gpp-ai/tests/test_lightrag_config.py

+33 −21

Original line number	Diff line number	Diff line
		@@ -40,20 +40,19 @@ class TestLLMConfig:
		def test_llm_config_defaults(self) -> None:
		"""Test LLM config default values."""
		config = LLMConfig()
		assert config.provider == "ollama"
		assert config.model == "qwen3:8b"
		assert config.host == "http://localhost:11434"
		assert config.model == "openrouter/openrouter/free"
		assert config.api_base == "http://localhost:11434"
		assert config.api_key is None

		def test_llm_config_custom(self) -> None:
		"""Test LLM config with custom values."""
		config = LLMConfig(
		provider="openai",
		model="gpt-4o-mini",
		model="openai/gpt-4o-mini",
		api_base="https://api.openai.com",
		api_key="sk-test123",
		)
		assert config.provider == "openai"
		assert config.model == "gpt-4o-mini"
		assert config.model == "openai/gpt-4o-mini"
		assert config.api_base == "https://api.openai.com"
		assert config.api_key == "sk-test123"


		@@ -63,17 +62,20 @@ class TestEmbeddingConfig:
		def test_embedding_config_defaults(self) -> None:
		"""Test embedding config default values."""
		config = EmbeddingConfig()
		assert config.model == "qwen3-embedding:0.6b"
		assert config.host == "http://localhost:11434"
		assert config.model == "ollama/qwen3-embedding:0.6b"
		assert config.api_base == "http://localhost:11434"
		assert config.api_key is None

		def test_embedding_config_custom(self) -> None:
		"""Test embedding config with custom values."""
		config = EmbeddingConfig(
		model="text-embedding-3-small",
		host="https://api.openai.com",
		model="openai/text-embedding-3-small",
		api_base="https://api.openai.com",
		api_key="sk-test123",
		)
		assert config.model == "text-embedding-3-small"
		assert config.host == "https://api.openai.com"
		assert config.model == "openai/text-embedding-3-small"
		assert config.api_base == "https://api.openai.com"
		assert config.api_key == "sk-test123"


		class TestDatabaseConfig:
		@@ -103,17 +105,15 @@ class TestLightRAGConfig:
		def test_lightrag_config_defaults(self) -> None:
		"""Test LightRAG config default values."""
		config = LightRAGConfig()
		assert config.llm.provider == "ollama"
		assert config.llm.model == "qwen3:8b"
		assert config.embedding.model == "qwen3-embedding:0.6b"
		assert config.llm.model == "openrouter/openrouter/free"
		assert config.embedding.model == "ollama/qwen3-embedding:0.6b"
		assert config.database.backend == StorageBackend.FILE
		assert config.working_dir.endswith(".3gpp-crawler/lightrag")
		assert config.default_query_mode == QueryMode.HYBRID

		def test_lightrag_config_with_subconfigs(self) -> None:
		"""Test LightRAG config with sub-configs."""
		llm = LLMConfig(provider="openai", model="gpt-4o")
		embedding = EmbeddingConfig(model="text-embedding-3-small")
		llm = LLMConfig(model="openai/gpt-4o")
		embedding = EmbeddingConfig(model="openai/text-embedding-3-small")
		database = DatabaseConfig(backend=StorageBackend.PG0)

		config = LightRAGConfig(
		@@ -121,8 +121,8 @@ class TestLightRAGConfig:
		embedding=embedding,
		database=database,
		)
		assert config.llm.provider == "openai"
		assert config.embedding.model == "text-embedding-3-small"
		assert config.llm.model == "openai/gpt-4o"
		assert config.embedding.model == "openai/text-embedding-3-small"
		assert config.database.backend == StorageBackend.PG0

		def test_lightrag_config_env_prefix(self) -> None:
		@@ -134,3 +134,15 @@ class TestLightRAGConfig:
		assert config.working_dir == "/custom/path"
		finally:
		del os.environ["LIGHTRAG_WORKING_DIR"]

		def test_lightrag_config_from_env(self) -> None:
		"""Test LightRAGConfig.from_env() with TDC_AI_* variables."""
		os.environ["TDC_AI_LLM_MODEL"] = "openai/gpt-4o"
		os.environ["TDC_AI_EMBEDDING_MODEL"] = "openai/text-embedding-3-small"
		try:
		config = LightRAGConfig.from_env()
		assert config.llm.model == "openai/gpt-4o"
		assert config.embedding.model == "openai/text-embedding-3-small"
		finally:
		del os.environ["TDC_AI_LLM_MODEL"]
		del os.environ["TDC_AI_EMBEDDING_MODEL"]

Admin message