Commit 15f2407f authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(ai): remove legacy AI processing pipeline and related tests

* Delete AiStorage class and its associated methods from storage.py.
* Update ai_cli tests to reflect removal of process, status, query, and graph commands.
* Remove legacy tests for the AI processing pipeline, retaining only as historical reference.
* Update workspace contract tests to reflect the removal of pipeline behaviors.
parent 76fba02a
Loading
Loading
Loading
Loading
+10 −20
Original line number Diff line number Diff line
@@ -77,15 +77,12 @@ TDC_VERBOSE=false
# Set to "true", "1", "yes", or "on" to enable; anything else disables it
# HTTP_CACHE_REFRESH_ON_ACCESS=true

# AI Configuration
# AI Configuration (LightRAG)
# Note: AI module requires API keys for cloud providers. See docs/ai.md for details.

# Path to AI LanceDB store (default: <cache_dir>/.ai/lancedb)
# TDC_AI_STORE_PATH=

# LLM model in format <provider>/<model_name>
# Recommended: openrouter/openrouter/free (free tier, no subscription required)
# API key: OPENROUTER_API_KEY
# API key: <provider-uppercase>_API_KEY (or set TDC_AI_LLM_API_KEY directly)
TDC_AI_LLM_MODEL=openrouter/openrouter/free

# Optional custom base URL for LLM provider/proxy
@@ -94,22 +91,11 @@ TDC_AI_LLM_MODEL=openrouter/openrouter/free
# Optional API key for LLM provider, will override default environment variable (e.g., OPENROUTER_API_KEY for OpenRouter)
# TDC_AI_LLM_API_KEY=

# Embedding model (HuggingFace sentence-transformers model ID)
# Default: sentence-transformers/all-MiniLM-L6-v2 (384 dimensions, popular and fast)
# For alternatives, see:
# https://huggingface.co/models?pipeline_tag=feature-extraction&num_parameters=min:0,max:3B&library=sentence-transformers&sort=trending
TDC_AI_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2

# Embedding backend for sentence-transformers (default: torch)
# Options: torch (default), onnx (faster inference), openvino (Intel hardware optimization)
# Can also be set via --accelerate/-a CLI option for ai process command
TDC_AI_EMBEDDING_BACKEND=torch
# Embedding model in format <provider>/<model_name>
# Recommended: ollama/qwen3-embedding:0.6b (self-hosted, no subscription required)
TDC_AI_EMBEDDING_MODEL=ollama/vuongnguyen2212/CodeRankEmbed:latest

# Activate workspace after creation (default: true)
# Set to "true", "1", or "yes" to enable; anything else disables it
TDC_AI_WORKSPACE_ACTIVATE=true

# Chunking
# Chunking parameters
TDC_AI_MAX_CHUNK_SIZE=1000
TDC_AI_CHUNK_OVERLAP=100

@@ -126,6 +112,10 @@ TDC_AI_PARALLELISM=4
# advanced: Use LLM to synthesize answer from graph + embeddings (GraphRAG)
TDC_GRAPH_QUERY_LEVEL=simple

# LightRAG-specific settings
# Enable shared embedding storage across workspaces (deduplication, default: true)
# TDC_LIGHTRAG_SHARED_STORAGE=true

# Note: Never commit actual .env file to version control!
# Copy this file to .env and replace placeholders with your actual credentials and preferences.

+12 −12
Original line number Diff line number Diff line
@@ -6,15 +6,15 @@ call .venv\scripts\activate.bat
:: tdoc-crawler crawl-tdocs --start-date 2016
:: tdoc-crawler query-tdocs --agenda "*atias*" --start-date 2018

tdoc-crawler ai workspace deactivate
tdoc-crawler ai workspace delete atias --no-preserve-artifacts
tdoc-crawler ai workspace create atias
:: tdoc-crawler ai workspace activate atias
tdoc-crawler ai workspace add-members 26131 26132 26260 26261 21905 --kind specs --release 19
tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 18.1.0
tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 18.0.0
tdoc-crawler ai workspace add-members 26260 26261 --kind specs --release 17
tdoc-crawler ai workspace add-members --kind tdocs --agenda "*atias*" --start-date 2017
tdoc-crawler ai workspace list-members
tdoc-crawler ai workspace process
tdoc-crawler ai query "Please summarize the evolution of test methods in all ATIAS work items between the releases, in particular focusing on IVAS-capable devices"
3gpp-ai workspace deactivate
3gpp-ai workspace delete atias --no-preserve-artifacts
3gpp-ai workspace create atias
:: 3gpp-ai workspace activate atias
3gpp-ai workspace add-members 26131 26132 26260 26261 21905 --kind specs --release 19
3gpp-ai workspace add-members 26260 --kind specs --release 18.1.0
3gpp-ai workspace add-members 26260 26261 --kind specs --release 18.0.0
3gpp-ai workspace add-members 26260 --kind specs --release 17
3gpp-ai workspace add-members --kind tdocs --agenda "*atias*" --start-date 2017
3gpp-ai workspace list-members
:: 3gpp-ai workspace process
:: 3gpp-ai query "Please summarize the evolution of test methods in all ATIAS work items between the releases, in particular focusing on IVAS-capable devices"
+107 −50
Original line number Diff line number Diff line
@@ -12,99 +12,156 @@ rg --files | tree-cli --fromfile

## Key Design Patterns

### LightRAG Integration (New)
### LightRAG Integration

The new pipeline uses LightRAG for knowledge graph construction:
The 3gpp-ai pipeline uses LightRAG for all document processing:

```python
from tdoc_ai import LightRAGConfig, TDocRAG, TDocProcessor

config = LightRAGConfig()
# Automatically reads TDC_AI_* environment variables
config = LightRAGConfig.from_env()
rag = TDocRAG(config)
await rag.start("my-workspace")
```

### Summarization (Legacy, Still Active)
### Main APIs

Uses LiteLLM directly for on-demand summaries:
Use `TDocRAG` for workspace-level retrieval and `TDocProcessor` for per-document ingestion.

## Configuration

### LightRAG

Reads from `TDC_AI_*` environment variables (see `.env.example`):

- `TDC_AI_LLM_MODEL` - LLM model in `<provider>/<model>` format (default: `openrouter/openrouter/free`)
- `TDC_AI_LLM_API_BASE` - Custom LLM API base URL (optional)
- `TDC_AI_LLM_API_KEY` - LLM API key (optional, overrides provider-specific env vars)
- `TDC_AI_EMBEDDING_MODEL` - Embedding model ID (default: `sentence-transformers/all-MiniLM-L6-v2`)

LightRAG-specific variables:

- `LIGHTRAG_SHARED_STORAGE` - Enable shared embedding storage (default: `true`)
- `LIGHTRAG_DB_BACKEND` - Storage backend: `file` or `pg0` (default: `file`)

### Path Management

**CRITICAL:** All file paths use `CacheManager` from `tdoc_crawler.config`:

```python
from tdoc_ai import summarize_document
from tdoc_crawler.config import resolve_cache_manager

summary = summarize_document("S4-250001", markdown_content)
manager = resolve_cache_manager()
manager.ai_cache_dir       # ~/.3gpp-crawler/lightrag/
manager.ai_embed_dir(model)  # ~/.3gpp-crawler/lightrag/{model}/
```

## Configuration
**NEVER hardcode paths** like `~/.3gpp-crawler` - see root `AGENTS.md` for the full CacheManager pattern.

### LightRAG (New)
## Single Source of Truth (SSOT) Principle

Environment-based via `LightRAGConfig`:
**Rule:** Every configuration value, constant, or shared resource must be defined **exactly once** and reused everywhere else.

- `LIGHTRAG_LLM_MODEL` - LLM model (default: `qwen3:8b`)
- `LIGHTRAG_EMBEDDING_MODEL` - Embedding model (default: `qwen3-embedding:0.6b`)
- `LIGHTRAG_WORKING_DIR` - Working directory (default: `~/.3gpp-crawler/lightrag`)
### What Must Follow SSOT

### Summarization (Legacy)
| Category | Source | Usage |
|----------|--------|-------|
| **Paths** | `CacheManager` | All file/directory paths |
| **API Keys** | Environment variables via `LightRAGConfig` | `config.llm.api_key`, `config.embedding.api_key` |
| **API Base URLs** | Environment variables via `LightRAGConfig` | `config.llm.api_base`, `config.embedding.api_base` |
| **Model Names** | Environment variables via `LightRAGConfig` | `config.llm.model`, `config.embedding.model` |
| **Provider Functions** | `PROVIDERS` registry in `rag.py` | Use `_get_provider(name)` - never inline |
| **Provider Aliases** | `PROVIDER_ALIASES` in `rag.py` | Central mapping (e.g., `zai``zhipu`) |
| **Embedding Dimensions** | `EMBEDDING_DIMENSIONS` in `rag.py` or provider config | Never hardcode dimension values |

Uses `AiConfig.from_env()`:
### Anti-Patterns (NEVER DO)

```python
# ❌ Hardcoded paths
Path.home() / ".3gpp-crawler" / "lightrag"

# ❌ Hardcoded API configuration
api_key = "sk-..."
api_base = "https://api.z.ai/..."

# ❌ Duplicated provider mapping
if provider == "ollama":
    func = ollama_model_complete
elif provider == "zhipu":
    func = zhipu_complete

# ❌ Hardcoded dimension values
if model == "qwen3":
    dim = 1024
```

- `TDC_AI_LLM_MODEL` - LLM model (default: `openrouter/openrouter/free`)
- `TDC_AI_LLM_API_KEY` - API key for LLM
### Correct Patterns (ALWAYS DO)

```python
# ✅ Paths via CacheManager
manager = resolve_cache_manager()
working_dir = manager.ai_embed_dir(model_name)

# ✅ Configuration via LightRAGConfig
config = LightRAGConfig.from_env()
api_key = config.llm.api_key

# ✅ Provider functions from registry
provider_config = _get_provider(provider_name)
func = provider_config.complete_func

# ✅ Dimensions from config or registry
dim = _get_embedding_dimension(model_name, provider)
```

### Why This Matters

1. **Maintainability**: Change once, update everywhere automatically
2. **Consistency**: No drift between different parts of the code
3. **Testability**: Easy to swap values in tests
4. **Security**: Secrets live in environment variables, not code
5. **DRY**: Eliminates duplicated logic and magic strings/numbers

## Storage Layer

### LightRAG (New)
### LightRAG

File-based storage by default:

- NanoVectorDB for embeddings
- JsonKVStorage for cache
- NanoVectorDB for embeddings (file-based)
- JsonKVStorage for cache (file-based)
- NetworkX for knowledge graph

Optionally use pg0 for PostgreSQL-backed storage.

### Legacy (Still Active)

AiStorage uses LanceDB for status tracking in extract/summarize pipelines.

## CLI Integration

Exposed via `3gpp-ai` commands:
Exposed via `tdoc-crawler` commands:

```bash
3gpp-ai rag query "your query"
3gpp-ai rag status
3gpp-ai summarize S4-250001
tdoc-crawler ai workspace process
tdoc-crawler ai rag query "your query"
tdoc-crawler ai rag status
```

## Import Guidelines

```python
# LightRAG integration (preferred)
from tdoc_ai import (
    LightRAGConfig,
    TDocRAG,
    TDocProcessor,
    RAGMetadata,
    enrich_text,
)

# Document operations (still used)
from tdoc_ai import convert_document, summarize_document

# Workspace management (still used)
from tdoc_ai import create_workspace, list_workspaces
from tdoc_ai import LightRAGConfig, TDocRAG, TDocProcessor
```

## Pipeline Stages (Legacy)

Order: **CLASSIFY****EXTRACT****EMBED****GRAPH**
## Extraction

LightRAG handles embedding and graph construction automatically.
LightRAG uses `kreuzberg` for text extraction before chunking and ingestion.

## Deprecated/Removed

- `create_embeddings_manager()` - Removed from public API
- `AiStorage` - Legacy, still used internally by extract/summarize
- `EmbeddingsManager` - Legacy, still used internally
- LanceDB-based storage - Replaced by LightRAG native storage
- `AiStorage`
- `EmbeddingsManager`
- `create_embeddings_manager()`
- `tdoc_ai.operations.pipeline` (legacy CLASSIFY/EXTRACT/EMBED/GRAPH flow)
- `tdoc_ai.storage.lancedb`
- `sentence-transformers`
- `tokenizers`
- `lancedb`
+1 −2
Original line number Diff line number Diff line
@@ -19,8 +19,7 @@ dependencies = [
    #"doc2txt>=1.0.8 @ git+https://github.com/Quantatirsk/doc2txt-pypi.git"
    "kreuzberg[all]>=4.0.0",
    "litellm>=1.81.15",
    "tokenizers>=0.22.2",
    "lightrag-hku[api]>=1.4.9.3",
    "lightrag-hku[offline]>=1.4.9.3",
    "pg0-embedded>=0.12.0",
    "pydantic-settings>=2.13.1",
]
+33 −21
Original line number Diff line number Diff line
@@ -40,20 +40,19 @@ class TestLLMConfig:
    def test_llm_config_defaults(self) -> None:
        """Test LLM config default values."""
        config = LLMConfig()
        assert config.provider == "ollama"
        assert config.model == "qwen3:8b"
        assert config.host == "http://localhost:11434"
        assert config.model == "openrouter/openrouter/free"
        assert config.api_base == "http://localhost:11434"
        assert config.api_key is None

    def test_llm_config_custom(self) -> None:
        """Test LLM config with custom values."""
        config = LLMConfig(
            provider="openai",
            model="gpt-4o-mini",
            model="openai/gpt-4o-mini",
            api_base="https://api.openai.com",
            api_key="sk-test123",
        )
        assert config.provider == "openai"
        assert config.model == "gpt-4o-mini"
        assert config.model == "openai/gpt-4o-mini"
        assert config.api_base == "https://api.openai.com"
        assert config.api_key == "sk-test123"


@@ -63,17 +62,20 @@ class TestEmbeddingConfig:
    def test_embedding_config_defaults(self) -> None:
        """Test embedding config default values."""
        config = EmbeddingConfig()
        assert config.model == "qwen3-embedding:0.6b"
        assert config.host == "http://localhost:11434"
        assert config.model == "ollama/qwen3-embedding:0.6b"
        assert config.api_base == "http://localhost:11434"
        assert config.api_key is None

    def test_embedding_config_custom(self) -> None:
        """Test embedding config with custom values."""
        config = EmbeddingConfig(
            model="text-embedding-3-small",
            host="https://api.openai.com",
            model="openai/text-embedding-3-small",
            api_base="https://api.openai.com",
            api_key="sk-test123",
        )
        assert config.model == "text-embedding-3-small"
        assert config.host == "https://api.openai.com"
        assert config.model == "openai/text-embedding-3-small"
        assert config.api_base == "https://api.openai.com"
        assert config.api_key == "sk-test123"


class TestDatabaseConfig:
@@ -103,17 +105,15 @@ class TestLightRAGConfig:
    def test_lightrag_config_defaults(self) -> None:
        """Test LightRAG config default values."""
        config = LightRAGConfig()
        assert config.llm.provider == "ollama"
        assert config.llm.model == "qwen3:8b"
        assert config.embedding.model == "qwen3-embedding:0.6b"
        assert config.llm.model == "openrouter/openrouter/free"
        assert config.embedding.model == "ollama/qwen3-embedding:0.6b"
        assert config.database.backend == StorageBackend.FILE
        assert config.working_dir.endswith(".3gpp-crawler/lightrag")
        assert config.default_query_mode == QueryMode.HYBRID

    def test_lightrag_config_with_subconfigs(self) -> None:
        """Test LightRAG config with sub-configs."""
        llm = LLMConfig(provider="openai", model="gpt-4o")
        embedding = EmbeddingConfig(model="text-embedding-3-small")
        llm = LLMConfig(model="openai/gpt-4o")
        embedding = EmbeddingConfig(model="openai/text-embedding-3-small")
        database = DatabaseConfig(backend=StorageBackend.PG0)

        config = LightRAGConfig(
@@ -121,8 +121,8 @@ class TestLightRAGConfig:
            embedding=embedding,
            database=database,
        )
        assert config.llm.provider == "openai"
        assert config.embedding.model == "text-embedding-3-small"
        assert config.llm.model == "openai/gpt-4o"
        assert config.embedding.model == "openai/text-embedding-3-small"
        assert config.database.backend == StorageBackend.PG0

    def test_lightrag_config_env_prefix(self) -> None:
@@ -134,3 +134,15 @@ class TestLightRAGConfig:
            assert config.working_dir == "/custom/path"
        finally:
            del os.environ["LIGHTRAG_WORKING_DIR"]

    def test_lightrag_config_from_env(self) -> None:
        """Test LightRAGConfig.from_env() with TDC_AI_* variables."""
        os.environ["TDC_AI_LLM_MODEL"] = "openai/gpt-4o"
        os.environ["TDC_AI_EMBEDDING_MODEL"] = "openai/text-embedding-3-small"
        try:
            config = LightRAGConfig.from_env()
            assert config.llm.model == "openai/gpt-4o"
            assert config.embedding.model == "openai/text-embedding-3-small"
        finally:
            del os.environ["TDC_AI_LLM_MODEL"]
            del os.environ["TDC_AI_EMBEDDING_MODEL"]
Loading