Commit 95448077 authored by Jan Reimes's avatar Jan Reimes
Browse files

chore(ai): remove legacy top-level tdoc-ai directory (moved under src/)

parent 90da54db
Loading
Loading
Loading
Loading

tdoc-ai/README.md

deleted100644 → 0
+0 −17
Original line number Diff line number Diff line
# tdoc-ai

Optional AI extension package for `tdoc-crawler`.

This package contains AI-focused capabilities including:

- Document extraction and conversion
- Summarization
- Embeddings and semantic search
- GraphRAG querying
- AI workspace management

Install via `tdoc-crawler` extras:

```bash
uv add "tdoc-crawler[ai]"
```

tdoc-ai/pyproject.toml

deleted100644 → 0
+0 −30
Original line number Diff line number Diff line
[project]
name = "tdoc-ai"
version = "0.1.0"
description = "Optional AI/RAG extension package for tdoc-crawler"
authors = [{ name = "Jan Reimes", email = "jan.reimes@head-acoustics.com" }]
readme = "README.md"
keywords = ["python", "3gpp", "rag", "ai"]
requires-python = ">=3.14,<4.0"
classifiers = [
    "Intended Audience :: Developers",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.14",
    "Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
    "doc2txt>=1.0.8",
    "kreuzberg[all]>=4.0.0",
    "lancedb>=0.29.2",
    "litellm>=1.81.15",
    "sentence-transformers[openvino]>=2.7.0",
    "tokenizers>=0.22.2",
]

[project.urls]
Repository = "https://forge.3gpp.org/rep/reimes/tdoc-crawler"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

tdoc-ai/tdoc_ai/__init__.py

deleted100644 → 0
+0 −100
Original line number Diff line number Diff line
"""AI document processing domain package."""

from __future__ import annotations

import litellm

from tdoc_ai.config import AiConfig
from tdoc_ai.container import AiServiceContainer
from tdoc_ai.models import (
    DocumentChunk,
    DocumentClassification,
    DocumentSummary,
    GraphEdge,
    GraphNode,
    PipelineStage,
    ProcessingStatus,
)
from tdoc_ai.operations.convert import convert_tdoc as convert_document
from tdoc_ai.operations.graph import query_graph
from tdoc_ai.operations.pipeline import get_status, process_all
from tdoc_ai.operations.pipeline import process_tdoc as process_document
from tdoc_ai.operations.summarize import SummarizeResult
from tdoc_ai.operations.summarize import summarize_tdoc as summarize_document
from tdoc_ai.operations.workspace_registry import (
    DEFAULT_WORKSPACE,
    WorkspaceDisplayInfo,
    WorkspaceRegistry,
    get_active_workspace,
    set_active_workspace,
)
from tdoc_ai.operations.workspaces import (
    add_workspace_members,
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
    create_workspace,
    delete_workspace,
    ensure_ai_subfolder,
    ensure_default_workspace,
    get_workspace,
    get_workspace_member_counts,
    is_default_workspace,
    list_workspace_members,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
    remove_invalid_members,
    resolve_tdoc_checkout_path,
    resolve_workspace,
)
from tdoc_ai.storage import AiStorage
from tdoc_crawler.config import CacheManager

litellm.suppress_debug_info = True  # Suppress provider/model info logs from litellm

process_tdoc = process_document


__all__ = [
    "DEFAULT_WORKSPACE",
    "AiConfig",
    "AiServiceContainer",
    "AiStorage",
    "CacheManager",
    "DocumentChunk",
    "DocumentClassification",
    "DocumentSummary",
    "GraphEdge",
    "GraphNode",
    "PipelineStage",
    "ProcessingStatus",
    "SummarizeResult",
    "WorkspaceDisplayInfo",
    "WorkspaceRegistry",
    "add_workspace_members",
    "checkout_spec_to_workspace",
    "checkout_tdoc_to_workspace",
    "convert_document",
    "create_workspace",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
    "get_active_workspace",
    "get_status",
    "get_workspace",
    "get_workspace_member_counts",
    "is_default_workspace",
    "list_workspace_members",
    "list_workspaces",
    "make_workspace_member",
    "normalize_workspace_name",
    "process_all",
    "process_tdoc",
    "process_document",
    "query_graph",
    "remove_invalid_members",
    "resolve_tdoc_checkout_path",
    "resolve_workspace",
    "set_active_workspace",
    "summarize_document",
]

tdoc-ai/tdoc_ai/config.py

deleted100644 → 0
+0 −176
Original line number Diff line number Diff line
"""Configuration for the AI document processing pipeline."""

from __future__ import annotations

import os
from pathlib import Path

import litellm
from pydantic import Field, field_validator, model_validator

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.models import BaseConfigModel

DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_LLM_MODEL = "openrouter/openrouter/free"


def _env_int(name: str) -> int | None:
    value = os.getenv(name)
    if value is None or value == "":
        return None
    return int(value)


def _validate_model_identifier(value: str, field_name: str) -> str:
    if "/" not in value:
        msg = f"{field_name} must be in '<provider>/<model_name>' format"
        raise ValueError(msg)

    provider, model_name = value.split("/", 1)
    provider_normalized = provider.strip().lower()
    model_name_normalized = model_name.strip()

    if not provider_normalized:
        msg = f"{field_name} provider segment cannot be empty"
        raise ValueError(msg)
    if not model_name_normalized:
        msg = f"{field_name} model_name segment cannot be empty"
        raise ValueError(msg)

    supported_providers = set(litellm.LITELLM_CHAT_PROVIDERS + litellm.openai_compatible_providers)

    if provider_normalized not in supported_providers:
        msg = (
            f"{field_name} provider '{provider}' is not supported by litellm. "
            f"See https://docs.litellm.ai/docs/providers for the full list of {len(supported_providers)} supported providers."
        )
        raise ValueError(msg)

    return f"{provider_normalized}/{model_name_normalized}"


def _validate_embedding_model_format(value: str) -> str:
    """Validate embedding model - accepts any HuggingFace-style model ID.

    Unlike LLM models, embedding models via sentence-transformers don't require
    LiteLLM provider validation. Accepts formats like:
    - sentence-transformers/all-MiniLM-L6-v2
    - perplexity-ai/pplx-embed-v1-0.6b
    """
    if "/" not in value:
        msg = "embedding_model must be in '<provider>/<model_name>' format"
        raise ValueError(msg)

    provider, model_name = value.split("/", 1)
    provider_normalized = provider.strip().lower()
    model_name_normalized = model_name.strip()

    if not provider_normalized:
        msg = "embedding_model provider segment cannot be empty"
        raise ValueError(msg)
    if not model_name_normalized:
        msg = "embedding_model model_name segment cannot be empty"
        raise ValueError(msg)

    return f"{provider_normalized}/{model_name_normalized}"


class AiConfig(BaseConfigModel):
    """Configuration for the AI processing pipeline."""

    ai_cache_dir: Path | None = Field(None, description="Path to AI cache directory")

    embedding_model: str = Field(
        DEFAULT_EMBEDDING_MODEL,
        description="Embedding model in <provider>/<model_name> format",
    )
    max_chunk_size: int = Field(1000, ge=1, description="Max tokens per chunk")
    chunk_overlap: int = Field(100, ge=0, description="Token overlap between chunks")

    llm_model: str = Field(
        DEFAULT_LLM_MODEL,
        description="LLM model in <provider>/<model_name> format",
    )
    llm_api_base: str | None = Field(None, description="Override LLM API base URL")

    abstract_min_words: int = Field(150, ge=1, description="Minimum abstract word count")
    abstract_max_words: int = Field(250, ge=1, description="Maximum abstract word count")
    parallelism: int = Field(4, ge=1, le=32, description="Concurrent TDoc processing")

    @classmethod
    def from_env(cls, **overrides: str | int | Path | None) -> AiConfig:
        """Create config from environment variables."""
        data: dict[str, str | int | Path | None] = {}

        # Set cache_manager_name for use in _resolve_paths validator
        if cache_manager_name := overrides.get("cache_manager_name"):
            data["cache_manager_name"] = cache_manager_name

        # NOTE: ai_cache_dir is NOT set here - it will be resolved in _resolve_paths
        # validator using ai_embed_dir(embedding_model) to include provider/model subdirectory

        if embedding_model := os.getenv("TDC_AI_EMBEDDING_MODEL"):
            data["embedding_model"] = embedding_model
        if llm_model := os.getenv("TDC_AI_LLM_MODEL"):
            data["llm_model"] = llm_model
        if llm_api_base := os.getenv("TDC_AI_LLM_API_BASE"):
            data["llm_api_base"] = llm_api_base

        max_chunk_size = _env_int("TDC_AI_MAX_CHUNK_SIZE")
        if max_chunk_size is not None:
            data["max_chunk_size"] = max_chunk_size

        chunk_overlap = _env_int("TDC_AI_CHUNK_OVERLAP")
        if chunk_overlap is not None:
            data["chunk_overlap"] = chunk_overlap

        abstract_min_words = _env_int("TDC_AI_ABSTRACT_MIN_WORDS")
        if abstract_min_words is not None:
            data["abstract_min_words"] = abstract_min_words

        abstract_max_words = _env_int("TDC_AI_ABSTRACT_MAX_WORDS")
        if abstract_max_words is not None:
            data["abstract_max_words"] = abstract_max_words

        parallelism = _env_int("TDC_AI_PARALLELISM")
        if parallelism is not None:
            data["parallelism"] = parallelism

        data.update(overrides)
        # Filter out None values to let defaults apply
        filtered_data = {k: v for k, v in data.items() if v is not None}
        return cls(**filtered_data)

    @model_validator(mode="after")
    def _resolve_paths(self) -> AiConfig:
        if self.ai_cache_dir is None:
            # Use CacheManager to resolve the embedding directory
            # e.g., ~/.tdoc-crawler/.ai/sentence-transformers/all-MiniLM-L6-v2
            # The ai_embed_dir method handles the provider/model subdirectory structure
            self.ai_cache_dir = resolve_cache_manager(self.cache_manager_name).ai_embed_dir(self.embedding_model)

        return self

    @model_validator(mode="after")
    def _validate_bounds(self) -> AiConfig:
        if self.abstract_max_words < self.abstract_min_words:
            msg = "abstract_max_words must be >= abstract_min_words"
            raise ValueError(msg)
        if self.chunk_overlap >= self.max_chunk_size:
            msg = "chunk_overlap must be less than max_chunk_size"
            raise ValueError(msg)
        return self

    @field_validator("embedding_model")
    @classmethod
    def _validate_embedding_model(cls, value: str) -> str:
        return _validate_embedding_model_format(value)

    @field_validator("llm_model")
    @classmethod
    def _validate_llm_model(cls, value: str) -> str:
        return _validate_model_identifier(value, "llm_model")


__all__ = ["AiConfig"]

tdoc-ai/tdoc_ai/container.py

deleted100644 → 0
+0 −215
Original line number Diff line number Diff line
"""AI Service Container - Singleton for AI module dependencies.

This module provides a centralized container for AI services (AiConfig, AiStorage,
EmbeddingsManager) following the Dependency Injection patterns defined in
specs/001-di-refactoring-plan/.

The container implements lazy initialization and singleton pattern to ensure:
- Single LanceDB connection per session
- Correct cache path including provider/model subdirectory
- Easy testing through dependency injection
"""

from __future__ import annotations

from typing import Any

from sentence_transformers import SentenceTransformer

from tdoc_ai.config import AiConfig
from tdoc_ai.operations.embeddings import EmbeddingsManager
from tdoc_ai.storage import AiStorage


class AiServiceContainer:
    """
    Singleton container for AI services.

    Provides centralized access to AiConfig, AiStorage, and EmbeddingsManager
    with lazy initialization. This ensures single instantiation and correct
    cache path resolution.

    Usage:
        # Get the singleton instance
        container = AiServiceContainer.get_instance()

        # Get services (lazy initialized)
        config = container.get_config()
        storage = container.get_storage()
        embeddings = container.get_embeddings_manager()

        # Or use convenience method
        storage = container.get_ai_storage()
    """

    _instance: AiServiceContainer | None = None
    _config: AiConfig | None = None
    _storage: AiStorage | None = None
    _embeddings_manager: EmbeddingsManager | None = None

    def __new__(cls) -> AiServiceContainer:
        """Ensure singleton pattern."""
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            # Initialize instance attributes
            cls._instance._config = None
            cls._instance._storage = None
            cls._instance._embeddings_manager = None
        return cls._instance

    def get_config(self) -> AiConfig:
        """Get the AI configuration singleton.

        Loads configuration from environment variables using AiConfig.from_env().

        Returns:
            AiConfig singleton instance.
        """
        if self._config is None:
            self._config = AiConfig.from_env()
        return self._config

    def get_embeddings_manager(self) -> EmbeddingsManager:
        """Get the embeddings manager singleton.

        Creates EmbeddingsManager with the shared config and storage.
        Note: Storage must be initialized before calling this method.

        Returns:
            EmbeddingsManager singleton instance.
        """
        if self._embeddings_manager is None:
            config = self.get_config()
            storage = self.get_storage()
            self._embeddings_manager = EmbeddingsManager(config=config, storage=storage)
        return self._embeddings_manager

    def get_storage(self) -> AiStorage:
        """Get the AI storage singleton.

        Creates AiStorage with the correct cache path (including provider/model
        subdirectory) obtained from AiConfig.

        Returns:
            AiStorage singleton instance.
        """
        if self._storage is None:
            config = self.get_config()
            # Load dimension directly from model to avoid circular dependency
            # with get_embeddings_manager() which requires storage
            dimension = self._load_embedding_dimension()
            # AiConfig.ai_cache_dir already includes the provider/model subdirectory
            # when embedding_model is set (see config.py lines 148-152)
            self._storage = AiStorage(config.ai_cache_dir, embedding_dimension=dimension)
        return self._storage

    # Aliases for compatibility with main ServiceContainer design
    def get_ai_config(self) -> AiConfig:
        """Alias for get_config() - compatibility with main ServiceContainer."""
        return self.get_config()

    def get_ai_storage(self) -> AiStorage:
        """Alias for get_storage() - compatibility with main ServiceContainer."""
        return self.get_storage()

    def get_embeddings(self) -> EmbeddingsManager:
        """Alias for get_embeddings_manager() - compatibility with main ServiceContainer."""
        return self.get_embeddings_manager()

    def close(self) -> None:
        """Close the container and release resources.

        Resets all singleton instances. Safe to call multiple times.
        """
        self._storage = None
        self._embeddings_manager = None
        self._config = None

    def __enter__(self) -> AiServiceContainer:
        """Context manager entry."""
        return self

    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        """Context manager exit - ensures cleanup."""
        self.close()

    @classmethod
    def reset_instance(cls) -> None:
        """Reset the singleton instance.

        This is primarily used for testing to ensure each test starts
        with a fresh container. After calling this method, the next
        call to get_instance() will create a new container instance.
        """
        cls._instance = None

    @classmethod
    def get_instance(cls) -> AiServiceContainer:
        """Get the singleton container instance.

        Returns:
            AiServiceContainer singleton instance.
        """
        return cls()

    @classmethod
    def reset_for_testing(cls) -> None:
        """Reset the singleton for testing purposes.

        WARNING: Only use in tests, not in production code.
        """
        cls._instance = None

    def _load_embedding_dimension(self) -> int:
        """Load the embedding dimension from the configured model.

        This is a helper method to avoid circular dependencies between
        get_storage() and get_embeddings_manager().

        Returns:
            The embedding dimension for the configured model.
        """
        config = self.get_config()

        model = SentenceTransformer(config.embedding_model)
        dimension = model.get_sentence_embedding_dimension()
        if dimension is None:
            raise RuntimeError(f"Model '{config.embedding_model}' did not report an embedding dimension")
        return dimension


# Convenience functions for backward compatibility
def get_ai_config() -> AiConfig:
    """Get AI configuration singleton.

    Returns:
        AiConfig singleton instance.
    """
    return AiServiceContainer.get_instance().get_config()


def get_ai_storage() -> AiStorage:
    """Get AI storage singleton.

    Returns:
        AiStorage singleton instance.
    """
    return AiServiceContainer.get_instance().get_storage()


def get_embeddings_manager() -> EmbeddingsManager:
    """Get embeddings manager singleton.

    Returns:
        EmbeddingsManager singleton instance.
    """
    return AiServiceContainer.get_instance().get_embeddings_manager()


__all__ = [
    "AiServiceContainer",
    "get_ai_config",
    "get_ai_storage",
    "get_embeddings_manager",
]
Loading