chore(ai): remove legacy top-level tdoc-ai directory (moved under src/) (95448077) · Commits · Jan Reimes / 3gpp-crawler

tdoc-ai/README.md

deleted100644 → 0

+0 −17

Original line number	Diff line number	Diff line
		# tdoc-ai

		Optional AI extension package for `tdoc-crawler`.

		This package contains AI-focused capabilities including:

		- Document extraction and conversion
		- Summarization
		- Embeddings and semantic search
		- GraphRAG querying
		- AI workspace management

		Install via `tdoc-crawler` extras:

		```bash
		uv add "tdoc-crawler[ai]"
		```

tdoc-ai/pyproject.toml

deleted100644 → 0

+0 −30

Original line number	Diff line number	Diff line
		[project]
		name = "tdoc-ai"
		version = "0.1.0"
		description = "Optional AI/RAG extension package for tdoc-crawler"
		authors = [{ name = "Jan Reimes", email = "jan.reimes@head-acoustics.com" }]
		readme = "README.md"
		keywords = ["python", "3gpp", "rag", "ai"]
		requires-python = ">=3.14,<4.0"
		classifiers = [
		"Intended Audience :: Developers",
		"Programming Language :: Python",
		"Programming Language :: Python :: 3",
		"Programming Language :: Python :: 3.14",
		"Topic :: Software Development :: Libraries :: Python Modules",
		]
		dependencies = [
		"doc2txt>=1.0.8",
		"kreuzberg[all]>=4.0.0",
		"lancedb>=0.29.2",
		"litellm>=1.81.15",
		"sentence-transformers[openvino]>=2.7.0",
		"tokenizers>=0.22.2",
		]

		[project.urls]
		Repository = "https://forge.3gpp.org/rep/reimes/tdoc-crawler"

		[build-system]
		requires = ["hatchling"]
		build-backend = "hatchling.build"

tdoc-ai/tdoc_ai/init.py

deleted100644 → 0

+0 −100

Original line number	Diff line number	Diff line
		"""AI document processing domain package."""

		from __future__ import annotations

		import litellm

		from tdoc_ai.config import AiConfig
		from tdoc_ai.container import AiServiceContainer
		from tdoc_ai.models import (
		DocumentChunk,
		DocumentClassification,
		DocumentSummary,
		GraphEdge,
		GraphNode,
		PipelineStage,
		ProcessingStatus,
		)
		from tdoc_ai.operations.convert import convert_tdoc as convert_document
		from tdoc_ai.operations.graph import query_graph
		from tdoc_ai.operations.pipeline import get_status, process_all
		from tdoc_ai.operations.pipeline import process_tdoc as process_document
		from tdoc_ai.operations.summarize import SummarizeResult
		from tdoc_ai.operations.summarize import summarize_tdoc as summarize_document
		from tdoc_ai.operations.workspace_registry import (
		DEFAULT_WORKSPACE,
		WorkspaceDisplayInfo,
		WorkspaceRegistry,
		get_active_workspace,
		set_active_workspace,
		)
		from tdoc_ai.operations.workspaces import (
		add_workspace_members,
		checkout_spec_to_workspace,
		checkout_tdoc_to_workspace,
		create_workspace,
		delete_workspace,
		ensure_ai_subfolder,
		ensure_default_workspace,
		get_workspace,
		get_workspace_member_counts,
		is_default_workspace,
		list_workspace_members,
		list_workspaces,
		make_workspace_member,
		normalize_workspace_name,
		remove_invalid_members,
		resolve_tdoc_checkout_path,
		resolve_workspace,
		)
		from tdoc_ai.storage import AiStorage
		from tdoc_crawler.config import CacheManager

		litellm.suppress_debug_info = True # Suppress provider/model info logs from litellm

		process_tdoc = process_document


		__all__ = [
		"DEFAULT_WORKSPACE",
		"AiConfig",
		"AiServiceContainer",
		"AiStorage",
		"CacheManager",
		"DocumentChunk",
		"DocumentClassification",
		"DocumentSummary",
		"GraphEdge",
		"GraphNode",
		"PipelineStage",
		"ProcessingStatus",
		"SummarizeResult",
		"WorkspaceDisplayInfo",
		"WorkspaceRegistry",
		"add_workspace_members",
		"checkout_spec_to_workspace",
		"checkout_tdoc_to_workspace",
		"convert_document",
		"create_workspace",
		"delete_workspace",
		"ensure_ai_subfolder",
		"ensure_default_workspace",
		"get_active_workspace",
		"get_status",
		"get_workspace",
		"get_workspace_member_counts",
		"is_default_workspace",
		"list_workspace_members",
		"list_workspaces",
		"make_workspace_member",
		"normalize_workspace_name",
		"process_all",
		"process_tdoc",
		"process_document",
		"query_graph",
		"remove_invalid_members",
		"resolve_tdoc_checkout_path",
		"resolve_workspace",
		"set_active_workspace",
		"summarize_document",
		]

tdoc-ai/tdoc_ai/config.py

deleted100644 → 0

+0 −176

Original line number	Diff line number	Diff line
		"""Configuration for the AI document processing pipeline."""

		from __future__ import annotations

		import os
		from pathlib import Path

		import litellm
		from pydantic import Field, field_validator, model_validator

		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.models import BaseConfigModel

		DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
		DEFAULT_LLM_MODEL = "openrouter/openrouter/free"


		def _env_int(name: str) -> int \| None:
		value = os.getenv(name)
		if value is None or value == "":
		return None
		return int(value)


		def _validate_model_identifier(value: str, field_name: str) -> str:
		if "/" not in value:
		msg = f"{field_name} must be in '<provider>/<model_name>' format"
		raise ValueError(msg)

		provider, model_name = value.split("/", 1)
		provider_normalized = provider.strip().lower()
		model_name_normalized = model_name.strip()

		if not provider_normalized:
		msg = f"{field_name} provider segment cannot be empty"
		raise ValueError(msg)
		if not model_name_normalized:
		msg = f"{field_name} model_name segment cannot be empty"
		raise ValueError(msg)

		supported_providers = set(litellm.LITELLM_CHAT_PROVIDERS + litellm.openai_compatible_providers)

		if provider_normalized not in supported_providers:
		msg = (
		f"{field_name} provider '{provider}' is not supported by litellm. "
		f"See https://docs.litellm.ai/docs/providers for the full list of {len(supported_providers)} supported providers."
		)
		raise ValueError(msg)

		return f"{provider_normalized}/{model_name_normalized}"


		def _validate_embedding_model_format(value: str) -> str:
		"""Validate embedding model - accepts any HuggingFace-style model ID.

		Unlike LLM models, embedding models via sentence-transformers don't require
		LiteLLM provider validation. Accepts formats like:
		- sentence-transformers/all-MiniLM-L6-v2
		- perplexity-ai/pplx-embed-v1-0.6b
		"""
		if "/" not in value:
		msg = "embedding_model must be in '<provider>/<model_name>' format"
		raise ValueError(msg)

		provider, model_name = value.split("/", 1)
		provider_normalized = provider.strip().lower()
		model_name_normalized = model_name.strip()

		if not provider_normalized:
		msg = "embedding_model provider segment cannot be empty"
		raise ValueError(msg)
		if not model_name_normalized:
		msg = "embedding_model model_name segment cannot be empty"
		raise ValueError(msg)

		return f"{provider_normalized}/{model_name_normalized}"


		class AiConfig(BaseConfigModel):
		"""Configuration for the AI processing pipeline."""

		ai_cache_dir: Path \| None = Field(None, description="Path to AI cache directory")

		embedding_model: str = Field(
		DEFAULT_EMBEDDING_MODEL,
		description="Embedding model in <provider>/<model_name> format",
		)
		max_chunk_size: int = Field(1000, ge=1, description="Max tokens per chunk")
		chunk_overlap: int = Field(100, ge=0, description="Token overlap between chunks")

		llm_model: str = Field(
		DEFAULT_LLM_MODEL,
		description="LLM model in <provider>/<model_name> format",
		)
		llm_api_base: str \| None = Field(None, description="Override LLM API base URL")

		abstract_min_words: int = Field(150, ge=1, description="Minimum abstract word count")
		abstract_max_words: int = Field(250, ge=1, description="Maximum abstract word count")
		parallelism: int = Field(4, ge=1, le=32, description="Concurrent TDoc processing")

		@classmethod
		def from_env(cls, **overrides: str \| int \| Path \| None) -> AiConfig:
		"""Create config from environment variables."""
		data: dict[str, str \| int \| Path \| None] = {}

		# Set cache_manager_name for use in _resolve_paths validator
		if cache_manager_name := overrides.get("cache_manager_name"):
		data["cache_manager_name"] = cache_manager_name

		# NOTE: ai_cache_dir is NOT set here - it will be resolved in _resolve_paths
		# validator using ai_embed_dir(embedding_model) to include provider/model subdirectory

		if embedding_model := os.getenv("TDC_AI_EMBEDDING_MODEL"):
		data["embedding_model"] = embedding_model
		if llm_model := os.getenv("TDC_AI_LLM_MODEL"):
		data["llm_model"] = llm_model
		if llm_api_base := os.getenv("TDC_AI_LLM_API_BASE"):
		data["llm_api_base"] = llm_api_base

		max_chunk_size = _env_int("TDC_AI_MAX_CHUNK_SIZE")
		if max_chunk_size is not None:
		data["max_chunk_size"] = max_chunk_size

		chunk_overlap = _env_int("TDC_AI_CHUNK_OVERLAP")
		if chunk_overlap is not None:
		data["chunk_overlap"] = chunk_overlap

		abstract_min_words = _env_int("TDC_AI_ABSTRACT_MIN_WORDS")
		if abstract_min_words is not None:
		data["abstract_min_words"] = abstract_min_words

		abstract_max_words = _env_int("TDC_AI_ABSTRACT_MAX_WORDS")
		if abstract_max_words is not None:
		data["abstract_max_words"] = abstract_max_words

		parallelism = _env_int("TDC_AI_PARALLELISM")
		if parallelism is not None:
		data["parallelism"] = parallelism

		data.update(overrides)
		# Filter out None values to let defaults apply
		filtered_data = {k: v for k, v in data.items() if v is not None}
		return cls(**filtered_data)

		@model_validator(mode="after")
		def _resolve_paths(self) -> AiConfig:
		if self.ai_cache_dir is None:
		# Use CacheManager to resolve the embedding directory
		# e.g., ~/.tdoc-crawler/.ai/sentence-transformers/all-MiniLM-L6-v2
		# The ai_embed_dir method handles the provider/model subdirectory structure
		self.ai_cache_dir = resolve_cache_manager(self.cache_manager_name).ai_embed_dir(self.embedding_model)

		return self

		@model_validator(mode="after")
		def _validate_bounds(self) -> AiConfig:
		if self.abstract_max_words < self.abstract_min_words:
		msg = "abstract_max_words must be >= abstract_min_words"
		raise ValueError(msg)
		if self.chunk_overlap >= self.max_chunk_size:
		msg = "chunk_overlap must be less than max_chunk_size"
		raise ValueError(msg)
		return self

		@field_validator("embedding_model")
		@classmethod
		def _validate_embedding_model(cls, value: str) -> str:
		return _validate_embedding_model_format(value)

		@field_validator("llm_model")
		@classmethod
		def _validate_llm_model(cls, value: str) -> str:
		return _validate_model_identifier(value, "llm_model")


		__all__ = ["AiConfig"]

tdoc-ai/tdoc_ai/container.py

deleted100644 → 0

+0 −215

Original line number	Diff line number	Diff line
		"""AI Service Container - Singleton for AI module dependencies.

		This module provides a centralized container for AI services (AiConfig, AiStorage,
		EmbeddingsManager) following the Dependency Injection patterns defined in
		specs/001-di-refactoring-plan/.

		The container implements lazy initialization and singleton pattern to ensure:
		- Single LanceDB connection per session
		- Correct cache path including provider/model subdirectory
		- Easy testing through dependency injection
		"""

		from __future__ import annotations

		from typing import Any

		from sentence_transformers import SentenceTransformer

		from tdoc_ai.config import AiConfig
		from tdoc_ai.operations.embeddings import EmbeddingsManager
		from tdoc_ai.storage import AiStorage


		class AiServiceContainer:
		"""
		Singleton container for AI services.

		Provides centralized access to AiConfig, AiStorage, and EmbeddingsManager
		with lazy initialization. This ensures single instantiation and correct
		cache path resolution.

		Usage:
		# Get the singleton instance
		container = AiServiceContainer.get_instance()

		# Get services (lazy initialized)
		config = container.get_config()
		storage = container.get_storage()
		embeddings = container.get_embeddings_manager()

		# Or use convenience method
		storage = container.get_ai_storage()
		"""

		_instance: AiServiceContainer \| None = None
		_config: AiConfig \| None = None
		_storage: AiStorage \| None = None
		_embeddings_manager: EmbeddingsManager \| None = None

		def __new__(cls) -> AiServiceContainer:
		"""Ensure singleton pattern."""
		if cls._instance is None:
		cls._instance = super().__new__(cls)
		# Initialize instance attributes
		cls._instance._config = None
		cls._instance._storage = None
		cls._instance._embeddings_manager = None
		return cls._instance

		def get_config(self) -> AiConfig:
		"""Get the AI configuration singleton.

		Loads configuration from environment variables using AiConfig.from_env().

		Returns:
		AiConfig singleton instance.
		"""
		if self._config is None:
		self._config = AiConfig.from_env()
		return self._config

		def get_embeddings_manager(self) -> EmbeddingsManager:
		"""Get the embeddings manager singleton.

		Creates EmbeddingsManager with the shared config and storage.
		Note: Storage must be initialized before calling this method.

		Returns:
		EmbeddingsManager singleton instance.
		"""
		if self._embeddings_manager is None:
		config = self.get_config()
		storage = self.get_storage()
		self._embeddings_manager = EmbeddingsManager(config=config, storage=storage)
		return self._embeddings_manager

		def get_storage(self) -> AiStorage:
		"""Get the AI storage singleton.

		Creates AiStorage with the correct cache path (including provider/model
		subdirectory) obtained from AiConfig.

		Returns:
		AiStorage singleton instance.
		"""
		if self._storage is None:
		config = self.get_config()
		# Load dimension directly from model to avoid circular dependency
		# with get_embeddings_manager() which requires storage
		dimension = self._load_embedding_dimension()
		# AiConfig.ai_cache_dir already includes the provider/model subdirectory
		# when embedding_model is set (see config.py lines 148-152)
		self._storage = AiStorage(config.ai_cache_dir, embedding_dimension=dimension)
		return self._storage

		# Aliases for compatibility with main ServiceContainer design
		def get_ai_config(self) -> AiConfig:
		"""Alias for get_config() - compatibility with main ServiceContainer."""
		return self.get_config()

		def get_ai_storage(self) -> AiStorage:
		"""Alias for get_storage() - compatibility with main ServiceContainer."""
		return self.get_storage()

		def get_embeddings(self) -> EmbeddingsManager:
		"""Alias for get_embeddings_manager() - compatibility with main ServiceContainer."""
		return self.get_embeddings_manager()

		def close(self) -> None:
		"""Close the container and release resources.

		Resets all singleton instances. Safe to call multiple times.
		"""
		self._storage = None
		self._embeddings_manager = None
		self._config = None

		def __enter__(self) -> AiServiceContainer:
		"""Context manager entry."""
		return self

		def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
		"""Context manager exit - ensures cleanup."""
		self.close()

		@classmethod
		def reset_instance(cls) -> None:
		"""Reset the singleton instance.

		This is primarily used for testing to ensure each test starts
		with a fresh container. After calling this method, the next
		call to get_instance() will create a new container instance.
		"""
		cls._instance = None

		@classmethod
		def get_instance(cls) -> AiServiceContainer:
		"""Get the singleton container instance.

		Returns:
		AiServiceContainer singleton instance.
		"""
		return cls()

		@classmethod
		def reset_for_testing(cls) -> None:
		"""Reset the singleton for testing purposes.

		WARNING: Only use in tests, not in production code.
		"""
		cls._instance = None

		def _load_embedding_dimension(self) -> int:
		"""Load the embedding dimension from the configured model.

		This is a helper method to avoid circular dependencies between
		get_storage() and get_embeddings_manager().

		Returns:
		The embedding dimension for the configured model.
		"""
		config = self.get_config()

		model = SentenceTransformer(config.embedding_model)
		dimension = model.get_sentence_embedding_dimension()
		if dimension is None:
		raise RuntimeError(f"Model '{config.embedding_model}' did not report an embedding dimension")
		return dimension


		# Convenience functions for backward compatibility
		def get_ai_config() -> AiConfig:
		"""Get AI configuration singleton.

		Returns:
		AiConfig singleton instance.
		"""
		return AiServiceContainer.get_instance().get_config()


		def get_ai_storage() -> AiStorage:
		"""Get AI storage singleton.

		Returns:
		AiStorage singleton instance.
		"""
		return AiServiceContainer.get_instance().get_storage()


		def get_embeddings_manager() -> EmbeddingsManager:
		"""Get embeddings manager singleton.

		Returns:
		EmbeddingsManager singleton instance.
		"""
		return AiServiceContainer.get_instance().get_embeddings_manager()


		__all__ = [
		"AiServiceContainer",
		"get_ai_config",
		"get_ai_storage",
		"get_embeddings_manager",
		]