Commit c7c8ca7e authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(ai): rename ai_store_path to ai_cache_dir and update usages

- Changed all references from ai_store_path to ai_cache_dir in the codebase.
- Updated the CacheManager to handle ai_cache_dir initialization.
- Adjusted workspace management functions to utilize the new cache directory.
- Ensured compatibility with existing functionality while improving clarity.
parent 17b00b9d
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@ dependencies = [
    "brotli>=1.2.0",
    "hishel>=1.1.8",
    "lxml>=6.0.2",
    "pandas>=2.3.3",
    "pandas<3.0.0",
    "pydantic>=2.12.2",
    "pydantic-sqlite>=0.4.0",
    "python-calamine>=0.5.3",
@@ -38,6 +38,7 @@ dependencies = [
    "lancedb>=0.29.2",
    "litellm>=1.81.15",
    "sentence-transformers>=2.7.0",
    "tokenizers>=0.22.2",
]

[project.urls]
+9 −3
Original line number Diff line number Diff line
@@ -24,8 +24,14 @@ from tdoc_crawler.ai.operations.pipeline import process_tdoc as _pipeline_proces
from tdoc_crawler.ai.operations.pipeline import process_tdoc as process_document
from tdoc_crawler.ai.operations.summarize import SummarizeResult
from tdoc_crawler.ai.operations.summarize import summarize_tdoc as summarize_document
from tdoc_crawler.ai.operations.workspaces import (
from tdoc_crawler.ai.operations.workspace_registry import (
    DEFAULT_WORKSPACE,
    WorkspaceDisplayInfo,
    WorkspaceRegistry,
    get_active_workspace,
    set_active_workspace,
)
from tdoc_crawler.ai.operations.workspaces import (
    add_workspace_members,
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
@@ -33,7 +39,6 @@ from tdoc_crawler.ai.operations.workspaces import (
    delete_workspace,
    ensure_ai_subfolder,
    ensure_default_workspace,
    get_active_workspace,
    get_workspace,
    is_default_workspace,
    list_workspaces,
@@ -41,7 +46,6 @@ from tdoc_crawler.ai.operations.workspaces import (
    normalize_workspace_name,
    resolve_tdoc_checkout_path,
    resolve_workspace,
    set_active_workspace,
)
from tdoc_crawler.ai.storage import AiStorage
from tdoc_crawler.config import CacheManager
@@ -122,6 +126,8 @@ __all__ = [
    "PipelineStage",
    "ProcessingStatus",
    "SummarizeResult",
    "WorkspaceDisplayInfo",
    "WorkspaceRegistry",
    "add_workspace_members",
    "checkout_spec_to_workspace",
    "checkout_tdoc_to_workspace",
+14 −7
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ from pathlib import Path
import litellm
from pydantic import Field, field_validator, model_validator

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.models import BaseConfigModel

DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
@@ -78,7 +79,7 @@ def _validate_embedding_model_format(value: str) -> str:
class AiConfig(BaseConfigModel):
    """Configuration for the AI processing pipeline."""

    ai_store_path: Path | None = Field(None, description="Path to AI LanceDB store")
    ai_cache_dir: Path | None = Field(None, description="Path to AI LanceDB store")

    embedding_model: str = Field(
        DEFAULT_EMBEDDING_MODEL,
@@ -102,8 +103,13 @@ class AiConfig(BaseConfigModel):
        """Create config from environment variables."""
        data: dict[str, str | int | Path | None] = {}

        if store_path := os.getenv("TDC_AI_STORE_PATH"):
            data["ai_store_path"] = Path(store_path)
        # Set cache_manager_name for use in _resolve_paths validator
        if cache_manager_name := overrides.get("cache_manager_name"):
            data["cache_manager_name"] = cache_manager_name

        # NOTE: ai_cache_dir is NOT set here - it will be resolved in _resolve_paths
        # validator using ai_embed_dir(embedding_model) to include provider/model subdirectory

        if embedding_model := os.getenv("TDC_AI_EMBEDDING_MODEL"):
            data["embedding_model"] = embedding_model
        if llm_model := os.getenv("TDC_AI_LLM_MODEL"):
@@ -138,11 +144,12 @@ class AiConfig(BaseConfigModel):

    @model_validator(mode="after")
    def _resolve_paths(self) -> AiConfig:
        if self.ai_store_path is None:
            # Include embedding model in path to avoid dimension conflicts
        if self.ai_cache_dir is None:
            # Use CacheManager to resolve the embedding directory
            # e.g., ~/.tdoc-crawler/.ai/sentence-transformers/all-MiniLM-L6-v2
            # Keep slash to group models by provider
            self.ai_store_path = self.cache_dir / ".ai" / self.embedding_model
            # The ai_embed_dir method handles the provider/model subdirectory structure
            self.ai_cache_dir = resolve_cache_manager(self.cache_manager_name).ai_embed_dir(self.embedding_model)

        return self

    @model_validator(mode="after")
+4 −4
Original line number Diff line number Diff line
@@ -85,9 +85,9 @@ class EmbeddingsManager:
            List of DocumentChunk objects with embeddings.
        """
        if self._storage is None:
            if self._config.ai_store_path is None:
                raise RuntimeError("ai_store_path is not configured in AiConfig")
            self._storage = AiStorage(self._config.ai_store_path, embedding_dimension=self.dimension)
            if self._config.ai_cache_dir is None:
                raise RuntimeError("ai_cache_dir is not configured in AiConfig")
            self._storage = AiStorage(self._config.ai_cache_dir, embedding_dimension=self.dimension)

        normalized_workspace = normalize_workspace_name(workspace)
        markdown_content = markdown.read_text(encoding="utf-8") if isinstance(markdown, Path) else markdown
@@ -141,7 +141,7 @@ class EmbeddingsManager:
        Returns:
            List of (DocumentChunk, score) tuples.
        """
        storage = self._storage or AiStorage(self._config.ai_store_path)  # type: ignore[arg-type]
        storage = self._storage or AiStorage(self._config.ai_cache_dir)  # type: ignore[arg-type]
        normalized_workspace = normalize_workspace_name(workspace)

        # Encode query
+3 −3
Original line number Diff line number Diff line
@@ -144,7 +144,7 @@ def build_graph(
    """Build knowledge graph from TDoc content with incremental updates."""
    if storage is None:
        config = AiConfig.from_env()
        store_path = config.ai_store_path
        store_path = config.ai_cache_dir
        if store_path is None:
            msg = "AI store path not configured"
            raise ValueError(msg)
@@ -355,7 +355,7 @@ def query_graph(
    """
    if storage is None:
        config = AiConfig.from_env()
        store_path = config.ai_store_path
        store_path = config.ai_cache_dir
        if store_path is None:
            msg = "AI store path not configured"
            raise ValueError(msg)
@@ -396,7 +396,7 @@ def get_tdoc_evolution(document_id: str, storage: AiStorage | None = None) -> li
    """Get evolution chain for a TDoc (revisions, supersessions)."""
    if storage is None:
        config = AiConfig.from_env()
        store_path = config.ai_store_path
        store_path = config.ai_cache_dir
        if store_path is None:
            msg = "AI store path not configured"
            raise ValueError(msg)
Loading