Commit 255064a0 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(core): add CacheManager for centralized path resolution

Introduce CacheManager singleton pattern to eliminate hardcoded paths

(~/.3gpp-crawler) across the codebase per AGENTS.md mandate.

- New cache_manager.py: CacheManager class with register/resolve pattern

- config/__init__.py: re-export CacheManager, resolve_cache_manager, and

  default constants (DEFAULT_DATABASE_FILENAME, DEFAULT_HTTP_CACHE_FILENAME, etc.)

All components should use resolve_cache_manager() instead of Path.home() /

'.3gpp-crawler' hardcoding.
parent d6e2d246
Loading
Loading
Loading
Loading
+5 −2
Original line number Diff line number Diff line
@@ -26,7 +26,7 @@ from tdoc_crawler.cli.args import (
    UseWhatTheSpecOption,
    VerbosityOption,
)
from tdoc_crawler.cli.config import load_cli_config
from tdoc_crawler.cli.config import CacheManager, load_cli_config
from tdoc_crawler.cli.config_app import config_app
from tdoc_crawler.cli.constants import HELP_PANEL_CRAWLING, HELP_PANEL_MAIN, HELP_PANEL_QUERY
from tdoc_crawler.cli.crawl import crawl_meetings, crawl_tdocs
@@ -79,6 +79,9 @@ def _app_callback(
    if cache_dir is not None:
        console.print("[yellow]Warning: --cache-dir is deprecated, use config file[/yellow]")
        config.path.cache_dir = cache_dir

    # Register CacheManager for centralized path management
    manager = CacheManager(config.path.cache_dir).register()
    ctx.obj = config


+21 −5
Original line number Diff line number Diff line
@@ -2,6 +2,15 @@

from __future__ import annotations

from tdoc_crawler.config.cache_manager import (
    DEFAULT_AI_CACHE_DIRNAME,
    DEFAULT_CHECKOUT_DIRNAME,
    DEFAULT_DATABASE_FILENAME,
    DEFAULT_HTTP_CACHE_FILENAME,
    CacheManager,
    CacheManagerNotRegisteredError,
    resolve_cache_manager,
)
from tdoc_crawler.config.env_vars import TOML_PATH_TO_ENV_VAR, ConfigEnvVar
from tdoc_crawler.config.settings import (
    CrawlConfig,
@@ -18,19 +27,26 @@ from tdoc_crawler.config.sources import (
    merge_configs,
)

DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"

__all__ = [
    # CacheManager
    "CacheManager",
    "CacheManagerNotRegisteredError",
    "resolve_cache_manager",
    "DEFAULT_DATABASE_FILENAME",
    "DEFAULT_HTTP_CACHE_FILENAME",
    "TOML_PATH_TO_ENV_VAR",
    "ConfigEnvVar",
    "ConfigLoadError",
    "DEFAULT_CHECKOUT_DIRNAME",
    "DEFAULT_AI_CACHE_DIRNAME",
    # Settings
    "CrawlConfig",
    "CredentialsConfig",
    "HttpConfig",
    "PathConfig",
    "TDocCrawlerConfig",
    "ThreeGPPConfig",
    # Config sources
    "ConfigEnvVar",
    "ConfigLoadError",
    "TOML_PATH_TO_ENV_VAR",
    "discover_config_files",
    "load_config_file",
    "merge_configs",
+145 −0
Original line number Diff line number Diff line
"""Cache manager for centralized path management.

This module provides the CacheManager class for managing file system paths
in a centralized, configurable manner. All paths should be accessed through
the CacheManager to ensure consistency and testability.

Usage:
    # At application entry point (CLI __main__.py)
    from tdoc_crawler.config import CacheManager
    
    cache_dir = Path.home() / ".3gpp-crawler"  # Or from config/env var
    manager = CacheManager(cache_dir).register()
    
    # Anywhere else in the codebase
    from tdoc_crawler.config import resolve_cache_manager
    
    manager = resolve_cache_manager()
    db_path = manager.db_file
    checkout_path = manager.checkout_dir
"""

from __future__ import annotations

from pathlib import Path
from typing import ClassVar

from tdoc_crawler.config.settings import WORKSPACE_REGISTRY_FILENAME

DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
DEFAULT_CHECKOUT_DIRNAME = "checkout"
DEFAULT_AI_CACHE_DIRNAME = "lightrag"


class CacheManagerNotRegisteredError(RuntimeError):
    """Raised when trying to resolve CacheManager before registration."""

    pass


class CacheManager:
    """Centralized manager for cache directory paths.

    Provides a single source of truth for all file system paths used by
    the application. The manager must be registered once at application
    startup, then resolved wherever paths are needed.

    Example:
        >>> # At application entry
        >>> manager = CacheManager(cache_dir).register()
        >>>
        >>> # Anywhere else
        >>> manager = resolve_cache_manager()
        >>> db_path = manager.db_file
    """

    _instance: ClassVar[CacheManager | None] = None

    def __init__(self, cache_dir: Path) -> None:
        """Initialize the cache manager.

        Args:
            cache_dir: Root cache directory path
        """
        self._cache_dir = cache_dir.resolve()

    def register(self) -> CacheManager:
        """Register this instance as the global CacheManager.

        Returns:
            Self for chaining

        Raises:
            RuntimeError: If a manager is already registered
        """
        if CacheManager._instance is not None:
            raise RuntimeError("CacheManager already registered. Call only once at startup.")
        CacheManager._instance = self
        return self

    @property
    def root(self) -> Path:
        """Root cache directory."""
        return self._cache_dir

    @property
    def db_file(self) -> Path:
        """Path to SQLite database file."""
        return self._cache_dir / DEFAULT_DATABASE_FILENAME

    @property
    def http_cache_file(self) -> Path:
        """Path to HTTP cache database file."""
        return self._cache_dir / DEFAULT_HTTP_CACHE_FILENAME

    @property
    def checkout_dir(self) -> Path:
        """Path to checkout directory for documents."""
        return self._cache_dir / DEFAULT_CHECKOUT_DIRNAME

    @property
    def ai_cache_dir(self) -> Path:
        """Path to AI cache directory for embeddings and graphs."""
        return self._cache_dir / DEFAULT_AI_CACHE_DIRNAME

    @property
    def ai_workspace_file(self) -> Path:
        """Path to workspace registry file."""
        return self.ai_cache_dir / WORKSPACE_REGISTRY_FILENAME

    def ai_embed_dir(self, embedding_model: str) -> Path:
        """Path to embeddings directory for a specific model.

        Args:
            embedding_model: Name of the embedding model

        Returns:
            Path to model-specific embeddings directory
        """
        return (self.ai_cache_dir / embedding_model).resolve()

    @classmethod
    def is_registered(cls) -> bool:
        """Check if a CacheManager instance is registered.

        Returns:
            True if registered, False otherwise
        """
        return cls._instance is not None


def resolve_cache_manager() -> CacheManager:
    """Resolve the registered CacheManager instance.

    Returns:
        The registered CacheManager instance

    Raises:
        CacheManagerNotRegisteredError: If no manager is registered
    """
    if CacheManager._instance is None:
        raise CacheManagerNotRegisteredError(
            "CacheManager not registered. Call CacheManager(cache_dir).register() at application startup."
        )
    return CacheManager._instance
+5 −3
Original line number Diff line number Diff line
@@ -20,7 +20,9 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
from tdoc_crawler.config.env_vars import ConfigEnvVar
from tdoc_crawler.config.sources import discover_config_files, load_config_file, merge_configs

_DEFAULT_CACHE_DIR = Path.home() / ".3gpp-crawler"
# Default values (not constants - just module-level defaults)
# Actual paths are resolved by CacheManager at runtime
_DEFAULT_CACHE_DIR_STR = "~/.3gpp-crawler"
_DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
_DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
_DEFAULT_CHECKOUT_DIRNAME = "checkout"
@@ -39,7 +41,7 @@ class PathConfig(BaseSettings):
    model_config = SettingsConfigDict(env_prefix="TDC_", env_nested_delimiter="_", extra="ignore")

    cache_dir: Path = Field(
        default=_DEFAULT_CACHE_DIR,
        default_factory=lambda: Path(_DEFAULT_CACHE_DIR_STR).expanduser().resolve(),
        validation_alias=AliasChoices(ConfigEnvVar.TDC_CACHE_DIR.name, "cache_dir"),
        description="Root cache directory for storing downloaded files and metadata",
    )
@@ -90,7 +92,7 @@ class PathConfig(BaseSettings):
    def _resolve_cache_dir(cls, value: str | Path | None) -> Path:
        """Resolve and validate the cache directory path."""
        if value is None:
            return _DEFAULT_CACHE_DIR
            return Path(_DEFAULT_CACHE_DIR_STR).expanduser().resolve()
        if isinstance(value, str):
            value = Path(value)
        return value.expanduser().resolve()
+15 −15
Original line number Diff line number Diff line
@@ -97,19 +97,19 @@ def checkout_tdoc(
        return checkout_path

    checkout_path.mkdir(parents=True, exist_ok=True)
    temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"
    temp_zip_file = checkout_path / f"{metadata.tdoc_id}.zip"

    if metadata.url is None:
        raise ValueError(f"TDoc {metadata.tdoc_id} has no URL")

    try:
        download_to_file(metadata.url, temp_zip_path, session=session)
        with zipfile.ZipFile(temp_zip_path) as archive:
        download_to_file(metadata.url, temp_zip_file, session=session)
        with zipfile.ZipFile(temp_zip_file) as archive:
            archive.extractall(checkout_path)
        logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
    finally:
        if temp_zip_path.exists():
            temp_zip_path.unlink()
        if temp_zip_file.exists():
            temp_zip_file.unlink()

    return checkout_path

@@ -140,8 +140,8 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo

    downloads_dir = Path(checkout_dir)
    downloads_dir.mkdir(parents=True, exist_ok=True)
    path = urlparse(metadata.url).path
    filename = str(posixpath.basename(path))
    url_path = urlparse(metadata.url).path
    filename = str(posixpath.basename(url_path))
    suffix = Path(filename).suffix.lower()

    if suffix == ".zip":
@@ -151,14 +151,14 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
            if files:
                return extract_dir if return_dir else files[0]
            shutil.rmtree(extract_dir)
        zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
        download_to_file(metadata.url, zip_path, session=session)
        zip_file = downloads_dir / f"{metadata.tdoc_id}.zip"
        download_to_file(metadata.url, zip_file, session=session)
        try:
            with zipfile.ZipFile(zip_path) as archive:
            with zipfile.ZipFile(zip_file) as archive:
                archive.extractall(extract_dir)
        finally:
            with suppress(FileNotFoundError):
                zip_path.unlink()
                zip_file.unlink()
        files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
        if not files:
            raise FileNotFoundError("no-files-in-archive")
@@ -167,14 +167,14 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
    # For non-zip files, download directly
    target_suffix = suffix or ""
    target_name = filename if filename else f"{metadata.tdoc_id}{target_suffix or '.bin'}"
    target_path = downloads_dir / target_name
    if not target_path.exists():
    target_file = downloads_dir / target_name
    if not target_file.exists():
        try:
            download_to_file(metadata.url, target_path, session=session)
            download_to_file(metadata.url, target_file, session=session)
        except requests.exceptions.HTTPError as exc:
            status_code = exc.response.status_code if exc.response is not None else "unknown"
            raise FileNotFoundError(f"failed-to-download ({status_code}): {metadata.url}") from exc
    return target_path
    return target_file


def get_checked_out_tdocs(checkout_dir: Path) -> list[str]: