Commit e53895ba authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(02-01): integrate CacheManager with TDocCrawlerConfig

- CacheManager.__init__ now accepts optional config: TDocCrawlerConfig parameter
- When config provided, uses config.path.cache_dir and config.path.ai_cache_dir
- Added HttpCacheConfig.from_http_config() factory method for migration
- Added deprecation warning when HttpCacheConfig is instantiated directly
- Maintains backward compatibility with existing positional arguments
parent 75fbc5ef
Loading
Loading
Loading
Loading
+67 −7
Original line number Diff line number Diff line
@@ -6,6 +6,26 @@ import os
from pathlib import Path
from typing import Self

# Import settings and sources modules
from tdoc_crawler.config.compat import (
    DEPRECATED_ENV_VARS,
    ENV_VAR_MAPPINGS,
    log_deprecation_warning,
)
from tdoc_crawler.config.settings import (
    CrawlConfig,
    CredentialsConfig,
    HttpConfig,
    PathConfig,
    TDocCrawlerConfig,
)
from tdoc_crawler.config.sources import (
    ConfigLoadError,
    discover_config_files,
    load_config_file,
    merge_configs,
)

# Fallback path if no argument or env var is provided
DEFAULT_CACHE_DIR = Path.home() / ".3gpp-crawler"
DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
@@ -17,6 +37,31 @@ DEFAULT_AI_CACHE_DIRNAME = "lightrag" # subdirectory under root cache dir for A

WORKSPACE_REGISTRY_FILENAME = "workspaces.json"

__all__ = [
    "DEFAULT_AI_CACHE_DIRNAME",
    "DEFAULT_CACHE_DIR",
    "DEFAULT_CHECKOUT_DIRNAME",
    "DEFAULT_DATABASE_FILENAME",
    "DEFAULT_HTTP_CACHE_FILENAME",
    "DEFAULT_MANAGER",
    "DEPRECATED_ENV_VARS",
    "ENV_VAR_MAPPINGS",
    "CacheManager",
    "ConfigLoadError",
    "CrawlConfig",
    "CredentialsConfig",
    "HttpConfig",
    "PathConfig",
    "TDocCrawlerConfig",
    "discover_config_files",
    "load_config_file",
    "log_deprecation_warning",
    "merge_configs",
    "register_cache_manager",
    "reset_cache_managers",
    "resolve_cache_manager",
]

_cache_managers: dict[str, CacheManager] = {}


@@ -56,32 +101,47 @@ class CacheManager:
    Acts as the single source of truth for where files are stored.
    """

    def __init__(self, root_path: Path | None = None, ai_cache_dir: Path | None = None, name: str = DEFAULT_MANAGER, ensure_paths: bool = True) -> None:
    def __init__(
        self,
        root_path: Path | None = None,
        ai_cache_dir: Path | None = None,
        name: str = DEFAULT_MANAGER,
        ensure_paths: bool = True,
        config: TDocCrawlerConfig | None = None,
    ) -> None:
        """Initialize cache manager.

        Args:
            root_path: Explicit root path. If None, tries TDC_CACHE_DIR env var,
                       then falls back to DEFAULT_CACHE_DIR.
            ai_cache_dir: Explicit AI cache directory path. If None, defaults to root_path/lightrag or TDC_AI_STORE_PATH env var if set.
                       then falls back to DEFAULT_CACHE_DIR. Ignored if config is provided.
            ai_cache_dir: Explicit AI cache directory path. If None, defaults to root_path/lightrag
                         or TDC_AI_STORE_PATH env var if set. Ignored if config is provided.
            name: Optional name to register this manager under. If provided, the manager is registered upon initialization.
            ensure_paths: If True, will create the root directory if it doesn't exist.
            config: Optional TDocCrawlerConfig to use for path resolution. If provided,
                    config.path.cache_dir and config.path.ai_cache_dir are used.
        """
        self.name = name

        if root_path:
        if config is not None:
            # Use config for path resolution (preferred)
            self.root = config.path.cache_dir
            self.ai_cache_dir = config.path.ai_cache_dir
        elif root_path:
            self.root = root_path
        else:
            # Fallback to env var (will be removed in future version)
            env_cache_dir = os.getenv("TDC_CACHE_DIR")
            self.root = Path(env_cache_dir) if env_cache_dir else DEFAULT_CACHE_DIR

        if ai_cache_dir:
            self.ai_cache_dir = ai_cache_dir.resolve()
        else:
        if config is None and not ai_cache_dir:
            env_ai_cache_dir = os.getenv("TDC_AI_STORE_PATH")
            if env_ai_cache_dir:
                self.ai_cache_dir = Path(env_ai_cache_dir).resolve()
            else:
                self.ai_cache_dir = (self.root / DEFAULT_AI_CACHE_DIRNAME).resolve()
        elif ai_cache_dir:
            self.ai_cache_dir = ai_cache_dir.resolve()

        if ensure_paths:
            self.ensure_paths()
+46 −1
Original line number Diff line number Diff line
@@ -3,14 +3,19 @@
from __future__ import annotations

import os
import warnings
from dataclasses import dataclass
from enum import StrEnum, auto
from pathlib import Path
from typing import TYPE_CHECKING

from pydantic import BaseModel, Field

from tdoc_crawler.config import resolve_cache_manager

if TYPE_CHECKING:
    from tdoc_crawler.config.settings import HttpConfig


class OutputFormat(StrEnum):
    """Supported output formats for CLI responses."""
@@ -36,13 +41,45 @@ _DEFAULT_MAX_RETRIES: int = 3

@dataclass
class HttpCacheConfig:
    """HTTP cache configuration for hishel caching."""
    """HTTP cache configuration for hishel caching.

    .. deprecated::
        Use TDocCrawlerConfig.http instead. HttpCacheConfig will be removed in v2.0.
    """

    ttl: int = _DEFAULT_TTL
    refresh_ttl_on_access: bool = _DEFAULT_TTL_ON_ACCESS
    max_retries: int = _DEFAULT_MAX_RETRIES
    cache_file: Path | None = None

    def __post_init__(self) -> None:
        """Emit deprecation warning when HttpCacheConfig is instantiated."""
        warnings.warn(
            "HttpCacheConfig is deprecated, use TDocCrawlerConfig.http instead",
            DeprecationWarning,
            stacklevel=2,
        )

    @classmethod
    def from_http_config(cls, http_config: HttpConfig) -> HttpCacheConfig:
        """Create HttpCacheConfig from TDocCrawlerConfig.http.

        This factory method allows migration from the deprecated HttpCacheConfig
        to the new HttpConfig while maintaining compatibility with existing code.

        Args:
            http_config: HttpConfig from TDocCrawlerConfig

        Returns:
            HttpCacheConfig instance (still emits deprecation warning)
        """
        return cls(
            ttl=http_config.cache_ttl,
            refresh_ttl_on_access=http_config.cache_refresh_on_access,
            max_retries=http_config.max_retries,
            cache_file=None,  # Cache file path is handled by CacheManager
        )

    @classmethod
    def resolve_http_cache_config(
        cls,
@@ -53,6 +90,9 @@ class HttpCacheConfig:
    ) -> HttpCacheConfig:
        """Resolve HTTP cache configuration from CLI parameters and environment variables.

        .. deprecated::
            Use TDocCrawlerConfig.from_settings().http instead.

        Args:
            cache_ttl: TTL for cache entries (CLI parameter)
            cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter)
@@ -62,6 +102,11 @@ class HttpCacheConfig:
        Returns:
            HttpCacheConfig instance with resolved values
        """
        warnings.warn(
            "HttpCacheConfig.resolve_http_cache_config() is deprecated, use TDocCrawlerConfig.from_settings().http instead",
            DeprecationWarning,
            stacklevel=2,
        )
        # Check CLI parameters first, then environment variables, then defaults
        if cache_ttl is not None:
            ttl = cache_ttl