feat(core): add CacheManager for centralized path resolution (255064a0) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/tdoc_app.py

+5 −2

Original line number	Diff line number	Diff line
		@@ -26,7 +26,7 @@ from tdoc_crawler.cli.args import (
		UseWhatTheSpecOption,
		VerbosityOption,
		)
		from tdoc_crawler.cli.config import load_cli_config
		from tdoc_crawler.cli.config import CacheManager, load_cli_config
		from tdoc_crawler.cli.config_app import config_app
		from tdoc_crawler.cli.constants import HELP_PANEL_CRAWLING, HELP_PANEL_MAIN, HELP_PANEL_QUERY
		from tdoc_crawler.cli.crawl import crawl_meetings, crawl_tdocs
		@@ -79,6 +79,9 @@ def _app_callback(
		if cache_dir is not None:
		console.print("[yellow]Warning: --cache-dir is deprecated, use config file[/yellow]")
		config.path.cache_dir = cache_dir

		# Register CacheManager for centralized path management
		manager = CacheManager(config.path.cache_dir).register()
		ctx.obj = config

src/tdoc_crawler/config/init.py

+21 −5

Original line number	Diff line number	Diff line
		@@ -2,6 +2,15 @@

		from __future__ import annotations

		from tdoc_crawler.config.cache_manager import (
		DEFAULT_AI_CACHE_DIRNAME,
		DEFAULT_CHECKOUT_DIRNAME,
		DEFAULT_DATABASE_FILENAME,
		DEFAULT_HTTP_CACHE_FILENAME,
		CacheManager,
		CacheManagerNotRegisteredError,
		resolve_cache_manager,
		)
		from tdoc_crawler.config.env_vars import TOML_PATH_TO_ENV_VAR, ConfigEnvVar
		from tdoc_crawler.config.settings import (
		CrawlConfig,
		@@ -18,19 +27,26 @@ from tdoc_crawler.config.sources import (
		merge_configs,
		)

		DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"

		__all__ = [
		# CacheManager
		"CacheManager",
		"CacheManagerNotRegisteredError",
		"resolve_cache_manager",
		"DEFAULT_DATABASE_FILENAME",
		"DEFAULT_HTTP_CACHE_FILENAME",
		"TOML_PATH_TO_ENV_VAR",
		"ConfigEnvVar",
		"ConfigLoadError",
		"DEFAULT_CHECKOUT_DIRNAME",
		"DEFAULT_AI_CACHE_DIRNAME",
		# Settings
		"CrawlConfig",
		"CredentialsConfig",
		"HttpConfig",
		"PathConfig",
		"TDocCrawlerConfig",
		"ThreeGPPConfig",
		# Config sources
		"ConfigEnvVar",
		"ConfigLoadError",
		"TOML_PATH_TO_ENV_VAR",
		"discover_config_files",
		"load_config_file",
		"merge_configs",

src/tdoc_crawler/config/cache_manager.py

0 → 100644

+145 −0

Original line number	Diff line number	Diff line
		"""Cache manager for centralized path management.

		This module provides the CacheManager class for managing file system paths
		in a centralized, configurable manner. All paths should be accessed through
		the CacheManager to ensure consistency and testability.

		Usage:
		# At application entry point (CLI __main__.py)
		from tdoc_crawler.config import CacheManager

		cache_dir = Path.home() / ".3gpp-crawler" # Or from config/env var
		manager = CacheManager(cache_dir).register()

		# Anywhere else in the codebase
		from tdoc_crawler.config import resolve_cache_manager

		manager = resolve_cache_manager()
		db_path = manager.db_file
		checkout_path = manager.checkout_dir
		"""

		from __future__ import annotations

		from pathlib import Path
		from typing import ClassVar

		from tdoc_crawler.config.settings import WORKSPACE_REGISTRY_FILENAME

		DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
		DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
		DEFAULT_CHECKOUT_DIRNAME = "checkout"
		DEFAULT_AI_CACHE_DIRNAME = "lightrag"


		class CacheManagerNotRegisteredError(RuntimeError):
		"""Raised when trying to resolve CacheManager before registration."""

		pass


		class CacheManager:
		"""Centralized manager for cache directory paths.

		Provides a single source of truth for all file system paths used by
		the application. The manager must be registered once at application
		startup, then resolved wherever paths are needed.

		Example:
		>>> # At application entry
		>>> manager = CacheManager(cache_dir).register()
		>>>
		>>> # Anywhere else
		>>> manager = resolve_cache_manager()
		>>> db_path = manager.db_file
		"""

		_instance: ClassVar[CacheManager \| None] = None

		def __init__(self, cache_dir: Path) -> None:
		"""Initialize the cache manager.

		Args:
		cache_dir: Root cache directory path
		"""
		self._cache_dir = cache_dir.resolve()

		def register(self) -> CacheManager:
		"""Register this instance as the global CacheManager.

		Returns:
		Self for chaining

		Raises:
		RuntimeError: If a manager is already registered
		"""
		if CacheManager._instance is not None:
		raise RuntimeError("CacheManager already registered. Call only once at startup.")
		CacheManager._instance = self
		return self

		@property
		def root(self) -> Path:
		"""Root cache directory."""
		return self._cache_dir

		@property
		def db_file(self) -> Path:
		"""Path to SQLite database file."""
		return self._cache_dir / DEFAULT_DATABASE_FILENAME

		@property
		def http_cache_file(self) -> Path:
		"""Path to HTTP cache database file."""
		return self._cache_dir / DEFAULT_HTTP_CACHE_FILENAME

		@property
		def checkout_dir(self) -> Path:
		"""Path to checkout directory for documents."""
		return self._cache_dir / DEFAULT_CHECKOUT_DIRNAME

		@property
		def ai_cache_dir(self) -> Path:
		"""Path to AI cache directory for embeddings and graphs."""
		return self._cache_dir / DEFAULT_AI_CACHE_DIRNAME

		@property
		def ai_workspace_file(self) -> Path:
		"""Path to workspace registry file."""
		return self.ai_cache_dir / WORKSPACE_REGISTRY_FILENAME

		def ai_embed_dir(self, embedding_model: str) -> Path:
		"""Path to embeddings directory for a specific model.

		Args:
		embedding_model: Name of the embedding model

		Returns:
		Path to model-specific embeddings directory
		"""
		return (self.ai_cache_dir / embedding_model).resolve()

		@classmethod
		def is_registered(cls) -> bool:
		"""Check if a CacheManager instance is registered.

		Returns:
		True if registered, False otherwise
		"""
		return cls._instance is not None


		def resolve_cache_manager() -> CacheManager:
		"""Resolve the registered CacheManager instance.

		Returns:
		The registered CacheManager instance

		Raises:
		CacheManagerNotRegisteredError: If no manager is registered
		"""
		if CacheManager._instance is None:
		raise CacheManagerNotRegisteredError(
		"CacheManager not registered. Call CacheManager(cache_dir).register() at application startup."
		)
		return CacheManager._instance

src/tdoc_crawler/config/settings.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -20,7 +20,9 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
		from tdoc_crawler.config.env_vars import ConfigEnvVar
		from tdoc_crawler.config.sources import discover_config_files, load_config_file, merge_configs

		_DEFAULT_CACHE_DIR = Path.home() / ".3gpp-crawler"
		# Default values (not constants - just module-level defaults)
		# Actual paths are resolved by CacheManager at runtime
		_DEFAULT_CACHE_DIR_STR = "~/.3gpp-crawler"
		_DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
		_DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
		_DEFAULT_CHECKOUT_DIRNAME = "checkout"
		@@ -39,7 +41,7 @@ class PathConfig(BaseSettings):
		model_config = SettingsConfigDict(env_prefix="TDC_", env_nested_delimiter="_", extra="ignore")

		cache_dir: Path = Field(
		default=_DEFAULT_CACHE_DIR,
		default_factory=lambda: Path(_DEFAULT_CACHE_DIR_STR).expanduser().resolve(),
		validation_alias=AliasChoices(ConfigEnvVar.TDC_CACHE_DIR.name, "cache_dir"),
		description="Root cache directory for storing downloaded files and metadata",
		)
		@@ -90,7 +92,7 @@ class PathConfig(BaseSettings):
		def _resolve_cache_dir(cls, value: str \| Path \| None) -> Path:
		"""Resolve and validate the cache directory path."""
		if value is None:
		return _DEFAULT_CACHE_DIR
		return Path(_DEFAULT_CACHE_DIR_STR).expanduser().resolve()
		if isinstance(value, str):
		value = Path(value)
		return value.expanduser().resolve()

src/tdoc_crawler/tdocs/operations/checkout.py

+15 −15

Original line number	Diff line number	Diff line
		@@ -97,19 +97,19 @@ def checkout_tdoc(
		return checkout_path

		checkout_path.mkdir(parents=True, exist_ok=True)
		temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"
		temp_zip_file = checkout_path / f"{metadata.tdoc_id}.zip"

		if metadata.url is None:
		raise ValueError(f"TDoc {metadata.tdoc_id} has no URL")

		try:
		download_to_file(metadata.url, temp_zip_path, session=session)
		with zipfile.ZipFile(temp_zip_path) as archive:
		download_to_file(metadata.url, temp_zip_file, session=session)
		with zipfile.ZipFile(temp_zip_file) as archive:
		archive.extractall(checkout_path)
		logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
		finally:
		if temp_zip_path.exists():
		temp_zip_path.unlink()
		if temp_zip_file.exists():
		temp_zip_file.unlink()

		return checkout_path

		@@ -140,8 +140,8 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo

		downloads_dir = Path(checkout_dir)
		downloads_dir.mkdir(parents=True, exist_ok=True)
		path = urlparse(metadata.url).path
		filename = str(posixpath.basename(path))
		url_path = urlparse(metadata.url).path
		filename = str(posixpath.basename(url_path))
		suffix = Path(filename).suffix.lower()

		if suffix == ".zip":
		@@ -151,14 +151,14 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
		if files:
		return extract_dir if return_dir else files[0]
		shutil.rmtree(extract_dir)
		zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
		download_to_file(metadata.url, zip_path, session=session)
		zip_file = downloads_dir / f"{metadata.tdoc_id}.zip"
		download_to_file(metadata.url, zip_file, session=session)
		try:
		with zipfile.ZipFile(zip_path) as archive:
		with zipfile.ZipFile(zip_file) as archive:
		archive.extractall(extract_dir)
		finally:
		with suppress(FileNotFoundError):
		zip_path.unlink()
		zip_file.unlink()
		files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
		if not files:
		raise FileNotFoundError("no-files-in-archive")
		@@ -167,14 +167,14 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
		# For non-zip files, download directly
		target_suffix = suffix or ""
		target_name = filename if filename else f"{metadata.tdoc_id}{target_suffix or '.bin'}"
		target_path = downloads_dir / target_name
		if not target_path.exists():
		target_file = downloads_dir / target_name
		if not target_file.exists():
		try:
		download_to_file(metadata.url, target_path, session=session)
		download_to_file(metadata.url, target_file, session=session)
		except requests.exceptions.HTTPError as exc:
		status_code = exc.response.status_code if exc.response is not None else "unknown"
		raise FileNotFoundError(f"failed-to-download ({status_code}): {metadata.url}") from exc
		return target_path
		return target_file


		def get_checked_out_tdocs(checkout_dir: Path) -> list[str]: