Commit ea271d19 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat: Implement core DI infrastructure (Phase 1-2)

- Add ServiceContainer for centralized service lifecycle management
- Add ConfigService for unified configuration loading
- Add DatabaseFactory, HttpClientProvider protocols
- Add ParserProtocol, ClientProtocol for future migrations
- Add LanceDB support with proper <provider>/<model> path structure
- Convert config.py and http_client.py to packages for better organization
- Add comprehensive tests for ServiceContainer (25 tests)

BREAKING CHANGE: None - backward compatible
parent b88e498e
Loading
Loading
Loading
Loading
+127 −0
Original line number Diff line number Diff line
"""Configuration management for file paths and caching behavior."""

from __future__ import annotations

import os
from pathlib import Path
from typing import Self

# Fallback path if no argument or env var is provided
DEFAULT_CACHE_DIR = Path.home() / ".tdoc-crawler"
DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db"
DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
DEFAULT_CHECKOUT_DIRNAME = "checkout"
DEFAULT_MANAGER = "default"

REGISTRY_FILENAME = "workspaces.json"

_cache_managers: dict[str, CacheManager] = {}


def register_cache_manager(manager: CacheManager, force: bool = False) -> None:
    """Register a cache manager instance under a given name.

    Args:
        manager: CacheManager instance to register
        force: If True, overwrite existing manager with same name
    """
    name = manager.name
    if name in _cache_managers and not force:
        raise ValueError(f"Cache manager with name '{name}' is already registered.")
    _cache_managers[name] = manager


def reset_cache_managers() -> None:
    """Clear all registered cache managers.

    Primarily useful for testing to ensure clean state between tests.
    """
    _cache_managers.clear()


def resolve_cache_manager(name: str | None = None) -> CacheManager:
    """Resolve a cache manager by name, or return the default if name is None."""
    name = name or DEFAULT_MANAGER
    manager = _cache_managers.get(name)
    if manager is None:
        raise ValueError(f"No cache manager registered under name '{name}'.")
    return manager


class CacheManager:
    """Manages cache directory layout and path resolution.

    Acts as the single source of truth for where files are stored.
    """

    def __init__(self, root_path: Path | None = None, ai_cache_dir: Path | None = None, name: str = DEFAULT_MANAGER, ensure_paths: bool = True) -> None:
        """Initialize cache manager.

        Args:
            root_path: Explicit root path. If None, tries TDC_CACHE_DIR env var,
                       then falls back to DEFAULT_CACHE_DIR.
            ai_cache_dir: Explicit AI cache directory path. If None, defaults to root_path/.ai
            name: Optional name to register this manager under. If provided, the manager is registered upon initialization.
            ensure_paths: If True, will create the root directory if it doesn't exist.
        """
        self.name = name

        if root_path:
            self.root = root_path
        else:
            env_cache_dir = os.getenv("TDC_CACHE_DIR")
            self.root = Path(env_cache_dir) if env_cache_dir else DEFAULT_CACHE_DIR

        if ai_cache_dir:
            self.ai_cache_dir = ai_cache_dir.resolve()
        else:
            env_ai_cache_dir = os.getenv("TDC_AI_STORE_PATH")
            if env_ai_cache_dir:
                self.ai_cache_dir = Path(env_ai_cache_dir).resolve()
            else:
                self.ai_cache_dir = (self.root / ".ai").resolve()

        if ensure_paths:
            self.ensure_paths()
            self.ensure_ai_paths()

    def register(self, force: bool = True) -> Self:
        """Register this instance as a cache manager under the given name.

        Args:
            force: If True (default), overwrite existing manager with same name
        """
        register_cache_manager(self, force=force)
        return self

    @property
    def http_cache_file(self) -> Path:
        """Path to the HTTP client cache database file."""
        return self.root / DEFAULT_HTTP_CACHE_FILENAME

    @property
    def db_file(self) -> Path:
        """Path to the metadata SQLite database."""
        return self.root / DEFAULT_DATABASE_FILENAME

    @property
    def checkout_dir(self) -> Path:
        """Path to the default checkout directory."""
        return self.root / DEFAULT_CHECKOUT_DIRNAME

    def ai_embed_dir(self, embedding_model: str) -> Path:
        """Path to store AI-related files for a specific embedding model."""
        return (self.ai_cache_dir / embedding_model).resolve()

    @property
    def ai_workspace_file(self) -> Path:
        """Path to the workspace registry file for a specific workspace."""
        return self.ai_cache_dir / REGISTRY_FILENAME

    def ensure_paths(self) -> None:
        """Ensure the root cache directory exists."""
        self.root.mkdir(parents=True, exist_ok=True)

    def ensure_ai_paths(self) -> None:
        """Ensure the AI cache directory exists."""
        self.ai_cache_dir.mkdir(parents=True, exist_ok=True)
+129 −0
Original line number Diff line number Diff line
"""Unified configuration service providing access to all config types."""

from __future__ import annotations

from pathlib import Path

from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.models.base import HttpCacheConfig
from tdoc_crawler.models.crawl_limits import CrawlLimits

try:
    from tdoc_crawler.ai.config import AiConfig as _AiConfig
except ImportError:
    _AiConfig = None  # type: ignore[assignment,misc]

class ConfigService:
    """Unified access point for all application configuration.

    Provides lazy access to cache settings, HTTP cache settings,
    AI settings, and crawl limits. Integrates with the existing
    CacheManager registry and ServiceContainer pattern.

    Example:
        config = ConfigService()
        http_config = config.http_cache
        crawl_limits = config.crawl_limits
    """

    def __init__(
        self,
        cache_manager_name: str | None = None,
        cache_dir: Path | None = None,
    ) -> None:
        """Initialize ConfigService.

        Args:
            cache_manager_name: Name of an already-registered CacheManager to use.
                If None, uses the 'default' manager when available, or creates one
                from environment variables.
            cache_dir: Explicit cache root path. Only used if no CacheManager is
                registered under cache_manager_name. Takes precedence over env vars.
        """
        self._cache_manager_name = cache_manager_name
        self._cache_dir = cache_dir
        self._http_cache: HttpCacheConfig | None = None
        self._crawl_limits: CrawlLimits | None = None

    @classmethod
    def from_env(cls, cache_manager_name: str | None = None) -> ConfigService:
        """Create ConfigService loading settings from environment variables.

        Args:
            cache_manager_name: Optional name for the CacheManager to use.

        Returns:
            ConfigService instance configured from the environment.
        """
        return cls(cache_manager_name=cache_manager_name)

    @property
    def cache_manager(self) -> CacheManager:
        """Return the resolved CacheManager instance.

        Returns:
            The registered CacheManager for this service's name, or the
            default manager if no name was provided.

        Raises:
            ValueError: If no matching CacheManager is registered and no
                cache_dir was provided to create one.
        """
        try:
            return resolve_cache_manager(self._cache_manager_name)
        except ValueError:
            # No registered manager - create one from cache_dir or env
            manager = CacheManager(
                root_path=self._cache_dir,
                name=self._cache_manager_name or "default",
                ensure_paths=True,
            )
            manager.register(force=True)
            return manager

    @property
    def http_cache(self) -> HttpCacheConfig:
        """Return HTTP cache configuration resolved from env vars and defaults.

        Returns:
            HttpCacheConfig instance with resolved TTL and refresh settings.
        """
        if self._http_cache is None:
            self._http_cache = HttpCacheConfig.resolve_http_cache_config(
                cache_file=self.cache_manager.http_cache_file
            )
        return self._http_cache

    @property
    def crawl_limits(self) -> CrawlLimits:
        """Return crawl limits with all limits unset (use CLI overrides as needed).

        Returns:
            CrawlLimits instance with no restrictions applied.
        """
        if self._crawl_limits is None:
            self._crawl_limits = CrawlLimits()
        return self._crawl_limits

    def get_ai_config(self, **overrides: str | int | Path | None) -> object:
        """Return AI configuration loaded from environment variables.

        Args:
            **overrides: Keyword arguments passed directly to AiConfig.from_env().

        Returns:
            AiConfig instance configured from the environment.

        Raises:
            ImportError: If the AI extras are not installed.
        """
        if _AiConfig is None:
            msg = "AI extras are not installed. Run: uv add tdoc-crawler[ai]"
            raise ImportError(msg)
        return _AiConfig.from_env(
            cache_manager_name=self._cache_manager_name,
            **overrides,
        )


__all__ = ["ConfigService"]
+67 −0
Original line number Diff line number Diff line
"""Protocol definitions for database factory abstractions."""

from __future__ import annotations

from pathlib import Path
from typing import Protocol, runtime_checkable

from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.database.tdocs import TDocDatabase


@runtime_checkable
class DatabaseFactory(Protocol):
    """Protocol for creating database instances."""

    def get_tdoc_db(self) -> TDocDatabase:
        """Return an open TDocDatabase context manager."""
        ...

    def get_spec_db(self) -> SpecDatabase:
        """Return an open SpecDatabase context manager."""
        ...

    def get_meeting_db(self) -> TDocDatabase:
        """Return an open TDocDatabase for meeting operations (TDocDatabase inherits MeetingDatabase)."""
        ...


class DefaultDatabaseFactory:
    """Default implementation of DatabaseFactory backed by a SQLite file path."""

    def __init__(self, db_file: Path) -> None:
        """Initialize with path to the SQLite database file.

        Args:
            db_file: Path to the SQLite database file.
        """
        self._db_file = db_file

    def get_tdoc_db(self) -> TDocDatabase:
        """Return a TDocDatabase for the configured database file.

        Returns:
            A TDocDatabase instance (not yet opened; use as context manager).
        """
        return TDocDatabase(self._db_file)

    def get_spec_db(self) -> SpecDatabase:
        """Return a SpecDatabase for the configured database file.

        Returns:
            A SpecDatabase instance (not yet opened; use as context manager).
        """
        return SpecDatabase(self._db_file)

    def get_meeting_db(self) -> TDocDatabase:
        """Return a TDocDatabase for meeting operations.

        TDocDatabase inherits from MeetingDatabase, so it provides all meeting ops.

        Returns:
            A TDocDatabase instance (not yet opened; use as context manager).
        """
        return TDocDatabase(self._db_file)


__all__ = ["DatabaseFactory", "DefaultDatabaseFactory"]
+10 −0
Original line number Diff line number Diff line
# Dependency Injection Package

"""Dependency injection infrastructure for tdoc-crawler.

This package provides the ServiceContainer for centralized service lifecycle management.
"""

from tdoc_crawler.di.container import ServiceContainer

__all__ = ["ServiceContainer"]
+217 −0
Original line number Diff line number Diff line
"""ServiceContainer for centralized dependency management."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import lancedb
import requests

from tdoc_crawler.config import CacheManager
from tdoc_crawler.config.service import ConfigService
from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger

logger = get_logger(__name__)


class ServiceContainer:
    """Manages lifecycle and resolution of core services.

    The container provides centralized access to core services while
    maintaining full backward compatibility with existing code patterns.
    Services are created lazily on first access and cached for reuse.
    """

    def __init__(
        self,
        cache_dir: Path | None = None,
        name: str = "default"
    ) -> None:
        """Initialize container with optional cache directory.

        Args:
            cache_dir: Optional cache directory path. If None, uses default
                       from CacheManager or TDC_CACHE_DIR environment variable.
            name: Container identifier for multi-container scenarios.
        """
        self._cache_dir = cache_dir
        self._name = name
        self._cache_manager: CacheManager | None = None
        self._config_service: ConfigService | None = None
        self._session: requests.Session | None = None
        self._db_instances: dict[str, Any] = {}

    @property
    def cache_manager(self) -> CacheManager:
        """Get the registered cache manager, creating if necessary.

        Returns:
            CacheManager instance registered in container.

        Raises:
            RuntimeError: If cache manager cannot be created or registered.
        """
        if self._cache_manager is None:
            self._cache_manager = CacheManager(
                root_path=self._cache_dir,
                name=self._name,
                ensure_paths=True
            )
            self._cache_manager.register()
            logger.debug(f"Created and registered CacheManager: {self._name}")
        return self._cache_manager

    def get_cache_manager(self) -> CacheManager:
        """Get the CacheManager instance managed by this container.

        Convenience method alias for the ``cache_manager`` property.

        Returns:
            CacheManager instance registered in container.
        """
        return self.cache_manager

    def ensure_paths(self) -> None:
        """Ensure all cache directories managed by this container exist.

        Creates the root cache directory and the AI cache directory if they
        do not already exist. Idempotent - safe to call multiple times.
        """
        self.cache_manager.ensure_paths()
        self.cache_manager.ensure_ai_paths()

    @property
    def config(self) -> ConfigService:
        """Get ConfigService providing unified access to all configuration.

        Returns:
            ConfigService instance bound to this container's cache manager name.
        """
        if self._config_service is None:
            # Ensure cache manager is initialized before ConfigService uses it
            _ = self.cache_manager
            self._config_service = ConfigService(
                cache_manager_name=self._name,
                cache_dir=self._cache_dir,
            )
        return self._config_service

    def get_tdoc_db(self) -> TDocDatabase:
        """Get TDoc database instance.

        Returns:
            TDocDatabase instance configured with container's cache manager.

        Raises:
            RuntimeError: If cache manager not available.
            IOError: If database file cannot be accessed.
        """
        key = "tdoc"
        if key not in self._db_instances:
            self._db_instances[key] = TDocDatabase(self.cache_manager.db_file)
        return self._db_instances[key]

    def get_spec_db(self) -> SpecDatabase:
        """Get Spec database instance.

        Returns:
            SpecDatabase instance configured with container's cache manager.
        """
        key = "spec"
        if key not in self._db_instances:
            self._db_instances[key] = SpecDatabase(self.cache_manager.db_file)
        return self._db_instances[key]

    def get_meeting_db(self) -> MeetingDatabase:
        """Get Meeting database instance.

        Returns:
            MeetingDatabase instance configured with container's cache manager.
        """
        key = "meeting"
        if key not in self._db_instances:
            self._db_instances[key] = MeetingDatabase(self.cache_manager.db_file)
        return self._db_instances[key]

    def get_ai_db(self, embedding_model: str | None = None) -> lancedb.db.LanceDBConnection:
        """Get LanceDB connection for AI storage.

        The connection is created at the correct path based on the embedding model:
        ``{ai_cache_dir}/{provider}/{model}/lancedb/``.

        For example, with model ``openai/text-embedding-3-small`` the path becomes:
        ``~/.tdoc-crawler/.ai/openai/text-embedding-3-small/lancedb/``.

        Args:
            embedding_model: Embedding model identifier in ``<provider>/<model>`` format.
                If None, uses the default from ``tdoc_crawler.ai.config``.

        Returns:
            LanceDB connection at the model-specific path.
        """
        from tdoc_crawler.ai.config import DEFAULT_EMBEDDING_MODEL

        resolved_model = embedding_model or DEFAULT_EMBEDDING_MODEL
        key = f"ai_db:{resolved_model}"
        if key not in self._db_instances:
            store_path = self.cache_manager.ai_embed_dir(resolved_model) / "lancedb"
            store_path.mkdir(parents=True, exist_ok=True)
            self._db_instances[key] = lancedb.connect(str(store_path))
            logger.debug(f"Created LanceDB connection at: {store_path}")
        return self._db_instances[key]

    def get_session(self) -> requests.Session:
        """Get HTTP session with caching.

        Sessions are cached and reused across multiple calls.
        Uses cache manager from container for HTTP cache storage.

        Returns:
            requests.Session configured with hishel caching.
        """
        if self._session is None:
            self._session = create_cached_session(
                cache_manager_name=self._name
            )
            logger.debug(f"Created HTTP session for container: {self._name}")
        return self._session

    def close(self) -> None:
        """Close container and release resources.

        Closes HTTP session if opened. Safe to call multiple times.
        Database connections are closed via context manager.
        """
        if self._session is not None:
            self._session.close()
            self._session = None
            logger.debug("Closed HTTP session")

        # Close any open database connections
        for key, db in self._db_instances.items():
            if hasattr(db, 'close'):
                try:
                    db.close()
                    logger.debug(f"Closed database: {key}")
                except Exception as e:
                    logger.warning(f"Failed to close database {key}: {e}")
        self._db_instances.clear()

        # Clear cache manager reference
        self._cache_manager = None

    def __enter__(self) -> ServiceContainer:
        """Context manager entry."""
        return self

    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        """Context manager exit - ensures cleanup."""
        self.close()


__all__ = ["ServiceContainer"]
Loading