Commit 928d0a36 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(cache): make CacheManager primary source for HTTP cache path and add...

refactor(cache): make CacheManager primary source for HTTP cache path and add HttpCacheConfig defaults
parent eaa07db9
Loading
Loading
Loading
Loading
+14 −3
Original line number Diff line number Diff line
@@ -99,12 +99,23 @@ def create_cached_session(
    """
    manager = resolve_cache_manager(cache_manager_name)
    http_cache = http_cache or HttpCacheConfig.resolve_http_cache_config()
    logger.debug(f"Creating cached HTTP session with cache_db={manager.http_cache_file})")

    # Determine database path: use explicit cache_dir if provided, otherwise from manager
    if http_cache.cache_dir:
        # If cache_dir is a file, use it directly. If it's a directory, append default filename.
        if http_cache.cache_dir.suffix in (".sqlite", ".sqlite3", ".db"):
            database_path = str(http_cache.cache_dir)
        else:
            database_path = str(http_cache.cache_dir / "http-cache.sqlite3")
    else:
        database_path = str(manager.http_cache_file)

    logger.debug(f"Creating cached HTTP session with cache_db={database_path})")
    logger.debug(f"HTTP cache config: ttl={http_cache.ttl}, refresh_ttl_on_access={http_cache.refresh_ttl_on_access}, max_retries={http_cache.max_retries}")

    # Create SQLite storage backend (cache_dir is the database file path)
    # Create SQLite storage backend
    storage = SyncSqliteStorage(
        database_path=str(manager.http_cache_file),
        database_path=database_path,
        default_ttl=http_cache.ttl,
        refresh_ttl_on_access=http_cache.refresh_ttl_on_access,
    )
+13 −3
Original line number Diff line number Diff line
@@ -5,9 +5,12 @@ from __future__ import annotations
import os
from dataclasses import dataclass
from enum import StrEnum
from pathlib import Path

from pydantic import BaseModel, Field

from tdoc_crawler.config import resolve_cache_manager


class OutputFormat(StrEnum):
    """Supported output formats for CLI responses."""
@@ -33,9 +36,10 @@ _DEFAULT_MAX_RETRIES: int = 3
class HttpCacheConfig:
    """HTTP cache configuration for hishel caching."""

    ttl: int
    refresh_ttl_on_access: bool
    max_retries: int
    ttl: int = _DEFAULT_TTL
    refresh_ttl_on_access: bool = _DEFAULT_TTL_ON_ACCESS
    max_retries: int = _DEFAULT_MAX_RETRIES
    cache_dir: Path | None = None

    @classmethod
    def resolve_http_cache_config(
@@ -71,6 +75,7 @@ class HttpCacheConfig:
        return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access, max_retries=max_retries)


# TODO: classes derived from BaseConfigModel should be simpler dataclasses, as they are not/never stored in DB and only used for config parsing. BaseModel is more useful for persistent models with validation needs.
class BaseConfigModel(BaseModel):
    """Shared configuration base enabling attribute parsing and whitespace handling."""

@@ -79,6 +84,11 @@ class BaseConfigModel(BaseModel):
    cache_manager_name: str | None = Field(default=None, description="Cache configuration manager name")
    http_cache: HttpCacheConfig = Field(default_factory=HttpCacheConfig.resolve_http_cache_config, description="HTTP cache configuration")

    @property
    def cache_dir(self) -> Path:
        """Return the cache directory being used."""
        return resolve_cache_manager(self.cache_manager_name).root


@dataclass
class PortalCredentials:
+4 −5
Original line number Diff line number Diff line
@@ -643,11 +643,10 @@ class HybridTDocCrawler:
                task = asyncio.create_task(
                    runner.run(
                        fetch_meeting_document_list_subinterpreter,
                        meeting.meeting_id,
                        str(config.cache_dir),
                        config.http_cache.ttl,
                        config.http_cache.refresh_ttl_on_access,
                        config.timeout,
                        meeting_id=meeting.meeting_id,
                        timeout=config.timeout,
                        http_cache=config.http_cache,
                        cache_manager_name=config.cache_manager_name,
                    )
                )
                tasks[task] = meeting