Commit b2d45a25 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(http): enhance HTTP cache configuration and management

* Introduce explicit cache_file parameter in HttpCacheConfig.
* Update app.py to build HTTP cache config for multiprocessing.
* Modify http_client.py to handle cache_file resolution.
parent 97672aaa
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -76,7 +76,7 @@ from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import get_console, set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.base import HttpCacheConfig, OutputFormat, SortOrder
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.specs.downloads import SpecDownloads
from tdoc_crawler.specs.models import SpecQueryFilters
@@ -144,6 +144,12 @@ def crawl_tdocs(
    working_groups = parse_working_groups(working_group, subgroups)

    limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)

    # Build HTTP cache config with explicit cache_dir for multiprocessing workers
    http_cache = HttpCacheConfig.resolve_http_cache_config(
        cache_ttl=None, cache_refresh_on_access=None, max_retries=max_retries, cache_file=manager.http_cache_file
    )

    config = TDocCrawlConfig(
        working_groups=working_groups,
        subgroups=subgroups,
@@ -160,6 +166,7 @@ def crawl_tdocs(
        use_document_list=True,
        allow_parallel_fallback=True,
        use_parallel_crawling=False,
        http_cache=http_cache,
    )

    db_file = manager.db_file
+7 −14
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@ from hishel import SyncBaseStorage, SyncSqliteStorage
from hishel.requests import CacheAdapter
from urllib3.util.retry import Retry

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config import DEFAULT_HTTP_CACHE_FILENAME, resolve_cache_manager
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import HttpCacheConfig

@@ -97,25 +97,18 @@ def create_cached_session(
    Returns:
        Configured requests.Session with caching enabled
    """
    manager = resolve_cache_manager(cache_manager_name)
    http_cache = http_cache or HttpCacheConfig.resolve_http_cache_config()
    if http_cache.cache_file is None:
        # If no explicit cache file is provided, determine it using the cache manager
        manager = resolve_cache_manager(cache_manager_name)
        http_cache.cache_file = str(manager.http_cache_file)

    # Determine database path: use explicit cache_dir if provided, otherwise from manager
    if http_cache.cache_dir:
        # If cache_dir is a file, use it directly. If it's a directory, append default filename.
        if http_cache.cache_dir.suffix in (".sqlite", ".sqlite3", ".db"):
            database_path = str(http_cache.cache_dir)
        else:
            database_path = str(http_cache.cache_dir / "http-cache.sqlite3")
    else:
        database_path = str(manager.http_cache_file)

    logger.debug(f"Creating cached HTTP session with cache_db={database_path})")
    logger.debug(f"Creating cached HTTP session with cache_db={http_cache.cache_file})")
    logger.debug(f"HTTP cache config: ttl={http_cache.ttl}, refresh_ttl_on_access={http_cache.refresh_ttl_on_access}, max_retries={http_cache.max_retries}")

    # Create SQLite storage backend
    storage = SyncSqliteStorage(
        database_path=database_path,
        database_path=str(http_cache.cache_file),
        default_ttl=http_cache.ttl,
        refresh_ttl_on_access=http_cache.refresh_ttl_on_access,
    )
+8 −3
Original line number Diff line number Diff line
@@ -39,11 +39,15 @@ class HttpCacheConfig:
    ttl: int = _DEFAULT_TTL
    refresh_ttl_on_access: bool = _DEFAULT_TTL_ON_ACCESS
    max_retries: int = _DEFAULT_MAX_RETRIES
    cache_dir: Path | None = None
    cache_file: Path | None = None

    @classmethod
    def resolve_http_cache_config(
        cls, cache_ttl: int | None = None, cache_refresh_on_access: bool | None = None, max_retries: int | None = None
        cls,
        cache_ttl: int | None = None,
        cache_refresh_on_access: bool | None = None,
        max_retries: int | None = None,
        cache_file: Path | None = None,
    ) -> HttpCacheConfig:
        """Resolve HTTP cache configuration from CLI parameters and environment variables.

@@ -51,6 +55,7 @@ class HttpCacheConfig:
            cache_ttl: TTL for cache entries (CLI parameter)
            cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter)
            max_retries: Maximum number of retry attempts for failed requests (CLI parameter)
            cache_file: Optional explicit cache file path (CLI parameter). If not provided, will be determined by CacheManager.

        Returns:
            HttpCacheConfig instance with resolved values
@@ -72,7 +77,7 @@ class HttpCacheConfig:
            refresh_on_access = _DEFAULT_TTL_ON_ACCESS

        max_retries = max_retries or _DEFAULT_MAX_RETRIES
        return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access, max_retries=max_retries)
        return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access, max_retries=max_retries, cache_file=cache_file)


# TODO: classes derived from BaseConfigModel should be simpler dataclasses, as they are not/never stored in DB and only used for config parsing. BaseModel is more useful for persistent models with validation needs.