Commit 2b093931 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(http_client,sources,operations): replace CacheManager and...

♻️ refactor(http_client,sources,operations): replace CacheManager and HttpCacheConfig with PathConfig and HttpConfig

- create_cached_session: drop cache_manager_name and http_cache params; accept http_cache_file: Path | None directly
- resolve_ssl_verify: remove TDC_VERIFY_SSL env-var fallback (handled by pydantic-settings)
- PortalClient: replace http_cache/cache_manager_name with http_config/http_cache_file
- TDocSourceConfig: replace http_cache/cache_manager_name with http_config/http_cache_file/db_file
- All TDoc/Spec sources and downloads: propagate http_cache_file path instead of manager name
- crawl operations: inline limit fields, pass http_config instead of http_cache+cache_manager_name
- fetch.py: remove resolve_cache_manager; fetch_missing_tdocs accepts http_cache_file/db_file
parent 8b7d7e35
Loading
Loading
Loading
Loading
+11 −9
Original line number Diff line number Diff line
@@ -14,15 +14,17 @@ For backward compatibility, standalone functions are also provided.
from __future__ import annotations

import re
from pathlib import Path
from typing import Any

import requests

from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.http_client import create_cached_session, resolve_ssl_verify
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
from tdoc_crawler.models.base import PortalCredentials
from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page
from tdoc_crawler.tdocs.models import TDocMetadata

@@ -60,8 +62,8 @@ class PortalClient:
        session: requests.Session | None = None,
        credentials: PortalCredentials | None = None,
        timeout: int = 30,
        http_cache: HttpCacheConfig | None = None,
        cache_manager_name: str | None = None,
        http_config: HttpConfig | None = None,
        http_cache_file: Path | None = None,
        verify: bool | str | None = None,
    ) -> None:
        """Initialize portal client.
@@ -70,14 +72,14 @@ class PortalClient:
            session: Optional requests.Session to reuse
            credentials: Optional portal credentials (if not provided, will attempt to resolve registered credentials)
            timeout: Request timeout in seconds
            http_cache: Optional HTTP cache configuration
            cache_manager_name: Optional cache manager name for resolving cache configuration
            http_config: Optional HTTP configuration from ThreeGPPConfig
            http_cache_file: Optional explicit path to the HTTP cache database
            verify: SSL certificate verification mode (bool or CA bundle path)
        """
        self._credentials = credentials
        self._cache_manager_name = cache_manager_name
        self._http_cache_file = http_cache_file
        self.timeout = timeout
        self._http_cache = http_cache
        self._http_config = http_config
        self._verify = verify
        self._authenticated = False
        self._session = session
@@ -299,8 +301,8 @@ class PortalClient:
        """
        if self._session is None:
            self._session = create_cached_session(
                http_config=self._http_cache,
                cache_manager_name=self._cache_manager_name,
                http_config=self._http_config,
                http_cache_file=self._http_cache_file,
                verify=self._verify_resolved,
            )

+7 −5
Original line number Diff line number Diff line
@@ -2,6 +2,8 @@

from __future__ import annotations

from pathlib import Path

import requests


@@ -11,14 +13,14 @@ class DefaultHttpClientProvider:
    Creates a new session on first call and caches it for reuse.
    """

    def __init__(self, cache_manager_name: str | None = None) -> None:
        """Initialize with optional cache manager name.
    def __init__(self, http_cache_file: Path | None = None) -> None:
        """Initialize with optional HTTP cache file path.

        Args:
            cache_manager_name: Optional name of the cache manager to use for
                                HTTP cache storage configuration.
            http_cache_file: Optional explicit path to the HTTP cache database.
                             Falls back to PathConfig default when caching is enabled.
        """
        self._cache_manager_name = cache_manager_name
        self._http_cache_file = http_cache_file
        self._session: requests.Session | None = None

    def close(self) -> None:
+30 −57
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from __future__ import annotations

import os
import ssl
from dataclasses import dataclass
from pathlib import Path
@@ -15,16 +14,12 @@ from requests.adapters import HTTPAdapter
from truststore import SSLContext as TruststoreSSLContext
from urllib3.util.retry import Retry

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.config.settings import HttpConfig, PathConfig
from tdoc_crawler.constants.urls import BROWSER_HEADERS
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import HttpCacheConfig

logger = get_logger(__name__)

SSL_DISABLED_ENV_VALUES = {"false", "0", "no", "off", "f", "n"}


class SSLContextHTTPAdapter(HTTPAdapter):
    """HTTP adapter that can enforce a specific SSL context."""
@@ -85,18 +80,16 @@ def resolve_ssl_verify(
    verify: bool | str | None = None,
    http_config: HttpConfig | None = None,
) -> bool | str:
    """Resolve SSL verification behavior from explicit argument or environment.
    """Resolve SSL verification behavior from explicit argument or HttpConfig.

    Resolution order:
    1. Explicit `verify` parameter
    2. http_config.verify_ssl if provided
    3. TDC_VERIFY_SSL environment variable
    4. Default (True)
    3. Default (True)

    Args:
        verify: Optional explicit SSL verification mode.
        http_config: Optional HttpConfig from TDocCrawlerConfig. When provided,
                    settings are taken from here with fallback to env vars.
        http_config: Optional HttpConfig. When provided, verify_ssl is used as fallback.

    Returns:
        Either a boolean verification flag or a certificate bundle path.
@@ -105,16 +98,6 @@ def resolve_ssl_verify(
        resolved = verify
    elif http_config is not None:
        resolved = http_config.verify_ssl
    else:
        env_value = os.getenv("TDC_VERIFY_SSL")
        if env_value is None:
            resolved = True
        else:
            normalized = env_value.strip().lower()
            if normalized in SSL_DISABLED_ENV_VALUES:
                resolved = False
            elif normalized:
                resolved = env_value.strip()
    else:
        resolved = True

@@ -143,8 +126,7 @@ def download_to_file(
    destination: Path,
    session: requests.Session | None = None,
    close_session: bool = True,
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
    http_cache_file: Path | None = None,
    http_cache_enabled: bool | None = None,
    pool_config: PoolConfig | None = None,
    verify: bool | str | None = None,
@@ -157,13 +139,11 @@ def download_to_file(
        destination: Destination path
        session: Optional requests.Session to reuse. If None, a temporary cached session is created.
        close_session: Whether to close the session after download. Only applicable if a temporary session is created.
        http_cache: Optional HTTP cache configuration
        cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session.
        http_cache_enabled: Whether to enable HTTP caching. If None, defaults to True.
        http_cache_file: Optional explicit path to the HTTP cache database. Falls back to PathConfig default.
        http_cache_enabled: Whether to enable HTTP caching. If None, defaults to http_config.cache_enabled or True.
        pool_config: Optional connection pool configuration.
        verify: SSL certificate verification mode. Can be bool or CA bundle path.
        http_config: Optional HttpConfig from TDocCrawlerConfig. When provided,
                    settings are taken from here with fallback to env vars.
        http_config: HttpConfig from ThreeGPPConfig. Controls caching, retries, SSL.

    Raises:
        ValueError: If URL scheme is not supported
@@ -179,8 +159,7 @@ def download_to_file(
    temp_session: requests.Session | None = None
    if session is None:
        temp_session = create_cached_session(
            http_cache=http_cache,
            cache_manager_name=cache_manager_name,
            http_cache_file=http_cache_file,
            http_cache_enabled=http_cache_enabled,
            pool_config=pool_config,
            verify=verify,
@@ -210,8 +189,7 @@ def download_to_file(


def create_cached_session(
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
    http_cache_file: Path | None = None,
    http_cache_enabled: bool | None = None,
    pool_config: PoolConfig | None = None,
    verify: bool | str | None = None,
@@ -220,16 +198,14 @@ def create_cached_session(
    """Create a requests.Session with hishel caching enabled.

    Args:
        http_cache: HTTP cache configuration
        cache_manager_name: Optional cache manager name to determine cache configuration.
        http_cache_enabled: Whether to enable HTTP caching. If None, defaults to True.
                         Can be set via HTTP_CACHE_ENABLED environment variable.
        http_cache_file: Optional explicit path to the HTTP cache database.
                         Falls back to PathConfig().http_cache_file when caching is enabled.
        http_cache_enabled: Whether to enable HTTP caching. If None, uses http_config.cache_enabled or True.
        pool_config: Optional connection pool configuration.
                When provided, pool settings are applied to the active adapter
                (cache adapter when caching is enabled, HTTPAdapter otherwise).
        verify: SSL certificate verification mode. Can be bool or CA bundle path.
        http_config: Optional HttpConfig from TDocCrawlerConfig. When provided,
                    settings are taken from here with fallback to env vars.
        http_config: HttpConfig from ThreeGPPConfig. Controls caching TTL, retries, SSL.

    Returns:
        Configured requests.Session with caching enabled (unless disabled)
@@ -243,14 +219,9 @@ def create_cached_session(
    ssl_context = _resolve_ssl_context(verify_mode)
    session.verify = verify_mode

    # Check if caching is disabled via parameter, http_config, or environment variable
    # Resolve cache enabled flag: explicit param http_config → default True
    if http_cache_enabled is None:
        if http_config is not None:
            http_cache_enabled = http_config.cache_enabled
        else:
            # Fallback to env var (backward compatibility)
            env_enabled = os.getenv("HTTP_CACHE_ENABLED", "").lower()
            http_cache_enabled = env_enabled not in ("false", "0", "no", "off", "f", "n")
        http_cache_enabled = http_config.cache_enabled if http_config is not None else True

    # If caching is disabled, optionally configure a pooled HTTP adapter and return.
    if not http_cache_enabled:
@@ -278,26 +249,28 @@ def create_cached_session(
        logger.debug("Creating plain HTTP session (caching disabled)")
        return session

    http_cache = http_cache or HttpCacheConfig.resolve_http_cache_config()
    if http_cache.cache_file is None:
        # If no explicit cache file is provided, determine it using the cache manager
        manager = resolve_cache_manager(cache_manager_name)
        http_cache.cache_file = str(manager.http_cache_file)
    # Resolve cache settings from http_config or defaults
    ttl = http_config.cache_ttl if http_config is not None else 7200
    refresh_ttl_on_access = http_config.cache_refresh_on_access if http_config is not None else True
    max_retries_count = http_config.max_retries if http_config is not None else 3

    # Determine cache file path — use explicit param or fall back to PathConfig default
    cache_file = str(http_cache_file) if http_cache_file is not None else str(PathConfig().http_cache_file)

    logger.debug(f"Creating cached HTTP session with cache_db={http_cache.cache_file})")
    logger.debug(f"HTTP cache config: ttl={http_cache.ttl}, refresh_ttl_on_access={http_cache.refresh_ttl_on_access}, max_retries={http_cache.max_retries}")
    logger.debug(f"Creating cached HTTP session with cache_db={cache_file})")
    logger.debug(f"HTTP cache config: ttl={ttl}, refresh_ttl_on_access={refresh_ttl_on_access}, max_retries={max_retries_count}")

    # Create SQLite storage backend
    storage = SyncSqliteStorage(
        database_path=str(http_cache.cache_file),
        default_ttl=http_cache.ttl,
        refresh_ttl_on_access=http_cache.refresh_ttl_on_access,
        database_path=cache_file,
        default_ttl=ttl,
        refresh_ttl_on_access=refresh_ttl_on_access,
    )
    storage = cast(SyncBaseStorage, storage)

    # Configure retry strategy for the cache adapter.
    # If pool_config is set, reuse its retry settings so caching + pooling share one adapter.
    retry_attempts = pool_config.retry_attempts if pool_config and pool_config.enable_retry else http_cache.max_retries
    retry_attempts = pool_config.retry_attempts if pool_config and pool_config.enable_retry else max_retries_count
    max_retries = Retry(
        total=retry_attempts,
        backoff_factor=1,
+10 −11
Original line number Diff line number Diff line
@@ -13,7 +13,6 @@ from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.models.subworking_groups import (
    CODE_INDEX,
    SUBWORKING_GROUP_RECORDS,
@@ -61,12 +60,12 @@ class MeetingCrawler:
        errors: list[str] = []
        meetings: list[MeetingMetadata] = []

        working_groups = self._limit_working_groups(config.working_groups, config.limits)
        working_groups = self._limit_working_groups(config.working_groups, config.limit_subwgs)
        existing_ids: set[int] = set()
        if config.incremental:
            existing_ids = await self.database.get_existing_meeting_ids(working_groups)
        credentials = resolve_credentials(None, None)
        session = create_cached_session(cache_manager_name=config.cache_manager_name, http_cache=config.http_cache)
        session = create_cached_session(http_config=config.http_config)
        if credentials is not None:
            session.auth = (credentials.username, credentials.password)

@@ -84,7 +83,7 @@ class MeetingCrawler:
        finally:
            session.close()

        filtered = self._apply_limits(meetings, config.limits)
        filtered = self._apply_limits(meetings, config.limit_meetings, config.limit_meetings_per_subwg)
        inserted = 0
        updated = 0
        if filtered:
@@ -130,14 +129,15 @@ class MeetingCrawler:
    def _apply_limits(
        self,
        meetings: list[MeetingMetadata],
        limits: CrawlLimits,
        limit_meetings: int | None,
        limit_meetings_per_subwg: int | None,
    ) -> list[MeetingMetadata]:
        """Apply all meeting limits from crawl configuration."""
        if not meetings:
            return []
        filtered = list(meetings)
        filtered = self._limit_meetings_per_subwg(filtered, limits.limit_meetings_per_subwg)
        filtered = self._limit_meetings(filtered, limits.limit_meetings)
        filtered = self._limit_meetings_per_subwg(filtered, limit_meetings_per_subwg)
        filtered = self._limit_meetings(filtered, limit_meetings)
        return filtered

    @staticmethod
@@ -175,13 +175,12 @@ class MeetingCrawler:
    @staticmethod
    def _limit_working_groups(
        working_groups: list[WorkingGroup],
        limits: CrawlLimits,
        limit_subwgs: int | None,
    ) -> list[WorkingGroup]:
        """Apply working group limits from crawl configuration."""
        if limits.limit_subwgs is None or limits.limit_subwgs == 0:
        if limit_subwgs is None or limit_subwgs == 0:
            return working_groups
        limit = limits.limit_subwgs
        return working_groups[:limit] if limit > 0 else working_groups[limit:]
        return working_groups[:limit_subwgs] if limit_subwgs > 0 else working_groups[limit_subwgs:]

    @staticmethod
    def _limit_meetings_per_subwg(
+4 −4
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ from typing import Any
import requests
from zipinspect import HTTPZipReader

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.constants.urls import SPEC_URL_TEMPLATE
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.http_client import download_to_file
@@ -23,9 +23,9 @@ _logger = get_logger(__name__)
class SpecDownloads:
    """Download and extraction utilities for specs."""

    def __init__(self, database: SpecDatabase, cache_manager_name: str | None = None) -> None:
    def __init__(self, database: SpecDatabase, http_cache_file: Path | None = None) -> None:
        self._database = database
        self._cache_manager = resolve_cache_manager(cache_manager_name)
        self._http_cache_file = http_cache_file if http_cache_file is not None else PathConfig().http_cache_file
        self.session: requests.Session | None = None

    def __del__(self) -> None:
@@ -181,7 +181,7 @@ class SpecDownloads:

    def _download_full_zip(self, url: str, target_path: Path) -> None:
        """Download full zip file, re-use session if already created for doc-only attempt."""
        self.session = download_to_file(url, target_path, session=self.session, close_session=False, cache_manager_name=self._cache_manager.name)
        self.session = download_to_file(url, target_path, session=self.session, close_session=False, http_cache_file=self._http_cache_file)

    @staticmethod
    def _filter_versions_by_release(
Loading