Commit f1c3ba55 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(http_client): improve download_to_file function signature and session handling

parent f4d2a5e1
Loading
Loading
Loading
Loading
+20 −7
Original line number Diff line number Diff line
"""HTTP client factory with hishel caching support."""

from __future__ import annotations
from babel.messages.frontend import log

import logging
from pathlib import Path
@@ -18,13 +17,22 @@ from tdoc_crawler.models import HttpCacheConfig
logger = logging.getLogger(__name__)


def download_to_path(url: str, destination: Path, session: requests.Session | None = None, cache_manager_name: str | None = None) -> None:
def download_to_file(
    url: str,
    destination: Path,
    session: requests.Session | None = None,
    close_session: bool = True,
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
) -> requests.Session | None:
    """Download a file from URL to destination path.

    Args:
        url: Source URL
        destination: Destination path
        session: Optional requests.Session to reuse. If None, a temporary cached session is created.
        close_session: Whether to close the session after download. Only applicable if a temporary session is created.
        http_cache: Optional HTTP cache configuration
        cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session.

    Raises:
@@ -37,11 +45,10 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No
    if not lowered.startswith(allowed_schemes):
        raise ValueError("unsupported-url-scheme")

    # Use provided session or create a temporary one
    # Use provided session or create a new one (might be used for multiple downloads, so we don't want to create a new session for each)
    temp_session: requests.Session | None = None
    if session is None:
        manager = resolve_cache_manager(cache_manager_name)
        temp_session = create_cached_session(manager.http_cache_file)
        temp_session = create_cached_session(http_cache=http_cache, cache_manager_name=cache_manager_name)
        active_session = temp_session
    else:
        active_session = session
@@ -66,13 +73,19 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    target.write(chunk)

        # return session or None if we created a temporary session and are closing it
        if close_session:
            active_session = None
        return active_session

    finally:
        if temp_session:
        if temp_session and close_session:
            temp_session.close()


def create_cached_session(
    http_cache: HttpCacheConfig | None,
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
) -> requests.Session:
    """Create a requests.Session with hishel caching enabled.
+11 −5
Original line number Diff line number Diff line
"""Spec download orchestration."""

import asyncio
import logging
import zipfile
from pathlib import Path

import requests
from zipinspect import HTTPZipReader

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.constants.urls import SPEC_URL_TEMPLATE
from tdoc_crawler.http_client import download_to_path
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.http_client import download_to_file
from tdoc_crawler.specs.sources.base import SpecSource
from tdoc_crawler.utils.normalization import normalize_spec_number

@@ -19,9 +20,14 @@ _logger = logging.getLogger(__name__)
class SpecDownloads:
    """Download and extraction utilities for specs."""

    def __init__(self, database, cache_manager_name: str | None = None) -> None:
    def __init__(self, database: SpecDatabase, cache_manager_name: str | None = None) -> None:
        self._database = database
        self._cache_manager = resolve_cache_manager(cache_manager_name)
        self.session: requests.Session | None = None

    def __del__(self) -> None:
        if self.session:
            self.session.close()

    def checkout_specs(
        self,
@@ -184,8 +190,8 @@ class SpecDownloads:
            return False

    def _download_full_zip(self, url: str, target_path: Path) -> None:
        """Download full zip file."""
        download_to_path(url, target_path, cache_manager_name=self._cache_manager.name)
        """Download full zip file, re-use session if already created for doc-only attempt."""
        self.session = download_to_file(url, target_path, session=self.session, close_session=False, cache_manager_name=self._cache_manager.name)

    @staticmethod
    def _extract_zip(zip_file: Path, extract_dir: Path, keep_zip: bool = True) -> None: