Loading src/tdoc_crawler/http_client.py +20 −7 Original line number Diff line number Diff line """HTTP client factory with hishel caching support.""" from __future__ import annotations from babel.messages.frontend import log import logging from pathlib import Path Loading @@ -18,13 +17,22 @@ from tdoc_crawler.models import HttpCacheConfig logger = logging.getLogger(__name__) def download_to_path(url: str, destination: Path, session: requests.Session | None = None, cache_manager_name: str | None = None) -> None: def download_to_file( url: str, destination: Path, session: requests.Session | None = None, close_session: bool = True, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, ) -> requests.Session | None: """Download a file from URL to destination path. Args: url: Source URL destination: Destination path session: Optional requests.Session to reuse. If None, a temporary cached session is created. close_session: Whether to close the session after download. Only applicable if a temporary session is created. http_cache: Optional HTTP cache configuration cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session. Raises: Loading @@ -37,11 +45,10 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No if not lowered.startswith(allowed_schemes): raise ValueError("unsupported-url-scheme") # Use provided session or create a temporary one # Use provided session or create a new one (might be used for multiple downloads, so we don't want to create a new session for each) temp_session: requests.Session | None = None if session is None: manager = resolve_cache_manager(cache_manager_name) temp_session = create_cached_session(manager.http_cache_file) temp_session = create_cached_session(http_cache=http_cache, cache_manager_name=cache_manager_name) active_session = temp_session else: active_session = session Loading @@ -66,13 +73,19 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No for chunk in response.iter_content(chunk_size=8192): if chunk: target.write(chunk) # return session or None if we created a temporary session and are closing it if close_session: active_session = None return active_session finally: if temp_session: if temp_session and close_session: temp_session.close() def create_cached_session( http_cache: HttpCacheConfig | None, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, ) -> requests.Session: """Create a requests.Session with hishel caching enabled. Loading src/tdoc_crawler/specs/downloads.py +11 −5 Original line number Diff line number Diff line """Spec download orchestration.""" import asyncio import logging import zipfile from pathlib import Path import requests from zipinspect import HTTPZipReader from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.constants.urls import SPEC_URL_TEMPLATE from tdoc_crawler.http_client import download_to_path from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.http_client import download_to_file from tdoc_crawler.specs.sources.base import SpecSource from tdoc_crawler.utils.normalization import normalize_spec_number Loading @@ -19,9 +20,14 @@ _logger = logging.getLogger(__name__) class SpecDownloads: """Download and extraction utilities for specs.""" def __init__(self, database, cache_manager_name: str | None = None) -> None: def __init__(self, database: SpecDatabase, cache_manager_name: str | None = None) -> None: self._database = database self._cache_manager = resolve_cache_manager(cache_manager_name) self.session: requests.Session | None = None def __del__(self) -> None: if self.session: self.session.close() def checkout_specs( self, Loading Loading @@ -184,8 +190,8 @@ class SpecDownloads: return False def _download_full_zip(self, url: str, target_path: Path) -> None: """Download full zip file.""" download_to_path(url, target_path, cache_manager_name=self._cache_manager.name) """Download full zip file, re-use session if already created for doc-only attempt.""" self.session = download_to_file(url, target_path, session=self.session, close_session=False, cache_manager_name=self._cache_manager.name) @staticmethod def _extract_zip(zip_file: Path, extract_dir: Path, keep_zip: bool = True) -> None: Loading Loading
src/tdoc_crawler/http_client.py +20 −7 Original line number Diff line number Diff line """HTTP client factory with hishel caching support.""" from __future__ import annotations from babel.messages.frontend import log import logging from pathlib import Path Loading @@ -18,13 +17,22 @@ from tdoc_crawler.models import HttpCacheConfig logger = logging.getLogger(__name__) def download_to_path(url: str, destination: Path, session: requests.Session | None = None, cache_manager_name: str | None = None) -> None: def download_to_file( url: str, destination: Path, session: requests.Session | None = None, close_session: bool = True, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, ) -> requests.Session | None: """Download a file from URL to destination path. Args: url: Source URL destination: Destination path session: Optional requests.Session to reuse. If None, a temporary cached session is created. close_session: Whether to close the session after download. Only applicable if a temporary session is created. http_cache: Optional HTTP cache configuration cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session. Raises: Loading @@ -37,11 +45,10 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No if not lowered.startswith(allowed_schemes): raise ValueError("unsupported-url-scheme") # Use provided session or create a temporary one # Use provided session or create a new one (might be used for multiple downloads, so we don't want to create a new session for each) temp_session: requests.Session | None = None if session is None: manager = resolve_cache_manager(cache_manager_name) temp_session = create_cached_session(manager.http_cache_file) temp_session = create_cached_session(http_cache=http_cache, cache_manager_name=cache_manager_name) active_session = temp_session else: active_session = session Loading @@ -66,13 +73,19 @@ def download_to_path(url: str, destination: Path, session: requests.Session | No for chunk in response.iter_content(chunk_size=8192): if chunk: target.write(chunk) # return session or None if we created a temporary session and are closing it if close_session: active_session = None return active_session finally: if temp_session: if temp_session and close_session: temp_session.close() def create_cached_session( http_cache: HttpCacheConfig | None, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, ) -> requests.Session: """Create a requests.Session with hishel caching enabled. Loading
src/tdoc_crawler/specs/downloads.py +11 −5 Original line number Diff line number Diff line """Spec download orchestration.""" import asyncio import logging import zipfile from pathlib import Path import requests from zipinspect import HTTPZipReader from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.constants.urls import SPEC_URL_TEMPLATE from tdoc_crawler.http_client import download_to_path from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.http_client import download_to_file from tdoc_crawler.specs.sources.base import SpecSource from tdoc_crawler.utils.normalization import normalize_spec_number Loading @@ -19,9 +20,14 @@ _logger = logging.getLogger(__name__) class SpecDownloads: """Download and extraction utilities for specs.""" def __init__(self, database, cache_manager_name: str | None = None) -> None: def __init__(self, database: SpecDatabase, cache_manager_name: str | None = None) -> None: self._database = database self._cache_manager = resolve_cache_manager(cache_manager_name) self.session: requests.Session | None = None def __del__(self) -> None: if self.session: self.session.close() def checkout_specs( self, Loading Loading @@ -184,8 +190,8 @@ class SpecDownloads: return False def _download_full_zip(self, url: str, target_path: Path) -> None: """Download full zip file.""" download_to_path(url, target_path, cache_manager_name=self._cache_manager.name) """Download full zip file, re-use session if already created for doc-only attempt.""" self.session = download_to_file(url, target_path, session=self.session, close_session=False, cache_manager_name=self._cache_manager.name) @staticmethod def _extract_zip(zip_file: Path, extract_dir: Path, keep_zip: bool = True) -> None: Loading