♻️ refactor(http_client): improve download_to_file function signature and session handling (f1c3ba55) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/http_client.py

+20 −7

Original line number	Diff line number	Diff line
		"""HTTP client factory with hishel caching support."""

		from __future__ import annotations
		from babel.messages.frontend import log

		import logging
		from pathlib import Path
		@@ -18,13 +17,22 @@ from tdoc_crawler.models import HttpCacheConfig
		logger = logging.getLogger(__name__)


		def download_to_path(url: str, destination: Path, session: requests.Session \| None = None, cache_manager_name: str \| None = None) -> None:
		def download_to_file(
		url: str,
		destination: Path,
		session: requests.Session \| None = None,
		close_session: bool = True,
		http_cache: HttpCacheConfig \| None = None,
		cache_manager_name: str \| None = None,
		) -> requests.Session \| None:
		"""Download a file from URL to destination path.

		Args:
		url: Source URL
		destination: Destination path
		session: Optional requests.Session to reuse. If None, a temporary cached session is created.
		close_session: Whether to close the session after download. Only applicable if a temporary session is created.
		http_cache: Optional HTTP cache configuration
		cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session.

		Raises:
		@@ -37,11 +45,10 @@ def download_to_path(url: str, destination: Path, session: requests.Session \| No
		if not lowered.startswith(allowed_schemes):
		raise ValueError("unsupported-url-scheme")

		# Use provided session or create a temporary one
		# Use provided session or create a new one (might be used for multiple downloads, so we don't want to create a new session for each)
		temp_session: requests.Session \| None = None
		if session is None:
		manager = resolve_cache_manager(cache_manager_name)
		temp_session = create_cached_session(manager.http_cache_file)
		temp_session = create_cached_session(http_cache=http_cache, cache_manager_name=cache_manager_name)
		active_session = temp_session
		else:
		active_session = session
		@@ -66,13 +73,19 @@ def download_to_path(url: str, destination: Path, session: requests.Session \| No
		for chunk in response.iter_content(chunk_size=8192):
		if chunk:
		target.write(chunk)

		# return session or None if we created a temporary session and are closing it
		if close_session:
		active_session = None
		return active_session

		finally:
		if temp_session:
		if temp_session and close_session:
		temp_session.close()


		def create_cached_session(
		http_cache: HttpCacheConfig \| None,
		http_cache: HttpCacheConfig \| None = None,
		cache_manager_name: str \| None = None,
		) -> requests.Session:
		"""Create a requests.Session with hishel caching enabled.

src/tdoc_crawler/specs/downloads.py

+11 −5

Original line number	Diff line number	Diff line
		"""Spec download orchestration."""

		import asyncio
		import logging
		import zipfile
		from pathlib import Path

		import requests
		from zipinspect import HTTPZipReader

		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.constants.urls import SPEC_URL_TEMPLATE
		from tdoc_crawler.http_client import download_to_path
		from tdoc_crawler.database.specs import SpecDatabase
		from tdoc_crawler.http_client import download_to_file
		from tdoc_crawler.specs.sources.base import SpecSource
		from tdoc_crawler.utils.normalization import normalize_spec_number

		@@ -19,9 +20,14 @@ _logger = logging.getLogger(__name__)
		class SpecDownloads:
		"""Download and extraction utilities for specs."""

		def __init__(self, database, cache_manager_name: str \| None = None) -> None:
		def __init__(self, database: SpecDatabase, cache_manager_name: str \| None = None) -> None:
		self._database = database
		self._cache_manager = resolve_cache_manager(cache_manager_name)
		self.session: requests.Session \| None = None

		def __del__(self) -> None:
		if self.session:
		self.session.close()

		def checkout_specs(
		self,
		@@ -184,8 +190,8 @@ class SpecDownloads:
		return False

		def _download_full_zip(self, url: str, target_path: Path) -> None:
		"""Download full zip file."""
		download_to_path(url, target_path, cache_manager_name=self._cache_manager.name)
		"""Download full zip file, re-use session if already created for doc-only attempt."""
		self.session = download_to_file(url, target_path, session=self.session, close_session=False, cache_manager_name=self._cache_manager.name)

		@staticmethod
		def _extract_zip(zip_file: Path, extract_dir: Path, keep_zip: bool = True) -> None: