Loading pyproject.toml +1 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ dependencies = [ "lxml>=6.0.2", "pool-executors", "oxyde>=0.4.0", "truststore>=0.10.4", ] [project.urls] Loading src/tdoc_crawler/clients/portal.py +17 −8 Original line number Diff line number Diff line Loading @@ -18,9 +18,9 @@ from typing import Any import requests from tdoc_crawler.constants.urls import BROWSER_HEADERS, LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.http_client import create_cached_session, resolve_ssl_verify from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page Loading Loading @@ -62,6 +62,7 @@ class PortalClient: timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, verify: bool | str | None = None, ) -> None: """Initialize portal client. Loading @@ -71,14 +72,21 @@ class PortalClient: timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration cache_manager_name: Optional cache manager name for resolving cache configuration verify: SSL certificate verification mode (bool or CA bundle path) """ self._credentials = credentials self._cache_manager_name = cache_manager_name self.timeout = timeout self._http_cache = http_cache self._verify = verify self._authenticated = False self._session = session @property def _verify_resolved(self) -> bool | str: """Return effective SSL verification mode.""" return resolve_ssl_verify(self._verify) @property def credentials(self) -> PortalCredentials | None: """Return the credentials used by this client.""" Loading Loading @@ -290,20 +298,21 @@ class PortalClient: Session configured with browser-like headers for portal access """ if self._session is None: self._session = create_cached_session(http_cache=self._http_cache, cache_manager_name=self._cache_manager_name) self._session = create_cached_session( http_cache=self._http_cache, cache_manager_name=self._cache_manager_name, verify=self._verify_resolved, ) return self._session @staticmethod def _get_auth_session() -> requests.Session: def _get_auth_session(self) -> requests.Session: """Get a non-cached session for authentication. Returns: Non-cached session with browser-like headers """ session = requests.Session() session.headers.update(BROWSER_HEADERS) return session return create_cached_session(http_cache_enabled=False, verify=self._verify_resolved) __all__ = [ Loading src/tdoc_crawler/http_client/__init__.py +16 −2 Original line number Diff line number Diff line Loading @@ -8,6 +8,20 @@ Re-exports from session module for backward-compatible imports: from __future__ import annotations from tdoc_crawler.http_client.session import PoolConfig, create_cached_session, download_to_file from tdoc_crawler.http_client.session import ( PoolConfig, SSLContextCacheAdapter, SSLContextHTTPAdapter, create_cached_session, download_to_file, resolve_ssl_verify, ) __all__ = ["PoolConfig", "create_cached_session", "download_to_file"] __all__ = [ "PoolConfig", "SSLContextCacheAdapter", "SSLContextHTTPAdapter", "create_cached_session", "download_to_file", "resolve_ssl_verify", ] src/tdoc_crawler/http_client/session.py +116 −8 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ from __future__ import annotations import os import ssl from dataclasses import dataclass from pathlib import Path from typing import cast Loading @@ -11,6 +12,7 @@ import requests from hishel import SyncBaseStorage, SyncSqliteStorage from hishel.requests import CacheAdapter from requests.adapters import HTTPAdapter from truststore import SSLContext as TruststoreSSLContext from urllib3.util.retry import Retry from tdoc_crawler.config import resolve_cache_manager Loading @@ -20,6 +22,44 @@ from tdoc_crawler.models import HttpCacheConfig logger = get_logger(__name__) SSL_DISABLED_ENV_VALUES = {"false", "0", "no", "off", "f", "n"} class SSLContextHTTPAdapter(HTTPAdapter): """HTTP adapter that can enforce a specific SSL context.""" def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None: self._ssl_context = ssl_context super().__init__(*args, **kwargs) def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None: if self._ssl_context is not None: pool_kwargs["ssl_context"] = self._ssl_context super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs) def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object: if self._ssl_context is not None: proxy_kwargs["ssl_context"] = self._ssl_context return super().proxy_manager_for(proxy, **proxy_kwargs) class SSLContextCacheAdapter(CacheAdapter): """Cache adapter variant that can enforce a specific SSL context.""" def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None: self._ssl_context = ssl_context super().__init__(*args, **kwargs) def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None: if self._ssl_context is not None: pool_kwargs["ssl_context"] = self._ssl_context super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs) def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object: if self._ssl_context is not None: proxy_kwargs["ssl_context"] = self._ssl_context return super().proxy_manager_for(proxy, **proxy_kwargs) @dataclass class PoolConfig: Loading @@ -40,6 +80,50 @@ class PoolConfig: retry_attempts: int = 3 def resolve_ssl_verify(verify: bool | str | None = None) -> bool | str: """Resolve SSL verification behavior from explicit argument or environment. Args: verify: Optional explicit SSL verification mode. Returns: Either a boolean verification flag or a certificate bundle path. """ if verify is not None: resolved = verify else: env_value = os.getenv("TDC_VERIFY_SSL") if env_value is None: resolved = True else: normalized = env_value.strip().lower() if normalized in SSL_DISABLED_ENV_VALUES: resolved = False elif normalized: resolved = env_value.strip() else: resolved = True if resolved is False: logger.warning("SSL certificate verification is disabled (TDC_VERIFY_SSL=false). This is insecure.") return resolved def _resolve_ssl_context(verify: bool | str) -> ssl.SSLContext | None: """Build SSL context for adapter-level TLS verification. Args: verify: Resolved SSL verification behavior. Returns: SSL context when adapter-level override is needed, otherwise None. """ if verify is True: return TruststoreSSLContext(ssl.PROTOCOL_TLS_CLIENT) return None def download_to_file( url: str, destination: Path, Loading @@ -49,6 +133,7 @@ def download_to_file( cache_manager_name: str | None = None, http_cache_enabled: bool | None = None, pool_config: PoolConfig | None = None, verify: bool | str | None = None, ) -> requests.Session | None: """Download a file from URL to destination path. Loading @@ -61,6 +146,7 @@ def download_to_file( cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session. http_cache_enabled: Whether to enable HTTP caching. If None, defaults to True. pool_config: Optional connection pool configuration. verify: SSL certificate verification mode. Can be bool or CA bundle path. Raises: ValueError: If URL scheme is not supported Loading @@ -80,6 +166,7 @@ def download_to_file( cache_manager_name=cache_manager_name, http_cache_enabled=http_cache_enabled, pool_config=pool_config, verify=verify, ) active_session = temp_session else: Loading Loading @@ -109,6 +196,7 @@ def create_cached_session( cache_manager_name: str | None = None, http_cache_enabled: bool | None = None, pool_config: PoolConfig | None = None, verify: bool | str | None = None, ) -> requests.Session: """Create a requests.Session with hishel caching enabled. Loading @@ -120,6 +208,7 @@ def create_cached_session( pool_config: Optional connection pool configuration. When provided, pool settings are applied to the active adapter (cache adapter when caching is enabled, HTTPAdapter otherwise). verify: SSL certificate verification mode. Can be bool or CA bundle path. Returns: Configured requests.Session with caching enabled (unless disabled) Loading @@ -128,6 +217,10 @@ def create_cached_session( session = requests.Session() session.headers.update(BROWSER_HEADERS) verify_mode = resolve_ssl_verify(verify) ssl_context = _resolve_ssl_context(verify_mode) session.verify = verify_mode # Check if caching is disabled via parameter or environment variable if http_cache_enabled is None: env_enabled = os.getenv("HTTP_CACHE_ENABLED", "").lower() Loading @@ -135,21 +228,28 @@ def create_cached_session( # If caching is disabled, optionally configure a pooled HTTP adapter and return. if not http_cache_enabled: if pool_config is not None: if pool_config is not None or ssl_context is not None: retry_attempts = pool_config.retry_attempts if pool_config and pool_config.enable_retry else 0 retry_strategy = Retry( total=pool_config.retry_attempts if pool_config.enable_retry else 0, total=retry_attempts, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["HEAD", "GET", "OPTIONS"], ) adapter = HTTPAdapter( pool_connections=pool_config.max_connections, pool_maxsize=pool_config.max_per_host, adapter = SSLContextHTTPAdapter( pool_connections=pool_config.max_connections if pool_config is not None else 10, pool_maxsize=pool_config.max_per_host if pool_config is not None else 10, max_retries=retry_strategy, ssl_context=ssl_context, ) session.mount("http://", adapter) session.mount("https://", adapter) logger.debug(f"Configured connection pool without caching: max_connections={pool_config.max_connections}, max_per_host={pool_config.max_per_host}") if pool_config is not None: logger.debug( "Configured connection pool without caching: " f"max_connections={pool_config.max_connections}, " f"max_per_host={pool_config.max_per_host}" ) logger.debug("Creating plain HTTP session (caching disabled)") return session Loading Loading @@ -182,11 +282,12 @@ def create_cached_session( ) # Create a single cache adapter with pool settings so both behaviors are active. cache_adapter = CacheAdapter( cache_adapter = SSLContextCacheAdapter( pool_connections=pool_config.max_connections if pool_config is not None else 10, pool_maxsize=pool_config.max_per_host if pool_config is not None else 10, max_retries=max_retries, storage=storage, ssl_context=ssl_context, ) # Mount the cache adapter Loading @@ -199,4 +300,11 @@ def create_cached_session( return session __all__ = ["PoolConfig", "create_cached_session", "download_to_file"] __all__ = [ "PoolConfig", "SSLContextCacheAdapter", "SSLContextHTTPAdapter", "create_cached_session", "download_to_file", "resolve_ssl_verify", ] tests/test_http_client.py +61 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ from requests.adapters import HTTPAdapter from tdoc_crawler.config import DEFAULT_HTTP_CACHE_FILENAME, CacheManager, reset_cache_managers from tdoc_crawler.http_client import PoolConfig, create_cached_session, download_to_file from tdoc_crawler.http_client.session import SSLContextCacheAdapter, SSLContextHTTPAdapter, resolve_ssl_verify from tdoc_crawler.models.base import HttpCacheConfig Loading Loading @@ -385,5 +386,65 @@ class TestResolveHttpCacheConfig: assert hasattr(config, "refresh_ttl_on_access") class TestResolveSslVerify: """Tests for SSL verify resolution helper.""" def test_explicit_value_wins(self, monkeypatch: pytest.MonkeyPatch) -> None: """Explicit function argument should override environment variables.""" monkeypatch.setenv("TDC_VERIFY_SSL", "false") assert resolve_ssl_verify(True) is True def test_env_false(self, monkeypatch: pytest.MonkeyPatch) -> None: """Known false env values should disable certificate validation.""" monkeypatch.setenv("TDC_VERIFY_SSL", "no") assert resolve_ssl_verify() is False def test_env_custom_ca_bundle(self, monkeypatch: pytest.MonkeyPatch) -> None: """Non-boolean env value should be treated as CA bundle path.""" monkeypatch.setenv("TDC_VERIFY_SSL", "C:/certs/corp.pem") assert resolve_ssl_verify() == "C:/certs/corp.pem" def test_default_true(self, monkeypatch: pytest.MonkeyPatch) -> None: """SSL verification should be enabled by default.""" monkeypatch.delenv("TDC_VERIFY_SSL", raising=False) assert resolve_ssl_verify() is True class TestSslContextAdapters: """Tests for SSL-context aware adapter behavior.""" def test_non_cached_verify_true_uses_ssl_context_adapter(self) -> None: """Non-cached sessions should mount SSL-aware adapters when verify=True.""" session = create_cached_session(http_cache_enabled=False, verify=True) try: assert isinstance(session.adapters["http://"], SSLContextHTTPAdapter) assert isinstance(session.adapters["https://"], SSLContextHTTPAdapter) assert session.verify is True finally: session.close() def test_non_cached_verify_false_uses_default_adapter(self) -> None: """Non-cached verify=False should use requests defaults when no pool override is set.""" session = create_cached_session(http_cache_enabled=False, verify=False) try: assert isinstance(session.adapters["http://"], HTTPAdapter) assert isinstance(session.adapters["https://"], HTTPAdapter) assert not isinstance(session.adapters["https://"], SSLContextHTTPAdapter) assert session.verify is False finally: session.close() def test_cached_verify_true_uses_ssl_cache_adapter(self, test_cache_dir: Path) -> None: """Cached sessions should mount SSL-aware cache adapters when verify=True.""" CacheManager(root_path=test_cache_dir, name="ssl_cache_adapter").register() session = create_cached_session(cache_manager_name="ssl_cache_adapter", verify=True) try: assert isinstance(session.adapters["http://"], SSLContextCacheAdapter) assert isinstance(session.adapters["https://"], SSLContextCacheAdapter) assert session.verify is True finally: session.close() if __name__ == "__main__": pytest.main([__file__, "-v"]) Loading
pyproject.toml +1 −0 Original line number Diff line number Diff line Loading @@ -36,6 +36,7 @@ dependencies = [ "lxml>=6.0.2", "pool-executors", "oxyde>=0.4.0", "truststore>=0.10.4", ] [project.urls] Loading
src/tdoc_crawler/clients/portal.py +17 −8 Original line number Diff line number Diff line Loading @@ -18,9 +18,9 @@ from typing import Any import requests from tdoc_crawler.constants.urls import BROWSER_HEADERS, LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.http_client import create_cached_session, resolve_ssl_verify from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page Loading Loading @@ -62,6 +62,7 @@ class PortalClient: timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, verify: bool | str | None = None, ) -> None: """Initialize portal client. Loading @@ -71,14 +72,21 @@ class PortalClient: timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration cache_manager_name: Optional cache manager name for resolving cache configuration verify: SSL certificate verification mode (bool or CA bundle path) """ self._credentials = credentials self._cache_manager_name = cache_manager_name self.timeout = timeout self._http_cache = http_cache self._verify = verify self._authenticated = False self._session = session @property def _verify_resolved(self) -> bool | str: """Return effective SSL verification mode.""" return resolve_ssl_verify(self._verify) @property def credentials(self) -> PortalCredentials | None: """Return the credentials used by this client.""" Loading Loading @@ -290,20 +298,21 @@ class PortalClient: Session configured with browser-like headers for portal access """ if self._session is None: self._session = create_cached_session(http_cache=self._http_cache, cache_manager_name=self._cache_manager_name) self._session = create_cached_session( http_cache=self._http_cache, cache_manager_name=self._cache_manager_name, verify=self._verify_resolved, ) return self._session @staticmethod def _get_auth_session() -> requests.Session: def _get_auth_session(self) -> requests.Session: """Get a non-cached session for authentication. Returns: Non-cached session with browser-like headers """ session = requests.Session() session.headers.update(BROWSER_HEADERS) return session return create_cached_session(http_cache_enabled=False, verify=self._verify_resolved) __all__ = [ Loading
src/tdoc_crawler/http_client/__init__.py +16 −2 Original line number Diff line number Diff line Loading @@ -8,6 +8,20 @@ Re-exports from session module for backward-compatible imports: from __future__ import annotations from tdoc_crawler.http_client.session import PoolConfig, create_cached_session, download_to_file from tdoc_crawler.http_client.session import ( PoolConfig, SSLContextCacheAdapter, SSLContextHTTPAdapter, create_cached_session, download_to_file, resolve_ssl_verify, ) __all__ = ["PoolConfig", "create_cached_session", "download_to_file"] __all__ = [ "PoolConfig", "SSLContextCacheAdapter", "SSLContextHTTPAdapter", "create_cached_session", "download_to_file", "resolve_ssl_verify", ]
src/tdoc_crawler/http_client/session.py +116 −8 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ from __future__ import annotations import os import ssl from dataclasses import dataclass from pathlib import Path from typing import cast Loading @@ -11,6 +12,7 @@ import requests from hishel import SyncBaseStorage, SyncSqliteStorage from hishel.requests import CacheAdapter from requests.adapters import HTTPAdapter from truststore import SSLContext as TruststoreSSLContext from urllib3.util.retry import Retry from tdoc_crawler.config import resolve_cache_manager Loading @@ -20,6 +22,44 @@ from tdoc_crawler.models import HttpCacheConfig logger = get_logger(__name__) SSL_DISABLED_ENV_VALUES = {"false", "0", "no", "off", "f", "n"} class SSLContextHTTPAdapter(HTTPAdapter): """HTTP adapter that can enforce a specific SSL context.""" def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None: self._ssl_context = ssl_context super().__init__(*args, **kwargs) def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None: if self._ssl_context is not None: pool_kwargs["ssl_context"] = self._ssl_context super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs) def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object: if self._ssl_context is not None: proxy_kwargs["ssl_context"] = self._ssl_context return super().proxy_manager_for(proxy, **proxy_kwargs) class SSLContextCacheAdapter(CacheAdapter): """Cache adapter variant that can enforce a specific SSL context.""" def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None: self._ssl_context = ssl_context super().__init__(*args, **kwargs) def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None: if self._ssl_context is not None: pool_kwargs["ssl_context"] = self._ssl_context super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs) def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object: if self._ssl_context is not None: proxy_kwargs["ssl_context"] = self._ssl_context return super().proxy_manager_for(proxy, **proxy_kwargs) @dataclass class PoolConfig: Loading @@ -40,6 +80,50 @@ class PoolConfig: retry_attempts: int = 3 def resolve_ssl_verify(verify: bool | str | None = None) -> bool | str: """Resolve SSL verification behavior from explicit argument or environment. Args: verify: Optional explicit SSL verification mode. Returns: Either a boolean verification flag or a certificate bundle path. """ if verify is not None: resolved = verify else: env_value = os.getenv("TDC_VERIFY_SSL") if env_value is None: resolved = True else: normalized = env_value.strip().lower() if normalized in SSL_DISABLED_ENV_VALUES: resolved = False elif normalized: resolved = env_value.strip() else: resolved = True if resolved is False: logger.warning("SSL certificate verification is disabled (TDC_VERIFY_SSL=false). This is insecure.") return resolved def _resolve_ssl_context(verify: bool | str) -> ssl.SSLContext | None: """Build SSL context for adapter-level TLS verification. Args: verify: Resolved SSL verification behavior. Returns: SSL context when adapter-level override is needed, otherwise None. """ if verify is True: return TruststoreSSLContext(ssl.PROTOCOL_TLS_CLIENT) return None def download_to_file( url: str, destination: Path, Loading @@ -49,6 +133,7 @@ def download_to_file( cache_manager_name: str | None = None, http_cache_enabled: bool | None = None, pool_config: PoolConfig | None = None, verify: bool | str | None = None, ) -> requests.Session | None: """Download a file from URL to destination path. Loading @@ -61,6 +146,7 @@ def download_to_file( cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session. http_cache_enabled: Whether to enable HTTP caching. If None, defaults to True. pool_config: Optional connection pool configuration. verify: SSL certificate verification mode. Can be bool or CA bundle path. Raises: ValueError: If URL scheme is not supported Loading @@ -80,6 +166,7 @@ def download_to_file( cache_manager_name=cache_manager_name, http_cache_enabled=http_cache_enabled, pool_config=pool_config, verify=verify, ) active_session = temp_session else: Loading Loading @@ -109,6 +196,7 @@ def create_cached_session( cache_manager_name: str | None = None, http_cache_enabled: bool | None = None, pool_config: PoolConfig | None = None, verify: bool | str | None = None, ) -> requests.Session: """Create a requests.Session with hishel caching enabled. Loading @@ -120,6 +208,7 @@ def create_cached_session( pool_config: Optional connection pool configuration. When provided, pool settings are applied to the active adapter (cache adapter when caching is enabled, HTTPAdapter otherwise). verify: SSL certificate verification mode. Can be bool or CA bundle path. Returns: Configured requests.Session with caching enabled (unless disabled) Loading @@ -128,6 +217,10 @@ def create_cached_session( session = requests.Session() session.headers.update(BROWSER_HEADERS) verify_mode = resolve_ssl_verify(verify) ssl_context = _resolve_ssl_context(verify_mode) session.verify = verify_mode # Check if caching is disabled via parameter or environment variable if http_cache_enabled is None: env_enabled = os.getenv("HTTP_CACHE_ENABLED", "").lower() Loading @@ -135,21 +228,28 @@ def create_cached_session( # If caching is disabled, optionally configure a pooled HTTP adapter and return. if not http_cache_enabled: if pool_config is not None: if pool_config is not None or ssl_context is not None: retry_attempts = pool_config.retry_attempts if pool_config and pool_config.enable_retry else 0 retry_strategy = Retry( total=pool_config.retry_attempts if pool_config.enable_retry else 0, total=retry_attempts, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["HEAD", "GET", "OPTIONS"], ) adapter = HTTPAdapter( pool_connections=pool_config.max_connections, pool_maxsize=pool_config.max_per_host, adapter = SSLContextHTTPAdapter( pool_connections=pool_config.max_connections if pool_config is not None else 10, pool_maxsize=pool_config.max_per_host if pool_config is not None else 10, max_retries=retry_strategy, ssl_context=ssl_context, ) session.mount("http://", adapter) session.mount("https://", adapter) logger.debug(f"Configured connection pool without caching: max_connections={pool_config.max_connections}, max_per_host={pool_config.max_per_host}") if pool_config is not None: logger.debug( "Configured connection pool without caching: " f"max_connections={pool_config.max_connections}, " f"max_per_host={pool_config.max_per_host}" ) logger.debug("Creating plain HTTP session (caching disabled)") return session Loading Loading @@ -182,11 +282,12 @@ def create_cached_session( ) # Create a single cache adapter with pool settings so both behaviors are active. cache_adapter = CacheAdapter( cache_adapter = SSLContextCacheAdapter( pool_connections=pool_config.max_connections if pool_config is not None else 10, pool_maxsize=pool_config.max_per_host if pool_config is not None else 10, max_retries=max_retries, storage=storage, ssl_context=ssl_context, ) # Mount the cache adapter Loading @@ -199,4 +300,11 @@ def create_cached_session( return session __all__ = ["PoolConfig", "create_cached_session", "download_to_file"] __all__ = [ "PoolConfig", "SSLContextCacheAdapter", "SSLContextHTTPAdapter", "create_cached_session", "download_to_file", "resolve_ssl_verify", ]
tests/test_http_client.py +61 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ from requests.adapters import HTTPAdapter from tdoc_crawler.config import DEFAULT_HTTP_CACHE_FILENAME, CacheManager, reset_cache_managers from tdoc_crawler.http_client import PoolConfig, create_cached_session, download_to_file from tdoc_crawler.http_client.session import SSLContextCacheAdapter, SSLContextHTTPAdapter, resolve_ssl_verify from tdoc_crawler.models.base import HttpCacheConfig Loading Loading @@ -385,5 +386,65 @@ class TestResolveHttpCacheConfig: assert hasattr(config, "refresh_ttl_on_access") class TestResolveSslVerify: """Tests for SSL verify resolution helper.""" def test_explicit_value_wins(self, monkeypatch: pytest.MonkeyPatch) -> None: """Explicit function argument should override environment variables.""" monkeypatch.setenv("TDC_VERIFY_SSL", "false") assert resolve_ssl_verify(True) is True def test_env_false(self, monkeypatch: pytest.MonkeyPatch) -> None: """Known false env values should disable certificate validation.""" monkeypatch.setenv("TDC_VERIFY_SSL", "no") assert resolve_ssl_verify() is False def test_env_custom_ca_bundle(self, monkeypatch: pytest.MonkeyPatch) -> None: """Non-boolean env value should be treated as CA bundle path.""" monkeypatch.setenv("TDC_VERIFY_SSL", "C:/certs/corp.pem") assert resolve_ssl_verify() == "C:/certs/corp.pem" def test_default_true(self, monkeypatch: pytest.MonkeyPatch) -> None: """SSL verification should be enabled by default.""" monkeypatch.delenv("TDC_VERIFY_SSL", raising=False) assert resolve_ssl_verify() is True class TestSslContextAdapters: """Tests for SSL-context aware adapter behavior.""" def test_non_cached_verify_true_uses_ssl_context_adapter(self) -> None: """Non-cached sessions should mount SSL-aware adapters when verify=True.""" session = create_cached_session(http_cache_enabled=False, verify=True) try: assert isinstance(session.adapters["http://"], SSLContextHTTPAdapter) assert isinstance(session.adapters["https://"], SSLContextHTTPAdapter) assert session.verify is True finally: session.close() def test_non_cached_verify_false_uses_default_adapter(self) -> None: """Non-cached verify=False should use requests defaults when no pool override is set.""" session = create_cached_session(http_cache_enabled=False, verify=False) try: assert isinstance(session.adapters["http://"], HTTPAdapter) assert isinstance(session.adapters["https://"], HTTPAdapter) assert not isinstance(session.adapters["https://"], SSLContextHTTPAdapter) assert session.verify is False finally: session.close() def test_cached_verify_true_uses_ssl_cache_adapter(self, test_cache_dir: Path) -> None: """Cached sessions should mount SSL-aware cache adapters when verify=True.""" CacheManager(root_path=test_cache_dir, name="ssl_cache_adapter").register() session = create_cached_session(cache_manager_name="ssl_cache_adapter", verify=True) try: assert isinstance(session.adapters["http://"], SSLContextCacheAdapter) assert isinstance(session.adapters["https://"], SSLContextCacheAdapter) assert session.verify is True finally: session.close() if __name__ == "__main__": pytest.main([__file__, "-v"])