Commit 25ce4f5a authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(http): use truststore SSLContext for certificate verification

parent f0ba47fe
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@ dependencies = [
    "lxml>=6.0.2",
    "pool-executors",
    "oxyde>=0.4.0",
    "truststore>=0.10.4",
]

[project.urls]
+17 −8
Original line number Diff line number Diff line
@@ -18,9 +18,9 @@ from typing import Any

import requests

from tdoc_crawler.constants.urls import BROWSER_HEADERS, LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.http_client import create_cached_session, resolve_ssl_verify
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page
@@ -62,6 +62,7 @@ class PortalClient:
        timeout: int = 30,
        http_cache: HttpCacheConfig | None = None,
        cache_manager_name: str | None = None,
        verify: bool | str | None = None,
    ) -> None:
        """Initialize portal client.

@@ -71,14 +72,21 @@ class PortalClient:
            timeout: Request timeout in seconds
            http_cache: Optional HTTP cache configuration
            cache_manager_name: Optional cache manager name for resolving cache configuration
            verify: SSL certificate verification mode (bool or CA bundle path)
        """
        self._credentials = credentials
        self._cache_manager_name = cache_manager_name
        self.timeout = timeout
        self._http_cache = http_cache
        self._verify = verify
        self._authenticated = False
        self._session = session

    @property
    def _verify_resolved(self) -> bool | str:
        """Return effective SSL verification mode."""
        return resolve_ssl_verify(self._verify)

    @property
    def credentials(self) -> PortalCredentials | None:
        """Return the credentials used by this client."""
@@ -290,20 +298,21 @@ class PortalClient:
            Session configured with browser-like headers for portal access
        """
        if self._session is None:
            self._session = create_cached_session(http_cache=self._http_cache, cache_manager_name=self._cache_manager_name)
            self._session = create_cached_session(
                http_cache=self._http_cache,
                cache_manager_name=self._cache_manager_name,
                verify=self._verify_resolved,
            )

        return self._session

    @staticmethod
    def _get_auth_session() -> requests.Session:
    def _get_auth_session(self) -> requests.Session:
        """Get a non-cached session for authentication.

        Returns:
            Non-cached session with browser-like headers
        """
        session = requests.Session()
        session.headers.update(BROWSER_HEADERS)
        return session
        return create_cached_session(http_cache_enabled=False, verify=self._verify_resolved)


__all__ = [
+16 −2
Original line number Diff line number Diff line
@@ -8,6 +8,20 @@ Re-exports from session module for backward-compatible imports:

from __future__ import annotations

from tdoc_crawler.http_client.session import PoolConfig, create_cached_session, download_to_file
from tdoc_crawler.http_client.session import (
    PoolConfig,
    SSLContextCacheAdapter,
    SSLContextHTTPAdapter,
    create_cached_session,
    download_to_file,
    resolve_ssl_verify,
)

__all__ = ["PoolConfig", "create_cached_session", "download_to_file"]
__all__ = [
    "PoolConfig",
    "SSLContextCacheAdapter",
    "SSLContextHTTPAdapter",
    "create_cached_session",
    "download_to_file",
    "resolve_ssl_verify",
]
+116 −8
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
from __future__ import annotations

import os
import ssl
from dataclasses import dataclass
from pathlib import Path
from typing import cast
@@ -11,6 +12,7 @@ import requests
from hishel import SyncBaseStorage, SyncSqliteStorage
from hishel.requests import CacheAdapter
from requests.adapters import HTTPAdapter
from truststore import SSLContext as TruststoreSSLContext
from urllib3.util.retry import Retry

from tdoc_crawler.config import resolve_cache_manager
@@ -20,6 +22,44 @@ from tdoc_crawler.models import HttpCacheConfig

logger = get_logger(__name__)

SSL_DISABLED_ENV_VALUES = {"false", "0", "no", "off", "f", "n"}


class SSLContextHTTPAdapter(HTTPAdapter):
    """HTTP adapter that can enforce a specific SSL context."""

    def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None:
        self._ssl_context = ssl_context
        super().__init__(*args, **kwargs)

    def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None:
        if self._ssl_context is not None:
            pool_kwargs["ssl_context"] = self._ssl_context
        super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs)

    def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object:
        if self._ssl_context is not None:
            proxy_kwargs["ssl_context"] = self._ssl_context
        return super().proxy_manager_for(proxy, **proxy_kwargs)


class SSLContextCacheAdapter(CacheAdapter):
    """Cache adapter variant that can enforce a specific SSL context."""

    def __init__(self, *args: object, ssl_context: ssl.SSLContext | None = None, **kwargs: object) -> None:
        self._ssl_context = ssl_context
        super().__init__(*args, **kwargs)

    def init_poolmanager(self, connections: int, maxsize: int, block: bool = False, **pool_kwargs: object) -> None:
        if self._ssl_context is not None:
            pool_kwargs["ssl_context"] = self._ssl_context
        super().init_poolmanager(connections=connections, maxsize=maxsize, block=block, **pool_kwargs)

    def proxy_manager_for(self, proxy: str, **proxy_kwargs: object) -> object:
        if self._ssl_context is not None:
            proxy_kwargs["ssl_context"] = self._ssl_context
        return super().proxy_manager_for(proxy, **proxy_kwargs)


@dataclass
class PoolConfig:
@@ -40,6 +80,50 @@ class PoolConfig:
    retry_attempts: int = 3


def resolve_ssl_verify(verify: bool | str | None = None) -> bool | str:
    """Resolve SSL verification behavior from explicit argument or environment.

    Args:
        verify: Optional explicit SSL verification mode.

    Returns:
        Either a boolean verification flag or a certificate bundle path.
    """
    if verify is not None:
        resolved = verify
    else:
        env_value = os.getenv("TDC_VERIFY_SSL")
        if env_value is None:
            resolved = True
        else:
            normalized = env_value.strip().lower()
            if normalized in SSL_DISABLED_ENV_VALUES:
                resolved = False
            elif normalized:
                resolved = env_value.strip()
            else:
                resolved = True

    if resolved is False:
        logger.warning("SSL certificate verification is disabled (TDC_VERIFY_SSL=false). This is insecure.")

    return resolved


def _resolve_ssl_context(verify: bool | str) -> ssl.SSLContext | None:
    """Build SSL context for adapter-level TLS verification.

    Args:
        verify: Resolved SSL verification behavior.

    Returns:
        SSL context when adapter-level override is needed, otherwise None.
    """
    if verify is True:
        return TruststoreSSLContext(ssl.PROTOCOL_TLS_CLIENT)
    return None


def download_to_file(
    url: str,
    destination: Path,
@@ -49,6 +133,7 @@ def download_to_file(
    cache_manager_name: str | None = None,
    http_cache_enabled: bool | None = None,
    pool_config: PoolConfig | None = None,
    verify: bool | str | None = None,
) -> requests.Session | None:
    """Download a file from URL to destination path.

@@ -61,6 +146,7 @@ def download_to_file(
        cache_manager_name: Optional cache manager name to determine cache directory when creating a temporary session.
        http_cache_enabled: Whether to enable HTTP caching. If None, defaults to True.
        pool_config: Optional connection pool configuration.
        verify: SSL certificate verification mode. Can be bool or CA bundle path.

    Raises:
        ValueError: If URL scheme is not supported
@@ -80,6 +166,7 @@ def download_to_file(
            cache_manager_name=cache_manager_name,
            http_cache_enabled=http_cache_enabled,
            pool_config=pool_config,
            verify=verify,
        )
        active_session = temp_session
    else:
@@ -109,6 +196,7 @@ def create_cached_session(
    cache_manager_name: str | None = None,
    http_cache_enabled: bool | None = None,
    pool_config: PoolConfig | None = None,
    verify: bool | str | None = None,
) -> requests.Session:
    """Create a requests.Session with hishel caching enabled.

@@ -120,6 +208,7 @@ def create_cached_session(
        pool_config: Optional connection pool configuration.
                When provided, pool settings are applied to the active adapter
                (cache adapter when caching is enabled, HTTPAdapter otherwise).
        verify: SSL certificate verification mode. Can be bool or CA bundle path.

    Returns:
        Configured requests.Session with caching enabled (unless disabled)
@@ -128,6 +217,10 @@ def create_cached_session(
    session = requests.Session()
    session.headers.update(BROWSER_HEADERS)

    verify_mode = resolve_ssl_verify(verify)
    ssl_context = _resolve_ssl_context(verify_mode)
    session.verify = verify_mode

    # Check if caching is disabled via parameter or environment variable
    if http_cache_enabled is None:
        env_enabled = os.getenv("HTTP_CACHE_ENABLED", "").lower()
@@ -135,21 +228,28 @@ def create_cached_session(

    # If caching is disabled, optionally configure a pooled HTTP adapter and return.
    if not http_cache_enabled:
        if pool_config is not None:
        if pool_config is not None or ssl_context is not None:
            retry_attempts = pool_config.retry_attempts if pool_config and pool_config.enable_retry else 0
            retry_strategy = Retry(
                total=pool_config.retry_attempts if pool_config.enable_retry else 0,
                total=retry_attempts,
                backoff_factor=1,
                status_forcelist=[429, 500, 502, 503, 504],
                allowed_methods=["HEAD", "GET", "OPTIONS"],
            )
            adapter = HTTPAdapter(
                pool_connections=pool_config.max_connections,
                pool_maxsize=pool_config.max_per_host,
            adapter = SSLContextHTTPAdapter(
                pool_connections=pool_config.max_connections if pool_config is not None else 10,
                pool_maxsize=pool_config.max_per_host if pool_config is not None else 10,
                max_retries=retry_strategy,
                ssl_context=ssl_context,
            )
            session.mount("http://", adapter)
            session.mount("https://", adapter)
            logger.debug(f"Configured connection pool without caching: max_connections={pool_config.max_connections}, max_per_host={pool_config.max_per_host}")
            if pool_config is not None:
                logger.debug(
                    "Configured connection pool without caching: "
                    f"max_connections={pool_config.max_connections}, "
                    f"max_per_host={pool_config.max_per_host}"
                )

        logger.debug("Creating plain HTTP session (caching disabled)")
        return session
@@ -182,11 +282,12 @@ def create_cached_session(
    )

    # Create a single cache adapter with pool settings so both behaviors are active.
    cache_adapter = CacheAdapter(
    cache_adapter = SSLContextCacheAdapter(
        pool_connections=pool_config.max_connections if pool_config is not None else 10,
        pool_maxsize=pool_config.max_per_host if pool_config is not None else 10,
        max_retries=max_retries,
        storage=storage,
        ssl_context=ssl_context,
    )

    # Mount the cache adapter
@@ -199,4 +300,11 @@ def create_cached_session(
    return session


__all__ = ["PoolConfig", "create_cached_session", "download_to_file"]
__all__ = [
    "PoolConfig",
    "SSLContextCacheAdapter",
    "SSLContextHTTPAdapter",
    "create_cached_session",
    "download_to_file",
    "resolve_ssl_verify",
]
+61 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ from requests.adapters import HTTPAdapter

from tdoc_crawler.config import DEFAULT_HTTP_CACHE_FILENAME, CacheManager, reset_cache_managers
from tdoc_crawler.http_client import PoolConfig, create_cached_session, download_to_file
from tdoc_crawler.http_client.session import SSLContextCacheAdapter, SSLContextHTTPAdapter, resolve_ssl_verify
from tdoc_crawler.models.base import HttpCacheConfig


@@ -385,5 +386,65 @@ class TestResolveHttpCacheConfig:
        assert hasattr(config, "refresh_ttl_on_access")


class TestResolveSslVerify:
    """Tests for SSL verify resolution helper."""

    def test_explicit_value_wins(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Explicit function argument should override environment variables."""
        monkeypatch.setenv("TDC_VERIFY_SSL", "false")
        assert resolve_ssl_verify(True) is True

    def test_env_false(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Known false env values should disable certificate validation."""
        monkeypatch.setenv("TDC_VERIFY_SSL", "no")
        assert resolve_ssl_verify() is False

    def test_env_custom_ca_bundle(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Non-boolean env value should be treated as CA bundle path."""
        monkeypatch.setenv("TDC_VERIFY_SSL", "C:/certs/corp.pem")
        assert resolve_ssl_verify() == "C:/certs/corp.pem"

    def test_default_true(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """SSL verification should be enabled by default."""
        monkeypatch.delenv("TDC_VERIFY_SSL", raising=False)
        assert resolve_ssl_verify() is True


class TestSslContextAdapters:
    """Tests for SSL-context aware adapter behavior."""

    def test_non_cached_verify_true_uses_ssl_context_adapter(self) -> None:
        """Non-cached sessions should mount SSL-aware adapters when verify=True."""
        session = create_cached_session(http_cache_enabled=False, verify=True)
        try:
            assert isinstance(session.adapters["http://"], SSLContextHTTPAdapter)
            assert isinstance(session.adapters["https://"], SSLContextHTTPAdapter)
            assert session.verify is True
        finally:
            session.close()

    def test_non_cached_verify_false_uses_default_adapter(self) -> None:
        """Non-cached verify=False should use requests defaults when no pool override is set."""
        session = create_cached_session(http_cache_enabled=False, verify=False)
        try:
            assert isinstance(session.adapters["http://"], HTTPAdapter)
            assert isinstance(session.adapters["https://"], HTTPAdapter)
            assert not isinstance(session.adapters["https://"], SSLContextHTTPAdapter)
            assert session.verify is False
        finally:
            session.close()

    def test_cached_verify_true_uses_ssl_cache_adapter(self, test_cache_dir: Path) -> None:
        """Cached sessions should mount SSL-aware cache adapters when verify=True."""
        CacheManager(root_path=test_cache_dir, name="ssl_cache_adapter").register()
        session = create_cached_session(cache_manager_name="ssl_cache_adapter", verify=True)
        try:
            assert isinstance(session.adapters["http://"], SSLContextCacheAdapter)
            assert isinstance(session.adapters["https://"], SSLContextCacheAdapter)
            assert session.verify is True
        finally:
            session.close()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])
Loading