Commit 7baeede5 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(constants): add common browser headers for HTTP requests

* Introduced BROWSER_HEADERS in urls.py to standardize HTTP headers.
* Updated portal.py and http_client.py to utilize BROWSER_HEADERS.
* Removed hardcoded headers from various locations for consistency.
parent c9ac517b
Loading
Loading
Loading
Loading
+3 −14
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ from typing import Any

import requests

from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.constants.urls import BROWSER_HEADERS, LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
@@ -28,17 +28,6 @@ from tdoc_crawler.tdocs.models import TDocMetadata

logger = get_logger(__name__)

# Common browser headers to avoid 403 Forbidden responses
_BROWSER_HEADERS: dict[str, str] = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}


class PortalAuthenticationError(Exception):
    """Raised when portal authentication fails."""
@@ -299,7 +288,7 @@ class PortalClient:
        """
        if self._session is None:
            self._session = create_cached_session(http_cache=self._http_cache, cache_manager_name=self._cache_manager_name)
            self._session.headers.update(_BROWSER_HEADERS)

        return self._session

    @staticmethod
@@ -310,7 +299,7 @@ class PortalClient:
            Non-cached session with browser-like headers
        """
        session = requests.Session()
        session.headers.update(_BROWSER_HEADERS)
        session.headers.update(BROWSER_HEADERS)
        return session


+11 −0
Original line number Diff line number Diff line
@@ -12,6 +12,17 @@ LOGIN_URL: Final[str] = f"{PORTAL_BASE_URL}/login.aspx"

SPEC_URL_TEMPLATE: Final[str] = "https://www.3gpp.org/ftp/Specs/archive/{series}/{normalized}/{file_name}"

# Common browser headers to avoid 403 Forbidden responses
BROWSER_HEADERS: dict[str, str] = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

__all__ = [
    "LOGIN_URL",
    "MEETINGS_BASE_URL",
+6 −17
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ from hishel.requests import CacheAdapter
from urllib3.util.retry import Retry

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.constants.urls import BROWSER_HEADERS
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import HttpCacheConfig

@@ -57,19 +58,6 @@ def download_to_file(
        active_session = session

    try:
        # Use requests with browser-like headers to avoid 403 Forbidden
        active_session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
                "Accept-Encoding": "gzip, deflate, br",
                "DNT": "1",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
            }
        )

        response = active_session.get(url, timeout=60, stream=True)
        response.raise_for_status()
        with destination.open("wb") as target:
@@ -103,6 +91,10 @@ def create_cached_session(
    Returns:
        Configured requests.Session with caching enabled (unless disabled)
    """
    # Create session, use requests with browser-like headers to avoid 403 Forbidden
    session = requests.Session()
    session.headers.update(BROWSER_HEADERS)

    # Check if caching is disabled via parameter or environment variable
    if http_cache_enabled is None:
        env_enabled = os.getenv("HTTP_CACHE_ENABLED", "").lower()
@@ -111,7 +103,7 @@ def create_cached_session(
    # If caching is disabled, return a plain session without caching
    if not http_cache_enabled:
        logger.debug("Creating plain HTTP session (caching disabled)")
        return requests.Session()
        return session

    http_cache = http_cache or HttpCacheConfig.resolve_http_cache_config()
    if http_cache.cache_file is None:
@@ -141,9 +133,6 @@ def create_cached_session(
    # Create cache adapter
    cache_adapter = CacheAdapter(storage=storage, max_retries=max_retries)  # ty:ignore[invalid-argument-type]

    # Create session
    session = requests.Session()

    # Mount the cache adapter
    session.mount("http://", cache_adapter)
    session.mount("https://", cache_adapter)
+0 −1
Original line number Diff line number Diff line
@@ -62,7 +62,6 @@ class MeetingCrawler:
            existing_ids = self.database.get_existing_meeting_ids(working_groups)
        credentials = resolve_credentials(None, None)
        session = create_cached_session(cache_manager_name=config.cache_manager_name, http_cache=config.http_cache)
        session.headers["User-Agent"] = "tdoc-crawler/0.0.1"
        if credentials is not None:
            session.auth = (credentials.username, credentials.password)

+1 −6
Original line number Diff line number Diff line
@@ -35,13 +35,8 @@ def fetch_threegpp_metadata(spec_number: str, http_cache: HttpCacheConfig | None
    compact = normalized.replace(".", "")
    url = f"https://www.3gpp.org/dynareport/{compact}.htm"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    }

    session = create_cached_session(http_cache=http_cache, cache_manager_name=cache_manager_name)
    response = session.get(url, timeout=30, allow_redirects=True, headers=headers)
    response = session.get(url, timeout=30, allow_redirects=True)
    response.raise_for_status()

    parsed = urlparse(response.request.url)
Loading