Commit 05e31220 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(download): replace urlopen with requests for improved download handling

- Use requests library to handle downloads with browser-like headers.
- Avoid 403 Forbidden errors by setting appropriate User-Agent and headers.
- Maintain timeout settings for download operations.
parent 79c46fae
Loading
Loading
Loading
Loading
+20 −3
Original line number Diff line number Diff line
@@ -13,8 +13,8 @@ from collections.abc import Iterable
from contextlib import suppress
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen

import requests
import typer
from rich.console import Console

@@ -261,8 +261,25 @@ def download_to_path(url: str, destination: Path) -> None:
    lowered = url.lower()
    if not lowered.startswith(ALLOWED_DOWNLOAD_SCHEMES):
        raise ValueError("unsupported-url-scheme")
    with urlopen(url, timeout=DOWNLOAD_TIMEOUT) as response, destination.open("wb") as target:  # noqa: S310
        shutil.copyfileobj(response, target)

    # Use requests with browser-like headers to avoid 403 Forbidden
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }
    )

    response = session.get(url, timeout=DOWNLOAD_TIMEOUT)
    response.raise_for_status()
    with destination.open("wb") as target:
        target.write(response.content)


def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path: