Commit 74bc0f0d authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(3gpp): detect missing specs and raise SpecNotFoundError

- Add SpecNotFoundError and raise when redirect URL lacks specificationId\n- Update sources __init__ to export error class\n- Catalog: skip specs when 3GPP source errors and mark as skipped\n- Add tests asserting SpecNotFoundError and improved redirect URL handling
parent c510f401
Loading
Loading
Loading
Loading
+16 −0
Original line number Diff line number Diff line
@@ -160,6 +160,22 @@ class SpecCatalog:
                )
                continue

            # Check if 3GPP source failed - if so, skip this spec entirely
            threegpp_outcome = next((o for o in outcomes if o.source_name == "3gpp"), None)
            if threegpp_outcome is not None and threegpp_outcome.status == "error":
                _logger.warning("Skipping spec %s due to 3GPP source error", normalized)
                results.append(
                    SpecCrawlResult(
                        spec_number=normalized,
                        release=release,
                        status="skipped",
                        latest_version=None,
                        sources=outcomes,
                        message="3gpp-source-error",
                    )
                )
                continue

            release_matches = release == "latest" or any(release in outcome.versions for outcome in outcomes if outcome.status == "ok")
            if not release_matches:
                results.append(
+2 −2
Original line number Diff line number Diff line
"""Spec metadata sources."""

from .threegpp import fetch_threegpp_metadata
from .threegpp import SpecNotFoundError, fetch_threegpp_metadata
from .whatthespec import fetch_whatthespec_metadata

__all__ = ["fetch_threegpp_metadata", "fetch_whatthespec_metadata"]
__all__ = ["SpecNotFoundError", "fetch_threegpp_metadata", "fetch_whatthespec_metadata"]
+25 −8
Original line number Diff line number Diff line
@@ -10,20 +10,40 @@ from tdoc_crawler.specs.normalization import normalize_spec_number
_logger = logging.getLogger(__name__)


class SpecNotFoundError(Exception):
    """Raised when a specification is not found on 3GPP portal."""

    pass


def fetch_threegpp_metadata(spec_number: str) -> dict[str, object]:
    """Fetch spec metadata via 3GPP.org redirect flow."""
    """Fetch spec metadata via 3GPP.org redirect flow.

    Args:
        spec_number: Spec number to fetch (e.g., "26.131").

    Returns:
        Dictionary containing spec metadata from 3GPP portal.

    Raises:
        SpecNotFoundError: If the spec does not exist (redirects to unknown spec page).
        requests.HTTPError: If the HTTP request fails.
    """
    normalized = normalize_spec_number(spec_number)
    compact = normalized.replace(".", "")
    url = f"https://www.3gpp.org/specs/{compact}"
    url = f"https://www.3gpp.org/dynareport/{compact}.htm"

    response = requests.get(url, timeout=30, allow_redirects=True)
    response.raise_for_status()

    parsed = urlparse(response.url)
    query = parse_qs(parsed.query)
    spec_id = None
    if "specificationId" in query:
        spec_id = query["specificationId"][0]
    spec_id = query.get("specificationId", [None])[0]

    # Verify the spec exists by checking for specificationId in the redirect URL
    if spec_id is None:
        _logger.warning("Spec %s not found (no specificationId in redirect URL: %s)", normalized, response.url)
        raise SpecNotFoundError(f"Spec {normalized} not found on 3GPP portal")

    payload: dict[str, object]
    try:
@@ -31,9 +51,6 @@ def fetch_threegpp_metadata(spec_number: str) -> dict[str, object]:
    except ValueError:
        payload = {}

    if spec_id is None:
        _logger.debug("No specificationId found in redirect URL: %s", response.url)

    versions = payload.get("versions")
    if not isinstance(versions, list):
        versions = []
+35 −2
Original line number Diff line number Diff line
@@ -2,7 +2,9 @@

from typing import Any

from tdoc_crawler.specs.sources.threegpp import fetch_threegpp_metadata
import pytest

from tdoc_crawler.specs.sources.threegpp import SpecNotFoundError, fetch_threegpp_metadata
from tdoc_crawler.specs.sources.whatthespec import fetch_whatthespec_metadata


@@ -21,7 +23,7 @@ class _FakeResponse:

def test_fetch_threegpp_metadata_uses_redirect(monkeypatch) -> None:
    payload = {"title": "Spec title", "versions": ["19.0.0"]}
    response = _FakeResponse("https://example.org/specificationId=12345", payload)
    response = _FakeResponse("https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=12345", payload)

    def _fake_get(*_args: object, **_kwargs: object) -> _FakeResponse:
        return response
@@ -31,6 +33,37 @@ def test_fetch_threegpp_metadata_uses_redirect(monkeypatch) -> None:
    result = fetch_threegpp_metadata("26.132")
    assert result["spec_number"] == "26.132"
    assert result["source_name"] == "3gpp"
    assert result["source_identifier"] == "12345"


def test_fetch_threegpp_metadata_raises_when_spec_not_found(monkeypatch) -> None:
    """Test that SpecNotFoundError is raised when spec doesn't exist (no specificationId in redirect)."""
    payload = {"title": "Unknown Specification"}
    # Simulate redirect to unknown spec page without specificationId parameter
    response = _FakeResponse("https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx", payload)

    def _fake_get(*_args: object, **_kwargs: object) -> _FakeResponse:
        return response

    monkeypatch.setattr("requests.get", _fake_get)

    with pytest.raises(SpecNotFoundError, match="not found on 3GPP portal"):
        fetch_threegpp_metadata("99.999")


def test_fetch_threegpp_metadata_raises_when_redirect_to_unknown_page(monkeypatch) -> None:
    """Test that SpecNotFoundError is raised when redirected to an unknown/invalid page."""
    payload = {}
    # Simulate redirect to a generic page without specificationId
    response = _FakeResponse("https://www.3gpp.org/specifications/unknown", payload)

    def _fake_get(*_args: object, **_kwargs: object) -> _FakeResponse:
        return response

    monkeypatch.setattr("requests.get", _fake_get)

    with pytest.raises(SpecNotFoundError, match="not found on 3GPP portal"):
        fetch_threegpp_metadata("26.999")


def test_fetch_whatthespec_metadata_parses_json(monkeypatch) -> None: