Commit 069bbfab authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(urls): add new URL templates for document and meeting resources

* Introduced SPEC_DYNAREPORT_URL_TEMPLATE for dynamic report URLs.
* Added DOCLIST_URL_TEMPLATE for generating document list URLs.
* Implemented MEETING_PORTAL_URL_TEMPLATE for meeting portal links.
* Updated fetch functions to utilize new URL templates for better maintainability.
parent 1d66e0da
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -12,6 +12,12 @@ TDOC_DOWNLOAD_URL: Final[str] = f"{PORTAL_BASE_URL}/ngppapp/DownloadTDoc.aspx"
LOGIN_URL: Final[str] = f"{PORTAL_BASE_URL}/login.aspx"

SPEC_URL_TEMPLATE: Final[str] = "https://www.3gpp.org/ftp/Specs/archive/{series}/{normalized}/{file_name}"
SPEC_DYNAREPORT_URL_TEMPLATE: Final[str] = "https://www.3gpp.org/dynareport/{compact}.htm"
DOCLIST_URL_TEMPLATE: Final[str] = "{portal_base}/ngppapp/GenerateDocumentList.aspx?meetingId={meeting_id}"
MEETING_PORTAL_URL_TEMPLATE: Final[str] = "{portal_base}/Home.aspx#/meeting?MtgId={meeting_id}"
WHATSPEC_BASE_URL: Final[str] = "https://whatthespec.net"
WHATSPEC_TDOC_URL_TEMPLATE: Final[str] = "{base}/3gpp/tdoc.php?name={tdoc_id}&api=1"
WHATSPEC_SPEC_URL_TEMPLATE: Final[str] = "{base}/3gpp/spec.php?q={compact}&api=1"

# Common browser headers to avoid 403 Forbidden responses
BROWSER_HEADERS: dict[str, str] = {
@@ -25,10 +31,16 @@ BROWSER_HEADERS: dict[str, str] = {
}

__all__ = [
    "DOCLIST_URL_TEMPLATE",
    "LOGIN_URL",
    "MEETINGS_REST_URL",
    "MEETING_PORTAL_URL_TEMPLATE",
    "PORTAL_BASE_URL",
    "SPEC_DYNAREPORT_URL_TEMPLATE",
    "SPEC_URL_TEMPLATE",
    "TDOC_DOWNLOAD_URL",
    "TDOC_VIEW_URL",
    "WHATSPEC_BASE_URL",
    "WHATSPEC_SPEC_URL_TEMPLATE",
    "WHATSPEC_TDOC_URL_TEMPLATE",
]
+2 −2
Original line number Diff line number Diff line
@@ -48,7 +48,7 @@ class TDocDatabase(MeetingDatabase):
            Tuple of (created, changed) booleans
        """
        record = self._prepare_tdoc(metadata)
        if getattr(record, "tbid", None) is None:
        if not record.tbid:
            record = await self._resolve_tbid(record)
        existing = await self._get_tdoc(record.tdoc_id)
        now = utc_now()
@@ -102,7 +102,7 @@ class TDocDatabase(MeetingDatabase):
        prepared: list[TDocMetadata] = []
        for metadata in tdocs_list:
            record = self._prepare_tdoc(metadata)
            if getattr(record, "tbid", None) is None:
            if not record.tbid:
                record = await self._resolve_tbid(record)
            prepared.append(record)

+2 −2
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from typing import Any

import niquests

from tdoc_crawler.constants.urls import MEETINGS_REST_URL
from tdoc_crawler.constants.urls import MEETING_PORTAL_URL_TEMPLATE, MEETINGS_REST_URL, PORTAL_BASE_URL
from tdoc_crawler.meetings.models import MeetingMetadata
from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS

@@ -80,7 +80,7 @@ def _portal_meeting_to_metadata(m: dict[str, Any]) -> MeetingMetadata:
        end_date=_parse_portal_date(m.get("EndDate")),
        location=m.get("Location") or None,
        files_url=m.get("MtgDocURL") or None,
        portal_url=f"https://portal.3gpp.org/Home.aspx#/meeting?MtgId={m['Id']}",
        portal_url=MEETING_PORTAL_URL_TEMPLATE.format(portal_base=PORTAL_BASE_URL, meeting_id=m["Id"]),
        tdoc_count=m.get("DocCount", 0),
    )

+5 −5
Original line number Diff line number Diff line
@@ -124,8 +124,8 @@ class SpecDownloads:
                return extract_subdir

            return target_dir
        except Exception as exc:
            _logger.exception("Failed to checkout %s: %s", spec, exc)
        except (ValueError, OSError, requests.RequestException) as exc:
            _logger.warning("Failed to checkout %s: %s", spec, exc)
            return None

    async def _resolve_spec_url_with_fallback(
@@ -183,7 +183,7 @@ class SpecDownloads:

    def _download_full_zip(self, url: str, target_file: Path) -> None:
        """Download full zip file, re-use session if already created for doc-only attempt."""
        self.session = download_to_file(url, target_file, session=self.session, close_session=False, http_cache_file=self._http_cache_file)
        download_to_file(url, target_file, session=self.session, http_cache_file=self._http_cache_file)

    @staticmethod
    def _filter_versions_by_release(
@@ -248,7 +248,7 @@ class SpecDownloads:
            with target_file.open("wb") as f:
                f.write(rzf.open(doc_file).read())
            return True
        except Exception as exc:
        except (OSError, ValueError, requests.RequestException) as exc:
            _logger.warning("Doc-only download failed for %s: %s", url, exc)
            return False

@@ -260,7 +260,7 @@ class SpecDownloads:

            if not keep_zip:
                zip_file.unlink()
        except Exception as exc:
        except (OSError, ValueError) as exc:
            _logger.exception("Failed to extract %s: %s", zip_file, exc)


+2 −1
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from pathlib import Path
from urllib.parse import parse_qs, urlparse

from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.constants.urls import SPEC_DYNAREPORT_URL_TEMPLATE
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.utils.normalization import normalize_spec_number
@@ -32,7 +33,7 @@ def fetch_threegpp_metadata(spec_number: str, http_config: HttpConfig | None = N
    """
    normalized = normalize_spec_number(spec_number)
    compact = normalized.replace(".", "")
    url = f"https://www.3gpp.org/dynareport/{compact}.htm"
    url = SPEC_DYNAREPORT_URL_TEMPLATE.format(compact=compact)

    session = create_cached_session(http_config=http_config, http_cache_file=http_cache_file)
    response = session.get(url, timeout=30, allow_redirects=True)
Loading