feat(urls): add new URL templates for document and meeting resources (069bbfab) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/constants/urls.py

+12 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,12 @@ TDOC_DOWNLOAD_URL: Final[str] = f"{PORTAL_BASE_URL}/ngppapp/DownloadTDoc.aspx"
		LOGIN_URL: Final[str] = f"{PORTAL_BASE_URL}/login.aspx"

		SPEC_URL_TEMPLATE: Final[str] = "https://www.3gpp.org/ftp/Specs/archive/{series}/{normalized}/{file_name}"
		SPEC_DYNAREPORT_URL_TEMPLATE: Final[str] = "https://www.3gpp.org/dynareport/{compact}.htm"
		DOCLIST_URL_TEMPLATE: Final[str] = "{portal_base}/ngppapp/GenerateDocumentList.aspx?meetingId={meeting_id}"
		MEETING_PORTAL_URL_TEMPLATE: Final[str] = "{portal_base}/Home.aspx#/meeting?MtgId={meeting_id}"
		WHATSPEC_BASE_URL: Final[str] = "https://whatthespec.net"
		WHATSPEC_TDOC_URL_TEMPLATE: Final[str] = "{base}/3gpp/tdoc.php?name={tdoc_id}&api=1"
		WHATSPEC_SPEC_URL_TEMPLATE: Final[str] = "{base}/3gpp/spec.php?q={compact}&api=1"

		# Common browser headers to avoid 403 Forbidden responses
		BROWSER_HEADERS: dict[str, str] = {
		@@ -25,10 +31,16 @@ BROWSER_HEADERS: dict[str, str] = {
		}

		__all__ = [
		"DOCLIST_URL_TEMPLATE",
		"LOGIN_URL",
		"MEETINGS_REST_URL",
		"MEETING_PORTAL_URL_TEMPLATE",
		"PORTAL_BASE_URL",
		"SPEC_DYNAREPORT_URL_TEMPLATE",
		"SPEC_URL_TEMPLATE",
		"TDOC_DOWNLOAD_URL",
		"TDOC_VIEW_URL",
		"WHATSPEC_BASE_URL",
		"WHATSPEC_SPEC_URL_TEMPLATE",
		"WHATSPEC_TDOC_URL_TEMPLATE",
		]

+2 −2

Original line number	Diff line number	Diff line
		@@ -48,7 +48,7 @@ class TDocDatabase(MeetingDatabase):
		Tuple of (created, changed) booleans
		"""
		record = self._prepare_tdoc(metadata)
		if getattr(record, "tbid", None) is None:
		if not record.tbid:
		record = await self._resolve_tbid(record)
		existing = await self._get_tdoc(record.tdoc_id)
		now = utc_now()
		@@ -102,7 +102,7 @@ class TDocDatabase(MeetingDatabase):
		prepared: list[TDocMetadata] = []
		for metadata in tdocs_list:
		record = self._prepare_tdoc(metadata)
		if getattr(record, "tbid", None) is None:
		if not record.tbid:
		record = await self._resolve_tbid(record)
		prepared.append(record)

+2 −2

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ from typing import Any

		import niquests

		from tdoc_crawler.constants.urls import MEETINGS_REST_URL
		from tdoc_crawler.constants.urls import MEETING_PORTAL_URL_TEMPLATE, MEETINGS_REST_URL, PORTAL_BASE_URL
		from tdoc_crawler.meetings.models import MeetingMetadata
		from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS

		@@ -80,7 +80,7 @@ def _portal_meeting_to_metadata(m: dict[str, Any]) -> MeetingMetadata:
		end_date=_parse_portal_date(m.get("EndDate")),
		location=m.get("Location") or None,
		files_url=m.get("MtgDocURL") or None,
		portal_url=f"https://portal.3gpp.org/Home.aspx#/meeting?MtgId={m['Id']}",
		portal_url=MEETING_PORTAL_URL_TEMPLATE.format(portal_base=PORTAL_BASE_URL, meeting_id=m["Id"]),
		tdoc_count=m.get("DocCount", 0),
		)

+5 −5

Original line number	Diff line number	Diff line
		@@ -124,8 +124,8 @@ class SpecDownloads:
		return extract_subdir

		return target_dir
		except Exception as exc:
		_logger.exception("Failed to checkout %s: %s", spec, exc)
		except (ValueError, OSError, requests.RequestException) as exc:
		_logger.warning("Failed to checkout %s: %s", spec, exc)
		return None

		async def _resolve_spec_url_with_fallback(
		@@ -183,7 +183,7 @@ class SpecDownloads:

		def _download_full_zip(self, url: str, target_file: Path) -> None:
		"""Download full zip file, re-use session if already created for doc-only attempt."""
		self.session = download_to_file(url, target_file, session=self.session, close_session=False, http_cache_file=self._http_cache_file)
		download_to_file(url, target_file, session=self.session, http_cache_file=self._http_cache_file)

		@staticmethod
		def _filter_versions_by_release(
		@@ -248,7 +248,7 @@ class SpecDownloads:
		with target_file.open("wb") as f:
		f.write(rzf.open(doc_file).read())
		return True
		except Exception as exc:
		except (OSError, ValueError, requests.RequestException) as exc:
		_logger.warning("Doc-only download failed for %s: %s", url, exc)
		return False

		@@ -260,7 +260,7 @@ class SpecDownloads:

		if not keep_zip:
		zip_file.unlink()
		except Exception as exc:
		except (OSError, ValueError) as exc:
		_logger.exception("Failed to extract %s: %s", zip_file, exc)

+2 −1

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ from pathlib import Path
		from urllib.parse import parse_qs, urlparse

		from tdoc_crawler.config.settings import HttpConfig
		from tdoc_crawler.constants.urls import SPEC_DYNAREPORT_URL_TEMPLATE
		from tdoc_crawler.http_client import create_cached_session
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.utils.normalization import normalize_spec_number
		@@ -32,7 +33,7 @@ def fetch_threegpp_metadata(spec_number: str, http_config: HttpConfig \| None = N
		"""
		normalized = normalize_spec_number(spec_number)
		compact = normalized.replace(".", "")
		url = f"https://www.3gpp.org/dynareport/{compact}.htm"
		url = SPEC_DYNAREPORT_URL_TEMPLATE.format(compact=compact)

		session = create_cached_session(http_config=http_config, http_cache_file=http_cache_file)
		response = session.get(url, timeout=30, allow_redirects=True)