refactor(tdoc): streamline metadata handling and improve error messages (d83948eb) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/extraction/convert.py

+14 −5

Original line number	Diff line number	Diff line
		@@ -205,6 +205,7 @@ def _enrich_with_metadata(
		metadata: OxydeSpecification \| OxydeTDocMetadata,
		*,
		source_kind: SourceKind,
		extraction_profile: ExtractionProfile = ExtractionProfile.DEFAULT,
		md_yaml_frontmatter: bool = True,
		) -> str:
		"""Inject document metadata into JSON and optionally prepend YAML frontmatter.
		@@ -213,8 +214,9 @@ def _enrich_with_metadata(
		md_yaml_frontmatter. Uses the Oxyde model's own serialization
		(model_dump) and field metadata (model_fields) — no hardcoded field names.
		"""
		metadata_dict = metadata.model_dump()
		metadata_dict = metadata.model_dump(mode="json")
		metadata_dict["kind"] = source_kind.value
		metadata_dict["extraction_profile"] = extraction_profile.value

		# Merge into JSON at top level
		if json_path.exists():
		@@ -232,7 +234,7 @@ def _enrich_with_metadata(
		value = metadata_dict.get(field_name)
		if value is not None:
		lines.append(f"{field_name}: {value}")
		lines.extend(["kind: " + source_kind.value, "---", ""])
		lines.extend(["kind: " + source_kind.value, "extraction_profile: " + extraction_profile.value, "---", ""])

		return "\n".join(lines) + markdown_content

		@@ -302,8 +304,8 @@ def convert_for_wiki(
		pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)
		return pdf_path

		# default or advanced: generate wiki PDF first, then opendataloader processes it
		pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)
		# default or advanced: reuse existing PDF, regenerate md/json only
		pdf_path = ensure_pdf(primary, wiki_source_dir, force=False)

		config = OpendataloaderConfig(
		hybrid="docling-fast",
		@@ -313,7 +315,14 @@ def convert_for_wiki(

		# Enrich markdown and JSON with document metadata
		if metadata:
		markdown_content = _enrich_with_metadata(markdown_content, json_path, metadata, source_kind=source_kind, md_yaml_frontmatter=md_yaml_frontmatter)
		markdown_content = _enrich_with_metadata(
		markdown_content,
		json_path,
		metadata,
		source_kind=source_kind,
		extraction_profile=profile,
		md_yaml_frontmatter=md_yaml_frontmatter,
		)

		# Write markdown to wiki source dir
		md_file = wiki_source_dir / f"{primary.stem}.md"

src/tdoc_crawler/specs/sources/threegpp.py

+3 −5

Original line number	Diff line number	Diff line
		@@ -54,7 +54,7 @@ _RE_TYPE = re.compile(r'<span id="typeVal">([^<]+)</span>')

		# Responsibility tab — primary working group
		_RE_PRIMARY_GROUP = re.compile(
		r'PrimaryResponsibleGroupLbl.?<span>\s([^<]+?)\s*</span>',
		r"PrimaryResponsibleGroupLbl.?<span>\s([^<]+?)\s*</span>",
		re.DOTALL,
		)

		@@ -62,7 +62,7 @@ _RE_PRIMARY_GROUP = re.compile(
		# <a id="..._lnkFtpDownload" ... href=".../26260-j10.zip">19.1.0</a>
		_RE_VERSION_LINK = re.compile(
		r'<a\s[^>]id="[^"]_lnkFtpDownload"[^>]href="([^"]+)"[^>]>'
		r'\s(\d+\.\d+\.\d+)\s</a>',
		r"\s(\d+\.\d+\.\d+)\s</a>",
		re.IGNORECASE,
		)

		@@ -262,9 +262,7 @@ def fetch_threegpp_metadata(
		normalized,
		dynareport_url,
		)
		raise SpecNotFoundError(
		f"Spec {normalized} not found on 3GPP portal or has no versions"
		)
		raise SpecNotFoundError(f"Spec {normalized} not found on 3GPP portal or has no versions")

		# Use the redirected URL as source identifier
		portal_url = str(response.url) if hasattr(response, "url") else dynareport_url

src/tdoc_crawler/tdocs/operations/checkout.py

+2 −6

Original line number	Diff line number	Diff line
		@@ -112,9 +112,7 @@ def checkout_tdoc(
		with zipfile.ZipFile(temp_zip_file) as archive:
		archive.extractall(checkout_path)
		except zipfile.BadZipFile as exc:
		raise FileNotFoundError(
		f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}"
		) from exc
		raise FileNotFoundError(f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}") from exc
		logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
		finally:
		if temp_zip_file.exists():
		@@ -170,9 +168,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
		with zipfile.ZipFile(zip_file) as archive:
		archive.extractall(extract_dir)
		except zipfile.BadZipFile as exc:
		raise FileNotFoundError(
		f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}"
		) from exc
		raise FileNotFoundError(f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}") from exc
		finally:
		with suppress(FileNotFoundError):
		zip_file.unlink()

tests/test_specs_sources.py

+2 −7

Original line number	Diff line number	Diff line
		@@ -32,10 +32,7 @@ class _FakeResponse:
		class _FakePortalResponse:
		"""Fake HTTP response with HTML text from the portal spec details page."""

		_PORTAL_URL = (
		"https://portal.3gpp.org/desktopmodules/Specifications"
		"/SpecificationDetails.aspx?specificationId=3314"
		)
		_PORTAL_URL = "https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=3314"

		def __init__(self, html_body: str, url: str = _PORTAL_URL) -> None:
		self.text = html_body
		@@ -159,9 +156,7 @@ def test_fetch_threegpp_metadata_parses_portal_page(monkeypatch: object) -> None

		def test_fetch_threegpp_metadata_parses_tr_type(monkeypatch: object) -> None:
		"""Test that Technical Report type is correctly parsed to TR."""
		html = _PORTAL_HTML_26260.replace(
		"Technical specification (TS)", "Technical Report (TR)"
		)
		html = _PORTAL_HTML_26260.replace("Technical specification (TS)", "Technical Report (TR)")
		response = _FakePortalResponse(html)

		class _FakeSession: