feat(tdoc): enhance TDoc fetching and conversion with metadata support (2524e868) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/extraction/convert.py

+78 −11

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ No wrapper layers - direct use of OpenDataLoader output files.

		from __future__ import annotations

		import asyncio
		import json
		import logging
		import tempfile
		@@ -15,9 +16,13 @@ from typing import Any
		import opendataloader_pdf
		from rich.console import Console

		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.database.oxyde_models import Specification as OxydeSpecification
		from tdoc_crawler.database.oxyde_models import TDocMetadata as OxydeTDocMetadata
		from tdoc_crawler.database.specs import SpecDatabase
		from tdoc_crawler.extraction.conversion import ensure_pdf
		from tdoc_crawler.extraction.fetch_spec import fetch_spec_files
		from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
		from tdoc_crawler.extraction.fetch_tdoc import _lookup_tdoc_in_db, fetch_tdoc_files
		from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server
		from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		@@ -152,7 +157,8 @@ def _ensure_converted(
		config: Optional OpenDataLoader configuration
		source_pdf: If provided, opendataloader processes this PDF instead of the
		original DOCX. This ensures all profiles use the same PDF that was
		generated for the wiki dir.
		generated for the wiki dir. When provided and exists, the fetch step
		is skipped entirely (the caller already resolved the document).
		output_dir: Directory for conversion artifacts. If None, uses a temp dir.

		Returns:
		@@ -164,6 +170,9 @@ def _ensure_converted(
		"""
		normalized_id = normalize_tdoc_id(document_id)

		if source_pdf is not None and source_pdf.exists():
		primary = source_pdf
		else:
		tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
		primary = tdoc_files.primary_path
		if primary is None:
		@@ -177,14 +186,57 @@ def _ensure_converted(
		if md_file.exists() and not force:
		markdown_content = md_file.read_text(encoding="utf-8")
		else:
		# Use the provided PDF (from wiki dir) or fall back to original source
		input_file = source_pdf if source_pdf is not None and source_pdf.exists() else primary
		markdown_content, _ = _run_opendataloader(input_file, artifact_dir, config=config)
		markdown_content, _ = _run_opendataloader(primary, artifact_dir, config=config)
		md_file.write_text(markdown_content, encoding="utf-8")

		return markdown_content, json_file, normalized_id


		async def _resolve_spec_metadata(spec_number: str) -> OxydeSpecification \| None:
		"""Look up specification metadata in the local database."""
		db_file = PathConfig().db_file
		async with SpecDatabase(db_file) as db:
		return await db._get_specification(spec_number)


		def _enrich_with_metadata(
		markdown_content: str,
		json_path: Path,
		metadata: OxydeSpecification \| OxydeTDocMetadata,
		*,
		source_kind: SourceKind,
		md_yaml_frontmatter: bool = True,
		) -> str:
		"""Inject document metadata into JSON and optionally prepend YAML frontmatter.

		JSON enrichment always happens. YAML frontmatter is controlled by
		md_yaml_frontmatter. Uses the Oxyde model's own serialization
		(model_dump) and field metadata (model_fields) — no hardcoded field names.
		"""
		metadata_dict = metadata.model_dump()
		metadata_dict["kind"] = source_kind.value

		# Merge into JSON at top level
		if json_path.exists():
		data = json.loads(json_path.read_text(encoding="utf-8"))
		if isinstance(data, dict):
		data.update(metadata_dict)
		json_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")

		if not md_yaml_frontmatter:
		return markdown_content

		# Generate YAML frontmatter from model fields
		lines = ["---"]
		for field_name in metadata.model_fields:
		value = metadata_dict.get(field_name)
		if value is not None:
		lines.append(f"{field_name}: {value}")
		lines.extend(["kind: " + source_kind.value, "---", ""])

		return "\n".join(lines) + markdown_content


		def convert_for_wiki(
		document_id: str,
		wiki_source_dir: Path,
		@@ -193,6 +245,7 @@ def convert_for_wiki(
		profile: ExtractionProfile \| None = None,
		force: bool = False,
		release: str \| None = None,
		md_yaml_frontmatter: bool = True,
		) -> Path \| None:
		"""Convert a document for wiki ingestion using the specified profile.

		@@ -217,8 +270,9 @@ def convert_for_wiki(

		wiki_source_dir.mkdir(parents=True, exist_ok=True)

		# Resolve primary document based on source kind
		# Resolve primary document based on source kind and collect metadata
		primary: Path \| None = None
		metadata: OxydeSpecification \| OxydeTDocMetadata \| None = None
		if source_kind == SourceKind.SPEC:
		base_spec = extract_base_spec_number(document_id)
		normalized_id = normalize_spec_number(base_spec)
		@@ -228,12 +282,21 @@ def convert_for_wiki(
		primary = spec_files.primary_path
		if primary is None:
		raise ConversionError(f"No document files found for spec {normalized_id}")
		else:
		metadata = asyncio.run(_resolve_spec_metadata(normalized_id))
		elif source_kind == SourceKind.TDOC:
		normalized_id = normalize_tdoc_id(document_id)
		tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
		primary = tdoc_files.primary_path
		if primary is None:
		raise ConversionError(f"No document files found for {normalized_id}")
		metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id))
		else:
		# SourceKind.OTHER — resolve document but skip metadata enrichment
		normalized_id = document_id
		tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
		primary = tdoc_files.primary_path
		if primary is None:
		raise ConversionError(f"No document files found for {normalized_id}")

		if profile == ExtractionProfile.PDF_ONLY:
		pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)
		@@ -248,6 +311,10 @@ def convert_for_wiki(
		)
		markdown_content, json_path, _ = _ensure_converted(document_id, force=force, config=config, source_pdf=pdf_path, output_dir=wiki_source_dir)

		# Enrich markdown and JSON with document metadata
		if metadata:
		markdown_content = _enrich_with_metadata(markdown_content, json_path, metadata, source_kind=source_kind, md_yaml_frontmatter=md_yaml_frontmatter)

		# Write markdown to wiki source dir
		md_file = wiki_source_dir / f"{primary.stem}.md"
		md_file.write_text(markdown_content, encoding="utf-8")

src/tdoc_crawler/extraction/fetch_tdoc.py

+15 −9

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@ from dataclasses import dataclass
		from pathlib import Path

		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.database.oxyde_models import TDocMetadata as OxydeTDocMetadata
		from tdoc_crawler.database.tdocs import TDocDatabase
		from tdoc_crawler.extraction.conversion import OFFICE_FORMATS
		from tdoc_crawler.http_client import create_cached_session
		@@ -66,9 +67,12 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
		return files

		# Step 2: Look up in local database (populated by crawl command)
		metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id))
		tdoc_record = asyncio.run(_lookup_tdoc_in_db(normalized_id))
		metadata: TDocMetadata \| None = None
		if tdoc_record is not None and tdoc_record.url:
		metadata = _oxyde_to_pydantic_tdoc(tdoc_record)

		# Step 3: Fall back to WhatTheSpec API if database has no record
		# Step 3: Fall back to WhatTheSpec API if database has no valid record
		if metadata is None:
		metadata = resolve_via_whatthespec(document_id)

		@@ -84,23 +88,25 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
		return _scan_checkout_dir(checkout_path)


		async def _lookup_tdoc_in_db(tdoc_id: str) -> TDocMetadata \| None:
		async def _lookup_tdoc_in_db(tdoc_id: str) -> OxydeTDocMetadata \| None:
		"""Look up TDoc metadata in the local database.

		The database is populated by the crawl command and contains TDoc metadata
		including the FTP download URL, which is sufficient for checkout.
		Returns the raw Oxyde model — callers decide if URL validation or
		Pydantic conversion is needed. Use _oxyde_to_pydantic_tdoc() to convert.

		Args:
		tdoc_id: Normalized TDoc identifier (uppercase)

		Returns:
		TDocMetadata if found with a valid URL, None otherwise
		OxydeTDocMetadata if found, None otherwise
		"""
		db_file = PathConfig().db_file
		async with TDocDatabase(db_file) as db:
		record = await db._get_tdoc(tdoc_id)
		if record is None or not record.url:
		return None
		return await db._get_tdoc(tdoc_id)


		def _oxyde_to_pydantic_tdoc(record: OxydeTDocMetadata) -> TDocMetadata:
		"""Convert Oxyde TDocMetadata to Pydantic TDocMetadata."""
		return TDocMetadata(
		tdoc_id=record.tdoc_id,
		meeting_id=record.meeting_id,