Commit d83948eb authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(tdoc): streamline metadata handling and improve error messages

* Update _enrich_with_metadata to include extraction profile in JSON output.
* Modify convert_for_wiki to reuse existing PDFs instead of regenerating.
* Simplify regex patterns in threegpp.py for better readability.
* Enhance error handling in checkout operations for invalid ZIP files.
parent a4a3edbb
Loading
Loading
Loading
Loading
+14 −5
Original line number Diff line number Diff line
@@ -205,6 +205,7 @@ def _enrich_with_metadata(
    metadata: OxydeSpecification | OxydeTDocMetadata,
    *,
    source_kind: SourceKind,
    extraction_profile: ExtractionProfile = ExtractionProfile.DEFAULT,
    md_yaml_frontmatter: bool = True,
) -> str:
    """Inject document metadata into JSON and optionally prepend YAML frontmatter.
@@ -213,8 +214,9 @@ def _enrich_with_metadata(
    *md_yaml_frontmatter*. Uses the Oxyde model's own serialization
    (model_dump) and field metadata (model_fields) — no hardcoded field names.
    """
    metadata_dict = metadata.model_dump()
    metadata_dict = metadata.model_dump(mode="json")
    metadata_dict["kind"] = source_kind.value
    metadata_dict["extraction_profile"] = extraction_profile.value

    # Merge into JSON at top level
    if json_path.exists():
@@ -232,7 +234,7 @@ def _enrich_with_metadata(
        value = metadata_dict.get(field_name)
        if value is not None:
            lines.append(f"{field_name}: {value}")
    lines.extend(["kind: " + source_kind.value, "---", ""])
    lines.extend(["kind: " + source_kind.value, "extraction_profile: " + extraction_profile.value, "---", ""])

    return "\n".join(lines) + markdown_content

@@ -302,8 +304,8 @@ def convert_for_wiki(
        pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)
        return pdf_path

    # default or advanced: generate wiki PDF first, then opendataloader processes it
    pdf_path = ensure_pdf(primary, wiki_source_dir, force=force)
    # default or advanced: reuse existing PDF, regenerate md/json only
    pdf_path = ensure_pdf(primary, wiki_source_dir, force=False)

    config = OpendataloaderConfig(
        hybrid="docling-fast",
@@ -313,7 +315,14 @@ def convert_for_wiki(

    # Enrich markdown and JSON with document metadata
    if metadata:
        markdown_content = _enrich_with_metadata(markdown_content, json_path, metadata, source_kind=source_kind, md_yaml_frontmatter=md_yaml_frontmatter)
        markdown_content = _enrich_with_metadata(
            markdown_content,
            json_path,
            metadata,
            source_kind=source_kind,
            extraction_profile=profile,
            md_yaml_frontmatter=md_yaml_frontmatter,
        )

    # Write markdown to wiki source dir
    md_file = wiki_source_dir / f"{primary.stem}.md"
+3 −5
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ _RE_TYPE = re.compile(r'<span id="typeVal">([^<]+)</span>')

# Responsibility tab — primary working group
_RE_PRIMARY_GROUP = re.compile(
    r'PrimaryResponsibleGroupLbl.*?<span>\s*([^<]+?)\s*</span>',
    r"PrimaryResponsibleGroupLbl.*?<span>\s*([^<]+?)\s*</span>",
    re.DOTALL,
)

@@ -62,7 +62,7 @@ _RE_PRIMARY_GROUP = re.compile(
# <a id="..._lnkFtpDownload" ... href=".../26260-j10.zip">19.1.0</a>
_RE_VERSION_LINK = re.compile(
    r'<a\s[^>]*id="[^"]*_lnkFtpDownload"[^>]*href="([^"]+)"[^>]*>'
    r'\s*(\d+\.\d+\.\d+)\s*</a>',
    r"\s*(\d+\.\d+\.\d+)\s*</a>",
    re.IGNORECASE,
)

@@ -262,9 +262,7 @@ def fetch_threegpp_metadata(
            normalized,
            dynareport_url,
        )
        raise SpecNotFoundError(
            f"Spec {normalized} not found on 3GPP portal or has no versions"
        )
        raise SpecNotFoundError(f"Spec {normalized} not found on 3GPP portal or has no versions")

    # Use the redirected URL as source identifier
    portal_url = str(response.url) if hasattr(response, "url") else dynareport_url
+2 −6
Original line number Diff line number Diff line
@@ -112,9 +112,7 @@ def checkout_tdoc(
            with zipfile.ZipFile(temp_zip_file) as archive:
                archive.extractall(checkout_path)
        except zipfile.BadZipFile as exc:
            raise FileNotFoundError(
                f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}"
            ) from exc
            raise FileNotFoundError(f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}") from exc
        logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
    finally:
        if temp_zip_file.exists():
@@ -170,9 +168,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, checkout_dir: Path, return_dir: bo
            with zipfile.ZipFile(zip_file) as archive:
                archive.extractall(extract_dir)
        except zipfile.BadZipFile as exc:
            raise FileNotFoundError(
                f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}"
            ) from exc
            raise FileNotFoundError(f"invalid-zip: {metadata.tdoc_id} ({zip_size} bytes, {metadata.url}): {exc}") from exc
        finally:
            with suppress(FileNotFoundError):
                zip_file.unlink()
+2 −7
Original line number Diff line number Diff line
@@ -32,10 +32,7 @@ class _FakeResponse:
class _FakePortalResponse:
    """Fake HTTP response with HTML text from the portal spec details page."""

    _PORTAL_URL = (
        "https://portal.3gpp.org/desktopmodules/Specifications"
        "/SpecificationDetails.aspx?specificationId=3314"
    )
    _PORTAL_URL = "https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=3314"

    def __init__(self, html_body: str, url: str = _PORTAL_URL) -> None:
        self.text = html_body
@@ -159,9 +156,7 @@ def test_fetch_threegpp_metadata_parses_portal_page(monkeypatch: object) -> None

def test_fetch_threegpp_metadata_parses_tr_type(monkeypatch: object) -> None:
    """Test that Technical Report type is correctly parsed to TR."""
    html = _PORTAL_HTML_26260.replace(
        "Technical specification (TS)", "Technical Report (TR)"
    )
    html = _PORTAL_HTML_26260.replace("Technical specification (TS)", "Technical Report (TR)")
    response = _FakePortalResponse(html)

    class _FakeSession: