Inline markitdown module into convert.py — it was just pymupdf4llm (f326abf2) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/extraction/convert.py

+79 −13

Original line number	Diff line number	Diff line
		@@ -4,13 +4,14 @@ After checkout/fetch, all source types (TDocs, Specs, Other) are treated as
		generic documents. Four extraction profiles are supported:

		- pdf-only: Convert to PDF only (via LibreOffice). No structured extraction.
		- markdown-only: Fast markitdown extraction (.md only, no ML).
		- markdown-only: Fast pymupdf4llm extraction (.md only, no ML).
		- default: PDF → Docling structured output (Markdown + JSON).
		- advanced: Same as default + picture descriptions, code/formula enrichment.
		"""

		from __future__ import annotations

		import base64
		import logging
		import os
		import re
		@@ -21,6 +22,7 @@ from enum import Enum
		from pathlib import Path

		import niquests as requests
		import pymupdf4llm
		from convert_lo import LibreOfficeFormat
		from convert_lo.converter import Converter

		@@ -32,7 +34,6 @@ from tdoc_crawler.extraction.docling import (
		from tdoc_crawler.extraction.errors import ConversionError
		from tdoc_crawler.extraction.fetch_tdoc import fetch_spec_files as fetch_spec_files_from_tdoc
		from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
		from tdoc_crawler.extraction.markitdown import _run_markitdown
		from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.models.workspaces import SourceKind
		@@ -224,6 +225,74 @@ def _convert_via_remote(
		raise ConversionError(msg) from e


		# ---------------------------------------------------------------------------
		# pymupdf4llm markdown extraction (fast, no ML)
		# ---------------------------------------------------------------------------

		# Matches ``![alt](media/image_NNN.png)`` — pymupdf4llm output with write_images=True.
		_MEDIA_REF_PATTERN = re.compile(r"!\[([^\]]*)\]\((media/[^)]+)\)")

		_SUFFIX_TO_MIME: dict[str, str] = {
		".png": "image/png",
		".jpg": "image/jpeg",
		".jpeg": "image/jpeg",
		".gif": "image/gif",
		".bmp": "image/bmp",
		".webp": "image/webp",
		}


		def _embed_media_as_base64(text: str, output_dir: Path) -> str:
		"""Replace ``media/`` file references with base64 data URIs.

		After embedding, the ``media/`` directory is removed.
		"""
		media_dir = output_dir / "media"

		def _replacement(match: re.Match) -> str:
		alt_text = match.group(1)
		rel_path = match.group(2)
		img_path = output_dir / rel_path
		try:
		data = img_path.read_bytes()
		b64 = base64.b64encode(data).decode("ascii")
		mime = _SUFFIX_TO_MIME.get(img_path.suffix.lower(), "application/octet-stream")
		return f"![{alt_text}](data:{mime};base64,{b64})"
		except (OSError, ValueError):
		logger.warning("Failed to embed image: %s", img_path, exc_info=True)
		return match.group(0)

		result = _MEDIA_REF_PATTERN.sub(_replacement, text)
		if media_dir.exists():
		shutil.rmtree(media_dir, ignore_errors=True)
		return result


		def _run_markdown_only(
		primary: Path,
		output_dir: Path,
		*,
		extract_media: bool = False,
		) -> Path:
		"""Convert a PDF document to Markdown using pymupdf4llm."""
		media_dir = output_dir / "media"
		media_dir.mkdir(parents=True, exist_ok=True)

		md_text = pymupdf4llm.to_markdown(
		str(primary),
		write_images=True,
		image_path=str(media_dir),
		image_format="png",
		)

		if not extract_media:
		md_text = _embed_media_as_base64(md_text, output_dir)

		md_path = output_dir / f"{primary.stem}.md"
		md_path.write_text(md_text, encoding="utf-8")
		return md_path


		# ---------------------------------------------------------------------------
		# Source file resolution (source-kind-specific, before generic conversion)
		# ---------------------------------------------------------------------------
		@@ -292,9 +361,9 @@ def convert_for_wiki(
		"""Convert a document for wiki ingestion using the specified profile.

		For markdown-only the pipeline is:
		1. Convert legacy ``.doc`` files to PDF (via LibreOffice) — they are binary
		format and cannot be processed directly by markitdown.
		2. Feed ``.docx`` or PDF to markitdown — no ML, no JSON.
		1. Convert Office documents to PDF (via LibreOffice); native PDFs pass through.
		2. Feed PDF to :func:`pymupdf4llm.to_markdown` for layout-aware extraction
		with inline images, tables, and multi-column support.
		3. Write ``.md`` to wiki_source_dir.

		For default and advanced profiles the pipeline is:
		@@ -312,7 +381,7 @@ def convert_for_wiki(
		docling_config: Optional Docling-specific configuration (figure/table modes).
		docx_direct: Feed .docx directly to backend, skip LibreOffice PDF step.
		extract_media: Extract embedded images to ``./media/`` next to the
		markdown (markdown-only profile, .docx only).
		markdown (markdown-only profile).

		Returns:
		Path to the primary output file (PDF for pdf-only, MD for others),
		@@ -335,19 +404,16 @@ def convert_for_wiki(
		if profile == ExtractionProfile.PDF_ONLY:
		return ensure_pdf(primary, wiki_source_dir, force=force)

		# Step 2b: markdown-only → fast markitdown, .md only
		# Step 2b: markdown-only → pymupdf4llm (fast, no ML, layout-aware)
		if profile == ExtractionProfile.MARKDOWN_ONLY:
		md_file = wiki_source_dir / f"{primary.stem}.md"
		if not force and md_file.exists():
		logger.debug("Skipping %s — markdown already exists", document_id)
		return md_file
		# Legacy .doc files (binary format) cannot be processed directly;
		# convert to PDF first, then run markitdown on the PDF.
		input_for_markitdown = primary
		if primary.suffix.lower() == ".doc":
		input_for_markitdown = ensure_pdf(primary, wiki_source_dir, force=force)
		# Office formats → LibreOffice PDF first; native PDFs pass through.
		input_for_md = ensure_pdf(primary, wiki_source_dir, force=force)
		with timed_operation(get_metrics_tracker(), document_id, MetricType.CONVERSION):
		return _run_markitdown(input_for_markitdown, wiki_source_dir, extract_media=extract_media)
		return _run_markdown_only(input_for_md, wiki_source_dir, extract_media=extract_media)

		# Step 2c: default/advanced → check existing output before running Docling
		md_file = wiki_source_dir / f"{primary.stem}.md"

src/tdoc_crawler/extraction/markitdown/init.py

deleted100644 → 0

+0 −5

Original line number	Diff line number	Diff line
		"""Markitdown converter — fast, no-ML Markdown extraction."""

		from tdoc_crawler.extraction.markitdown.converter import _run_markitdown

		__all__ = ["_run_markitdown"]

src/tdoc_crawler/extraction/markitdown/converter.py

deleted100644 → 0

+0 −98

Original line number	Diff line number	Diff line
		"""Markitdown converter — fast, no-ML Markdown extraction (no JSON, no models)."""

		from __future__ import annotations

		import re
		from pathlib import Path

		from docx import Document
		from docx.opc.constants import RELATIONSHIP_TYPE as RT
		from markitdown import MarkItDown

		# Content type → file extension mapping for common image formats.
		# EMF/WMF are vector formats — kept as-is since they're what's actually stored.
		_CONTENT_TYPE_TO_EXT: dict[str, str] = {
		"image/png": ".png",
		"image/jpeg": ".jpg",
		"image/gif": ".gif",
		"image/bmp": ".bmp",
		"image/tiff": ".tiff",
		"image/x-emf": ".emf",
		"image/x-wmf": ".wmf",
		"image/svg+xml": ".svg",
		}

		# Matches `![alt](data:image/...;base64,...)` — mammoth embeds images as data URIs.
		_DATA_URI_PATTERN = re.compile(r"!\[([^\]])\]\(data:image/[^;]+;base64,[^)]\)")


		def _extract_docx_images(docx_path: Path, output_dir: Path) -> list[Path]:
		"""Extract all embedded images from a .docx file.

		Images are saved to ``output_dir / "media" /`` with sequential naming
		(``image_001.png``, ``image_002.emf``, …). Returns the list of saved
		paths in extraction order (relationship order, which matches document
		order for practically all 3GPP documents).
		"""
		media_dir = output_dir / "media"
		media_dir.mkdir(parents=True, exist_ok=True)

		doc = Document(str(docx_path))
		extracted: list[Path] = []

		for rel in doc.part.rels.values():
		if rel.reltype == RT.IMAGE:
		image_part = rel.target_part
		ext = _CONTENT_TYPE_TO_EXT.get(image_part.content_type, ".bin")
		image_path = media_dir / f"image_{len(extracted) + 1:03d}{ext}"
		image_path.write_bytes(image_part.blob)
		extracted.append(image_path)

		return extracted


		def _replace_data_uris(text: str, image_paths: list[Path]) -> str:
		"""Replace data-URI image references with relative file references.

		Each ``![alt](data:image/…)`` occurrence is replaced in order by
		``![alt](./media/image_NNN.ext)``. If there are more data URIs than
		extracted images, the excess references are left untouched.
		"""
		remaining = list(image_paths) # shallow copy for pop(0)

		def _replacement(match: re.Match) -> str:
		if not remaining:
		return match.group(0) # no more extracted images — leave as-is
		img_path = remaining.pop(0)
		alt_text = match.group(1)
		return f"![{alt_text}](./media/{img_path.name})"

		return _DATA_URI_PATTERN.sub(_replacement, text)


		def _run_markitdown(
		primary: Path,
		output_dir: Path,
		*,
		extract_media: bool = False,
		) -> Path:
		"""Convert document to Markdown using markitdown (fast, no ML models).

		Handles .docx, .pdf, .pptx, .xlsx directly — no LibreOffice step needed.

		When extract_media is ``True`` and primary is a .docx file, embedded
		images are extracted to ``output_dir/media/`` and data-URI references in
		the markdown are rewritten to relative ``./media/…`` paths.
		"""
		md = MarkItDown()
		result = md.convert(str(primary))
		text = result.text_content

		if extract_media and primary.suffix.lower() == ".docx":
		image_paths = _extract_docx_images(primary, output_dir)
		if image_paths:
		text = _replace_data_uris(text, image_paths)

		md_path = output_dir / f"{primary.stem}.md"
		md_path.write_text(text, encoding="utf-8")
		return md_path