Commit 74318d6b authored by Jan Reimes's avatar Jan Reimes
Browse files

Fix frontmatter, image embedding, and unify --figures option

Three bugs fixed:
1. md_yaml_frontmatter was accepted in CLI but never forwarded through
   _process_member → convert_for_wiki → output file. Added param to
   convert_for_wiki and _add_yaml_frontmatter() post-processor.

2. pymupdf4llm emits absolute image paths (C:/.../media/img.png) but
   _embed_media_as_base64 regex only matched relative media/ paths.
   Fixed regex to match both; normalize absolute→relative before match.

3. --figures (Docling) and --extract-media (pymupdf4llm) were redundant.
   Removed --extract-media; --figures now serves all profiles:
   - embed: base64 in markdown (pymupdf4llm) / PLACEHOLDER (Docling)
   - reference: ./media dir (pymupdf4llm) / REFERENCED (Docling)

Also fixed Docling ignoring figures_mode — hardcoded ImageRefMode.REFERENCED
now uses config.figures_mode (PLACEHOLDER for embed, REFERENCED for reference).
parent f2c567d3
Loading
Loading
Loading
Loading
+4 −9
Original line number Diff line number Diff line
@@ -220,7 +220,10 @@ FiguresModeOption = Annotated[
    str,
    typer.Option(
        "--figures",
        help="Figure handling: embed (placeholder in markdown) or reference (extract image files)",
        help=(
            "Figure handling for all profiles: "
            "embed (images embedded/self-contained) or reference (extract image files)"
        ),
        envvar=ConfigEnvVar.TDC_FIGURES.name,
    ),
]
@@ -248,14 +251,6 @@ DocxDirectOption = Annotated[
        envvar=ConfigEnvVar.TDC_DOCX_DIRECT.name,
    ),
]
ExtractMediaOption = Annotated[
    bool,
    typer.Option(
        "--extract-media/--no-extract-media",
        help="Extract embedded images to a ./media folder next to the markdown",
        envvar=ConfigEnvVar.TDC_EXTRACT_MEDIA.name,
    ),
]
WorkspaceNameOption = Annotated[
    str | None,
    typer.Option("-w", "--workspace", help="Workspace name (default: active workspace)"),
+1 −5
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@ from tdoc_crawler.cli._shared import console, create_progress_bar
from tdoc_crawler.cli.args import (
    DeviceOption,
    DocxDirectOption,
    ExtractMediaOption,
    FiguresModeOption,
    MdYamlFrontmatterOption,
    ProcessLimitOption,
@@ -132,7 +131,6 @@ def _process_member(
    md_yaml_frontmatter: bool,
    docling_config: DoclingConfig,
    docx_direct: bool = False,
    extract_media: bool = False,
) -> tuple[str, bool, bool, int]:
    """Process a single workspace member.

@@ -152,7 +150,7 @@ def _process_member(
            force=force,
            docling_config=docling_config,
            docx_direct=docx_direct,
            extract_media=extract_media,
            md_yaml_frontmatter=md_yaml_frontmatter,
        )
        suffix = result_path.suffix.lstrip(".")
        logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
@@ -177,7 +175,6 @@ def workspace_process(
    tables: TablesModeOption = "embed",
    device: DeviceOption = "auto",
    docx_direct: DocxDirectOption = False,
    extract_media: ExtractMediaOption = False,
    md_yaml_frontmatter: MdYamlFrontmatterOption = True,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
@@ -259,7 +256,6 @@ def workspace_process(
                md_yaml_frontmatter,
                docling_config,
                docx_direct,
                extract_media,
            )
            if succeeded:
                processed += 1
+0 −1
Original line number Diff line number Diff line
@@ -75,7 +75,6 @@ class ConfigEnvVar(StrEnum):
    TDC_AUTO_CRAWL_SPECS = "crawl.auto_crawl_specs"
    TDC_MD_YAML_FRONTMATTER = "workspace.md_yaml_frontmatter"
    TDC_DOCX_DIRECT = "workspace.docx_direct"
    TDC_EXTRACT_MEDIA = "workspace.extract_media"
    TDC_SKIP_EXISTING = "workspace.skip_existing"
    # Workspace extraction settings
    TDC_PROFILE = "workspace.profile"
+45 −11
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ import re
import shutil
import tempfile
from dataclasses import dataclass
from datetime import UTC, datetime
from enum import Enum
from pathlib import Path

@@ -230,8 +231,8 @@ def _convert_via_remote(
# ---------------------------------------------------------------------------

# Matches ``![alt](media/image_NNN.png)`` — pymupdf4llm output with write_images=True.
_MEDIA_REF_PATTERN = re.compile(r"!\[([^\]]*)\]\((media/[^)]+)\)")

# Also matches absolute paths like ``![alt](C:/.../media/image_NNN.png)``.
_MEDIA_REF_PATTERN_ABS = re.compile(r"!\[([^\]]*)\]\(([^)]*media/[^)]+)\)")
_SUFFIX_TO_MIME: dict[str, str] = {
    ".png": "image/png",
    ".jpg": "image/jpeg",
@@ -245,14 +246,17 @@ _SUFFIX_TO_MIME: dict[str, str] = {
def _embed_media_as_base64(text: str, output_dir: Path) -> str:
    """Replace ``media/`` file references with base64 data URIs.

    Handles both relative ``media/...`` and absolute paths emitted by pymupdf4llm.
    After embedding, the ``media/`` directory is removed.
    """
    media_dir = output_dir / "media"

    def _replacement(match: re.Match) -> str:
        alt_text = match.group(1)
        rel_path = match.group(2)
        img_path = output_dir / rel_path
        img_ref = match.group(2)
        img_path = Path(img_ref)
        if not img_path.is_absolute():
            img_path = output_dir / img_ref
        try:
            data = img_path.read_bytes()
            b64 = base64.b64encode(data).decode("ascii")
@@ -262,17 +266,38 @@ def _embed_media_as_base64(text: str, output_dir: Path) -> str:
            logger.warning("Failed to embed image: %s", img_path, exc_info=True)
            return match.group(0)

    result = _MEDIA_REF_PATTERN.sub(_replacement, text)
    # Match both relative ``media/...`` and absolute ``C:/.../media/...`` paths.
    result = _MEDIA_REF_PATTERN_ABS.sub(_replacement, text)
    if media_dir.exists():
        shutil.rmtree(media_dir, ignore_errors=True)
    return result


def _add_yaml_frontmatter(
    md_path: Path,
    *,
    document_id: str,
    source_kind: SourceKind,
    profile: ExtractionProfile,
) -> None:
    """Prepend YAML frontmatter to a Markdown file."""
    frontmatter = (
        "---\n"
        f"document_id: {document_id}\n"
        f"source_kind: {source_kind.value}\n"
        f"profile: {profile.value}\n"
        f"extraction_date: \"{datetime.now(UTC).isoformat()}\"\n"
        "---\n\n"
    )
    content = md_path.read_text(encoding="utf-8")
    md_path.write_text(frontmatter + content, encoding="utf-8")


def _run_markdown_only(
    primary: Path,
    output_dir: Path,
    *,
    extract_media: bool = False,
    figures_mode: str = "embed",
) -> Path:
    """Convert a PDF document to Markdown using pymupdf4llm."""
    media_dir = output_dir / "media"
@@ -285,7 +310,11 @@ def _run_markdown_only(
        image_format="png",
    )

    if not extract_media:
    # pymupdf4llm may emit absolute paths — normalize to relative ``media/``.
    media_prefix = str(media_dir).replace("\\", "/")
    md_text = md_text.replace(media_prefix + "/", "media/")

    if figures_mode == "embed":
        md_text = _embed_media_as_base64(md_text, output_dir)

    md_path = output_dir / f"{primary.stem}.md"
@@ -356,7 +385,7 @@ def convert_for_wiki(
    force: bool = False,
    docling_config: DoclingConfig | None = None,
    docx_direct: bool = False,
    extract_media: bool = False,
    md_yaml_frontmatter: bool = True,
) -> Path:
    """Convert a document for wiki ingestion using the specified profile.

@@ -380,8 +409,7 @@ def convert_for_wiki(
        force: Force reconversion.
        docling_config: Optional Docling-specific configuration (figure/table modes).
        docx_direct: Feed .docx directly to backend, skip LibreOffice PDF step.
        extract_media: Extract embedded images to ``./media/`` next to the
            markdown (markdown-only profile).
        md_yaml_frontmatter: Prepend YAML frontmatter to generated Markdown.

    Returns:
        Path to the primary output file (PDF for pdf-only, MD for others).
@@ -414,8 +442,12 @@ def convert_for_wiki(
            return md_file
        # Office formats → LibreOffice PDF first; native PDFs pass through.
        input_for_md = ensure_pdf(primary, wiki_source_dir, force=force)
        figures_mode = docling_config.figures_mode if docling_config else "embed"
        with timed_operation(get_metrics_tracker(), document_id, MetricType.CONVERSION):
            return _run_markdown_only(input_for_md, wiki_source_dir, extract_media=extract_media)
            result = _run_markdown_only(input_for_md, wiki_source_dir, figures_mode=figures_mode)
        if md_yaml_frontmatter:
            _add_yaml_frontmatter(result, document_id=document_id, source_kind=source_kind, profile=profile)
        return result

    # Step 2c: default/advanced → check existing output before running Docling
    md_file = wiki_source_dir / f"{primary.stem}.md"
@@ -437,6 +469,8 @@ def convert_for_wiki(
            docx_direct=is_docx_direct,
        )

    if md_yaml_frontmatter:
        _add_yaml_frontmatter(md_file, document_id=document_id, source_kind=source_kind, profile=profile)
    return md_file


+3 −2
Original line number Diff line number Diff line
@@ -145,10 +145,11 @@ def _run_docling(
    md_path = output_dir / f"{stem}.md"
    json_path = output_dir / f"{stem}.json"

    # --- Export markdown with referenced figure images ---
    # --- Export markdown with figure images ---
    image_mode = ImageRefMode.PLACEHOLDER if config.figures_mode == "embed" else ImageRefMode.REFERENCED
    result.document.save_as_markdown(
        md_path,
        image_mode=ImageRefMode.REFERENCED,
        image_mode=image_mode,
    )
    markdown_content = md_path.read_text(encoding="utf-8")