Commit e1fd4257 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(convert): disable layout mode by default for simpler output

parent 368d2b59
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -298,11 +298,16 @@ def _run_markdown_only(
    output_dir: Path,
    *,
    figures_mode: FiguresMode | str = FiguresMode.EMBED,
    use_layout_mode: bool = False,
) -> Path:
    """Convert a PDF document to Markdown using pymupdf4llm."""
    media_dir = output_dir / "media"
    media_dir.mkdir(parents=True, exist_ok=True)

    # by default, pymupdf4llm's layout mode is enabled for better text flow in multi-column documents.
    # For LLM-based extraction, the page layout does not need to be preserved, so layout mode can be disabled by default for simpler output.
    pymupdf4llm.use_layout(use_layout_mode)

    md_text = pymupdf4llm.to_markdown(
        str(primary),
        write_images=True,