Commit c84cb307 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(extraction): update markdown extraction to use pymupdf4llm

- improves layout-aware markdown generation
- replaces markitdown with more accurate tool
- maintains no-ML constraint for markdown-only profile
parent f326abf2
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -41,8 +41,6 @@ dependencies = [
    "pydantic-settings>=2.13.1",
    "niquests>=3.18.4",
    "opencv-python-headless>=4.13.0.92",
    "markitdown[all]>=0.1.5",
    "markitdown-ocr>=0.1.0",
    "pymupdf>=1.27.2.3",
    "pymupdf4llm>=1.27.2.3",
]
+2 −2
Original line number Diff line number Diff line
@@ -2,7 +2,7 @@

Defines four tiers of document extraction:
- pdf-only: Raw PDF (office docs converted via LibreOffice)
- markdown-only: Fast markdown via markitdown (no ML, .md only)
- markdown-only: Layout-aware Markdown via pymupdf4llm (no ML, .md only)
- default: Structured markdown + JSON via Docling
- advanced: Same as default + picture descriptions, code/formula enrichment
"""
@@ -16,7 +16,7 @@ class ExtractionProfile(StrEnum):
    """Extraction profile levels for wiki ingestion."""

    PDF_ONLY = "pdf-only"  # raw PDF, no structured extraction
    MARKDOWN_ONLY = "markdown-only"  # fast markitdown, .md only, no ML
    MARKDOWN_ONLY = "markdown-only"  # layout-aware pymupdf4llm, .md only, no ML
    DEFAULT = "default"  # Docling structured extraction
    ADVANCED = "advanced"  # Docling + picture descriptions, code/formula enrichment