✨ feat(extraction): update markdown extraction to use pymupdf4llm (c84cb307) · Commits · Jan Reimes / 3gpp-crawler

pyproject.toml

+0 −2

+2 −2

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@

		Defines four tiers of document extraction:
		- pdf-only: Raw PDF (office docs converted via LibreOffice)
		- markdown-only: Fast markdown via markitdown (no ML, .md only)
		- markdown-only: Layout-aware Markdown via pymupdf4llm (no ML, .md only)
		- default: Structured markdown + JSON via Docling
		- advanced: Same as default + picture descriptions, code/formula enrichment
		"""
		@@ -16,7 +16,7 @@ class ExtractionProfile(StrEnum):
		"""Extraction profile levels for wiki ingestion."""

		PDF_ONLY = "pdf-only" # raw PDF, no structured extraction
		MARKDOWN_ONLY = "markdown-only" # fast markitdown, .md only, no ML
		MARKDOWN_ONLY = "markdown-only" # layout-aware pymupdf4llm, .md only, no ML
		DEFAULT = "default" # Docling structured extraction
		ADVANCED = "advanced" # Docling + picture descriptions, code/formula enrichment