Loading pyproject.toml +0 −2 Original line number Diff line number Diff line Loading @@ -41,8 +41,6 @@ dependencies = [ "pydantic-settings>=2.13.1", "niquests>=3.18.4", "opencv-python-headless>=4.13.0.92", "markitdown[all]>=0.1.5", "markitdown-ocr>=0.1.0", "pymupdf>=1.27.2.3", "pymupdf4llm>=1.27.2.3", ] Loading src/tdoc_crawler/extraction/profiles.py +2 −2 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ Defines four tiers of document extraction: - pdf-only: Raw PDF (office docs converted via LibreOffice) - markdown-only: Fast markdown via markitdown (no ML, .md only) - markdown-only: Layout-aware Markdown via pymupdf4llm (no ML, .md only) - default: Structured markdown + JSON via Docling - advanced: Same as default + picture descriptions, code/formula enrichment """ Loading @@ -16,7 +16,7 @@ class ExtractionProfile(StrEnum): """Extraction profile levels for wiki ingestion.""" PDF_ONLY = "pdf-only" # raw PDF, no structured extraction MARKDOWN_ONLY = "markdown-only" # fast markitdown, .md only, no ML MARKDOWN_ONLY = "markdown-only" # layout-aware pymupdf4llm, .md only, no ML DEFAULT = "default" # Docling structured extraction ADVANCED = "advanced" # Docling + picture descriptions, code/formula enrichment Loading Loading
pyproject.toml +0 −2 Original line number Diff line number Diff line Loading @@ -41,8 +41,6 @@ dependencies = [ "pydantic-settings>=2.13.1", "niquests>=3.18.4", "opencv-python-headless>=4.13.0.92", "markitdown[all]>=0.1.5", "markitdown-ocr>=0.1.0", "pymupdf>=1.27.2.3", "pymupdf4llm>=1.27.2.3", ] Loading
src/tdoc_crawler/extraction/profiles.py +2 −2 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ Defines four tiers of document extraction: - pdf-only: Raw PDF (office docs converted via LibreOffice) - markdown-only: Fast markdown via markitdown (no ML, .md only) - markdown-only: Layout-aware Markdown via pymupdf4llm (no ML, .md only) - default: Structured markdown + JSON via Docling - advanced: Same as default + picture descriptions, code/formula enrichment """ Loading @@ -16,7 +16,7 @@ class ExtractionProfile(StrEnum): """Extraction profile levels for wiki ingestion.""" PDF_ONLY = "pdf-only" # raw PDF, no structured extraction MARKDOWN_ONLY = "markdown-only" # fast markitdown, .md only, no ML MARKDOWN_ONLY = "markdown-only" # layout-aware pymupdf4llm, .md only, no ML DEFAULT = "default" # Docling structured extraction ADVANCED = "advanced" # Docling + picture descriptions, code/formula enrichment Loading