Commit 7b434091 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(ai): migrate extraction from docling to opendataloader_pdf

Replaces the docling-based PDF extraction with OpenDataLoader, which offers
better accuracy (#1 in benchmarks with 0.907) and simpler CPU-only operation.
- Removes docling, transformers, docling-core, hf_xet dependencies
- Adds opendataloader-pdf[hybrid] dependency
- Supports hybrid AI mode for complex document pages
- Updates extraction result parsing for opendataloader JSON output
parent f6be862d
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -156,7 +156,7 @@ from tdoc_ai import LightRAGConfig, TDocRAG, TDocProcessor

## Extraction

LightRAG uses `docling` for text, table, and figure extraction before chunking and ingestion.
LightRAG uses `opendataloader-pdf` for text, table, formula, and figure extraction before chunking and ingestion.

## Deprecated/Removed

@@ -168,3 +168,4 @@ LightRAG uses `docling` for text, table, and figure extraction before chunking a
- `sentence-transformers`
- `tokenizers`
- `lancedb`
- `docling` (replaced by `opendataloader-pdf`)
+1 −5
Original line number Diff line number Diff line
@@ -16,14 +16,10 @@ classifiers = [
dependencies = [
    "convert-lo",
    "doc2txt>=1.0.8",
    #"doc2txt>=1.0.8 @ git+https://github.com/Quantatirsk/doc2txt-pypi.git"
    "litellm>=1.81.15",
    "pydantic-settings>=2.13.1",
    "liteparse>=1.2.0",
    "docling[vlm]>=2.82.0",
    "transformers>=4.57.6",
    "docling-core[chunking]>=2.70.2",
    "hf_xet"
    "opendataloader-pdf[hybrid]>=2.2.0",
]

[project.urls]
+439 −219

File changed.

Preview size limit exceeded, changes collapsed.

+116 −4
Original line number Diff line number Diff line
@@ -13,10 +13,9 @@ import shutil
import tempfile
from collections.abc import Sequence
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Any

from docling.document_converter import ConversionResult
from docling_core.types.doc.document import DescriptionAnnotation
from tdoc_crawler.logging import get_logger

from threegpp_ai.models import (
    DocumentMetadataContract,
@@ -31,6 +30,11 @@ from threegpp_ai.models import (
    StructuredExtractionResult,
)

if TYPE_CHECKING:
    from docling.document_converter import ConversionResult

logger = get_logger(__name__)

_EQUATION_PATTERNS = [
    re.compile(r"\$\$(.*?)\$\$", re.DOTALL),
    re.compile(r"\\\[(.*?)\\\]", re.DOTALL),
@@ -756,7 +760,7 @@ def _extract_figures_from_docling(
        # Try to get VLM-generated description from annotations
        if not description and hasattr(image, "annotations"):
            for annotation in getattr(image, "annotations", []) or []:
                if isinstance(annotation, DescriptionAnnotation):
                if isinstance(annotation, "DescriptionAnnotation"):
                    vlm_description = getattr(annotation, "text", None)
                    if vlm_description:
                        description = vlm_description
@@ -838,6 +842,113 @@ def from_docling_result(
    )


def from_opendataloader_result(
    markdown_content: str,
    *,
    tables: list[dict[str, Any]] | None = None,
    figures: list[dict[str, Any]] | None = None,
    formulas: list[dict[str, Any]] | None = None,
    metadata: dict[str, Any] | None = None,
) -> StructuredExtractionResult:
    """Convert OpenDataLoader extraction output into the canonical payload.

    OpenDataLoader outputs Markdown + JSON with bounding boxes. This function
    parses those outputs and converts them into the canonical structured format.

    Args:
        markdown_content: Extracted markdown text from OpenDataLoader.
        tables: List of table element dictionaries from OpenDataLoader JSON.
        figures: List of figure element dictionaries from OpenDataLoader JSON.
        formulas: List of formula element dictionaries from OpenDataLoader JSON.
        metadata: Optional additional metadata.

    Returns:
        Canonical structured extraction result.
    """
    # Convert table dicts to ExtractedTableElement
    table_elements: list[ExtractedTableElement] = []
    for index, table_dict in enumerate(tables or [], start=1):
        try:
            table_elements.append(
                ExtractedTableElement(
                    element_id=table_dict.get("element_id", f"table_{index}"),
                    page_number=table_dict.get("page_number"),
                    row_count=table_dict.get("row_count", 0),
                    column_count=table_dict.get("column_count", 0),
                    cells=table_dict.get("cells", []),
                    cell_metadata=table_dict.get("cell_metadata", []),
                    markdown=table_dict.get("markdown"),
                    caption=table_dict.get("caption"),
                    source_anchor_id=table_dict.get("source_anchor_id", f"table-{index}"),
                )
            )
        except Exception as e:
            logger.warning("Failed to parse table element %d: %s", index, e)

    # Convert figure dicts to ExtractedFigureElement
    figure_elements: list[ExtractedFigureElement] = []
    for index, figure_dict in enumerate(figures or [], start=1):
        try:
            figure_elements.append(
                ExtractedFigureElement(
                    element_id=figure_dict.get("element_id", f"figure_{index}"),
                    page_number=figure_dict.get("page_number"),
                    image_path=figure_dict.get("image_path"),
                    image_format=figure_dict.get("image_format", "png"),
                    caption=figure_dict.get("caption"),
                    description=figure_dict.get("description"),
                    source_anchor_id=figure_dict.get("source_anchor_id", f"figure-{index}"),
                    is_partial=figure_dict.get("is_partial", False),
                    partial_reason_codes=figure_dict.get("partial_reason_codes", []),
                    metadata=figure_dict.get("metadata", {}),
                )
            )
        except Exception as e:
            logger.warning("Failed to parse figure element %d: %s", index, e)

    # Convert formula dicts to ExtractedEquationElement
    equation_elements: list[ExtractedEquationElement] = []
    for index, formula_dict in enumerate(formulas or [], start=1):
        try:
            equation_elements.append(
                ExtractedEquationElement(
                    element_id=formula_dict.get("element_id", f"equation_{index}"),
                    latex=formula_dict.get("latex", ""),
                    raw_text=formula_dict.get("raw_text"),
                    source_anchor_id=formula_dict.get("source_anchor_id", f"equation-{index}"),
                    normalized_text=formula_dict.get("normalized_text", formula_dict.get("latex", "")),
                    equation_type=formula_dict.get("equation_type", "latex"),
                    display_mode=formula_dict.get("display_mode", "display"),
                    page_number=formula_dict.get("page_number"),
                )
            )
        except Exception as e:
            logger.warning("Failed to parse formula element %d: %s", index, e)

    # Build marker lines for tables and figures (equations already in markdown if LaTeX)
    marker_lines: list[str] = []
    marker_lines.extend(_build_table_marker(table) for table in table_elements)
    marker_lines.extend(_build_figure_marker(figure) for figure in figure_elements)
    # Only add equation markers if not already embedded in markdown as LaTeX
    for equation in equation_elements:
        if equation.latex and not (equation.latex.startswith("$$") or equation.latex.startswith("\\[")):
            marker_lines.append(_build_equation_marker(equation))

    content = markdown_content
    if marker_lines:
        content = f"{content.rstrip()}\n\n" + "\n".join(marker_lines) + "\n"

    result_metadata = metadata or {}

    return build_structured_extraction_result(
        content=content,
        tables=table_elements,
        figures=figure_elements,
        equations=equation_elements,
        metadata=result_metadata,
    )


def _load_json_artifact(path: Path) -> dict[str, Any]:
    """Load a single JSON artifact file."""
    try:
@@ -941,6 +1052,7 @@ __all__ = [
    "build_structured_extraction_result",
    "evaluate_quality_gates",
    "from_docling_result",
    "from_opendataloader_result",
    "has_cached_artifacts",
    "persist_canonical_output",
    "persist_equations_from_extraction",
+1 −1
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ def test_extraction_metadata_includes_profile_and_effective_settings(monkeypatch
    file_path = tmp_path / "doc.md"
    file_path.write_text("content", encoding="utf-8")

    monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda *_args, **_kwargs: (True, "cached"))
    monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda *_args, **_kwargs: (True, "cached", {"tables", "figures", "equations"}))
    monkeypatch.setattr(extraction_ops, "read_cached_artifacts", lambda *_args, **_kwargs: build_structured_extraction_result("cached"))

    result = extraction_ops.extract_document_structured(