Loading packages/3gpp-ai/threegpp_ai/lightrag/processor.py +4 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ from tdoc_crawler.logging import get_logger from threegpp_ai.models import ConversionError, ExtractionError from threegpp_ai.operations.conversion import OFFICE_FORMATS from threegpp_ai.operations.extraction import VlmOptions, extract_document_structured from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured from .config import LightRAGConfig from .metadata import RAGMetadata, enrich_text Loading Loading @@ -99,6 +99,7 @@ class DocumentProcessor: metadata: RAGMetadata | dict[str, Any] | None = None, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> ProcessingResult: """Process a single document file and insert into LightRAG. Loading @@ -111,6 +112,7 @@ class DocumentProcessor: If None, extracts all types. Supported types: "tables", "figures", "equations". vlm_options: Optional VLM features for extraction. Enables picture description and/or formula enrichment using VLM pipelines. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: ProcessingResult with status and metadata. Loading @@ -129,6 +131,7 @@ class DocumentProcessor: metadata=None, # Metadata enrichment happens at this level, not in extraction extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) except (ExtractionError, ConversionError) as e: logger.error("Extraction failed for %s: %s", file_path, e) Loading packages/3gpp-ai/threegpp_ai/operations/convert.py +13 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ from threegpp_ai.operations.conversion import ( ConverterBackend, ConverterConfig, ) from threegpp_ai.operations.extraction import extract_document_structured from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured from threegpp_ai.operations.extraction_result import StructuredExtractionResult from threegpp_ai.operations.fetch_tdoc import fetch_tdoc_files from threegpp_ai.operations.metrics import MetricType, get_metrics_tracker, timed_operation Loading Loading @@ -168,6 +168,8 @@ def convert_document_to_markdown( output_path: Path | None = None, force: bool = False, converter_config: ConverterConfig | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> str: """Convert TDoc to markdown using the unified extraction pipeline. Loading @@ -186,6 +188,8 @@ def convert_document_to_markdown( output_path: Optional path to write markdown file force: Force reconversion even if cached converter_config: Optional converter configuration (unused, kept for API compatibility) vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Markdown content string Loading @@ -199,6 +203,8 @@ def convert_document_to_markdown( extraction = extract_document_structured_from_tdoc( document_id=document_id, force=force, vlm_options=vlm_options, accelerator_config=accelerator_config, ) # Get TDoc metadata for header Loading @@ -222,6 +228,8 @@ def extract_document_structured_from_tdoc( document_id: str, force: bool = False, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Extract a TDoc into the canonical structured payload. Loading @@ -233,6 +241,8 @@ def extract_document_structured_from_tdoc( force: Force reconversion even if cached markdown exists. extract_types: Optional set of artifact types to extract/persist. If None, extracts all types. Supported types: "tables", "figures", "equations". vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Structured extraction result. Loading Loading @@ -269,6 +279,8 @@ def extract_document_structured_from_tdoc( metadata=metadata_dict, force=force, extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) Loading packages/3gpp-ai/threegpp_ai/operations/extraction.py +66 −5 Original line number Diff line number Diff line Loading @@ -19,9 +19,9 @@ from typing import Any from convert_lo import LibreOfficeFormat from convert_lo.converter import Converter from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorOptions, CodeFormulaVlmOptions, TableStructureOptions, ThreadedPdfPipelineOptions, Loading Loading @@ -61,26 +61,77 @@ class VlmOptions: enable_formula_enrichment: bool = False def _create_pdf_converter(vlm_options: VlmOptions | None = None) -> DocumentConverter: # Map of user-facing device strings to AcceleratorDevice enum values _DEVICE_MAP: dict[str, AcceleratorDevice] = { "auto": AcceleratorDevice.AUTO, "cpu": AcceleratorDevice.CPU, "cuda": AcceleratorDevice.CUDA, "mps": AcceleratorDevice.MPS, "xpu": AcceleratorDevice.XPU, } @dataclass class AcceleratorConfig: """Accelerator configuration for Docling document processing. Controls GPU/CPU device selection, thread count, and batch sizes. All fields have sensible defaults — Docling auto-detects CUDA when available. Attributes: device: Compute device — one of: auto, cpu, cuda, mps, xpu, or cuda:N. num_threads: Thread count for CPU-bound operations. batch_size: Unified batch size for OCR, layout, and table structure. None leaves each at Docling's default. Higher values benefit GPU. """ device: str = "auto" num_threads: int = 4 batch_size: int | None = None def to_accelerator_options(self) -> AcceleratorOptions: """Convert to Docling's AcceleratorOptions.""" device_lower = self.device.strip().lower() device = _DEVICE_MAP.get(device_lower, AcceleratorDevice.AUTO) return AcceleratorOptions(num_threads=self.num_threads, device=device) def apply_batch_sizes(self, options: ThreadedPdfPipelineOptions) -> None: """Apply batch size overrides to pipeline options if configured.""" if self.batch_size is not None and self.batch_size > 0: options.ocr_batch_size = self.batch_size options.layout_batch_size = self.batch_size options.table_batch_size = max(1, self.batch_size // 16) def _create_pdf_converter( vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> DocumentConverter: """Create a DocumentConverter for PDF extraction. Args: vlm_options: Optional VLM features. If None, uses StandardPdfPipeline with enhanced options (table structure, accelerator). If any VLM feature is enabled, uses VlmPipeline with Granite Docling. accelerator_config: Optional accelerator settings for GPU/CPU and threading. If None, uses defaults (auto device, 4 threads). Returns: Configured DocumentConverter for PDF processing. """ accel = accelerator_config or AcceleratorConfig() accelerator_options = accel.to_accelerator_options() if vlm_options is None or (not vlm_options.enable_picture_description and not vlm_options.enable_formula_enrichment): # Standard pipeline with enhanced options options = ThreadedPdfPipelineOptions( do_table_structure=True, table_structure_options=TableStructureOptions(do_cell_matching=True), accelerator_options=AcceleratorOptions(num_threads=4), accelerator_options=accelerator_options, do_formula_enrichment=True, code_formula_options=CodeFormulaVlmOptions.from_preset("granite_docling"), ) accel.apply_batch_sizes(options) pdf_format_option = FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend, Loading @@ -93,6 +144,7 @@ def _create_pdf_converter(vlm_options: VlmOptions | None = None) -> DocumentConv images_scale=2.0, generate_picture_images=True, vlm_options=VlmConvertOptions.from_preset("granite_docling"), accelerator_options=accelerator_options, ) pdf_format_option = FormatOption( pipeline_cls=VlmPipeline, Loading Loading @@ -219,6 +271,7 @@ def _convert_via_libreoffice( doc_stem: str | None = None, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Convert non-PDF document to structured extraction via LibreOffice. Loading @@ -228,6 +281,7 @@ def _convert_via_libreoffice( doc_stem: Document stem for naming artifacts. extract_types: Optional set of artifact types to extract/persist. vlm_options: Optional VLM features for PDF extraction. accelerator_config: Optional accelerator settings. Returns: Structured extraction payload. Loading @@ -239,7 +293,7 @@ def _convert_via_libreoffice( converter = Converter() with tempfile.TemporaryDirectory() as tmpdir: conversion_result = converter.convert(file_path, LibreOfficeFormat.PDF, Path(tmpdir)) converter_docling = _create_pdf_converter(vlm_options=vlm_options) converter_docling = _create_pdf_converter(vlm_options=vlm_options, accelerator_config=accelerator_config) docling_result = converter_docling.convert(str(conversion_result.output_path)) return _build_structured_from_result( docling_result, Loading Loading @@ -283,6 +337,7 @@ def extract_document_structured( force: bool = False, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Extract a document into a canonical structured payload. Loading @@ -307,6 +362,7 @@ def extract_document_structured( vlm_options: Optional VLM features for extraction. Enables picture description and/or formula enrichment using VLM pipelines. If None, uses standard extraction with formula enrichment. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: StructuredExtractionResult with content, tables, figures, and equations. Loading Loading @@ -345,7 +401,7 @@ def extract_document_structured( # No valid cache found, perform fresh extraction if file_path.suffix.lower() == ".pdf": converter = _create_pdf_converter(vlm_options=vlm_options) converter = _create_pdf_converter(vlm_options=vlm_options, accelerator_config=accelerator_config) result = converter.convert(str(file_path)) extraction = _build_structured_from_result( result, Loading @@ -370,6 +426,7 @@ def extract_document_structured( doc_stem=doc_stem, extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) # Ensure .ai directory exists Loading @@ -389,6 +446,7 @@ def extract_document_text( file_path: Path, force: bool = False, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> str: """Extract plain text content from a document using docling. Loading @@ -399,6 +457,7 @@ def extract_document_text( file_path: Path to the document file. force: If True, re-extract even if cached markdown exists. vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Extracted text content as a string. Loading @@ -413,11 +472,13 @@ def extract_document_text( force=force, extract_types=set(), # No artifacts needed for text-only extraction vlm_options=vlm_options, accelerator_config=accelerator_config, ) return extraction.content __all__ = [ "AcceleratorConfig", "VlmOptions", "extract_document_structured", "extract_document_text", Loading Loading
packages/3gpp-ai/threegpp_ai/lightrag/processor.py +4 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ from tdoc_crawler.logging import get_logger from threegpp_ai.models import ConversionError, ExtractionError from threegpp_ai.operations.conversion import OFFICE_FORMATS from threegpp_ai.operations.extraction import VlmOptions, extract_document_structured from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured from .config import LightRAGConfig from .metadata import RAGMetadata, enrich_text Loading Loading @@ -99,6 +99,7 @@ class DocumentProcessor: metadata: RAGMetadata | dict[str, Any] | None = None, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> ProcessingResult: """Process a single document file and insert into LightRAG. Loading @@ -111,6 +112,7 @@ class DocumentProcessor: If None, extracts all types. Supported types: "tables", "figures", "equations". vlm_options: Optional VLM features for extraction. Enables picture description and/or formula enrichment using VLM pipelines. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: ProcessingResult with status and metadata. Loading @@ -129,6 +131,7 @@ class DocumentProcessor: metadata=None, # Metadata enrichment happens at this level, not in extraction extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) except (ExtractionError, ConversionError) as e: logger.error("Extraction failed for %s: %s", file_path, e) Loading
packages/3gpp-ai/threegpp_ai/operations/convert.py +13 −1 Original line number Diff line number Diff line Loading @@ -24,7 +24,7 @@ from threegpp_ai.operations.conversion import ( ConverterBackend, ConverterConfig, ) from threegpp_ai.operations.extraction import extract_document_structured from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured from threegpp_ai.operations.extraction_result import StructuredExtractionResult from threegpp_ai.operations.fetch_tdoc import fetch_tdoc_files from threegpp_ai.operations.metrics import MetricType, get_metrics_tracker, timed_operation Loading Loading @@ -168,6 +168,8 @@ def convert_document_to_markdown( output_path: Path | None = None, force: bool = False, converter_config: ConverterConfig | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> str: """Convert TDoc to markdown using the unified extraction pipeline. Loading @@ -186,6 +188,8 @@ def convert_document_to_markdown( output_path: Optional path to write markdown file force: Force reconversion even if cached converter_config: Optional converter configuration (unused, kept for API compatibility) vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Markdown content string Loading @@ -199,6 +203,8 @@ def convert_document_to_markdown( extraction = extract_document_structured_from_tdoc( document_id=document_id, force=force, vlm_options=vlm_options, accelerator_config=accelerator_config, ) # Get TDoc metadata for header Loading @@ -222,6 +228,8 @@ def extract_document_structured_from_tdoc( document_id: str, force: bool = False, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Extract a TDoc into the canonical structured payload. Loading @@ -233,6 +241,8 @@ def extract_document_structured_from_tdoc( force: Force reconversion even if cached markdown exists. extract_types: Optional set of artifact types to extract/persist. If None, extracts all types. Supported types: "tables", "figures", "equations". vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Structured extraction result. Loading Loading @@ -269,6 +279,8 @@ def extract_document_structured_from_tdoc( metadata=metadata_dict, force=force, extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) Loading
packages/3gpp-ai/threegpp_ai/operations/extraction.py +66 −5 Original line number Diff line number Diff line Loading @@ -19,9 +19,9 @@ from typing import Any from convert_lo import LibreOfficeFormat from convert_lo.converter import Converter from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorOptions, CodeFormulaVlmOptions, TableStructureOptions, ThreadedPdfPipelineOptions, Loading Loading @@ -61,26 +61,77 @@ class VlmOptions: enable_formula_enrichment: bool = False def _create_pdf_converter(vlm_options: VlmOptions | None = None) -> DocumentConverter: # Map of user-facing device strings to AcceleratorDevice enum values _DEVICE_MAP: dict[str, AcceleratorDevice] = { "auto": AcceleratorDevice.AUTO, "cpu": AcceleratorDevice.CPU, "cuda": AcceleratorDevice.CUDA, "mps": AcceleratorDevice.MPS, "xpu": AcceleratorDevice.XPU, } @dataclass class AcceleratorConfig: """Accelerator configuration for Docling document processing. Controls GPU/CPU device selection, thread count, and batch sizes. All fields have sensible defaults — Docling auto-detects CUDA when available. Attributes: device: Compute device — one of: auto, cpu, cuda, mps, xpu, or cuda:N. num_threads: Thread count for CPU-bound operations. batch_size: Unified batch size for OCR, layout, and table structure. None leaves each at Docling's default. Higher values benefit GPU. """ device: str = "auto" num_threads: int = 4 batch_size: int | None = None def to_accelerator_options(self) -> AcceleratorOptions: """Convert to Docling's AcceleratorOptions.""" device_lower = self.device.strip().lower() device = _DEVICE_MAP.get(device_lower, AcceleratorDevice.AUTO) return AcceleratorOptions(num_threads=self.num_threads, device=device) def apply_batch_sizes(self, options: ThreadedPdfPipelineOptions) -> None: """Apply batch size overrides to pipeline options if configured.""" if self.batch_size is not None and self.batch_size > 0: options.ocr_batch_size = self.batch_size options.layout_batch_size = self.batch_size options.table_batch_size = max(1, self.batch_size // 16) def _create_pdf_converter( vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> DocumentConverter: """Create a DocumentConverter for PDF extraction. Args: vlm_options: Optional VLM features. If None, uses StandardPdfPipeline with enhanced options (table structure, accelerator). If any VLM feature is enabled, uses VlmPipeline with Granite Docling. accelerator_config: Optional accelerator settings for GPU/CPU and threading. If None, uses defaults (auto device, 4 threads). Returns: Configured DocumentConverter for PDF processing. """ accel = accelerator_config or AcceleratorConfig() accelerator_options = accel.to_accelerator_options() if vlm_options is None or (not vlm_options.enable_picture_description and not vlm_options.enable_formula_enrichment): # Standard pipeline with enhanced options options = ThreadedPdfPipelineOptions( do_table_structure=True, table_structure_options=TableStructureOptions(do_cell_matching=True), accelerator_options=AcceleratorOptions(num_threads=4), accelerator_options=accelerator_options, do_formula_enrichment=True, code_formula_options=CodeFormulaVlmOptions.from_preset("granite_docling"), ) accel.apply_batch_sizes(options) pdf_format_option = FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend, Loading @@ -93,6 +144,7 @@ def _create_pdf_converter(vlm_options: VlmOptions | None = None) -> DocumentConv images_scale=2.0, generate_picture_images=True, vlm_options=VlmConvertOptions.from_preset("granite_docling"), accelerator_options=accelerator_options, ) pdf_format_option = FormatOption( pipeline_cls=VlmPipeline, Loading Loading @@ -219,6 +271,7 @@ def _convert_via_libreoffice( doc_stem: str | None = None, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Convert non-PDF document to structured extraction via LibreOffice. Loading @@ -228,6 +281,7 @@ def _convert_via_libreoffice( doc_stem: Document stem for naming artifacts. extract_types: Optional set of artifact types to extract/persist. vlm_options: Optional VLM features for PDF extraction. accelerator_config: Optional accelerator settings. Returns: Structured extraction payload. Loading @@ -239,7 +293,7 @@ def _convert_via_libreoffice( converter = Converter() with tempfile.TemporaryDirectory() as tmpdir: conversion_result = converter.convert(file_path, LibreOfficeFormat.PDF, Path(tmpdir)) converter_docling = _create_pdf_converter(vlm_options=vlm_options) converter_docling = _create_pdf_converter(vlm_options=vlm_options, accelerator_config=accelerator_config) docling_result = converter_docling.convert(str(conversion_result.output_path)) return _build_structured_from_result( docling_result, Loading Loading @@ -283,6 +337,7 @@ def extract_document_structured( force: bool = False, extract_types: set[str] | None = None, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> StructuredExtractionResult: """Extract a document into a canonical structured payload. Loading @@ -307,6 +362,7 @@ def extract_document_structured( vlm_options: Optional VLM features for extraction. Enables picture description and/or formula enrichment using VLM pipelines. If None, uses standard extraction with formula enrichment. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: StructuredExtractionResult with content, tables, figures, and equations. Loading Loading @@ -345,7 +401,7 @@ def extract_document_structured( # No valid cache found, perform fresh extraction if file_path.suffix.lower() == ".pdf": converter = _create_pdf_converter(vlm_options=vlm_options) converter = _create_pdf_converter(vlm_options=vlm_options, accelerator_config=accelerator_config) result = converter.convert(str(file_path)) extraction = _build_structured_from_result( result, Loading @@ -370,6 +426,7 @@ def extract_document_structured( doc_stem=doc_stem, extract_types=extract_types, vlm_options=vlm_options, accelerator_config=accelerator_config, ) # Ensure .ai directory exists Loading @@ -389,6 +446,7 @@ def extract_document_text( file_path: Path, force: bool = False, vlm_options: VlmOptions | None = None, accelerator_config: AcceleratorConfig | None = None, ) -> str: """Extract plain text content from a document using docling. Loading @@ -399,6 +457,7 @@ def extract_document_text( file_path: Path to the document file. force: If True, re-extract even if cached markdown exists. vlm_options: Optional VLM features for extraction. accelerator_config: Optional accelerator settings for GPU/CPU and threading. Returns: Extracted text content as a string. Loading @@ -413,11 +472,13 @@ def extract_document_text( force=force, extract_types=set(), # No artifacts needed for text-only extraction vlm_options=vlm_options, accelerator_config=accelerator_config, ) return extraction.content __all__ = [ "AcceleratorConfig", "VlmOptions", "extract_document_structured", "extract_document_text", Loading