consolidate OFFICE_FORMATS, merge WorkspaceMember, remove dead code (2c1c0a97) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/docs/PIPELINE.md

+6 −4

Original line number	Diff line number	Diff line
		@@ -48,15 +48,17 @@ OpenDataLoader with Hybrid Mode
		Standard Extraction Outputs + AI-Enhanced Artifacts
		```

		Key Differences:
		## Extraction

		\| Feature \| Standard Pipeline \| Hybrid Mode \|
		\|---------\|-------------------\|-------------\|
		Extraction always enables all artifact types: tables, figures, and equations.

		\| Feature \| Standard Mode \| Hybrid Mode \|
		\|---------\|---------------\|-------------\|
		\| Backend \| `opendataloader_pdf` (local) \| `opendataloader_pdf[hybrid]` \|
		\| Table Structure \| ✅ Enabled \| ✅ Enabled \|
		\| Formula Enrichment \| ✅ Enabled \| ✅ Enhanced \|
		\| Picture Description \| ✅ Enabled \| ✅ AI-generated (SmolVLM) \|
		\| OCR for Scanned PDFs \| ✅ via `force_ocr` \| ✅ via `force_ocr` \|
		\| OCR for Scanned PDFs \| ✅ Automatic \| ✅ Automatic \|
		\| Java Required \| Yes (11+) \| Yes (11+) \|
		\| GPU Required \| No \| No (but hybrid server needs LLM)

packages/3gpp-ai/threegpp_ai/args.py

+0 −33

Original line number	Diff line number	Diff line
		@@ -112,39 +112,6 @@ WorkspaceProcessSkipExistingOption = Annotated[
		help="Skip extraction for components that already exist in .ai folder",
		),
		]
		ExtractionProfileOption = Annotated[
		str \| None,
		typer.Option(
		"--profile",
		help="Extraction profile override: default, balanced, optimum, custom",
		envvar="TDC_AI_EXTRACTION_PROFILE",
		),
		]
		CustomExtractOcrOption = Annotated[
		bool \| None,
		typer.Option("--custom-ocr/--no-custom-ocr", help="Custom profile override for OCR stage"),
		]
		CustomExtractLayoutOption = Annotated[
		bool \| None,
		typer.Option("--custom-layout/--no-custom-layout", help="Custom profile override for layout stage"),
		]
		CustomExtractTablesOption = Annotated[
		bool \| None,
		typer.Option("--custom-tables/--no-custom-tables", help="Custom profile override for table extraction"),
		]
		CustomExtractFiguresOption = Annotated[
		bool \| None,
		typer.Option("--custom-figures/--no-custom-figures", help="Custom profile override for figure extraction"),
		]
		CustomExtractEquationsOption = Annotated[
		bool \| None,
		typer.Option("--custom-equations/--no-custom-equations", help="Custom profile override for equation extraction"),
		]
		CustomExtractEnrichmentOption = Annotated[
		bool \| None,
		typer.Option("--custom-enrichment/--no-custom-enrichment", help="Custom profile override for enrichment stages"),
		]

		# Accelerator options for Docling extraction
		AcceleratorDeviceOption = Annotated[
		str,

packages/3gpp-ai/threegpp_ai/cli.py

+94 −104

Original line number	Diff line number	Diff line
		@@ -57,14 +57,7 @@ from threegpp_ai.args import (
		ConvertMdOption,
		ConvertOutputOption,
		ConvertPdfOption,
		CustomExtractEnrichmentOption,
		CustomExtractEquationsOption,
		CustomExtractFiguresOption,
		CustomExtractLayoutOption,
		CustomExtractOcrOption,
		CustomExtractTablesOption,
		EndDateOption,
		ExtractionProfileOption,
		OutputFormatOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		@@ -98,6 +91,13 @@ from threegpp_ai.operations.classify import pick_main_document
		from threegpp_ai.operations.conversion import OFFICE_FORMATS, convert_to_pdf
		from threegpp_ai.operations.convert import convert_document_to_markdown
		from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured
		from threegpp_ai.operations.hybrid_server import (
		DEFAULT_HOST,
		DEFAULT_PORT,
		HybridServerConfig,
		HybridServerManager,
		ensure_hybrid_server,
		)
		from threegpp_ai.operations.workspace_registry import WorkspaceRegistry, normalize_spec_member_id
		from threegpp_ai.operations.workspace_utils import check_pdf_status

		@@ -106,7 +106,9 @@ load_dotenv()

		app = typer.Typer(help="3GPP AI - Document processing and summarization")
		workspace_app = typer.Typer(help="Manage extraction workspaces")
		hybrid_app = typer.Typer(help="Manage the OpenDataLoader hybrid server")
		app.add_typer(workspace_app, name="workspace")
		app.add_typer(hybrid_app, name="hybrid-server")
		app.add_typer(config_app, name="config")

		console = get_console()
		@@ -238,13 +240,6 @@ async def _process_single_item(
		path_config: PathConfig,
		vlm_options: VlmOptions \| None = None,
		accelerator_config: AcceleratorConfig \| None = None,
		profile: str \| None = None,
		custom_extract_ocr: bool \| None = None,
		custom_extract_layout: bool \| None = None,
		custom_extract_tables: bool \| None = None,
		custom_extract_figures: bool \| None = None,
		custom_extract_equations: bool \| None = None,
		custom_extract_enrichment: bool \| None = None,
		) -> tuple[Any \| None, str \| None, bool, bool]:
		"""Process a single workspace item (checkout + optional PDF conversion + optional markdown extraction).

		@@ -259,13 +254,6 @@ async def _process_single_item(
		path_config: PathConfig for file system paths
		vlm_options: Optional VLM features for extraction.
		accelerator_config: Optional accelerator settings for GPU/CPU and threading.
		profile: Extraction profile override.
		custom_extract_ocr: Optional custom OCR toggle.
		custom_extract_layout: Optional custom layout toggle.
		custom_extract_tables: Optional custom table extraction toggle.
		custom_extract_figures: Optional custom figure extraction toggle.
		custom_extract_equations: Optional custom equation extraction toggle.
		custom_extract_enrichment: Optional custom enrichment toggle.

		Returns:
		Tuple of (member, skip_reason, was_converted, was_md_extracted)
		@@ -303,7 +291,7 @@ async def _process_single_item(
		# Optional PDF conversion
		was_converted = False
		if convert_pdf:
		member_for_convert = make_workspace_member(workspace, item, source_path, source_kind)
		member_for_convert = make_workspace_member(item, source_path, source_kind)
		pdf_path = _convert_member_to_pdf(member_for_convert)
		was_converted = pdf_path is not None

		@@ -319,13 +307,6 @@ async def _process_single_item(
		force=False,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)
		else:
		# Generic extraction (specs, other) - uses file path directly
		@@ -337,13 +318,6 @@ async def _process_single_item(
		force=False,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)
		was_md_extracted = True
		except Exception as e:
		@@ -353,7 +327,7 @@ async def _process_single_item(
		if source_kind == SourceKind.SPEC and release:
		resolved_release, _ = await resolve_spec_release_from_db(item, release)
		source_item_id = f"{item}-REL{normalize_release_version(resolved_release)}" if resolved_release else item
		member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
		member = make_workspace_member(source_item_id, source_path, source_kind)
		return member, None, was_converted, was_md_extracted


		@@ -436,13 +410,6 @@ async def _process_workspace_members(
		skip_existing: bool = False,
		vlm_options: VlmOptions \| None = None,
		accelerator_config: AcceleratorConfig \| None = None,
		profile: str \| None = None,
		custom_extract_ocr: bool \| None = None,
		custom_extract_layout: bool \| None = None,
		custom_extract_tables: bool \| None = None,
		custom_extract_figures: bool \| None = None,
		custom_extract_equations: bool \| None = None,
		custom_extract_enrichment: bool \| None = None,
		) -> list[dict[str, Any]]:
		"""Process workspace members with optional progress callback.

		@@ -453,15 +420,8 @@ async def _process_workspace_members(
		checkout: Whether to checkout documents if not available
		convert_md: Whether to extract markdown (implies PDF conversion)
		skip_existing: If True, skip extraction for components that already exist.
		vlm_options: Optional VLM features for extraction.
		vlm_options: Optional VLM features for extraction (enables hybrid mode for figures/tables/equations).
		accelerator_config: Optional accelerator settings for GPU/CPU and threading.
		profile: Extraction profile override.
		custom_extract_ocr: Optional custom OCR toggle.
		custom_extract_layout: Optional custom layout toggle.
		custom_extract_tables: Optional custom table extraction toggle.
		custom_extract_figures: Optional custom figure extraction toggle.
		custom_extract_equations: Optional custom equation extraction toggle.
		custom_extract_enrichment: Optional custom enrichment toggle.

		Returns:
		List of processing results
		@@ -508,7 +468,7 @@ async def _process_workspace_members(
		if convert_md:
		doc_file = _resolve_process_file(Path(member.source_path))
		if doc_file is not None and doc_file.suffix.lower() in OFFICE_FORMATS:
		pdf_path = _convert_member_to_pdf(make_workspace_member(workspace, member.source_item_id, member.source_path, member.source_kind))
		pdf_path = _convert_member_to_pdf(make_workspace_member(member.source_item_id, member.source_path, member.source_kind))
		if pdf_path is not None:
		file_path = pdf_path
		elif doc_file.suffix.lower() not in {".pdf", ".txt", ".md"}:
		@@ -531,13 +491,6 @@ async def _process_workspace_members(
		skip_existing=skip_existing,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)
		results.append(
		{
		@@ -758,13 +711,6 @@ def _checkout_and_convert_items(
		convert_md: bool,
		vlm_options: VlmOptions \| None,
		accelerator_config: AcceleratorConfig,
		profile: str \| None,
		custom_extract_ocr: bool \| None,
		custom_extract_layout: bool \| None,
		custom_extract_tables: bool \| None,
		custom_extract_figures: bool \| None,
		custom_extract_equations: bool \| None,
		custom_extract_enrichment: bool \| None,
		) -> tuple[list[Any], list[tuple[str, str]], int, int]:
		"""Checkout, optionally convert to PDF, and optionally extract markdown for items.

		@@ -804,13 +750,6 @@ def _checkout_and_convert_items(
		path_config=manager,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)
		if skip_reason:
		skipped.append((item, skip_reason))
		@@ -853,13 +792,6 @@ def workspace_add_members(
		device: AcceleratorDeviceOption = "auto",
		threads: AcceleratorThreadsOption = 4,
		batch_size: AcceleratorBatchSizeOption = 4,
		profile: ExtractionProfileOption = None,
		custom_ocr: CustomExtractOcrOption = None,
		custom_layout: CustomExtractLayoutOption = None,
		custom_tables: CustomExtractTablesOption = None,
		custom_figures: CustomExtractFiguresOption = None,
		custom_equations: CustomExtractEquationsOption = None,
		custom_enrichment: CustomExtractEnrichmentOption = None,
		verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output (INFO level logging)"),
		) -> None:
		# Set log level based on verbosity
		@@ -874,7 +806,13 @@ def workspace_add_members(
		# Build VLM and accelerator options for extraction
		vlm_options: VlmOptions \| None = None
		if vlm:
		vlm_options = VlmOptions(enable_picture_description=True, enable_formula_enrichment=True)
		vlm_options = VlmOptions(enable_hybrid=True)
		# Auto-start hybrid server if not running
		_, server_status = ensure_hybrid_server()
		if not server_status.running:
		console.print(f"[red]Failed to start hybrid server: {server_status.error}[/red]")
		raise typer.Exit(1)
		console.print(f"[dim]Using hybrid server at {server_status.url}[/dim]")
		accelerator_config = AcceleratorConfig(device=device, num_threads=threads, batch_size=batch_size)

		# Phase 1: Resolve items - either directly provided or via database query
		@@ -916,13 +854,6 @@ def workspace_add_members(
		convert_md=convert_md,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_ocr,
		custom_extract_layout=custom_layout,
		custom_extract_tables=custom_tables,
		custom_extract_figures=custom_figures,
		custom_extract_equations=custom_equations,
		custom_extract_enrichment=custom_enrichment,
		)

		if skipped:
		@@ -1013,13 +944,8 @@ def workspace_process(
		device: AcceleratorDeviceOption = "auto",
		threads: AcceleratorThreadsOption = 4,
		batch_size: AcceleratorBatchSizeOption = 4,
		profile: ExtractionProfileOption = None,
		custom_ocr: CustomExtractOcrOption = None,
		custom_layout: CustomExtractLayoutOption = None,
		custom_tables: CustomExtractTablesOption = None,
		custom_figures: CustomExtractFiguresOption = None,
		custom_equations: CustomExtractEquationsOption = None,
		custom_enrichment: CustomExtractEnrichmentOption = None,
		vlm_host: Annotated[str, typer.Option("--vlm-host", help="Hybrid server host")] = DEFAULT_HOST,
		vlm_port: Annotated[int, typer.Option("--vlm-port", help="Hybrid server port")] = DEFAULT_PORT,
		) -> None:
		workspace_name = _resolve_workspace_name(workspace)

		@@ -1042,7 +968,14 @@ def workspace_process(
		# Build VLM options if --vlm flag is set
		vlm_options: VlmOptions \| None = None
		if vlm:
		vlm_options = VlmOptions(enable_picture_description=True, enable_formula_enrichment=True)
		vlm_options = VlmOptions(enable_hybrid=True, hybrid_url=f"http://{vlm_host}:{vlm_port}")
		# Auto-start hybrid server if not running
		server_config = HybridServerConfig(host=vlm_host, port=vlm_port, device=device)
		_, server_status = ensure_hybrid_server(server_config)
		if not server_status.running:
		console.print(f"[red]Failed to start hybrid server: {server_status.error}[/red]")
		raise typer.Exit(1)
		console.print(f"[dim]Using hybrid server at {server_status.url}[/dim]")

		# Build accelerator config from CLI options
		accelerator_config = AcceleratorConfig(device=device, num_threads=threads, batch_size=batch_size)
		@@ -1073,13 +1006,6 @@ def workspace_process(
		skip_existing=skip_existing,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_ocr,
		custom_extract_layout=custom_layout,
		custom_extract_tables=custom_tables,
		custom_extract_figures=custom_figures,
		custom_extract_equations=custom_equations,
		custom_extract_enrichment=custom_enrichment,
		)
		)
		progress.update(task, completed=len(results), description="[cyan]Processing complete")
		@@ -1211,5 +1137,69 @@ def clear_artifacts(
		console.print(f"\n[green]Deleted {total_deleted} items from {len(ai_dirs)} .ai folders[/green]")


		@hybrid_app.command("start")
		def hybrid_server_start(
		host: Annotated[str, typer.Option("--host", help="Host to bind to")] = DEFAULT_HOST,
		port: Annotated[int, typer.Option("--port", help="Port to bind to")] = DEFAULT_PORT,
		device: Annotated[str, typer.Option("--device", help="Compute device: auto, cpu, cuda, mps, xpu")] = "auto",
		wait: Annotated[bool, typer.Option("--wait/--no-wait", help="Wait for server to be ready")] = True,
		) -> None:
		"""Start the OpenDataLoader hybrid server.

		The hybrid server enables extraction of figures, tables, and equations.
		Server runs with formula enrichment and picture description enabled by default.

		Example:
		3gpp-ai hybrid-server start --device cuda
		"""
		config = HybridServerConfig(
		host=host,
		port=port,
		device=device,
		)
		manager = HybridServerManager(config)

		if wait:
		with console.status(f"[cyan]Starting hybrid server at {host}:{port}..."):
		status = manager.start(wait=True)
		else:
		status = manager.start(wait=False)

		if status.running:
		console.print(f"[green]Hybrid server started at {status.url}[/green]")
		console.print(f"[dim]PID: {status.pid}[/dim]")
		else:
		console.print(f"[red]Failed to start hybrid server: {status.error}[/red]")
		raise typer.Exit(1)


		@hybrid_app.command("status")
		def hybrid_server_status() -> None:
		"""Check the status of the hybrid server."""
		manager = HybridServerManager()
		status = manager.check_health()

		if status.running:
		console.print(f"[green]Hybrid server is running at {status.url}[/green]")
		if status.pid:
		console.print(f"[dim]PID: {status.pid}[/dim]")
		else:
		console.print("[yellow]Hybrid server is not running[/yellow]")
		if status.error:
		console.print(f"[dim]Status: {status.error}[/dim]")


		@hybrid_app.command("stop")
		def hybrid_server_stop() -> None:
		"""Stop the running hybrid server."""
		manager = HybridServerManager()
		status = manager.stop()

		if status.running:
		console.print("[red]Server still running (may need manual cleanup)[/red]")
		else:
		console.print("[green]Hybrid server stopped[/green]")


		if __name__ == "__main__":
		app()

packages/3gpp-ai/threegpp_ai/models.py

+1 −43

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@

		from __future__ import annotations

		from dataclasses import asdict, dataclass, field
		from dataclasses import dataclass, field
		from datetime import datetime
		from enum import StrEnum, auto
		from typing import Any
		@@ -11,7 +11,6 @@ from pydantic import BaseModel, Field, field_validator
		from tdoc_crawler.utils.misc import utc_now
		from tdoc_crawler.utils.normalization import normalize_tdoc_id

		from threegpp_ai.config import AiConfig
		from threegpp_ai.operations.workspace_names import normalize_workspace_name


		@@ -82,46 +81,6 @@ class Workspace:
		self.workspace_name = normalize_workspace_name(self.workspace_name)


		@dataclass
		class WorkspaceMember:
		"""Source item assigned to one workspace corpus.

		NOTE: This class is being merged into workspace_registry.WorkspaceMember.
		Prefer importing from workspace_registry for new code.
		"""

		workspace_name: str = field(metadata={"description": "Workspace identifier"})
		source_item_id: str = field(metadata={"description": "Stable source item identifier"})
		source_path: str = field(metadata={"description": "Path or locator of the source item"})
		source_kind: SourceKind = field(metadata={"description": "Type of source item"})
		added_at: datetime \| str = field(default_factory=utc_now, metadata={"description": "Registration timestamp"})
		added_by: str \| None = field(default=None, metadata={"description": "Actor that registered the source"})
		is_active: bool = field(default=True, metadata={"description": "Membership active flag"})

		def __post_init__(self) -> None:
		"""Normalize fields after initialization."""
		if not self.workspace_name.strip():
		msg = "workspace_name must not be empty"
		raise ValueError(msg)
		self.workspace_name = normalize_workspace_name(self.workspace_name)

		normalized = normalize_tdoc_id(self.source_item_id)
		if not normalized:
		msg = "source_item_id must not be empty"
		raise ValueError(msg)

		self.source_item_id = normalized
		self.source_kind = SourceKind(self.source_kind)

		def to_dict(self) -> dict[str, Any]:
		"""Serialize to dict compatible with WorkspaceMetadata storage."""
		result = asdict(self)
		result["source_kind"] = self.source_kind.value
		result["added_by"] = self.added_by or ""
		result["added_at"] = self.added_at.isoformat() if isinstance(self.added_at, datetime) else self.added_at
		return result


		class DocumentClassification(BaseModel):
		"""Classification of a file within a TDoc folder."""

		@@ -319,6 +278,5 @@ __all__ = [
		"SummarizeResult",
		"TDocNotFoundError",
		"Workspace",
		"WorkspaceMember",
		"WorkspaceNotFoundError",
		]

packages/3gpp-ai/threegpp_ai/operations/convert.py

+0 −42

Original line number	Diff line number	Diff line
		@@ -169,13 +169,6 @@ def convert_document_to_markdown(
		converter_config: ConverterConfig \| None = None,
		vlm_options: VlmOptions \| None = None,
		accelerator_config: AcceleratorConfig \| None = None,
		profile: str \| None = None,
		custom_extract_ocr: bool \| None = None,
		custom_extract_layout: bool \| None = None,
		custom_extract_tables: bool \| None = None,
		custom_extract_figures: bool \| None = None,
		custom_extract_equations: bool \| None = None,
		custom_extract_enrichment: bool \| None = None,
		) -> str:
		"""Convert TDoc to markdown using the unified extraction pipeline.

		@@ -196,13 +189,6 @@ def convert_document_to_markdown(
		converter_config: Optional converter configuration (unused, kept for API compatibility)
		vlm_options: Optional VLM features for extraction.
		accelerator_config: Optional accelerator settings for GPU/CPU and threading.
		profile: Extraction profile override.
		custom_extract_ocr: Optional custom OCR toggle.
		custom_extract_layout: Optional custom layout toggle.
		custom_extract_tables: Optional custom table extraction toggle.
		custom_extract_figures: Optional custom figure extraction toggle.
		custom_extract_equations: Optional custom equation extraction toggle.
		custom_extract_enrichment: Optional custom enrichment toggle.

		Returns:
		Markdown content string
		@@ -218,13 +204,6 @@ def convert_document_to_markdown(
		force=force,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)

		# Get TDoc metadata for header
		@@ -250,13 +229,6 @@ def extract_document_structured_from_tdoc(
		extract_types: set[str] \| None = None,
		vlm_options: VlmOptions \| None = None,
		accelerator_config: AcceleratorConfig \| None = None,
		profile: str \| None = None,
		custom_extract_ocr: bool \| None = None,
		custom_extract_layout: bool \| None = None,
		custom_extract_tables: bool \| None = None,
		custom_extract_figures: bool \| None = None,
		custom_extract_equations: bool \| None = None,
		custom_extract_enrichment: bool \| None = None,
		) -> StructuredExtractionResult:
		"""Extract a TDoc into the canonical structured payload.

		@@ -270,13 +242,6 @@ def extract_document_structured_from_tdoc(
		If None, extracts all types. Supported types: "tables", "figures", "equations".
		vlm_options: Optional VLM features for extraction.
		accelerator_config: Optional accelerator settings for GPU/CPU and threading.
		profile: Extraction profile override.
		custom_extract_ocr: Optional custom OCR toggle.
		custom_extract_layout: Optional custom layout toggle.
		custom_extract_tables: Optional custom table extraction toggle.
		custom_extract_figures: Optional custom figure extraction toggle.
		custom_extract_equations: Optional custom equation extraction toggle.
		custom_extract_enrichment: Optional custom enrichment toggle.

		Returns:
		Structured extraction result.
		@@ -318,13 +283,6 @@ def extract_document_structured_from_tdoc(
		extract_types=extract_types,
		vlm_options=vlm_options,
		accelerator_config=accelerator_config,
		profile=profile,
		custom_extract_ocr=custom_extract_ocr,
		custom_extract_layout=custom_extract_layout,
		custom_extract_tables=custom_extract_tables,
		custom_extract_figures=custom_extract_figures,
		custom_extract_equations=custom_extract_equations,
		custom_extract_enrichment=custom_extract_enrichment,
		)

Original line number	Diff line number	Diff line
		@@ -48,15 +48,17 @@ OpenDataLoader with Hybrid Mode
		Standard Extraction Outputs + AI-Enhanced Artifacts
		```

		Key Differences:
		## Extraction

		\| Feature \| Standard Pipeline \| Hybrid Mode \|
		\|---------\|-------------------\|-------------\|
		Extraction always enables all artifact types: tables, figures, and equations.

		\| Feature \| Standard Mode \| Hybrid Mode \|
		\|---------\|---------------\|-------------\|
		\| Backend \| `opendataloader_pdf` (local) \| `opendataloader_pdf[hybrid]` \|
		\| Table Structure \| ✅ Enabled \| ✅ Enabled \|
		\| Formula Enrichment \| ✅ Enabled \| ✅ Enhanced \|
		\| Picture Description \| ✅ Enabled \| ✅ AI-generated (SmolVLM) \|
		\| OCR for Scanned PDFs \| ✅ via `force_ocr` \| ✅ via `force_ocr` \|
		\| OCR for Scanned PDFs \| ✅ Automatic \| ✅ Automatic \|
		\| Java Required \| Yes (11+) \| Yes (11+) \|
		\| GPU Required \| No \| No (but hybrid server needs LLM)