Commit 7add95b7 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(3gpp-ai): remove AcceleratorConfig - documented no-op, kept for compatibility

- Removed AcceleratorConfig dataclass (documented as ignored, kept for API compatibility)
- Removed accelerator_config parameter from extract_document_structured()
- Removed accelerator_config parameter from extract_document_text()
- Removed accelerator_config parameter from convert_document_to_markdown()
- Removed accelerator_config parameter from extract_document_structured_from_tdoc()
- Removed all AcceleratorConfig creation and passing in CLI commands
- Removed AcceleratorConfig from __all__ exports in extraction.py
- Cleaned up related comments and del statements
parent 35161e58
Loading
Loading
Loading
Loading
+2 −18
Original line number Diff line number Diff line
@@ -90,7 +90,7 @@ from threegpp_ai.models import WorkspaceNotFoundError
from threegpp_ai.operations.classify import pick_main_document
from threegpp_ai.operations.conversion import OFFICE_FORMATS, convert_to_pdf
from threegpp_ai.operations.convert import convert_document_to_markdown
from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured
from threegpp_ai.operations.extraction import VlmOptions, extract_document_structured
from threegpp_ai.operations.hybrid_server import (
    DEFAULT_HOST,
    DEFAULT_PORT,
@@ -239,7 +239,6 @@ async def _process_single_item(
    convert_md: bool = False,
    path_config: PathConfig,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> tuple[Any | None, str | None, bool, bool]:
    """Process a single workspace item (checkout + optional PDF conversion + optional markdown extraction).

@@ -253,7 +252,6 @@ async def _process_single_item(
        convert_md: Whether to extract markdown (implies convert_pdf)
        path_config: PathConfig for file system paths
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.

    Returns:
        Tuple of (member, skip_reason, was_converted, was_md_extracted)
@@ -306,7 +304,6 @@ async def _process_single_item(
                    output_path=None,
                    force=False,
                    vlm_options=vlm_options,
                    accelerator_config=accelerator_config,
                )
            else:
                # Generic extraction (specs, other) - uses file path directly
@@ -317,7 +314,6 @@ async def _process_single_item(
                        metadata=None,
                        force=False,
                        vlm_options=vlm_options,
                        accelerator_config=accelerator_config,
                    )
            was_md_extracted = True
        except Exception as e:
@@ -409,7 +405,6 @@ async def _process_workspace_members(
    convert_md: bool = False,
    skip_existing: bool = False,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> list[dict[str, Any]]:
    """Process workspace members with optional progress callback.

@@ -421,7 +416,6 @@ async def _process_workspace_members(
        convert_md: Whether to extract markdown (implies PDF conversion)
        skip_existing: If True, skip extraction for components that already exist.
        vlm_options: Optional VLM features for extraction (enables hybrid mode for figures/tables/equations).
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.

    Returns:
        List of processing results
@@ -490,7 +484,6 @@ async def _process_workspace_members(
                force=False,
                skip_existing=skip_existing,
                vlm_options=vlm_options,
                accelerator_config=accelerator_config,
            )
            results.append(
                {
@@ -710,7 +703,6 @@ def _checkout_and_convert_items(
    convert_pdf: bool,
    convert_md: bool,
    vlm_options: VlmOptions | None,
    accelerator_config: AcceleratorConfig,
) -> tuple[list[Any], list[tuple[str, str]], int, int]:
    """Checkout, optionally convert to PDF, and optionally extract markdown for items.

@@ -749,7 +741,6 @@ def _checkout_and_convert_items(
                    convert_md=convert_md,
                    path_config=manager,
                    vlm_options=vlm_options,
                    accelerator_config=accelerator_config,
                )
                if skip_reason:
                    skipped.append((item, skip_reason))
@@ -803,7 +794,7 @@ def workspace_add_members(
    kind_normalized = kind.lower().rstrip("s")
    source_kind = SourceKind(kind_normalized) if kind_normalized in {entry.value for entry in SourceKind} else SourceKind.OTHER

    # Build VLM and accelerator options for extraction
    # Build VLM options for extraction
    vlm_options: VlmOptions | None = None
    if vlm:
        # Auto-start hybrid server if not running
@@ -815,8 +806,6 @@ def workspace_add_members(

        vlm_options = VlmOptions(enable_hybrid=True)

    accelerator_config = AcceleratorConfig(device=device, num_threads=threads, batch_size=batch_size)

    # Phase 1: Resolve items - either directly provided or via database query
    if items is not None:
        resolved_items = items
@@ -855,7 +844,6 @@ def workspace_add_members(
        convert_pdf=convert_pdf,
        convert_md=convert_md,
        vlm_options=vlm_options,
        accelerator_config=accelerator_config,
    )

    if skipped:
@@ -979,9 +967,6 @@ def workspace_process(
            raise typer.Exit(1)
        console.print(f"[dim]Using hybrid server at {server_status.url}[/dim]")

    # Build accelerator config from CLI options
    accelerator_config = AcceleratorConfig(device=device, num_threads=threads, batch_size=batch_size)

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
@@ -1007,7 +992,6 @@ def workspace_process(
                convert_md=True,
                skip_existing=skip_existing,
                vlm_options=vlm_options,
                accelerator_config=accelerator_config,
            )
        )
        progress.update(task, completed=len(results), description="[cyan]Processing complete")
+1 −7
Original line number Diff line number Diff line
@@ -24,7 +24,7 @@ from threegpp_ai.operations.conversion import (
    ConverterBackend,
    ConverterConfig,
)
from threegpp_ai.operations.extraction import AcceleratorConfig, VlmOptions, extract_document_structured
from threegpp_ai.operations.extraction import VlmOptions, extract_document_structured
from threegpp_ai.operations.extraction_result import StructuredExtractionResult
from threegpp_ai.operations.fetch_tdoc import fetch_tdoc_files
from threegpp_ai.operations.metrics import MetricType, get_metrics_tracker, timed_operation
@@ -168,7 +168,6 @@ def convert_document_to_markdown(
    force: bool = False,
    converter_config: ConverterConfig | None = None,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> str:
    """Convert TDoc to markdown using the unified extraction pipeline.

@@ -188,7 +187,6 @@ def convert_document_to_markdown(
        force: Force reconversion even if cached
        converter_config: Optional converter configuration (unused, kept for API compatibility)
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.

    Returns:
        Markdown content string
@@ -203,7 +201,6 @@ def convert_document_to_markdown(
        document_id=document_id,
        force=force,
        vlm_options=vlm_options,
        accelerator_config=accelerator_config,
    )

    # Get TDoc metadata for header
@@ -228,7 +225,6 @@ def extract_document_structured_from_tdoc(
    force: bool = False,
    extract_types: set[str] | None = None,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> StructuredExtractionResult:
    """Extract a TDoc into the canonical structured payload.

@@ -241,7 +237,6 @@ def extract_document_structured_from_tdoc(
        extract_types: Optional set of artifact types to extract/persist.
            If None, extracts all types. Supported types: "tables", "figures", "equations".
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.

    Returns:
        Structured extraction result.
@@ -282,7 +277,6 @@ def extract_document_structured_from_tdoc(
            force=force,
            extract_types=extract_types,
            vlm_options=vlm_options,
            accelerator_config=accelerator_config,
        )


+0 −27
Original line number Diff line number Diff line
@@ -93,25 +93,6 @@ class VlmOptions:
    hybrid_fallback: bool = True
    image_output: ImageOutput = ImageOutput.EXTERNAL


@dataclass
class AcceleratorConfig:
    """Accelerator configuration for OpenDataLoader document processing.

    OpenDataLoader is CPU-optimized and doesn't require GPU acceleration.
    This dataclass is kept for API compatibility with the previous docling-based pipeline.

    Attributes:
        device: Compute device (ignored, kept for compatibility).
        num_threads: Thread count for parallel processing.
        batch_size: Batch size for processing (ignored by OpenDataLoader).
    """

    device: str = "auto"
    num_threads: int = 4
    batch_size: int | None = None


# All supported formats (PDF + Office formats + text files)
SUPPORTED_FORMATS = {".pdf", ".txt", ".md"} | OFFICE_FORMATS

@@ -594,7 +575,6 @@ def extract_document_structured(
    skip_existing: bool = False,
    extract_types: set[str] | None = None,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> StructuredExtractionResult:
    """Extract a document into a canonical structured payload.

@@ -620,7 +600,6 @@ def extract_document_structured(
            If None, extracts all types. Supported types: "tables", "figures", "equations".
        vlm_options: Optional VLM features for extraction. Enables hybrid AI mode
            for complex PDF pages. If None, uses deterministic local extraction.
        accelerator_config: Optional accelerator settings (ignored, kept for compatibility).

    Returns:
        StructuredExtractionResult with content, tables, figures, and equations.
@@ -629,8 +608,6 @@ def extract_document_structured(
        ExtractionError: If the source file does not exist or extraction fails.
        ConversionError: If Office document conversion fails.
    """
    del accelerator_config  # OpenDataLoader is CPU-optimized, no GPU needed

    if not file_path.exists():
        raise ExtractionError(f"Source file does not exist: {file_path}")

@@ -718,7 +695,6 @@ def extract_document_text(
    file_path: Path,
    force: bool = False,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> str:
    """Extract plain text content from a document using OpenDataLoader.

@@ -729,7 +705,6 @@ def extract_document_text(
        file_path: Path to the document file.
        force: If True, re-extract even if cached markdown exists.
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings (ignored, kept for compatibility).

    Returns:
        Extracted text content as a string.
@@ -744,13 +719,11 @@ def extract_document_text(
        force=force,
        extract_types=set(),  # No artifacts needed for text-only extraction
        vlm_options=vlm_options,
        accelerator_config=accelerator_config,
    )
    return extraction.content


__all__ = [
    "AcceleratorConfig",
    "VlmOptions",
    "extract_document_structured",
    "extract_document_text",