Commit 6d060008 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(cli): add VLM and query mode options to CLI arguments

* Introduce WorkspaceProcessVlmOption for VLM features.
* Add ProvidersOutputOption for output format selection.
* Implement QueryModeOption for specifying query modes.
* Update workspace_process command to support new options.
parent 1e7881dd
Loading
Loading
Loading
Loading
+33 −0
Original line number Diff line number Diff line
@@ -7,6 +7,9 @@ from typing import Annotated, Literal

import typer

from tdoc_crawler.models.base import OutputFormat
from threegpp_ai.lightrag.config import QueryMode

# Common
OutputFormatOption = Annotated[
    Literal["text", "json", "yaml"],
@@ -71,6 +74,14 @@ WorkspaceReleaseOption = Annotated[
WorkspaceLimitOption = Annotated[int | None, typer.Option("--limit", help="Maximum items to add")]
WorkspaceIncludeInactiveOption = Annotated[bool, typer.Option("--include-inactive", help="Include inactive members")]
WorkspaceProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing of all members")]
WorkspaceProcessVlmOption = Annotated[
    bool | None,
    typer.Option(
        "--vlm/--no-vlm",
        help="Enable VLM picture description and formula enrichment",
        envvar="TDC_AI_VLM",
    ),
]
WorkspacePreserveArtifactsOption = Annotated[
    bool,
    typer.Option(
@@ -112,3 +123,25 @@ AgendaPatternExcludeOption = Annotated[
    list[str] | None,
    typer.Option("--agenda-ex", help="Glob pattern to exclude agenda field (repeatable)"),
]

# Providers
ProvidersOutputOption = Annotated[
    OutputFormat,
    typer.Option(
        "--output",
        "-o",
        case_sensitive=False,
        help="Output format (table, json, ison, toon, yaml)",
    ),
]

# Query
QueryModeOption = Annotated[
    QueryMode,
    typer.Option(
        "--mode",
        "-m",
        case_sensitive=False,
        help="Query mode (local, global, hybrid, naive)",
    ),
]
+60 −24
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ import shutil
from collections.abc import Callable
from datetime import UTC, datetime
from pathlib import Path
from typing import Annotated, Any
from typing import Any

import typer
from dotenv import load_dotenv
@@ -56,6 +56,8 @@ from threegpp_ai.args import (
    ConvertPdfOption,
    EndDateOption,
    OutputFormatOption,
    ProvidersOutputOption,
    QueryModeOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    StartDateOption,
@@ -75,6 +77,7 @@ from threegpp_ai.args import (
    WorkspaceNameOption,
    WorkspacePreserveArtifactsOption,
    WorkspaceProcessForceOption,
    WorkspaceProcessVlmOption,
    WorkspaceReleaseOption,
)
from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
@@ -102,17 +105,6 @@ console = get_console()
_logger = get_logger(__name__)


ProvidersOutputOption = Annotated[
    OutputFormat,
    typer.Option(
        "--output",
        "-o",
        case_sensitive=False,
        help="Output format (table, json, ison, toon, yaml)",
    ),
]


def _print_output(
    data: Any,
    output_format: OutputFormat,
@@ -461,6 +453,8 @@ async def _process_workspace_members(
    workspace: str,
    members: list[Any],
    on_progress: Callable[[int, str], None] | None = None,
    checkout: bool = True,
    convert_md: bool = False,
    vlm_options: VlmOptions | None = None,
) -> list[dict[str, Any]]:
    """Process workspace members with optional progress callback.
@@ -469,6 +463,8 @@ async def _process_workspace_members(
        workspace: Workspace name
        members: List of workspace members to process
        on_progress: Optional callback(completed_count, source_item_id) called after each member
        checkout: Whether to checkout documents if not available
        convert_md: Whether to extract markdown (implies PDF conversion)
        vlm_options: Optional VLM features for extraction.

    Returns:
@@ -476,11 +472,36 @@ async def _process_workspace_members(
    """
    processor = DocumentProcessor(LightRAGConfig.from_env())
    results: list[dict[str, Any]] = []
    manager = resolve_cache_manager()

    await processor.rag.start(workspace)
    try:
        for member in members:
            # Ensure document is available: checkout + convert to PDF if needed
            file_path = _resolve_process_file(Path(member.source_path))

            if file_path is None or not file_path.exists():
                if checkout and member.source_kind in (SourceKind.TDOC, SourceKind.SPEC):
                    # Try to checkout the document
                    checkout_path: Path | None = None
                    if member.source_kind == SourceKind.TDOC:
                        checkout_path = await checkout_tdoc_to_workspace(
                            member.source_item_id,
                            manager.checkout_dir,
                            workspace,
                            db_file=manager.db_file,
                        )
                    elif member.source_kind == SourceKind.SPEC:
                        checkout_path = await checkout_spec_to_workspace(
                            member.source_item_id,
                            manager.checkout_dir,
                            workspace,
                            "latest",
                            db_file=manager.db_file,
                        )
                    if checkout_path is not None:
                        file_path = _resolve_process_file(checkout_path)

            if file_path is None or not file_path.exists():
                results.append(
                    {
@@ -493,6 +514,26 @@ async def _process_workspace_members(
                    on_progress(len(results), member.source_item_id)
                continue

            # Convert to PDF if it's an office format and convert_md is enabled
            if convert_md:
                doc_file = _resolve_process_file(Path(member.source_path))
                if doc_file is not None and doc_file.suffix.lower() in OFFICE_FORMATS:
                    pdf_path = _convert_member_to_pdf(make_workspace_member(workspace, member.source_item_id, member.source_path, member.source_kind))
                    if pdf_path is not None:
                        file_path = pdf_path
                    elif doc_file.suffix.lower() not in {".pdf", ".txt", ".md"}:
                        # Could not convert and not a directly processable format
                        results.append(
                            {
                                "source_item_id": member.source_item_id,
                                "status": "skipped",
                                "reason": "office document could not be converted to PDF",
                            },
                        )
                        if on_progress:
                            on_progress(len(results), member.source_item_id)
                        continue

            metadata = await _try_build_tdoc_metadata(member.source_item_id)
            process_result = await processor.process_file(file_path, workspace, metadata=metadata, vlm_options=vlm_options)
            results.append(
@@ -627,15 +668,7 @@ def workspace_list(
@workspace_app.command("query")
def workspace_query(
    query: str = typer.Argument(..., help="Query string"),
    mode: Annotated[
        QueryMode,
        typer.Option(
            "--mode",
            "-m",
            case_sensitive=False,
            help="Query mode (local, global, hybrid, naive)",
        ),
    ] = QueryMode.HYBRID,
    mode: QueryModeOption = QueryMode.HYBRID,
    workspace: WorkspaceNameOption = None,
    output_format: OutputFormatOption = "text",
) -> None:
@@ -982,13 +1015,14 @@ def workspace_list_members(
    )


@workspace_app.command("process", help="Process workspace members (checkout, convert, embed)")
@workspace_app.command("process", help="Process workspace members (checkout, convert, extract, embed)")
def workspace_process(
    workspace: WorkspaceNameOption = None,
    force: WorkspaceProcessForceOption = False,
    limit: WorkspaceLimitOption = None,
    output_format: OutputFormatOption = "text",
    vlm: bool = typer.Option("--vlm", help="Enable VLM picture description and formula enrichment"),
    checkout: WorkspaceCheckoutOption = True,
    vlm: WorkspaceProcessVlmOption = None,
) -> None:
    workspace_name = _resolve_workspace_name(workspace)

@@ -1028,7 +1062,9 @@ def workspace_process(
            completed = count
            progress.update(task, completed=completed, description=f"[cyan]{source_item_id}")

        results = asyncio.run(_process_workspace_members(workspace_name, members, on_progress=on_progress, vlm_options=vlm_options))
        results = asyncio.run(
            _process_workspace_members(workspace_name, members, on_progress=on_progress, checkout=checkout, convert_md=True, vlm_options=vlm_options)
        )
        progress.update(task, completed=len(results), description="[cyan]Processing complete")

    success_count = sum(1 for row in results if row["status"] == "success")