Commit f49c02e6 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(ai): simplify document processing command and remove unused imports

parent 2180f262
Loading
Loading
Loading
Loading
+24 −39
Original line number Diff line number Diff line
@@ -23,7 +23,6 @@ from tdoc_ai import (
    get_status,
    make_workspace_member,
    normalize_workspace_name,
    process_document,
    query_graph,
    set_active_workspace,
    summarize_document,
@@ -45,7 +44,6 @@ from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    CheckoutBaseOption,
    CheckoutPathOption,
    ConvertDocumentArgument,
    ConvertOutputOption,
    EmbeddingBackendOption,
@@ -57,7 +55,6 @@ from tdoc_crawler.cli.args import (
    ProcessAllOption,
    ProcessForceOption,
    ProcessNewOnlyOption,
    ProcessTDocIdOption,
    QueryArgument,
    QueryOption,
    SourcePatternExcludeOption,
@@ -216,9 +213,7 @@ def ai_query(

@ai_app.command("process")
def ai_process(
    document_id: ProcessTDocIdOption = None,
    workspace: WorkspaceNameOption = None,
    checkout_path: CheckoutPathOption = None,
    checkout_base: CheckoutBaseOption = None,
    process_all_flag: ProcessAllOption = False,
    new_only: ProcessNewOnlyOption = False,
@@ -226,11 +221,19 @@ def ai_process(
    accelerate: EmbeddingBackendOption = "torch",
    json_output: JsonOutputOption = False,
) -> None:
    """Process a single document or all documents through the AI pipeline."""
    """Process all documents in a workspace through the AI pipeline.

    Processing runs in three phases:
    Phase 1: CLASSIFY → EXTRACT (create markdown artifacts)
    Phase 2: EMBED (generate vector embeddings)
    Phase 3: GRAPH (build knowledge graph)

    Failed documents in one phase are not processed in later phases,
    keeping logs clean and errors contained.
    """
    workspace = workspace or "default"
    config = AiConfig.from_env(embedding_backend=accelerate)

    if process_all_flag:
    # Process all documents in workspace
    manager = _get_cache_manager()
    checkout_root = Path(checkout_base) if checkout_base else manager.root
@@ -246,24 +249,6 @@ def ai_process(
        typer.echo(json.dumps(result))
    else:
        console.print(f"[green]Processed {len(result)} documents in workspace {workspace}[/green]")
    elif document_id:
        # Process single document
        manager = _get_cache_manager()
        resolved_checkout = Path(checkout_path) if checkout_path else manager.checkout_dir / document_id
        result = process_document(
            document_id,
            workspace=workspace,
            checkout_path=resolved_checkout,
            force_rerun=force,
            config=config,
        )
        if json_output:
            typer.echo(json.dumps(result))
        else:
            console.print(f"[green]Processed {document_id}[/green]")
    else:
        console.print("[red]Error: Must specify --tdoc-id or --all[/red]")
        raise typer.Exit(1)


@ai_app.command("status")