Commit 0e59a813 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(ai): add workspace process command for manual embedding generation

parent 2f6d4695
Loading
Loading
Loading
Loading
+15 −1
Original line number Diff line number Diff line
@@ -101,7 +101,21 @@ tdoc-crawler ai workspace create my-project --auto-build
tdoc-crawler ai workspace create my-project
```

### 2. Query Your Knowledge Base
### 2. Process Documents (Generate Embeddings)

After adding TDocs to your workspace, process them to generate RAG/GraphRAG embeddings:

```bash
# Process all TDocs in workspace (only new ones)
tdoc-crawler ai workspace process -w my-project

# Force reprocess all TDocs
tdoc-crawler ai workspace process -w my-project --force
```

Note: If you created the workspace with `--auto-build`, documents are processed automatically when added.

### 3. Query Your Knowledge Base

Once you have a workspace with documents, query using semantic search and knowledge graph (RAG + GraphRAG):

+44 −0
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ from tdoc_crawler.ai import (
    summarize_tdoc,
)
from tdoc_crawler.ai.models import SourceKind
from tdoc_crawler.ai.operations.pipeline import process_all
from tdoc_crawler.config import CacheManager

HELP_PANEL = "AI Commands"
@@ -267,6 +268,49 @@ def workspace_list_members(
        console.print(table)


@_workspace_app.command("process")
def workspace_process(
    workspace: Annotated[str, typer.Option("--workspace", "-w", help="Workspace name")],
    new_only: Annotated[bool, typer.Option("--new-only", help="Process only TDocs not already completed")] = True,
    force_rerun: Annotated[bool, typer.Option("--force", help="Force reprocessing of all TDocs")] = False,
    json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
) -> None:
    """Process all TDoc members in a workspace through the AI pipeline."""
    manager = CacheManager().register()
    storage = AiStorage(manager.root / ".ai" / "lancedb")

    # Get workspace members
    members = storage.list_workspace_members(workspace, include_inactive=False)
    tdoc_ids = [m.source_item_id for m in members if m.is_active and m.source_kind.value == "tdoc"]

    if not tdoc_ids:
        if json_output:
            typer.echo(json.dumps({"workspace": normalize_workspace_name(workspace), "processed": 0, "message": "No TDoc members found"}))
        else:
            console.print(f"[yellow]No TDoc members found in workspace '{normalize_workspace_name(workspace)}'[/yellow]")
        return

    # Process TDocs
    results = process_all(
        tdoc_ids=tdoc_ids,
        checkout_base=manager.root,
        new_only=new_only,
        force_rerun=force_rerun,
        workspace=workspace,
    )

    if json_output:
        typer.echo(json.dumps({
            "workspace": normalize_workspace_name(workspace),
            "processed": len(results),
            "total_members": len(tdoc_ids),
            "tdoc_ids": list(results.keys()),
        }))
    else:
        console.print(f"[green]Processed {len(results)}/{len(tdoc_ids)} TDoc(s) in workspace '{normalize_workspace_name(workspace)}'[/green]")



@_workspace_app.command("delete")
def workspace_delete(
    name: Annotated[str, typer.Argument(..., help="Workspace name")],