feat(ai): add workspace process command for manual embedding generation (0e59a813) · Commits · Jan Reimes / 3gpp-crawler

docs/ai.md

+15 −1

Original line number	Diff line number	Diff line
		@@ -101,7 +101,21 @@ tdoc-crawler ai workspace create my-project --auto-build
		tdoc-crawler ai workspace create my-project
		```

		### 2. Query Your Knowledge Base
		### 2. Process Documents (Generate Embeddings)

		After adding TDocs to your workspace, process them to generate RAG/GraphRAG embeddings:

		```bash
		# Process all TDocs in workspace (only new ones)
		tdoc-crawler ai workspace process -w my-project

		# Force reprocess all TDocs
		tdoc-crawler ai workspace process -w my-project --force
		```

		Note: If you created the workspace with `--auto-build`, documents are processed automatically when added.

		### 3. Query Your Knowledge Base

		Once you have a workspace with documents, query using semantic search and knowledge graph (RAG + GraphRAG):

src/tdoc_crawler/cli/ai.py

+44 −0

Original line number	Diff line number	Diff line
		@@ -23,6 +23,7 @@ from tdoc_crawler.ai import (
		summarize_tdoc,
		)
		from tdoc_crawler.ai.models import SourceKind
		from tdoc_crawler.ai.operations.pipeline import process_all
		from tdoc_crawler.config import CacheManager

		HELP_PANEL = "AI Commands"
		@@ -267,6 +268,49 @@ def workspace_list_members(
		console.print(table)


		@_workspace_app.command("process")
		def workspace_process(
		workspace: Annotated[str, typer.Option("--workspace", "-w", help="Workspace name")],
		new_only: Annotated[bool, typer.Option("--new-only", help="Process only TDocs not already completed")] = True,
		force_rerun: Annotated[bool, typer.Option("--force", help="Force reprocessing of all TDocs")] = False,
		json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
		) -> None:
		"""Process all TDoc members in a workspace through the AI pipeline."""
		manager = CacheManager().register()
		storage = AiStorage(manager.root / ".ai" / "lancedb")

		# Get workspace members
		members = storage.list_workspace_members(workspace, include_inactive=False)
		tdoc_ids = [m.source_item_id for m in members if m.is_active and m.source_kind.value == "tdoc"]

		if not tdoc_ids:
		if json_output:
		typer.echo(json.dumps({"workspace": normalize_workspace_name(workspace), "processed": 0, "message": "No TDoc members found"}))
		else:
		console.print(f"[yellow]No TDoc members found in workspace '{normalize_workspace_name(workspace)}'[/yellow]")
		return

		# Process TDocs
		results = process_all(
		tdoc_ids=tdoc_ids,
		checkout_base=manager.root,
		new_only=new_only,
		force_rerun=force_rerun,
		workspace=workspace,
		)

		if json_output:
		typer.echo(json.dumps({
		"workspace": normalize_workspace_name(workspace),
		"processed": len(results),
		"total_members": len(tdoc_ids),
		"tdoc_ids": list(results.keys()),
		}))
		else:
		console.print(f"[green]Processed {len(results)}/{len(tdoc_ids)} TDoc(s) in workspace '{normalize_workspace_name(workspace)}'[/green]")



		@_workspace_app.command("delete")
		def workspace_delete(
		name: Annotated[str, typer.Argument(..., help="Workspace name")],