feat(cli): enhance structured output formatting for CLI commands (3351b510) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/threegpp_ai/cli.py

+156 −106

Original line number	Diff line number	Diff line
		@@ -7,18 +7,16 @@ and nested LightRAG commands under `rag`.
		from __future__ import annotations

		import asyncio
		import json
		import shutil
		from collections.abc import Callable
		from datetime import UTC, datetime
		from pathlib import Path
		from typing import Annotated, Any, Literal
		from typing import Annotated, Any

		import typer
		import yaml
		from dotenv import load_dotenv
		from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
		from rich.table import Table
		from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
		from tdoc_crawler.config import CacheManager, resolve_cache_manager
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.logging import get_console, get_logger
		@@ -102,9 +100,37 @@ console = get_console()
		_logger = get_logger(__name__)


		ProvidersOutputOption = Annotated[
		OutputFormat,
		typer.Option(
		"--output",
		"-o",
		case_sensitive=False,
		help="Output format (table, json, ison, toon, yaml)",
		),
		]


		def _print_output(
		data: Any,
		output_format: OutputFormat,
		*,
		table_title: str,
		table_columns: list[TableColumnSpec] \| None = None,
		) -> None:
		"""Print structured command output through the shared formatter pipeline."""
		print_structured_output(
		data,
		output_format,
		table_title=table_title,
		table_columns=table_columns,
		console=console,
		)


		@providers_app.command("list", help="List all available AI providers")
		def providers_list(
		output: Literal["table", "json", "yaml", "toon"] = "table",
		output: ProvidersOutputOption = OutputFormat.TABLE,
		) -> None:
		"""List all supported AI providers with their capabilities and aliases.

		@@ -146,49 +172,28 @@ def providers_list(
		}
		)

		if output == "json":
		typer.echo(json.dumps(providers_data, indent=2))
		return

		if output == "yaml":
		typer.echo(yaml.dump(providers_data, default_flow_style=False))
		return

		if output == "toon":
		for p in providers_data:
		if p["canonical"]:
		# Alias with base URL - show as "alias -> canonical (base_url)"
		typer.echo(f"{p['name']} -> {p['canonical']}: LLM={'Y' if p['llm'] else 'N'}, EMB={'Y' if p['embedding'] else 'N'}, base_url={p['base_url']}")
		else:
		typer.echo(f"{p['name']}: LLM={'Y' if p['llm'] else 'N'}, EMB={'Y' if p['embedding'] else 'N'}")
		return

		# Default: table output
		table = Table(title="AI Providers")
		table.add_column("Provider", style="cyan")
		table.add_column("LLM", style="green", justify="center")
		table.add_column("Embedding", style="yellow", justify="center")
		table.add_column("Base URL / Note", style="white")

		for p in providers_data:
		if p["canonical"]:
		# Alias with base URL
		table.add_row(
		p["name"],
		"Y" if p["llm"] else "N",
		"Y" if p["embedding"] else "N",
		f"via {p['canonical']} - {p['base_url']}",
		)
		else:
		table.add_row(
		p["name"],
		"Y" if p["llm"] else "N",
		"Y" if p["embedding"] else "N",
		"-",
		output_rows = [
		{
		"provider": provider["name"],
		"llm": "Y" if provider["llm"] else "N",
		"embedding": "Y" if provider["embedding"] else "N",
		"base_url_note": (f"via {provider['canonical']} - {provider['base_url']}" if provider["canonical"] else "-"),
		}
		for provider in providers_data
		]

		_print_output(
		output_rows,
		output,
		table_title="AI Providers",
		table_columns=[
		TableColumnSpec("provider", "Provider", style="cyan"),
		TableColumnSpec("llm", "LLM", style="green", justify="center"),
		TableColumnSpec("embedding", "Embedding", style="yellow", justify="center"),
		TableColumnSpec("base_url_note", "Base URL / Note", style="white"),
		],
		)

		console.print(table)


		@app.callback()
		def _app_init(cache_dir: CacheDirOption = None) -> None:
		@@ -548,13 +553,21 @@ def ai_convert(

		if output:
		if json_output:
		typer.echo(json.dumps({"output": str(output)}))
		_print_output(
		{"output": str(output)},
		OutputFormat.JSON,
		table_title="Convert Result",
		)
		else:
		console.print(f"[green]Converted {document_id} to {output}[/green]")
		return

		if json_output:
		typer.echo(json.dumps({"markdown": markdown_or_path}))
		_print_output(
		{"markdown": markdown_or_path},
		OutputFormat.JSON,
		table_title="Convert Result",
		)
		return

		typer.echo(markdown_or_path)
		@@ -573,7 +586,11 @@ def workspace_create(
		set_active_workspace(name)

		if json_output:
		typer.echo(json.dumps({"name": workspace.name if workspace else name, "auto_build": auto_build}))
		_print_output(
		{"name": workspace.name if workspace else name, "auto_build": auto_build},
		OutputFormat.JSON,
		table_title="Workspace Create",
		)
		return

		console.print(f"[green]Created workspace: {normalize_workspace_name(name)}[/green]")
		@@ -587,34 +604,38 @@ def workspace_list(
		) -> None:
		registry = WorkspaceRegistry.load()
		workspaces = registry.list_workspaces()

		if json_output:
		typer.echo(
		json.dumps(
		[
		workspace_rows = [
		{
		"name": entry.name,
		"is_active": entry.is_active,
		"tdoc_count": entry.tdoc_count,
		"spec_count": entry.spec_count,
		"other_count": entry.other_count,
		"active": "*" if entry.is_active else "",
		"tdocs": entry.tdoc_count,
		"specs": entry.spec_count,
		"other": entry.other_count,
		"created_at": entry.created_at,
		}
		for entry in workspaces
		],
		),
		]

		if json_output:
		_print_output(
		workspace_rows,
		OutputFormat.JSON,
		table_title="Workspaces",
		)
		return

		table = Table(title="Workspaces")
		table.add_column("Name", style="cyan")
		table.add_column("Active", style="green")
		table.add_column("TDocs", style="yellow", justify="right")
		table.add_column("Specs", style="yellow", justify="right")
		table.add_column("Other", style="yellow", justify="right")
		for ws in workspaces:
		table.add_row(ws.name, "*" if ws.is_active else "", str(ws.tdoc_count), str(ws.spec_count), str(ws.other_count))
		console.print(table)
		_print_output(
		workspace_rows,
		OutputFormat.TABLE,
		table_title="Workspaces",
		table_columns=[
		TableColumnSpec("name", "Name", style="cyan"),
		TableColumnSpec("active", "Active", style="green"),
		TableColumnSpec("tdocs", "TDocs", style="yellow", justify="right"),
		TableColumnSpec("specs", "Specs", style="yellow", justify="right"),
		TableColumnSpec("other", "Other", style="yellow", justify="right"),
		],
		)


		@workspace_app.command("query")
		@@ -652,7 +673,11 @@ def workspace_query(
		result = asyncio.run(_run())

		if json_output:
		typer.echo(json.dumps({"query": query, "mode": mode.value, "result": result}))
		_print_output(
		{"query": query, "mode": mode.value, "result": result},
		OutputFormat.JSON,
		table_title="Workspace Query",
		)
		else:
		console.print(f"\n[bold]Query:[/bold] {query}")
		console.print(f"[bold]Mode:[/bold] {mode.value}\n")
		@@ -723,18 +748,28 @@ def workspace_info(

		counts = get_workspace_member_counts(name)
		if json_output:
		typer.echo(json.dumps({"name": workspace.name, "auto_build": workspace.auto_build, "member_counts": counts}))
		_print_output(
		{"name": workspace.name, "auto_build": workspace.auto_build, "member_counts": counts},
		OutputFormat.JSON,
		table_title=f"Workspace: {workspace.name}",
		)
		return

		table = Table(title=f"Workspace: {workspace.name}")
		table.add_column("Field", style="cyan")
		table.add_column("Value", style="green")
		table.add_row("Auto-build", "Yes" if workspace.auto_build else "No")
		table.add_row("Total Members", str(counts["total"]))
		table.add_row("TDocs", str(counts["tdoc"]))
		table.add_row("Specs", str(counts["spec"]))
		table.add_row("Other", str(counts["other"]))
		console.print(table)
		_print_output(
		[
		{"field": "Auto-build", "value": "Yes" if workspace.auto_build else "No"},
		{"field": "Total Members", "value": counts["total"]},
		{"field": "TDocs", "value": counts["tdoc"]},
		{"field": "Specs", "value": counts["spec"]},
		{"field": "Other", "value": counts["other"]},
		],
		OutputFormat.TABLE,
		table_title=f"Workspace: {workspace.name}",
		table_columns=[
		TableColumnSpec("field", "Field", style="cyan"),
		TableColumnSpec("value", "Value", style="green"),
		],
		)


		@workspace_app.command("activate", help="Activate a workspace (set as current)")
		@@ -849,6 +884,7 @@ def workspace_add_members(
		f"[cyan]Processing {len(resolved_items)} item(s)...",
		total=len(resolved_items),
		)

		async def _process_items() -> None:
		nonlocal converted_count, md_extracted_count
		for item in resolved_items:
		@@ -906,10 +942,7 @@ def workspace_list_members(
		console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
		raise typer.Exit(1)

		if json_output:
		typer.echo(
		json.dumps(
		[
		member_rows = [
		{
		"source_item_id": entry.source_item_id,
		"source_path": entry.source_path,
		@@ -918,19 +951,36 @@ def workspace_list_members(
		"added_at": entry.added_at,
		}
		for entry in members
		],
		),
		]

		if json_output:
		_print_output(
		member_rows,
		OutputFormat.JSON,
		table_title=f"Members: {workspace_name}",
		)
		return

		table = Table(title=f"Members: {workspace_name}")
		table.add_column("Source ID", style="cyan")
		table.add_column("Kind", style="green")
		table.add_column("Path", style="white")
		table.add_column("Active", style="yellow")
		for entry in members:
		table.add_row(entry.source_item_id, entry.source_kind.value, entry.source_path, "Yes" if entry.is_active else "No")
		console.print(table)
		table_rows = [
		{
		"source_id": row["source_item_id"],
		"kind": row["source_kind"],
		"path": row["source_path"],
		"active": "Yes" if row["is_active"] else "No",
		}
		for row in member_rows
		]
		_print_output(
		table_rows,
		OutputFormat.TABLE,
		table_title=f"Members: {workspace_name}",
		table_columns=[
		TableColumnSpec("source_id", "Source ID", style="cyan"),
		TableColumnSpec("kind", "Kind", style="green"),
		TableColumnSpec("path", "Path", style="white"),
		TableColumnSpec("active", "Active", style="yellow"),
		],
		)


		@workspace_app.command("process", help="Process workspace members (checkout, convert, embed)")
		@@ -990,7 +1040,7 @@ def workspace_process(
		}

		if json_output:
		typer.echo(json.dumps(payload))
		_print_output(payload, OutputFormat.JSON, table_title="Workspace Process")
		return

		console.print(f"[green]Processed: {success_count}[/green]")

src/tdoc_crawler/cli/formatting.py

+116 −13

Original line number	Diff line number	Diff line
		@@ -14,17 +14,103 @@ Usage:
		from __future__ import annotations

		import json
		from typing import Any
		from dataclasses import dataclass
		from typing import Any, Literal

		import pandas as pd
		import yaml
		from ison_parser import dumps as ison_dumps
		from ison_parser import from_dict as ison_from_dict
		from rich.console import Console
		from rich.table import Table
		from toon_format import encode as toon_encode

		from tdoc_crawler.logging import get_console
		from tdoc_crawler.models.base import OutputFormat


		@dataclass(frozen=True)
		class TableColumnSpec:
		"""Describes one table column for DataFrame-backed rendering."""

		key: str
		header: str
		style: str = "white"
		justify: Literal["default", "left", "center", "right", "full"] = "left"


		def _normalize_to_dataframe(data: Any) -> tuple[pd.DataFrame, str]:
		"""Normalize arbitrary structured data to a DataFrame.

		Returns:
		Tuple of (dataframe, input_kind) where input_kind is one of:
		"records", "dict", "scalar", "dataframe".
		"""
		if isinstance(data, pd.DataFrame):
		return data.copy(), "dataframe"

		if isinstance(data, list):
		return pd.DataFrame(data), "records"

		if isinstance(data, dict):
		return pd.DataFrame([data]), "dict"

		return pd.DataFrame([{"value": data}]), "scalar"


		def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any:
		"""Convert normalized DataFrame to a payload shape for serializers."""
		records = df.to_dict(orient="records")

		if input_kind == "dict":
		return records[0] if records else {}

		if input_kind == "scalar":
		if not records:
		return None
		return records[0].get("value")

		return records


		def _format_cell(value: Any) -> str:
		"""Format one table cell from DataFrame value with empty handling."""
		if value is None:
		return "-"

		if isinstance(value, float) and pd.isna(value):
		return "-"

		return str(value)


		def _build_table(
		df: pd.DataFrame,
		*,
		title: str,
		table_columns: list[TableColumnSpec] \| None,
		) -> Table:
		"""Build a Rich table from a DataFrame and optional column specs."""
		table = Table(title=title)

		if table_columns:
		for column in table_columns:
		table.add_column(column.header, style=column.style, justify=column.justify)

		for _, row in df.iterrows():
		table.add_row(*[_format_cell(row.get(column.key)) for column in table_columns])

		return table

		for column_name in df.columns:
		table.add_column(str(column_name), style="white")

		for _, row in df.iterrows():
		table.add_row(*[_format_cell(row[column_name]) for column_name in df.columns])

		return table


		def format_output(data: Any, output_format: OutputFormat) -> str:
		"""Format structured data to the specified output format.

		@@ -41,27 +127,44 @@ def format_output(data: Any, output_format: OutputFormat) -> str:
		Raises:
		ValueError: If the output format is not supported
		"""
		# Convert to DataFrame for consistent tabular handling
		if isinstance(data, list):
		df = pd.DataFrame(data)
		elif isinstance(data, dict):
		df = data if any(isinstance(v, (list, dict)) for v in data.values()) else pd.DataFrame([data])
		else:
		df = data
		df, input_kind = _normalize_to_dataframe(data)
		payload = _dataframe_to_payload(df, input_kind)

		match output_format:
		case OutputFormat.JSON:
		result = df.to_json(indent=2, orient="records") if isinstance(df, pd.DataFrame) else json.dumps(df, indent=2, default=str)
		result = json.dumps(payload, indent=2, default=str)
		return result or "{}"
		case OutputFormat.ISON:
		records = df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df
		records = payload if isinstance(payload, list) else [payload]
		return ison_dumps(ison_from_dict({"items": records}))
		case OutputFormat.TOON:
		return toon_encode(df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df)
		return toon_encode(payload)
		case OutputFormat.YAML:
		return yaml.dump(df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df, sort_keys=False, default_flow_style=False)
		return yaml.dump(payload, sort_keys=False, default_flow_style=False)
		case OutputFormat.TABLE:
		return df.to_string(index=False)
		case _:
		raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}")


		__all__ = ["format_output"]
		def print_structured_output(
		data: Any,
		output_format: OutputFormat,
		*,
		table_title: str = "Results",
		table_columns: list[TableColumnSpec] \| None = None,
		console: Console \| None = None,
		) -> None:
		"""Print structured output through a DataFrame-first common pipeline."""
		output_console = console or get_console()
		df, _ = _normalize_to_dataframe(data)

		if output_format is OutputFormat.TABLE:
		table = _build_table(df, title=table_title, table_columns=table_columns)
		output_console.print(table)
		return

		output_console.print(format_output(df, output_format))


		__all__ = ["TableColumnSpec", "format_output", "print_structured_output"]

src/tdoc_crawler/cli/printing.py

+107 −83

File changed.

Preview size limit exceeded, changes collapsed.

tests/test_cli.py

+40 −30

Original line number	Diff line number	Diff line
		@@ -42,12 +42,14 @@ class TestCrawlCommand:

		mock_crawler = MagicMock()
		mock_crawler_class.return_value = mock_crawler
		mock_crawler.crawl = AsyncMock(return_value=MagicMock(
		mock_crawler.crawl = AsyncMock(
		return_value=MagicMock(
		processed=10,
		inserted=10,
		updated=0,
		errors=[],
		))
		)
		)

		result = runner.invoke(
		app,
		@@ -76,12 +78,14 @@ class TestCrawlCommand:

		mock_crawler = MagicMock()
		mock_crawler_class.return_value = mock_crawler
		mock_crawler.crawl = AsyncMock(return_value=MagicMock(
		mock_crawler.crawl = AsyncMock(
		return_value=MagicMock(
		processed=5,
		inserted=5,
		updated=0,
		errors=[],
		))
		)
		)

		result = runner.invoke(
		app,
		@@ -111,12 +115,14 @@ class TestCrawlMeetingsCommand:

		mock_crawler = MagicMock()
		mock_crawler_class.return_value = mock_crawler
		mock_crawler.crawl = AsyncMock(return_value=MagicMock(
		mock_crawler.crawl = AsyncMock(
		return_value=MagicMock(
		processed=20,
		inserted=20,
		updated=0,
		errors=[],
		))
		)
		)

		result = runner.invoke(
		app,
		@@ -732,12 +738,14 @@ class TestEnvironmentVariables:

		mock_crawler = MagicMock()
		mock_crawler_class.return_value = mock_crawler
		mock_crawler.crawl = AsyncMock(return_value=MagicMock(
		mock_crawler.crawl = AsyncMock(
		return_value=MagicMock(
		processed=0,
		inserted=0,
		updated=0,
		errors=[],
		))
		)
		)

		# Set environment variable via monkeypatch
		monkeypatch.setenv("TDC_CACHE_DIR", str(test_cache_dir))
		@@ -763,12 +771,14 @@ class TestEnvironmentVariables:

		mock_crawler = MagicMock()
		mock_crawler_class.return_value = mock_crawler
		mock_crawler.crawl = AsyncMock(return_value=MagicMock(
		mock_crawler.crawl = AsyncMock(
		return_value=MagicMock(
		processed=0,
		inserted=0,
		updated=0,
		errors=[],
		))
		)
		)

		# Set environment variable via monkeypatch
		monkeypatch.setenv("TDC_WORKERS", "8")

tests/test_formatting.py

0 → 100644

+70 −0

Original line number	Diff line number	Diff line
		"""Tests for CLI structured output formatting."""

		from __future__ import annotations

		from rich.console import Console

		from tdoc_crawler.cli.formatting import TableColumnSpec, format_output, print_structured_output
		from tdoc_crawler.models.base import OutputFormat


		def test_format_output_json_from_records() -> None:
		"""JSON output should serialize list-of-dict inputs as records."""
		output = format_output([{"name": "alpha", "count": 2}], OutputFormat.JSON)

		assert '"name": "alpha"' in output
		assert output.strip().startswith("[")


		def test_format_output_json_from_dict() -> None:
		"""JSON output should keep dict payload shape after DataFrame normalization."""
		output = format_output({"name": "alpha", "count": 2}, OutputFormat.JSON)

		assert '"name": "alpha"' in output
		assert output.strip().startswith("{")


		def test_format_output_yaml_from_records() -> None:
		"""YAML output should serialize records with stable keys."""
		output = format_output([{"name": "alpha", "count": 2}], OutputFormat.YAML)

		assert "name: alpha" in output
		assert "count: 2" in output


		def test_format_output_ison_contains_items_wrapper() -> None:
		"""ISON output should include the shared items envelope."""
		output = format_output([{"name": "alpha"}], OutputFormat.ISON)

		assert "items" in output
		assert "alpha" in output


		def test_format_output_table_text() -> None:
		"""Table output should return DataFrame string representation."""
		output = format_output([{"name": "alpha", "count": 2}], OutputFormat.TABLE)

		assert "name" in output
		assert "count" in output


		def test_print_structured_output_table_with_column_specs() -> None:
		"""Table printer should render specified columns via the shared pipeline."""
		output_console = Console(record=True, width=120)

		print_structured_output(
		[{"name": "alpha", "count": 2}],
		OutputFormat.TABLE,
		table_title="Example",
		table_columns=[
		TableColumnSpec("name", "Name", style="cyan"),
		TableColumnSpec("count", "Count", style="yellow", justify="right"),
		],
		console=output_console,
		)

		rendered = output_console.export_text()
		assert "Example" in rendered
		assert "Name" in rendered
		assert "Count" in rendered
		assert "alpha" in rendered