Commit 3351b510 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(cli): enhance structured output formatting for CLI commands

* Implement structured output support for various formats (table, json, ison, yaml).
* Refactor output printing logic to utilize a shared formatter pipeline.
* Update CLI commands to use the new structured output functions.
* Add tests for structured output formatting to ensure correctness.
parent 4bd7a9f9
Loading
Loading
Loading
Loading
+156 −106
Original line number Diff line number Diff line
@@ -7,18 +7,16 @@ and nested LightRAG commands under `rag`.
from __future__ import annotations

import asyncio
import json
import shutil
from collections.abc import Callable
from datetime import UTC, datetime
from pathlib import Path
from typing import Annotated, Any, Literal
from typing import Annotated, Any

import typer
import yaml
from dotenv import load_dotenv
from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from rich.table import Table
from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_console, get_logger
@@ -102,9 +100,37 @@ console = get_console()
_logger = get_logger(__name__)


ProvidersOutputOption = Annotated[
    OutputFormat,
    typer.Option(
        "--output",
        "-o",
        case_sensitive=False,
        help="Output format (table, json, ison, toon, yaml)",
    ),
]


def _print_output(
    data: Any,
    output_format: OutputFormat,
    *,
    table_title: str,
    table_columns: list[TableColumnSpec] | None = None,
) -> None:
    """Print structured command output through the shared formatter pipeline."""
    print_structured_output(
        data,
        output_format,
        table_title=table_title,
        table_columns=table_columns,
        console=console,
    )


@providers_app.command("list", help="List all available AI providers")
def providers_list(
    output: Literal["table", "json", "yaml", "toon"] = "table",
    output: ProvidersOutputOption = OutputFormat.TABLE,
) -> None:
    """List all supported AI providers with their capabilities and aliases.

@@ -146,49 +172,28 @@ def providers_list(
            }
        )

    if output == "json":
        typer.echo(json.dumps(providers_data, indent=2))
        return

    if output == "yaml":
        typer.echo(yaml.dump(providers_data, default_flow_style=False))
        return

    if output == "toon":
        for p in providers_data:
            if p["canonical"]:
                # Alias with base URL - show as "alias -> canonical (base_url)"
                typer.echo(f"{p['name']} -> {p['canonical']}: LLM={'Y' if p['llm'] else 'N'}, EMB={'Y' if p['embedding'] else 'N'}, base_url={p['base_url']}")
            else:
                typer.echo(f"{p['name']}: LLM={'Y' if p['llm'] else 'N'}, EMB={'Y' if p['embedding'] else 'N'}")
        return

    # Default: table output
    table = Table(title="AI Providers")
    table.add_column("Provider", style="cyan")
    table.add_column("LLM", style="green", justify="center")
    table.add_column("Embedding", style="yellow", justify="center")
    table.add_column("Base URL / Note", style="white")

    for p in providers_data:
        if p["canonical"]:
            # Alias with base URL
            table.add_row(
                p["name"],
                "Y" if p["llm"] else "N",
                "Y" if p["embedding"] else "N",
                f"via {p['canonical']} - {p['base_url']}",
            )
        else:
            table.add_row(
                p["name"],
                "Y" if p["llm"] else "N",
                "Y" if p["embedding"] else "N",
                "-",
    output_rows = [
        {
            "provider": provider["name"],
            "llm": "Y" if provider["llm"] else "N",
            "embedding": "Y" if provider["embedding"] else "N",
            "base_url_note": (f"via {provider['canonical']} - {provider['base_url']}" if provider["canonical"] else "-"),
        }
        for provider in providers_data
    ]

    _print_output(
        output_rows,
        output,
        table_title="AI Providers",
        table_columns=[
            TableColumnSpec("provider", "Provider", style="cyan"),
            TableColumnSpec("llm", "LLM", style="green", justify="center"),
            TableColumnSpec("embedding", "Embedding", style="yellow", justify="center"),
            TableColumnSpec("base_url_note", "Base URL / Note", style="white"),
        ],
    )

    console.print(table)


@app.callback()
def _app_init(cache_dir: CacheDirOption = None) -> None:
@@ -548,13 +553,21 @@ def ai_convert(

    if output:
        if json_output:
            typer.echo(json.dumps({"output": str(output)}))
            _print_output(
                {"output": str(output)},
                OutputFormat.JSON,
                table_title="Convert Result",
            )
        else:
            console.print(f"[green]Converted {document_id} to {output}[/green]")
        return

    if json_output:
        typer.echo(json.dumps({"markdown": markdown_or_path}))
        _print_output(
            {"markdown": markdown_or_path},
            OutputFormat.JSON,
            table_title="Convert Result",
        )
        return

    typer.echo(markdown_or_path)
@@ -573,7 +586,11 @@ def workspace_create(
        set_active_workspace(name)

    if json_output:
        typer.echo(json.dumps({"name": workspace.name if workspace else name, "auto_build": auto_build}))
        _print_output(
            {"name": workspace.name if workspace else name, "auto_build": auto_build},
            OutputFormat.JSON,
            table_title="Workspace Create",
        )
        return

    console.print(f"[green]Created workspace: {normalize_workspace_name(name)}[/green]")
@@ -587,34 +604,38 @@ def workspace_list(
) -> None:
    registry = WorkspaceRegistry.load()
    workspaces = registry.list_workspaces()

    if json_output:
        typer.echo(
            json.dumps(
                [
    workspace_rows = [
        {
            "name": entry.name,
                        "is_active": entry.is_active,
                        "tdoc_count": entry.tdoc_count,
                        "spec_count": entry.spec_count,
                        "other_count": entry.other_count,
            "active": "*" if entry.is_active else "",
            "tdocs": entry.tdoc_count,
            "specs": entry.spec_count,
            "other": entry.other_count,
            "created_at": entry.created_at,
        }
        for entry in workspaces
                ],
            ),
    ]

    if json_output:
        _print_output(
            workspace_rows,
            OutputFormat.JSON,
            table_title="Workspaces",
        )
        return

    table = Table(title="Workspaces")
    table.add_column("Name", style="cyan")
    table.add_column("Active", style="green")
    table.add_column("TDocs", style="yellow", justify="right")
    table.add_column("Specs", style="yellow", justify="right")
    table.add_column("Other", style="yellow", justify="right")
    for ws in workspaces:
        table.add_row(ws.name, "*" if ws.is_active else "", str(ws.tdoc_count), str(ws.spec_count), str(ws.other_count))
    console.print(table)
    _print_output(
        workspace_rows,
        OutputFormat.TABLE,
        table_title="Workspaces",
        table_columns=[
            TableColumnSpec("name", "Name", style="cyan"),
            TableColumnSpec("active", "Active", style="green"),
            TableColumnSpec("tdocs", "TDocs", style="yellow", justify="right"),
            TableColumnSpec("specs", "Specs", style="yellow", justify="right"),
            TableColumnSpec("other", "Other", style="yellow", justify="right"),
        ],
    )


@workspace_app.command("query")
@@ -652,7 +673,11 @@ def workspace_query(
    result = asyncio.run(_run())

    if json_output:
        typer.echo(json.dumps({"query": query, "mode": mode.value, "result": result}))
        _print_output(
            {"query": query, "mode": mode.value, "result": result},
            OutputFormat.JSON,
            table_title="Workspace Query",
        )
    else:
        console.print(f"\n[bold]Query:[/bold] {query}")
        console.print(f"[bold]Mode:[/bold] {mode.value}\n")
@@ -723,18 +748,28 @@ def workspace_info(

    counts = get_workspace_member_counts(name)
    if json_output:
        typer.echo(json.dumps({"name": workspace.name, "auto_build": workspace.auto_build, "member_counts": counts}))
        _print_output(
            {"name": workspace.name, "auto_build": workspace.auto_build, "member_counts": counts},
            OutputFormat.JSON,
            table_title=f"Workspace: {workspace.name}",
        )
        return

    table = Table(title=f"Workspace: {workspace.name}")
    table.add_column("Field", style="cyan")
    table.add_column("Value", style="green")
    table.add_row("Auto-build", "Yes" if workspace.auto_build else "No")
    table.add_row("Total Members", str(counts["total"]))
    table.add_row("TDocs", str(counts["tdoc"]))
    table.add_row("Specs", str(counts["spec"]))
    table.add_row("Other", str(counts["other"]))
    console.print(table)
    _print_output(
        [
            {"field": "Auto-build", "value": "Yes" if workspace.auto_build else "No"},
            {"field": "Total Members", "value": counts["total"]},
            {"field": "TDocs", "value": counts["tdoc"]},
            {"field": "Specs", "value": counts["spec"]},
            {"field": "Other", "value": counts["other"]},
        ],
        OutputFormat.TABLE,
        table_title=f"Workspace: {workspace.name}",
        table_columns=[
            TableColumnSpec("field", "Field", style="cyan"),
            TableColumnSpec("value", "Value", style="green"),
        ],
    )


@workspace_app.command("activate", help="Activate a workspace (set as current)")
@@ -849,6 +884,7 @@ def workspace_add_members(
            f"[cyan]Processing {len(resolved_items)} item(s)...",
            total=len(resolved_items),
        )

        async def _process_items() -> None:
            nonlocal converted_count, md_extracted_count
            for item in resolved_items:
@@ -906,10 +942,7 @@ def workspace_list_members(
        console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
        raise typer.Exit(1)

    if json_output:
        typer.echo(
            json.dumps(
                [
    member_rows = [
        {
            "source_item_id": entry.source_item_id,
            "source_path": entry.source_path,
@@ -918,19 +951,36 @@ def workspace_list_members(
            "added_at": entry.added_at,
        }
        for entry in members
                ],
            ),
    ]

    if json_output:
        _print_output(
            member_rows,
            OutputFormat.JSON,
            table_title=f"Members: {workspace_name}",
        )
        return

    table = Table(title=f"Members: {workspace_name}")
    table.add_column("Source ID", style="cyan")
    table.add_column("Kind", style="green")
    table.add_column("Path", style="white")
    table.add_column("Active", style="yellow")
    for entry in members:
        table.add_row(entry.source_item_id, entry.source_kind.value, entry.source_path, "Yes" if entry.is_active else "No")
    console.print(table)
    table_rows = [
        {
            "source_id": row["source_item_id"],
            "kind": row["source_kind"],
            "path": row["source_path"],
            "active": "Yes" if row["is_active"] else "No",
        }
        for row in member_rows
    ]
    _print_output(
        table_rows,
        OutputFormat.TABLE,
        table_title=f"Members: {workspace_name}",
        table_columns=[
            TableColumnSpec("source_id", "Source ID", style="cyan"),
            TableColumnSpec("kind", "Kind", style="green"),
            TableColumnSpec("path", "Path", style="white"),
            TableColumnSpec("active", "Active", style="yellow"),
        ],
    )


@workspace_app.command("process", help="Process workspace members (checkout, convert, embed)")
@@ -990,7 +1040,7 @@ def workspace_process(
    }

    if json_output:
        typer.echo(json.dumps(payload))
        _print_output(payload, OutputFormat.JSON, table_title="Workspace Process")
        return

    console.print(f"[green]Processed: {success_count}[/green]")
+116 −13
Original line number Diff line number Diff line
@@ -14,17 +14,103 @@ Usage:
from __future__ import annotations

import json
from typing import Any
from dataclasses import dataclass
from typing import Any, Literal

import pandas as pd
import yaml
from ison_parser import dumps as ison_dumps
from ison_parser import from_dict as ison_from_dict
from rich.console import Console
from rich.table import Table
from toon_format import encode as toon_encode

from tdoc_crawler.logging import get_console
from tdoc_crawler.models.base import OutputFormat


@dataclass(frozen=True)
class TableColumnSpec:
    """Describes one table column for DataFrame-backed rendering."""

    key: str
    header: str
    style: str = "white"
    justify: Literal["default", "left", "center", "right", "full"] = "left"


def _normalize_to_dataframe(data: Any) -> tuple[pd.DataFrame, str]:
    """Normalize arbitrary structured data to a DataFrame.

    Returns:
        Tuple of (dataframe, input_kind) where input_kind is one of:
        "records", "dict", "scalar", "dataframe".
    """
    if isinstance(data, pd.DataFrame):
        return data.copy(), "dataframe"

    if isinstance(data, list):
        return pd.DataFrame(data), "records"

    if isinstance(data, dict):
        return pd.DataFrame([data]), "dict"

    return pd.DataFrame([{"value": data}]), "scalar"


def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any:
    """Convert normalized DataFrame to a payload shape for serializers."""
    records = df.to_dict(orient="records")

    if input_kind == "dict":
        return records[0] if records else {}

    if input_kind == "scalar":
        if not records:
            return None
        return records[0].get("value")

    return records


def _format_cell(value: Any) -> str:
    """Format one table cell from DataFrame value with empty handling."""
    if value is None:
        return "-"

    if isinstance(value, float) and pd.isna(value):
        return "-"

    return str(value)


def _build_table(
    df: pd.DataFrame,
    *,
    title: str,
    table_columns: list[TableColumnSpec] | None,
) -> Table:
    """Build a Rich table from a DataFrame and optional column specs."""
    table = Table(title=title)

    if table_columns:
        for column in table_columns:
            table.add_column(column.header, style=column.style, justify=column.justify)

        for _, row in df.iterrows():
            table.add_row(*[_format_cell(row.get(column.key)) for column in table_columns])

        return table

    for column_name in df.columns:
        table.add_column(str(column_name), style="white")

    for _, row in df.iterrows():
        table.add_row(*[_format_cell(row[column_name]) for column_name in df.columns])

    return table


def format_output(data: Any, output_format: OutputFormat) -> str:
    """Format structured data to the specified output format.

@@ -41,27 +127,44 @@ def format_output(data: Any, output_format: OutputFormat) -> str:
    Raises:
        ValueError: If the output format is not supported
    """
    # Convert to DataFrame for consistent tabular handling
    if isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, dict):
        df = data if any(isinstance(v, (list, dict)) for v in data.values()) else pd.DataFrame([data])
    else:
        df = data
    df, input_kind = _normalize_to_dataframe(data)
    payload = _dataframe_to_payload(df, input_kind)

    match output_format:
        case OutputFormat.JSON:
            result = df.to_json(indent=2, orient="records") if isinstance(df, pd.DataFrame) else json.dumps(df, indent=2, default=str)
            result = json.dumps(payload, indent=2, default=str)
            return result or "{}"
        case OutputFormat.ISON:
            records = df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df
            records = payload if isinstance(payload, list) else [payload]
            return ison_dumps(ison_from_dict({"items": records}))
        case OutputFormat.TOON:
            return toon_encode(df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df)
            return toon_encode(payload)
        case OutputFormat.YAML:
            return yaml.dump(df.to_dict(orient="records") if isinstance(df, pd.DataFrame) else df, sort_keys=False, default_flow_style=False)
            return yaml.dump(payload, sort_keys=False, default_flow_style=False)
        case OutputFormat.TABLE:
            return df.to_string(index=False)
        case _:
            raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}")


__all__ = ["format_output"]
def print_structured_output(
    data: Any,
    output_format: OutputFormat,
    *,
    table_title: str = "Results",
    table_columns: list[TableColumnSpec] | None = None,
    console: Console | None = None,
) -> None:
    """Print structured output through a DataFrame-first common pipeline."""
    output_console = console or get_console()
    df, _ = _normalize_to_dataframe(data)

    if output_format is OutputFormat.TABLE:
        table = _build_table(df, title=table_title, table_columns=table_columns)
        output_console.print(table)
        return

    output_console.print(format_output(df, output_format))


__all__ = ["TableColumnSpec", "format_output", "print_structured_output"]
+107 −83

File changed.

Preview size limit exceeded, changes collapsed.

+40 −30
Original line number Diff line number Diff line
@@ -42,12 +42,14 @@ class TestCrawlCommand:

        mock_crawler = MagicMock()
        mock_crawler_class.return_value = mock_crawler
        mock_crawler.crawl = AsyncMock(return_value=MagicMock(
        mock_crawler.crawl = AsyncMock(
            return_value=MagicMock(
                processed=10,
                inserted=10,
                updated=0,
                errors=[],
        ))
            )
        )

        result = runner.invoke(
            app,
@@ -76,12 +78,14 @@ class TestCrawlCommand:

        mock_crawler = MagicMock()
        mock_crawler_class.return_value = mock_crawler
        mock_crawler.crawl = AsyncMock(return_value=MagicMock(
        mock_crawler.crawl = AsyncMock(
            return_value=MagicMock(
                processed=5,
                inserted=5,
                updated=0,
                errors=[],
        ))
            )
        )

        result = runner.invoke(
            app,
@@ -111,12 +115,14 @@ class TestCrawlMeetingsCommand:

        mock_crawler = MagicMock()
        mock_crawler_class.return_value = mock_crawler
        mock_crawler.crawl = AsyncMock(return_value=MagicMock(
        mock_crawler.crawl = AsyncMock(
            return_value=MagicMock(
                processed=20,
                inserted=20,
                updated=0,
                errors=[],
        ))
            )
        )

        result = runner.invoke(
            app,
@@ -732,12 +738,14 @@ class TestEnvironmentVariables:

        mock_crawler = MagicMock()
        mock_crawler_class.return_value = mock_crawler
        mock_crawler.crawl = AsyncMock(return_value=MagicMock(
        mock_crawler.crawl = AsyncMock(
            return_value=MagicMock(
                processed=0,
                inserted=0,
                updated=0,
                errors=[],
        ))
            )
        )

        # Set environment variable via monkeypatch
        monkeypatch.setenv("TDC_CACHE_DIR", str(test_cache_dir))
@@ -763,12 +771,14 @@ class TestEnvironmentVariables:

        mock_crawler = MagicMock()
        mock_crawler_class.return_value = mock_crawler
        mock_crawler.crawl = AsyncMock(return_value=MagicMock(
        mock_crawler.crawl = AsyncMock(
            return_value=MagicMock(
                processed=0,
                inserted=0,
                updated=0,
                errors=[],
        ))
            )
        )

        # Set environment variable via monkeypatch
        monkeypatch.setenv("TDC_WORKERS", "8")
+70 −0
Original line number Diff line number Diff line
"""Tests for CLI structured output formatting."""

from __future__ import annotations

from rich.console import Console

from tdoc_crawler.cli.formatting import TableColumnSpec, format_output, print_structured_output
from tdoc_crawler.models.base import OutputFormat


def test_format_output_json_from_records() -> None:
    """JSON output should serialize list-of-dict inputs as records."""
    output = format_output([{"name": "alpha", "count": 2}], OutputFormat.JSON)

    assert '"name": "alpha"' in output
    assert output.strip().startswith("[")


def test_format_output_json_from_dict() -> None:
    """JSON output should keep dict payload shape after DataFrame normalization."""
    output = format_output({"name": "alpha", "count": 2}, OutputFormat.JSON)

    assert '"name": "alpha"' in output
    assert output.strip().startswith("{")


def test_format_output_yaml_from_records() -> None:
    """YAML output should serialize records with stable keys."""
    output = format_output([{"name": "alpha", "count": 2}], OutputFormat.YAML)

    assert "name: alpha" in output
    assert "count: 2" in output


def test_format_output_ison_contains_items_wrapper() -> None:
    """ISON output should include the shared items envelope."""
    output = format_output([{"name": "alpha"}], OutputFormat.ISON)

    assert "items" in output
    assert "alpha" in output


def test_format_output_table_text() -> None:
    """Table output should return DataFrame string representation."""
    output = format_output([{"name": "alpha", "count": 2}], OutputFormat.TABLE)

    assert "name" in output
    assert "count" in output


def test_print_structured_output_table_with_column_specs() -> None:
    """Table printer should render specified columns via the shared pipeline."""
    output_console = Console(record=True, width=120)

    print_structured_output(
        [{"name": "alpha", "count": 2}],
        OutputFormat.TABLE,
        table_title="Example",
        table_columns=[
            TableColumnSpec("name", "Name", style="cyan"),
            TableColumnSpec("count", "Count", style="yellow", justify="right"),
        ],
        console=output_console,
    )

    rendered = output_console.export_text()
    assert "Example" in rendered
    assert "Name" in rendered
    assert "Count" in rendered
    assert "alpha" in rendered