Commit 265c8a82 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(3gpp-ai): modularize CLI into subpackage and simplify extraction pipeline

Replace monolithic cli.py (~1010 lines) with focused modules:

- cli/__init__.py     - wires workspace/hybrid/config sub-typers

- cli/_shared.py      - console, progress bars, workspace resolution

- cli/_workspace.py   - checkout, process, convert business logic

- cli/_workspace_commands.py - workspace create/list/activate/process/delete

- cli/_commands.py    - summarize, convert, clear top-level commands

- cli/_hybrid_commands.py  - hybrid-server start/stop/status

Simplify extraction pipeline to use OpenDataLoader directly:

- convert.py: rewrite to call opendataloader_pdf.convert() directly

- Delete extraction.py + extraction_result.py wrapper abstractions

- models.py: remove deprecated StructuredExtractionResult, contracts

- summarize.py: accept dict | object for extraction results

- checkout.py: remove ensure_ai_subfolder call (handled by convert)

Old cli.py preserved as thin shim re-exporting from threegpp_ai.cli
parent fd6ca52a
Loading
Loading
Loading
Loading
+5 −1052

File changed.

Preview size limit exceeded, changes collapsed.

+20 −0
Original line number Diff line number Diff line
"""CLI package for 3gpp-ai.

Provides a modular command tree with convert/summarize/workspace operations
for extraction-first document workflows.
"""

from __future__ import annotations

from threegpp_ai.config_app import config_app

from ._commands import app as main_app
from ._hybrid_commands import app as hybrid_app
from ._workspace_commands import app as workspace_app

app = main_app
app.add_typer(workspace_app, name="workspace")
app.add_typer(hybrid_app, name="hybrid-server")
app.add_typer(config_app, name="config")

__all__ = ["app"]
+215 −0
Original line number Diff line number Diff line
"""Top-level CLI commands for 3gpp-ai (summarize, convert, clear)."""

from __future__ import annotations

import asyncio
from pathlib import Path
from typing import Any

import typer
from dotenv import load_dotenv
from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import OutputFormat

from threegpp_ai import (
    convert_tdoc_to_markdown,
    delete_ai_folder,
    summarize_document,
)
from threegpp_ai.args import (
    CacheDirOption,
    ClearDryRunOption,
    ClearPathOption,
    ClearWorkspaceOption,
    ConfigFileOption,
    ConvertDocumentArgument,
    ConvertForceOption,
    ConvertOutputOption,
    OutputFormatOption,
    SummarizeAllowFailedQualityOption,
    SummarizeDocumentArgument,
    SummarizeForceOption,
    SummarizeOutputModeOption,
    SummarizeQualityPolicyOption,
    SummarizeWordsOption,
)
from threegpp_ai.config import ThreeGPPAIConfig
from threegpp_ai.operations.workspaces import list_workspace_members, normalize_workspace_name

from ._shared import console

load_dotenv()

app = typer.Typer(help="3GPP AI - Document processing and summarization")

_logger = get_logger(__name__)


def _print_output(
    data: Any,
    output_format: OutputFormat,
    *,
    table_title: str,
    table_columns: list[TableColumnSpec] | None = None,
) -> None:
    """Print structured command output through the shared formatter pipeline."""
    print_structured_output(
        data,
        output_format,
        table_title=table_title,
        table_columns=table_columns,
        console=console,
    )


@app.callback()
def _app_init(
    ctx: typer.Context,
    config_file: ConfigFileOption = None,
    cache_dir: CacheDirOption = None,
) -> None:
    """Load configuration so all sub-commands can resolve file paths."""
    config = ThreeGPPAIConfig.from_settings(config_file=config_file)
    if cache_dir:
        config.path.cache_dir = cache_dir

    ctx.obj = config


@app.command("summarize", help="Summarize a single document with specified word count.")
def ai_summarize(
    document_id: SummarizeDocumentArgument,
    words: SummarizeWordsOption = 200,
    force: SummarizeForceOption = False,
    quality_policy: SummarizeQualityPolicyOption = None,
    allow_failed_quality: SummarizeAllowFailedQualityOption = False,
    output_mode: SummarizeOutputModeOption = "standard",
) -> None:
    """Summarize one TDoc through the 3gpp-ai pipeline."""
    normalized_mode = output_mode.strip().lower()
    if normalized_mode not in {"standard", "wiki"}:
        raise typer.BadParameter("--output-mode must be one of: standard, wiki")

    result = summarize_document(
        document_id=document_id,
        max_words=words,
        force=force,
        quality_policy_mode=quality_policy,
        allow_failed_quality=allow_failed_quality,
    )
    if normalized_mode == "wiki":
        console.print(f"## Wiki Summary for {document_id}")
        console.print("### Abstract")
        console.print(result.summary)
        if result.keywords:
            console.print("### Keywords")
            for keyword in result.keywords:
                console.print(f"- {keyword}")
        extraction_status = result.metadata.get("extraction_status")
        if extraction_status:
            console.print("### Source Quality")
            console.print(f"Extraction status: {extraction_status}")
        return

    console.print(f"## Summary for {document_id}")
    console.print(result.summary)


@app.command("convert", help="Convert a single TDoc to markdown format.")
def ai_convert(
    document_id: ConvertDocumentArgument,
    output_path: ConvertOutputOption = None,
    force: ConvertForceOption = False,
    output_format: OutputFormatOption = OutputFormat.TABLE.value,
) -> None:
    """Convert one TDoc and optionally persist markdown output."""
    markdown_or_path = asyncio.run(
        convert_tdoc_to_markdown(document_id=document_id, output_path=output_path, force=force),
    )

    output = OutputFormat(output_format.lower())
    if output_path:
        _print_output({"output": str(output_path)}, output, table_title="Convert Result")
        return

    if output is OutputFormat.TABLE:
        typer.echo(markdown_or_path)
    else:
        _print_output({"markdown": markdown_or_path}, output, table_title="Convert Result")


@app.command("clear", help="Delete all .ai processing artifacts from checkout/cache folders")
def clear_artifacts(
    path: ClearPathOption = None,
    workspace: ClearWorkspaceOption = None,
    dry_run: ClearDryRunOption = False,
) -> None:
    """Delete all .ai processing artifacts from checkout folders.

    Use this to force re-extraction of all documents.
    """
    if path is None:
        path_config = PathConfig()
        checkout_path = path_config.cache_dir / path_config.checkout_dirname
    else:
        checkout_path = path

    if workspace is not None:
        _clear_workspace_artifacts(checkout_path, workspace, dry_run)
    else:
        _clear_all_artifacts(checkout_path, dry_run)


def _clear_workspace_artifacts(checkout_path: Path, workspace: str, dry_run: bool) -> None:
    """Clear .ai folders for members of a specific workspace."""
    normalized = normalize_workspace_name(workspace)
    members = list_workspace_members(normalized, include_inactive=True)
    ai_dirs: set[Path] = set()
    for member in members:
        source = Path(member.source_path)
        if source.suffix == ".ai":
            ai_dirs.add(source)
        else:
            ai_dirs.add(source.parent / ".ai")

    total_deleted = 0
    for ai_dir in sorted(ai_dirs):
        if ai_dir.exists():
            if dry_run:
                count = sum(1 for _ in ai_dir.rglob("*") if _.is_file())
                console.print(f"  [dim]{ai_dir}[/dim] (dry-run): {count} items")
            else:
                count = delete_ai_folder(ai_dir)
            total_deleted += count

    if dry_run:
        console.print(f"\n[yellow]Dry-run: would delete {total_deleted} items from {len(ai_dirs)} .ai folders[/yellow]")
    else:
        console.print(f"\n[green]Deleted {total_deleted} items from {len(ai_dirs)} .ai folders[/green]")


def _clear_all_artifacts(checkout_path: Path, dry_run: bool) -> None:
    """Clear all .ai folders under the checkout path."""
    ai_dirs = list(checkout_path.rglob(".ai"))
    if not ai_dirs:
        console.print("[yellow]No .ai folders found[/yellow]")
        return

    total_deleted = 0
    for ai_dir in sorted(ai_dirs):
        if dry_run:
            count = sum(1 for _ in ai_dir.rglob("*") if _.is_file())
            console.print(f"  {ai_dir}: {count} items (dry-run)")
            total_deleted += count
        else:
            count = delete_ai_folder(ai_dir)
            if count > 0:
                console.print(f"  {ai_dir}: {count} items")
            total_deleted += count

    if dry_run:
        console.print(f"\n[yellow]Dry-run: would delete {total_deleted} items from {len(ai_dirs)} .ai folders[/yellow]")
    else:
        console.print(f"\n[green]Deleted {total_deleted} items from {len(ai_dirs)} .ai folders[/green]")
+73 −0
Original line number Diff line number Diff line
"""Hybrid server CLI commands for 3gpp-ai."""

from __future__ import annotations

import typer

from threegpp_ai.operations.hybrid_server import (
    DEFAULT_HOST,
    DEFAULT_PORT,
    HybridServerConfig,
    HybridServerManager,
)

from ._shared import console

app = typer.Typer(help="Hybrid server management")


@app.command("start", help="Start the hybrid RAG + keyword search server.")
def hybrid_server_start(
    host: str = DEFAULT_HOST,
    port: int = DEFAULT_PORT,
    device: str = "auto",
    wait: bool = True,
) -> None:
    """Start the hybrid search server with both vector and keyword indices."""
    config = HybridServerConfig(
        host=host,
        port=port,
        device=device,
    )
    manager = HybridServerManager(config)

    if wait:
        with console.status(f"[cyan]Starting hybrid server at {host}:{port}..."):
            status = manager.start(wait=True)
    else:
        status = manager.start(wait=False)

    if status.running:
        console.print(f"[green]Hybrid server started at {status.url}[/green]")
        console.print(f"[dim]PID: {status.pid}[/dim]")
    else:
        console.print(f"[red]Failed to start hybrid server: {status.error}[/red]")
        raise typer.Exit(1)


@app.command("status", help="Show hybrid server status.")
def hybrid_server_status() -> None:
    """Check whether the hybrid server is running."""
    manager = HybridServerManager()
    status = manager.check_health()

    if status.running:
        console.print(f"[green]Hybrid server is running at {status.url}[/green]")
        if status.pid:
            console.print(f"[dim]PID: {status.pid}[/dim]")
    else:
        console.print("[yellow]Hybrid server is not running.[/yellow]")
        if status.error:
            console.print(f"[dim]Status: {status.error}[/dim]")


@app.command("stop", help="Stop the hybrid RAG + keyword search server.")
def hybrid_server_stop() -> None:
    """Stop the running hybrid search server."""
    manager = HybridServerManager()
    status = manager.stop()

    if status.running:
        console.print("[red]Server still running (may need manual cleanup)[/red]")
    else:
        console.print("[green]Hybrid server stopped.[/green]")
+96 −0
Original line number Diff line number Diff line
"""Shared utilities for 3gpp-ai CLI commands."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import typer
from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.logging import get_console, get_logger
from tdoc_crawler.models.base import OutputFormat

from threegpp_ai.operations.workspace_registry import normalize_workspace_name
from threegpp_ai.operations.workspaces import get_active_workspace

console = get_console()
_logger = get_logger(__name__)


def print_output(
    data: Any,
    output_format: OutputFormat,
    *,
    table_title: str,
    table_columns: list[TableColumnSpec] | None = None,
) -> None:
    """Print structured command output through the shared formatter pipeline."""
    print_structured_output(
        data,
        output_format,
        table_title=table_title,
        table_columns=table_columns,
        console=console,
    )


def resolve_workspace_name(workspace: str | None) -> str:
    """Resolve workspace name from explicit value or active workspace."""
    if workspace:
        return normalize_workspace_name(workspace)

    active = get_active_workspace()
    if active:
        return active

    console.print("[red]No workspace specified and no active workspace set.[/red]")
    console.print("[red]Use -w <name> or '3gpp-ai workspace activate <name>'[/red]")
    raise typer.Exit(1)


def get_relative_path(source_path: str, base_path: Path) -> str:
    """Convert absolute path to relative path from base, or return original if not under base."""
    try:
        source = Path(source_path)
        base = Path(base_path)
        return str(source.relative_to(base))
    except ValueError:
        return source_path


def create_progress_bar(
    description: str,
    total: float | None = None,
    *columns: Any,
    console_instance: Any = None,
) -> Progress:
    """Create a standard Rich Progress instance for CLI operations.

    Args:
        description: Task description.
        total: Initial total (will be updated by callback).
        columns: Additional progress columns (if any).
        console_instance: Optional console override.

    Returns:
        Configured Progress context manager.
    """
    return Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        TimeElapsedColumn(),
        *columns,
        console=console_instance or console,
    )


def create_minimal_spinner(console_instance: Any = None) -> Progress:
    """Create a minimal spinner-only progress indicator for short operations."""
    return Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        console=console_instance or console,
    )
Loading