Commit 83cbe215 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(cli): split workspace commands into subpackage

parent ff5f2c60
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -11,9 +11,9 @@ from typing import Annotated

import typer

from tdoc_crawler.cli._workspace_commands import app as workspace_app
from tdoc_crawler.cli.config import load_cli_config
from tdoc_crawler.cli.config_app import config_app
from tdoc_crawler.cli.workspace import app as workspace_app
from tdoc_crawler.config import CacheManager
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
+33 −0
Original line number Diff line number Diff line
"""Workspace CLI commands — split from monolithic _workspace_commands.py."""

from __future__ import annotations

import typer

from tdoc_crawler.cli.workspace.create import workspace_create
from tdoc_crawler.cli.workspace.lifecycle import (
    workspace_activate,
    workspace_deactivate,
    workspace_delete,
    workspace_list,
)
from tdoc_crawler.cli.workspace.members import (
    workspace_add,
    workspace_clear_invalid,
    workspace_members,
)
from tdoc_crawler.cli.workspace.process import workspace_process

app = typer.Typer(help="Manage extraction workspaces")

app.command("create")(workspace_create)
app.command("list")(workspace_list)
app.command("activate")(workspace_activate)
app.command("deactivate")(workspace_deactivate)
app.command("delete")(workspace_delete)
app.command("members")(workspace_members)
app.command("add")(workspace_add)
app.command("clear-invalid")(workspace_clear_invalid)
app.command("process")(workspace_process)

__all__ = ["app"]
+24 −0
Original line number Diff line number Diff line
"""Workspace create command."""

from __future__ import annotations

import typer

from tdoc_crawler.cli._shared import console
from tdoc_crawler.cli.args import SourcesDirNameOption, WorkspaceDirOption
from tdoc_crawler.workspaces import create_workspace, normalize_workspace_name


def workspace_create(
    name: str = typer.Argument(..., help="Workspace name"),
    workspace_dir: WorkspaceDirOption = None,
    sources_dirname: SourcesDirNameOption = None,
) -> None:
    """Create a workspace."""
    normalized = normalize_workspace_name(name)
    create_workspace(
        normalized,
        workspace_dir=str(workspace_dir) if workspace_dir else None,
        sources_dirname=sources_dirname,
    )
    console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")
+72 −0
Original line number Diff line number Diff line
"""Workspace lifecycle commands: list, activate, deactivate, delete."""

from __future__ import annotations

import shutil

import typer

from tdoc_crawler.cli._shared import console
from tdoc_crawler.cli.args import DeleteArtifactsOption, DeleteLlmWikiOption, WorkspaceDeleteForceOption
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.workspaces import (
    delete_workspace,
    get_workspace,
    list_workspaces,
    normalize_workspace_name,
    set_active_workspace,
)


def workspace_list() -> None:
    """Display all existing workspaces."""
    workspaces = list_workspaces()
    if not workspaces:
        console.print("[dim]No workspaces found.[/dim]")
        return

    for ws in sorted(workspaces, key=lambda w: w.name):
        active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else ""
        console.print(f"- {ws.name}{active_marker}")


def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None:
    """Activate workspace for default command targets."""
    normalized = normalize_workspace_name(workspace_name)
    set_active_workspace(normalized)
    console.print(f"[green]Workspace '{normalized}' is now active.[/green]")


def workspace_deactivate() -> None:
    """Deactivate workspace context."""
    set_active_workspace(None)
    console.print("[yellow]Workspace deactivated.[/yellow]")


def workspace_delete(
    workspace_name: str = typer.Argument(..., help="Workspace name"),
    force: WorkspaceDeleteForceOption = False,
    delete_artifacts: DeleteArtifactsOption = False,
    delete_llm_wiki: DeleteLlmWikiOption = False,
) -> None:
    """Permanently delete a workspace and all associated files."""
    normalized = normalize_workspace_name(workspace_name)
    if not force:
        console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
        return

    delete_workspace(normalized, delete_artifacts=delete_artifacts)

    if delete_llm_wiki:
        try:
            manager = resolve_cache_manager()
            metadata = get_workspace(normalized)
            ws_dir = metadata.resolve_workspace_directory(manager.workspaces_dir) if metadata is not None else manager.workspaces_dir / normalized
            llm_wiki_dir = ws_dir / "wiki"
            if llm_wiki_dir.exists():
                shutil.rmtree(llm_wiki_dir)
                console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]")
        except Exception as e:
            console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]")

    console.print(f"[green]Workspace '{normalized}' deleted.[/green]")
+230 −0
Original line number Diff line number Diff line
"""Workspace-related CLI commands for the main application.

These commands create, inspect, modify, and process workspaces.
"""
"""Workspace membership commands: members, add, clear-invalid."""

from __future__ import annotations

import asyncio
import json
import shutil
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

import typer

from tdoc_crawler.cli._shared import console, create_progress_bar, ensure_hybrid_server_for_profile
from tdoc_crawler.cli._shared import console
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    AutoCrawlSpecsOption,
    DeleteArtifactsOption,
    DeleteLlmWikiOption,
    DryRunOption,
    EndDateOption,
    IncludeInactiveOption,
    LimitOption,
    MdYamlFrontmatterOption,
    ProcessLimitOption,
    ProfileOption,
    ReleaseOption,
    SkipExistingOption,
    SourceKindOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    StartDateOption,
    TitlePatternExcludeOption,
    TitlePatternOption,
    VerbosityOption,
    WorkspaceDeleteForceOption,
    WorkspaceItemsArgument,
    WorkspaceNameOption,
    WorkspaceProcessForceOption,
)
from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.config import PathConfig, resolve_cache_manager
from tdoc_crawler.config import PathConfig
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.extraction.convert import convert_for_wiki
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import get_logger, set_verbosity
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.specs.operations.checkout import resolve_spec_release
@@ -56,110 +36,14 @@ from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.workspaces import (
    add_workspace_members,
    create_workspace,
    delete_workspace,
    get_active_workspace,
    list_workspace_members,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
    remove_workspace_member,
    set_active_workspace,
)

logger = get_logger(__name__)

__all__ = ["app"]

app = typer.Typer(help="Manage extraction workspaces")

_logger = get_logger(__name__)


def _print_output(
    data: Any,
    output_format: OutputFormat,
    *,
    table_title: str,
    table_columns: list[TableColumnSpec] | None = None,
) -> None:
    """Print structured command output through the shared formatter pipeline."""
    print_structured_output(
        data,
        output_format,
        table_title=table_title,
        table_columns=table_columns,
        console=console,
)


@app.command("create", help="Create a new workspace.")
def workspace_create(
    name: str = typer.Argument(..., help="Workspace name"),
) -> None:
    """Create a workspace."""
    normalized = normalize_workspace_name(name)
    create_workspace(normalized)
    console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")


@app.command("list", help="List all available workspaces.")
def workspace_list() -> None:
    """Display all existing workspaces."""
    workspaces = list_workspaces()
    if not workspaces:
        console.print("[dim]No workspaces found.[/dim]")
        return

    for ws in sorted(workspaces, key=lambda w: w.name):
        active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else ""
        console.print(f"- {ws.name}{active_marker}")


@app.command("activate", help="Set a workspace as active.")
def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None:
    """Activate workspace for default command targets."""
    normalized = normalize_workspace_name(workspace_name)
    set_active_workspace(normalized)
    console.print(f"[green]Workspace '{normalized}' is now active.[/green]")


@app.command("deactivate", help="Deactivate the currently active workspace.")
def workspace_deactivate() -> None:
    """Deactivate workspace context."""
    set_active_workspace(None)
    console.print("[yellow]Workspace deactivated.[/yellow]")


@app.command("delete", help="Delete a workspace and optionally its artifacts.")
def workspace_delete(
    workspace_name: str = typer.Argument(..., help="Workspace name"),
    force: WorkspaceDeleteForceOption = False,
    delete_artifacts: DeleteArtifactsOption = False,
    delete_llm_wiki: DeleteLlmWikiOption = False,
) -> None:
    """Permanently delete a workspace and all associated files."""
    normalized = normalize_workspace_name(workspace_name)
    if not force:
        console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
        return

    delete_workspace(normalized, delete_artifacts=delete_artifacts)

    if delete_llm_wiki:
        try:
            manager = resolve_cache_manager()
            llm_wiki_dir = manager.workspaces_dir / normalized / "wiki"
            if llm_wiki_dir.exists():
                shutil.rmtree(llm_wiki_dir)
                console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]")
        except Exception as e:
            console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]")

    console.print(f"[green]Workspace '{normalized}' deleted.[/green]")


@app.command("members", help="List workspace members.")
def workspace_members(
    workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
    include_inactive: IncludeInactiveOption = False,
@@ -181,187 +65,6 @@ def workspace_members(
        console.print(f"[red]Error: {e}[/red]")


_PROFILE_LEVELS = {
    ExtractionProfile.PDF_ONLY: 0,
    ExtractionProfile.DEFAULT: 1,
    ExtractionProfile.ADVANCED: 2,
}


def _should_skip_member(
    source_id: str,
    wiki_base: Path,
    profile: ExtractionProfile,
    force: bool,
    skip_existing: bool,
) -> bool:
    """Check if a workspace member should be skipped due to existing artifacts."""
    if force or not skip_existing:
        return False

    member_dir = wiki_base / source_id

    if profile == ExtractionProfile.PDF_ONLY:
        if list(member_dir.glob("*.pdf")):
            logger.debug("Skipping %s — PDF exists", source_id)
            return True
        return False

    md_files = list(member_dir.glob("*.md"))
    json_files = list(member_dir.glob("*.json"))
    if not md_files or not json_files:
        return False

    saved_profile = _read_json_profile(json_files[0])
    saved_level = _PROFILE_LEVELS.get(saved_profile, -1)
    if saved_level < 0:
        saved_level = _PROFILE_LEVELS.get(_coerce_profile(saved_profile), -1)

    required_level = _PROFILE_LEVELS[profile]
    if saved_level >= required_level:
        label = saved_profile if saved_level >= 0 else "unknown"
        logger.debug("Skipping %s — %s output exists", source_id, label)
        return True

    return False


def _read_json_profile(json_path: Path) -> str:
    """Read extraction_profile from a JSON file, empty string on failure."""
    try:
        data = json.loads(json_path.read_text(encoding="utf-8"))
        return data.get("extraction_profile", "")
    except json.JSONDecodeError, OSError:
        return ""


def _coerce_profile(value: str) -> ExtractionProfile | None:
    """Try to parse a string as ExtractionProfile, return None on failure."""
    try:
        return ExtractionProfile(value)
    except ValueError:
        return None


def _process_member(
    member: Any,
    wiki_source_dir_base: Path,
    extraction_profile: ExtractionProfile,
    force: bool,
    md_yaml_frontmatter: bool,
) -> tuple[str, bool, bool]:
    """Process a single workspace member.

    Returns:
        Tuple of (source_id, succeeded, failed).
    """
    source_id = member.source_item_id
    wiki_source_dir = wiki_source_dir_base / source_id
    wiki_source_dir.mkdir(parents=True, exist_ok=True)

    try:
        result_path = convert_for_wiki(
            document_id=source_id,
            wiki_source_dir=wiki_source_dir,
            source_kind=member.source_kind,
            profile=extraction_profile,
            force=force,
            release=member.release,
            md_yaml_frontmatter=md_yaml_frontmatter,
        )
        if result_path:
            suffix = result_path.suffix.lstrip(".")
            logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
            return source_id, True, False
        logger.debug("No output for %s", source_id)
        return source_id, False, False
    except Exception as e:
        console.print(f"[red]  Failed {source_id}: {e}[/red]")
        logger.error("Failed to process %s: %s", source_id, e)
        return source_id, False, True


@app.command("process", help="Process workspace members.")
def workspace_process(
    workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
    force: WorkspaceProcessForceOption = False,
    limit: ProcessLimitOption = None,
    skip_existing: SkipExistingOption = False,
    profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value,
    md_yaml_frontmatter: MdYamlFrontmatterOption = True,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Extract structured data from all workspace members."""
    set_verbosity(verbosity)

    if workspace_name is None:
        workspace_name = get_active_workspace()

    normalized = normalize_workspace_name(workspace_name)

    try:
        extraction_profile = ExtractionProfile(profile)
    except ValueError:
        console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]")
        raise typer.Exit(1)

    console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]")

    try:
        members = list_workspace_members(normalized, include_inactive=False)
    except Exception as e:
        console.print(f"[red]Error listing members: {e}[/red]")
        raise typer.Exit(1)

    if not members:
        console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]")
        return

    if limit is not None:
        members = members[:limit]

    if not ensure_hybrid_server_for_profile(extraction_profile):
        raise typer.Exit(1)

    cache_manager = resolve_cache_manager()
    wiki_source_dir_base = cache_manager.workspaces_dir / normalized / "sources"

    processed = 0
    failed = 0
    skipped = 0

    progress, task = create_progress_bar(
        f"Processing [{extraction_profile.value}]",
        total=len(members),
    )

    with progress:
        for member in members:
            source_id = member.source_item_id

            if _should_skip_member(source_id, wiki_source_dir_base, extraction_profile, force, skip_existing):
                skipped += 1
                progress.advance(task)
                continue

            progress.update(task, description=f"Processing [{extraction_profile.value}] {source_id}")
            _, succeeded, errored = _process_member(
                member,
                wiki_source_dir_base,
                extraction_profile,
                force,
                md_yaml_frontmatter,
            )
            if succeeded:
                processed += 1
            if errored:
                failed += 1

            progress.advance(task)

    console.print(f"[yellow]Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped[/yellow]")


def _resolve_spec_release_for_add(
    item: str,
    release: str,
@@ -385,7 +88,6 @@ def _resolve_spec_release_for_add(
    return release


@app.command("add", help="Add documents to an existing workspace.")
def workspace_add(
    items: WorkspaceItemsArgument = None,
    workspace: WorkspaceNameOption = None,
@@ -408,7 +110,6 @@ def workspace_add(
    Either provide items directly, or use filter options (--agenda, --source, etc.)
    to query the TDoc database and add matching results in bulk.
    """
    # Resolve workspace name: -w flag > active workspace
    resolved_name = workspace or get_active_workspace()
    if resolved_name is None:
        console.print("[red]No workspace specified and no active workspace set. Use 'workspace create' and 'workspace activate' first.[/red]")
@@ -417,13 +118,10 @@ def workspace_add(
    normalized = normalize_workspace_name(resolved_name)
    source_kind = SourceKind(kind)

    # Check if any filter flags are set
    has_filters = any([agenda, agenda_ex, start_date, end_date, source, source_ex, title, title_ex])

    members: list = []

    if has_filters:
        # Batch mode: query TDoc database and add matching results
        try:
            start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None
        except ValueError as exc:
@@ -478,7 +176,6 @@ def workspace_add(
            )

    elif items:
        # Direct mode: add items by ID
        for item in items:
            resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs)
            members.append(
@@ -503,7 +200,6 @@ def workspace_add(
    console.print(f"[green]Added {added} item(s) to workspace '{normalized}' ({mode}).[/green]")


@app.command("clear-invalid", help="Remove members with invalid or missing source paths.")
def workspace_clear_invalid(
    workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
    dry_run: DryRunOption = False,
Loading