Commit d76918a8 authored by Jan Reimes's avatar Jan Reimes
Browse files

🔥 chore: remove workspace commands module

parent 658c6557
Loading
Loading
Loading
Loading
+0 −596
Original line number Diff line number Diff line
"""Workspace-related CLI commands for the main application.

These commands create, inspect, modify, and process workspaces.
"""

from __future__ import annotations

import asyncio
import shutil
from datetime import UTC, datetime
from pathlib import Path
from typing import Annotated

import typer
from rich.progress import Progress, SpinnerColumn, TextColumn

from tdoc_crawler.cli._shared import console
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.extraction.convert import convert_for_wiki
from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.workspaces import SourceKind, TDocNotFoundError
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.normalization import normalize_release_version, normalize_spec_number, normalize_tdoc_id
from tdoc_crawler.workspaces import (
    add_workspace_members,
    create_workspace,
    delete_workspace,
    get_active_workspace,
    list_workspace_members,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
    remove_workspace_member,
    resolve_spec_release_from_db,
    set_active_workspace,
)

logger = get_logger(__name__)

INACTIVE_MISSING_TDOC_STATUSES = {"withdrawn", "reserved"}

__all__ = ["app"]

app = typer.Typer(help="Manage extraction workspaces")

_logger = get_logger(__name__)

WorkspaceNameOption = Annotated[
    str | None,
    typer.Option(
        "-w",
        "--workspace",
        help="Workspace name (default: active workspace)",
    ),
]


def _resolve_workspace_name(workspace: str | None) -> str:
    """Resolve workspace name to a normalized string.

    Args:
        workspace: Workspace name or None.

    Returns:
        Normalized workspace name, or active workspace if None.
    """
    if workspace is None:
        return get_active_workspace()
    return normalize_workspace_name(workspace)


def _validate_tdoc_exists(tdoc_id: str) -> bool:
    """Check if a TDoc exists and can be resolved.

    Args:
        tdoc_id: TDoc identifier to validate.

    Returns:
        True if the TDoc can be resolved, False otherwise.
    """
    try:
        fetch_tdoc_files(tdoc_id)
        return True
    except (TDocNotFoundError, Exception) as e:
        _logger.debug("TDoc %s validation failed: %s", tdoc_id, e)
        return False


async def _get_tdoc_status(tdoc_id: str) -> str | None:
    """Return normalized status from database for a TDoc ID, if present."""
    normalized_tdoc_id = normalize_tdoc_id(tdoc_id)
    if normalized_tdoc_id is None:
        return None

    manager = resolve_cache_manager()
    query_config = TDocQueryConfig(tdoc_ids=[normalized_tdoc_id])
    async with TDocDatabase(manager.db_file) as db:
        rows = await db.query_tdocs(query_config)

    if not rows:
        return None

    raw_status = rows[0].status
    if raw_status is None:
        return None
    return raw_status.strip().lower() or None


def _resolve_inactive_missing_tdoc_status(tdoc_id: str) -> str | None:
    """Return status when missing files are acceptable and member should be inactive."""
    status = asyncio.run(_get_tdoc_status(tdoc_id))
    if status in INACTIVE_MISSING_TDOC_STATUSES:
        return status
    return None


async def _validate_spec_exists(spec_number: str, release: str | None = None) -> bool:
    """Check if a spec exists in the database.

    Args:
        spec_number: Spec number to validate.
        release: Optional release version.

    Returns:
        True if the spec exists, False otherwise.
    """
    try:
        normalized_spec = normalize_spec_number(spec_number)
        manager = resolve_cache_manager()
        async with SpecDatabase(manager.db_file) as db:
            versions = await db.get_spec_versions(normalized_spec)
            if not versions:
                return False
            if release:
                normalized = normalize_release_version(release)
                return any(normalized in v.version for v in versions)
            return True
    except Exception as e:
        if isinstance(e, (KeyboardInterrupt, SystemExit)):
            raise
        _logger.debug("Spec %s validation failed: %s", spec_number, e)
        return False


@app.command("create", help="Create a new workspace.")
def workspace_create(
    name: str = typer.Argument(..., help="Workspace name"),
) -> None:
    """Create a workspace."""
    normalized = normalize_workspace_name(name)
    create_workspace(normalized)
    console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")


@app.command("list", help="List all available workspaces.")
def workspace_list() -> None:
    """Display all existing workspaces."""
    workspaces = list_workspaces()
    if not workspaces:
        console.print("[dim]No workspaces found.[/dim]")
        return

    for ws in sorted(workspaces, key=lambda w: w.name):
        active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else ""
        console.print(f"- {ws.name}{active_marker}")


@app.command("activate", help="Set a workspace as active.")
def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None:
    """Activate workspace for default command targets."""
    normalized = normalize_workspace_name(workspace_name)
    set_active_workspace(normalized)
    console.print(f"[green]Workspace '{normalized}' is now active.[/green]")


@app.command("deactivate", help="Deactivate the currently active workspace.")
def workspace_deactivate() -> None:
    """Deactivate workspace context."""
    set_active_workspace(None)
    console.print("[yellow]Workspace deactivated.[/yellow]")


@app.command("delete", help="Delete a workspace and optionally its artifacts.")
def workspace_delete(
    workspace_name: str = typer.Argument(..., help="Workspace name"),
    force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"),
    delete_wiki: bool = typer.Option(False, "--delete-wiki", help="Delete the wiki folder for this workspace"),
) -> None:
    """Permanently delete a workspace and all associated files."""
    normalized = normalize_workspace_name(workspace_name)
    if not force:
        console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
        return

    delete_workspace(normalized)

    if delete_wiki:
        try:
            manager = resolve_cache_manager()
            wiki_dir = manager.workspace_llm_wiki_dir(normalized)
            if wiki_dir.exists():
                shutil.rmtree(wiki_dir)
                console.print(f"[green]Deleted wiki folder for '{normalized}'.[/green]")
        except Exception as e:
            console.print(f"[yellow]Could not delete wiki folder: {e}[/yellow]")

    console.print(f"[green]Workspace '{normalized}' deleted.[/green]")


@app.command("members", help="List workspace members.")
def workspace_members(
    workspace: WorkspaceNameOption = None,
    include_inactive: bool = typer.Option(False, "--include-inactive", help="Include inactive members"),
) -> None:
    """List members of a workspace."""
    normalized = _resolve_workspace_name(workspace)
    try:
        members = list_workspace_members(normalized, include_inactive=include_inactive)
        if not members:
            console.print(f"[dim]No members in workspace '{normalized}'.[/dim]")
            return

        for member in members:
            status = "[dim]inactive[/dim]" if not member.is_active else "[green]active[/green]"
            console.print(f"  {member.source_item_id} ({member.source_kind.value}) - {status}")
    except Exception as e:
        if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
            raise
        console.print(f"[red]Error: {e}[/red]")


def _should_skip_member(
    skip_existing: bool,
    force: bool,
    wiki_source_dir_base: Path,
    source_id: str,
    extraction_profile: ExtractionProfile,
) -> bool:
    """Check if a member should be skipped because artifacts already exist."""
    if not skip_existing or force:
        return False

    member_wiki_dir = wiki_source_dir_base / source_id
    has_artifacts = bool(list(member_wiki_dir.glob("*.pdf"))) if extraction_profile == ExtractionProfile.PDF_ONLY else bool(list(member_wiki_dir.glob("*.md")))

    if has_artifacts:
        console.print(f"[dim]  Skipping {source_id} - artifacts exist[/dim]")
    return has_artifacts


@app.command("process", help="Process workspace members.")
def workspace_process(
    workspace: WorkspaceNameOption = None,
    force: bool = typer.Option(False, "--force", help="Re-process existing artifacts"),
    limit: int = typer.Option(None, "--limit", help="Limit number of members to process"),
    skip_existing: bool = typer.Option(False, "--skip-existing", help="Skip members that already have artifacts"),
    profile: str = typer.Option(
        DEFAULT_EXTRACTION_PROFILE.value,
        "--profile",
        help="Extraction profile: pdf-only, default, or advanced",
    ),
) -> None:
    """Extract structured data from all workspace members."""
    normalized = _resolve_workspace_name(workspace)

    try:
        extraction_profile = ExtractionProfile(profile)
    except ValueError:
        console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]")
        raise typer.Exit(1)

    console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]")

    try:
        members = list_workspace_members(normalized, include_inactive=False)
    except Exception as e:
        if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
            raise
        console.print(f"[red]Error listing members: {e}[/red]")
        raise typer.Exit(1)

    if not members:
        console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]")
        return

    if limit is not None:
        members = members[:limit]

    manager = resolve_cache_manager()
    wiki_source_dir_base = manager.workspace_sources_dir(normalized)

    processed = 0
    failed = 0
    skipped_items: list[tuple[str, str]] = []

    for member in members:
        source_id = member.source_item_id

        if _should_skip_member(skip_existing, force, wiki_source_dir_base, source_id, extraction_profile):
            continue

        wiki_source_dir = wiki_source_dir_base / source_id
        wiki_source_dir.mkdir(parents=True, exist_ok=True)

        try:
            result_path = convert_for_wiki(
                document_id=source_id,
                wiki_source_dir=wiki_source_dir,
                source_kind=member.source_kind,
                source_path=member.source_path,
                profile=extraction_profile,
                force=force,
            )
            if result_path:
                console.print(f"[green]  Processed {source_id} -> {result_path.name}[/green]")
                processed += 1
            else:
                console.print(f"[yellow]  No output for {source_id}[/yellow]")
        except Exception as e:
            if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
                raise
            error_text = str(e)
            skipped_items.append((source_id, error_text))
            logger.debug("Skipped processing %s: %s", source_id, error_text)
            failed += 1

    if skipped_items:
        console.print("\n[yellow]Skipped documents (processing warnings):[/yellow]")
        for source_id, error_text in skipped_items:
            console.print(f"[yellow]  - {source_id}: {error_text}[/yellow]")

    console.print(f"\n[yellow]Processing complete: {processed} succeeded, {failed} failed[/yellow]")


def _parse_date_filters(
    start_date: str | None,
    end_date: str | None,
) -> tuple[datetime | None, datetime | None]:
    """Parse and validate date filter strings into datetime objects.

    Args:
        start_date: ISO-8601 start date string, or None.
        end_date: ISO-8601 end date string, or None.

    Returns:
        Tuple of (start_datetime, end_datetime), either may be None.

    Raises:
        typer.Exit: If a date string cannot be parsed.
    """
    try:
        start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None
    except ValueError:
        console.print("[red]Invalid start date format; use ISO-8601[/red]")
        raise typer.Exit(1)

    try:
        end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None
    except ValueError:
        console.print("[red]Invalid end date format; use ISO-8601[/red]")
        raise typer.Exit(1)

    return start, end


async def _query_tdocs_async(
    source_kind: SourceKind,
    start_date: str | None,
    end_date: str | None,
    source: list[str] | None,
    source_ex: list[str] | None,
    title: list[str] | None,
    title_ex: list[str] | None,
    agenda: list[str] | None,
    agenda_ex: list[str] | None,
    limit: int | None,
) -> list[str]:
    """Query database for TDocs matching filter criteria.

    Args:
        source_kind: Must be TDOC for query mode.
        start_date: Filter by start date.
        end_date: Filter by end date.
        source: Filter by source pattern.
        source_ex: Exclude by source pattern.
        title: Filter by title pattern.
        title_ex: Exclude by title pattern.
        agenda: Filter by agenda pattern.
        agenda_ex: Exclude by agenda pattern.
        limit: Limit number of results.

    Returns:
        List of matching TDoc IDs.
    """
    if source_kind != SourceKind.TDOC:
        console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]")
        raise typer.Exit(1)

    manager = resolve_cache_manager()
    start, end = _parse_date_filters(start_date, end_date)

    config = TDocQueryConfig(
        output_format=OutputFormat.TABLE,
        tdoc_ids=None,
        working_groups=None,
        start_date=start,
        end_date=end,
        meeting_start_date=None,
        meeting_end_date=None,
        source_pattern=source,
        source_pattern_exclude=source_ex,
        title_pattern=title,
        title_pattern_exclude=title_ex,
        agenda_pattern=agenda,
        agenda_pattern_exclude=agenda_ex,
        limit=limit,
        order=SortOrder.DESC,
    )

    async with TDocDatabase(manager.db_file) as db:
        rows = await db.query_tdocs(config)

    return [row.tdoc_id for row in rows]


def _validate_and_create_members(
    resolved_items: list[str],
    source_kind: SourceKind,
    release: str | None,
) -> tuple[list, list[tuple[str, str]]]:
    """Validate items and create workspace members.

    Args:
        resolved_items: Item identifiers to validate and convert.
        source_kind: Type of source (TDOC, SPEC, OTHER).
        release: Optional release version for specs.

    Returns:
        Tuple of (valid_members, skipped_items_with_reasons).
    """
    members = []
    skipped = []
    for item in resolved_items:
        if source_kind == SourceKind.TDOC:
            inactive_status = _resolve_inactive_missing_tdoc_status(item)
            if inactive_status is not None:
                member = make_workspace_member(
                    source_item_id=item,
                    source_path=item,
                    source_kind=source_kind,
                    added_by="cli",
                )
                member.is_active = False
                members.append(member)
                console.print(f"[dim]  Adding {item} as inactive - status '{inactive_status}'[/dim]")
                continue

            if not _validate_tdoc_exists(item):
                skipped.append((item, "TDoc not found"))
                console.print(f"[yellow]  Skipping {item} - TDoc not found[/yellow]")
                continue
        elif source_kind == SourceKind.SPEC and not asyncio.run(_validate_spec_exists(item, release)):
            skipped.append((item, f"Spec not found (release={release or 'latest'})"))
            console.print(f"[yellow]  Skipping {item} - Spec not found[/yellow]")
            continue

        source_item_id = item
        if source_kind == SourceKind.SPEC:
            # Resolve release for spec member ID (always include release if available)
            resolved_release, _ = asyncio.run(resolve_spec_release_from_db(item, release or "latest"))
            if resolved_release:
                normalized_release = normalize_release_version(resolved_release)
                source_item_id = f"{item}-REL{normalized_release}"

        members.append(
            make_workspace_member(
                source_item_id=source_item_id,
                source_path=item,
                source_kind=source_kind,
                added_by="cli",
            ),
        )
    return members, skipped


@app.command("add", help="Add documents to an existing workspace.")
def workspace_add(
    workspace: WorkspaceNameOption = None,
    items: Annotated[list[str] | None, typer.Argument(help="Items to add (TDoc IDs, spec numbers). If not provided, uses --kind/filter options.")] = None,
    kind: Annotated[str, typer.Option("--kind", help="Source kind: tdoc, spec, or other")] = "tdoc",
    release: Annotated[str | None, typer.Option("--release", help="Spec release version (e.g., 19, 19.1, 19.1.2). Only applies to specs.")] = None,
    # Query-based filtering options
    start_date: Annotated[str | None, typer.Option("--start-date", help="Filter: start date (ISO-8601)")] = None,
    end_date: Annotated[str | None, typer.Option("--end-date", help="Filter: end date (ISO-8601)")] = None,
    source: Annotated[list[str] | None, typer.Option("--source", help="Filter: source pattern (glob)")] = None,
    source_ex: Annotated[list[str] | None, typer.Option("--source-ex", help="Filter: exclude source pattern (glob)")] = None,
    title: Annotated[list[str] | None, typer.Option("--title", help="Filter: title pattern (glob)")] = None,
    title_ex: Annotated[list[str] | None, typer.Option("--title-ex", help="Filter: exclude title pattern (glob)")] = None,
    agenda: Annotated[list[str] | None, typer.Option("--agenda", help="Filter: agenda pattern (glob)")] = None,
    agenda_ex: Annotated[list[str] | None, typer.Option("--agenda-ex", help="Filter: exclude agenda pattern (glob)")] = None,
    limit: Annotated[int | None, typer.Option("--limit", help="Limit number of items")] = None,
) -> None:
    """Add documents to a workspace.

    Can be used in two modes:
    1. Explicit items: workspace add <item1> <item2> --kind tdoc
    2. Query-based: workspace add --kind tdoc --agenda "*pattern*" --start-date 2018

    For query-based mode, provide filter options (--agenda, --title, --source, etc.)
    without explicit items.

    Workspace is specified via -w/--workspace option, defaulting to active workspace.
    """
    normalized = _resolve_workspace_name(workspace)
    source_kind = SourceKind(kind.lower().rstrip("s")) if kind.lower().rstrip("s") in {e.value for e in SourceKind} else SourceKind.OTHER

    # Phase 1: Resolve items - either directly provided or via database query
    if items is not None:
        resolved_items = items
    else:
        # Database query mode
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console,
        ) as progress:
            progress.add_task("[cyan]Querying database...", total=None)
            resolved_items = asyncio.run(
                _query_tdocs_async(
                    source_kind=source_kind,
                    start_date=start_date,
                    end_date=end_date,
                    source=source,
                    source_ex=source_ex,
                    title=title,
                    title_ex=title_ex,
                    agenda=agenda,
                    agenda_ex=agenda_ex,
                    limit=limit,
                ),
            )

        if not resolved_items:
            console.print("[yellow]No items match the provided filters[/yellow]")
            return

        console.print(f"[cyan]Found {len(resolved_items)} matching items[/cyan]")

    # Phase 2: Validate and create members
    members, skipped = _validate_and_create_members(resolved_items, source_kind, release)

    if not members:
        console.print("[yellow]No valid items to add[/yellow]")
        return

    added = add_workspace_members(normalized, members)
    console.print(f"[green]Added {added} item(s) to workspace '{normalized}'.[/green]")
    if skipped:
        console.print(f"[dim]Skipped {len(skipped)} invalid item(s)[/dim]")


@app.command("clear-invalid", help="Remove members with invalid or missing source paths.")
def workspace_clear_invalid(
    workspace: WorkspaceNameOption = None,
    dry_run: bool = typer.Option(False, "--dry-run", help="Show what would be removed without removing"),
) -> None:
    """Remove members whose source path no longer exists."""
    normalized = _resolve_workspace_name(workspace)
    try:
        members = list_workspace_members(normalized, include_inactive=True)
        to_remove = [m for m in members if not Path(m.source_path).exists()]

        if not to_remove:
            console.print(f"[green]All members in '{normalized}' have valid paths.[/green]")
            return

        for m in to_remove:
            status_str = "[yellow](would remove)[/yellow]" if dry_run else "[red](removed)[/red]"
            console.print(f"  {m.source_item_id}: {m.source_path} {status_str}")

        if not dry_run:
            for m in to_remove:
                remove_workspace_member(normalized, m.source_item_id)
            console.print(f"\n[green]Removed {len(to_remove)} invalid members.[/green]")
        else:
            console.print(f"\n[yellow]Dry-run: would remove {len(to_remove)} invalid members.[/yellow]")
    except Exception as e:
        if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
            raise
        console.print(f"[red]Error: {e}[/red]")
+0 −29
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ from pathlib import Path

import pytest

from tdoc_crawler.cli._workspace_commands import _validate_and_create_members
from tdoc_crawler.config import CacheManager
from tdoc_crawler.config.workspace_registry import WorkspaceRegistry
from tdoc_crawler.models.workspaces import (
@@ -226,31 +225,3 @@ class TestWorkspaceCRUD:
        """Test ensuring default workspace exists."""
        registry = ensure_default_workspace()
        assert DEFAULT_WORKSPACE in registry.workspaces


class TestWorkspaceAddValidation:
    """Tests for workspace add member validation behavior."""

    def test_missing_withdrawn_tdoc_added_as_inactive(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Withdrawn TDocs without files should still be tracked as inactive members."""
        monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: "withdrawn")
        monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False)

        members, skipped = _validate_and_create_members(["S4-230330"], SourceKind.TDOC, release=None)

        assert skipped == []
        assert len(members) == 1
        assert members[0].source_item_id == "S4-230330"
        assert members[0].is_active is False

    def test_missing_regular_tdoc_still_skipped(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Missing TDocs without inactive-eligible status should remain skipped."""
        monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: None)
        monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False)

        members, skipped = _validate_and_create_members(["S4-251751"], SourceKind.TDOC, release=None)

        assert members == []
        assert len(skipped) == 1
        assert skipped[0][0] == "S4-251751"
        assert skipped[0][1] == "TDoc not found"