Loading src/tdoc_crawler/cli/_workspace_commands.pydeleted 100644 → 0 +0 −596 Original line number Diff line number Diff line """Workspace-related CLI commands for the main application. These commands create, inspect, modify, and process workspaces. """ from __future__ import annotations import asyncio import shutil from datetime import UTC, datetime from pathlib import Path from typing import Annotated import typer from rich.progress import Progress, SpinnerColumn, TextColumn from tdoc_crawler.cli._shared import console from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.convert import convert_for_wiki from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.workspaces import SourceKind, TDocNotFoundError from tdoc_crawler.tdocs.models import TDocQueryConfig from tdoc_crawler.utils.date_parser import parse_partial_date from tdoc_crawler.utils.normalization import normalize_release_version, normalize_spec_number, normalize_tdoc_id from tdoc_crawler.workspaces import ( add_workspace_members, create_workspace, delete_workspace, get_active_workspace, list_workspace_members, list_workspaces, make_workspace_member, normalize_workspace_name, remove_workspace_member, resolve_spec_release_from_db, set_active_workspace, ) logger = get_logger(__name__) INACTIVE_MISSING_TDOC_STATUSES = {"withdrawn", "reserved"} __all__ = ["app"] app = typer.Typer(help="Manage extraction workspaces") _logger = get_logger(__name__) WorkspaceNameOption = Annotated[ str | None, typer.Option( "-w", "--workspace", help="Workspace name (default: active workspace)", ), ] def _resolve_workspace_name(workspace: str | None) -> str: """Resolve workspace name to a normalized string. Args: workspace: Workspace name or None. Returns: Normalized workspace name, or active workspace if None. """ if workspace is None: return get_active_workspace() return normalize_workspace_name(workspace) def _validate_tdoc_exists(tdoc_id: str) -> bool: """Check if a TDoc exists and can be resolved. Args: tdoc_id: TDoc identifier to validate. Returns: True if the TDoc can be resolved, False otherwise. """ try: fetch_tdoc_files(tdoc_id) return True except (TDocNotFoundError, Exception) as e: _logger.debug("TDoc %s validation failed: %s", tdoc_id, e) return False async def _get_tdoc_status(tdoc_id: str) -> str | None: """Return normalized status from database for a TDoc ID, if present.""" normalized_tdoc_id = normalize_tdoc_id(tdoc_id) if normalized_tdoc_id is None: return None manager = resolve_cache_manager() query_config = TDocQueryConfig(tdoc_ids=[normalized_tdoc_id]) async with TDocDatabase(manager.db_file) as db: rows = await db.query_tdocs(query_config) if not rows: return None raw_status = rows[0].status if raw_status is None: return None return raw_status.strip().lower() or None def _resolve_inactive_missing_tdoc_status(tdoc_id: str) -> str | None: """Return status when missing files are acceptable and member should be inactive.""" status = asyncio.run(_get_tdoc_status(tdoc_id)) if status in INACTIVE_MISSING_TDOC_STATUSES: return status return None async def _validate_spec_exists(spec_number: str, release: str | None = None) -> bool: """Check if a spec exists in the database. Args: spec_number: Spec number to validate. release: Optional release version. Returns: True if the spec exists, False otherwise. """ try: normalized_spec = normalize_spec_number(spec_number) manager = resolve_cache_manager() async with SpecDatabase(manager.db_file) as db: versions = await db.get_spec_versions(normalized_spec) if not versions: return False if release: normalized = normalize_release_version(release) return any(normalized in v.version for v in versions) return True except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit)): raise _logger.debug("Spec %s validation failed: %s", spec_number, e) return False @app.command("create", help="Create a new workspace.") def workspace_create( name: str = typer.Argument(..., help="Workspace name"), ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace(normalized) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]") @app.command("list", help="List all available workspaces.") def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") @app.command("activate", help="Set a workspace as active.") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") @app.command("deactivate", help="Deactivate the currently active workspace.") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") @app.command("delete", help="Delete a workspace and optionally its artifacts.") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"), delete_wiki: bool = typer.Option(False, "--delete-wiki", help="Delete the wiki folder for this workspace"), ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized) if delete_wiki: try: manager = resolve_cache_manager() wiki_dir = manager.workspace_llm_wiki_dir(normalized) if wiki_dir.exists(): shutil.rmtree(wiki_dir) console.print(f"[green]Deleted wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]") @app.command("members", help="List workspace members.") def workspace_members( workspace: WorkspaceNameOption = None, include_inactive: bool = typer.Option(False, "--include-inactive", help="Include inactive members"), ) -> None: """List members of a workspace.""" normalized = _resolve_workspace_name(workspace) try: members = list_workspace_members(normalized, include_inactive=include_inactive) if not members: console.print(f"[dim]No members in workspace '{normalized}'.[/dim]") return for member in members: status = "[dim]inactive[/dim]" if not member.is_active else "[green]active[/green]" console.print(f" {member.source_item_id} ({member.source_kind.value}) - {status}") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error: {e}[/red]") def _should_skip_member( skip_existing: bool, force: bool, wiki_source_dir_base: Path, source_id: str, extraction_profile: ExtractionProfile, ) -> bool: """Check if a member should be skipped because artifacts already exist.""" if not skip_existing or force: return False member_wiki_dir = wiki_source_dir_base / source_id has_artifacts = bool(list(member_wiki_dir.glob("*.pdf"))) if extraction_profile == ExtractionProfile.PDF_ONLY else bool(list(member_wiki_dir.glob("*.md"))) if has_artifacts: console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]") return has_artifacts @app.command("process", help="Process workspace members.") def workspace_process( workspace: WorkspaceNameOption = None, force: bool = typer.Option(False, "--force", help="Re-process existing artifacts"), limit: int = typer.Option(None, "--limit", help="Limit number of members to process"), skip_existing: bool = typer.Option(False, "--skip-existing", help="Skip members that already have artifacts"), profile: str = typer.Option( DEFAULT_EXTRACTION_PROFILE.value, "--profile", help="Extraction profile: pdf-only, default, or advanced", ), ) -> None: """Extract structured data from all workspace members.""" normalized = _resolve_workspace_name(workspace) try: extraction_profile = ExtractionProfile(profile) except ValueError: console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]") raise typer.Exit(1) console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]") try: members = list_workspace_members(normalized, include_inactive=False) except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error listing members: {e}[/red]") raise typer.Exit(1) if not members: console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]") return if limit is not None: members = members[:limit] manager = resolve_cache_manager() wiki_source_dir_base = manager.workspace_sources_dir(normalized) processed = 0 failed = 0 skipped_items: list[tuple[str, str]] = [] for member in members: source_id = member.source_item_id if _should_skip_member(skip_existing, force, wiki_source_dir_base, source_id, extraction_profile): continue wiki_source_dir = wiki_source_dir_base / source_id wiki_source_dir.mkdir(parents=True, exist_ok=True) try: result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, source_path=member.source_path, profile=extraction_profile, force=force, ) if result_path: console.print(f"[green] Processed {source_id} -> {result_path.name}[/green]") processed += 1 else: console.print(f"[yellow] No output for {source_id}[/yellow]") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise error_text = str(e) skipped_items.append((source_id, error_text)) logger.debug("Skipped processing %s: %s", source_id, error_text) failed += 1 if skipped_items: console.print("\n[yellow]Skipped documents (processing warnings):[/yellow]") for source_id, error_text in skipped_items: console.print(f"[yellow] - {source_id}: {error_text}[/yellow]") console.print(f"\n[yellow]Processing complete: {processed} succeeded, {failed} failed[/yellow]") def _parse_date_filters( start_date: str | None, end_date: str | None, ) -> tuple[datetime | None, datetime | None]: """Parse and validate date filter strings into datetime objects. Args: start_date: ISO-8601 start date string, or None. end_date: ISO-8601 end date string, or None. Returns: Tuple of (start_datetime, end_datetime), either may be None. Raises: typer.Exit: If a date string cannot be parsed. """ try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError: console.print("[red]Invalid start date format; use ISO-8601[/red]") raise typer.Exit(1) try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError: console.print("[red]Invalid end date format; use ISO-8601[/red]") raise typer.Exit(1) return start, end async def _query_tdocs_async( source_kind: SourceKind, start_date: str | None, end_date: str | None, source: list[str] | None, source_ex: list[str] | None, title: list[str] | None, title_ex: list[str] | None, agenda: list[str] | None, agenda_ex: list[str] | None, limit: int | None, ) -> list[str]: """Query database for TDocs matching filter criteria. Args: source_kind: Must be TDOC for query mode. start_date: Filter by start date. end_date: Filter by end date. source: Filter by source pattern. source_ex: Exclude by source pattern. title: Filter by title pattern. title_ex: Exclude by title pattern. agenda: Filter by agenda pattern. agenda_ex: Exclude by agenda pattern. limit: Limit number of results. Returns: List of matching TDoc IDs. """ if source_kind != SourceKind.TDOC: console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]") raise typer.Exit(1) manager = resolve_cache_manager() start, end = _parse_date_filters(start_date, end_date) config = TDocQueryConfig( output_format=OutputFormat.TABLE, tdoc_ids=None, working_groups=None, start_date=start, end_date=end, meeting_start_date=None, meeting_end_date=None, source_pattern=source, source_pattern_exclude=source_ex, title_pattern=title, title_pattern_exclude=title_ex, agenda_pattern=agenda, agenda_pattern_exclude=agenda_ex, limit=limit, order=SortOrder.DESC, ) async with TDocDatabase(manager.db_file) as db: rows = await db.query_tdocs(config) return [row.tdoc_id for row in rows] def _validate_and_create_members( resolved_items: list[str], source_kind: SourceKind, release: str | None, ) -> tuple[list, list[tuple[str, str]]]: """Validate items and create workspace members. Args: resolved_items: Item identifiers to validate and convert. source_kind: Type of source (TDOC, SPEC, OTHER). release: Optional release version for specs. Returns: Tuple of (valid_members, skipped_items_with_reasons). """ members = [] skipped = [] for item in resolved_items: if source_kind == SourceKind.TDOC: inactive_status = _resolve_inactive_missing_tdoc_status(item) if inactive_status is not None: member = make_workspace_member( source_item_id=item, source_path=item, source_kind=source_kind, added_by="cli", ) member.is_active = False members.append(member) console.print(f"[dim] Adding {item} as inactive - status '{inactive_status}'[/dim]") continue if not _validate_tdoc_exists(item): skipped.append((item, "TDoc not found")) console.print(f"[yellow] Skipping {item} - TDoc not found[/yellow]") continue elif source_kind == SourceKind.SPEC and not asyncio.run(_validate_spec_exists(item, release)): skipped.append((item, f"Spec not found (release={release or 'latest'})")) console.print(f"[yellow] Skipping {item} - Spec not found[/yellow]") continue source_item_id = item if source_kind == SourceKind.SPEC: # Resolve release for spec member ID (always include release if available) resolved_release, _ = asyncio.run(resolve_spec_release_from_db(item, release or "latest")) if resolved_release: normalized_release = normalize_release_version(resolved_release) source_item_id = f"{item}-REL{normalized_release}" members.append( make_workspace_member( source_item_id=source_item_id, source_path=item, source_kind=source_kind, added_by="cli", ), ) return members, skipped @app.command("add", help="Add documents to an existing workspace.") def workspace_add( workspace: WorkspaceNameOption = None, items: Annotated[list[str] | None, typer.Argument(help="Items to add (TDoc IDs, spec numbers). If not provided, uses --kind/filter options.")] = None, kind: Annotated[str, typer.Option("--kind", help="Source kind: tdoc, spec, or other")] = "tdoc", release: Annotated[str | None, typer.Option("--release", help="Spec release version (e.g., 19, 19.1, 19.1.2). Only applies to specs.")] = None, # Query-based filtering options start_date: Annotated[str | None, typer.Option("--start-date", help="Filter: start date (ISO-8601)")] = None, end_date: Annotated[str | None, typer.Option("--end-date", help="Filter: end date (ISO-8601)")] = None, source: Annotated[list[str] | None, typer.Option("--source", help="Filter: source pattern (glob)")] = None, source_ex: Annotated[list[str] | None, typer.Option("--source-ex", help="Filter: exclude source pattern (glob)")] = None, title: Annotated[list[str] | None, typer.Option("--title", help="Filter: title pattern (glob)")] = None, title_ex: Annotated[list[str] | None, typer.Option("--title-ex", help="Filter: exclude title pattern (glob)")] = None, agenda: Annotated[list[str] | None, typer.Option("--agenda", help="Filter: agenda pattern (glob)")] = None, agenda_ex: Annotated[list[str] | None, typer.Option("--agenda-ex", help="Filter: exclude agenda pattern (glob)")] = None, limit: Annotated[int | None, typer.Option("--limit", help="Limit number of items")] = None, ) -> None: """Add documents to a workspace. Can be used in two modes: 1. Explicit items: workspace add <item1> <item2> --kind tdoc 2. Query-based: workspace add --kind tdoc --agenda "*pattern*" --start-date 2018 For query-based mode, provide filter options (--agenda, --title, --source, etc.) without explicit items. Workspace is specified via -w/--workspace option, defaulting to active workspace. """ normalized = _resolve_workspace_name(workspace) source_kind = SourceKind(kind.lower().rstrip("s")) if kind.lower().rstrip("s") in {e.value for e in SourceKind} else SourceKind.OTHER # Phase 1: Resolve items - either directly provided or via database query if items is not None: resolved_items = items else: # Database query mode with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console, ) as progress: progress.add_task("[cyan]Querying database...", total=None) resolved_items = asyncio.run( _query_tdocs_async( source_kind=source_kind, start_date=start_date, end_date=end_date, source=source, source_ex=source_ex, title=title, title_ex=title_ex, agenda=agenda, agenda_ex=agenda_ex, limit=limit, ), ) if not resolved_items: console.print("[yellow]No items match the provided filters[/yellow]") return console.print(f"[cyan]Found {len(resolved_items)} matching items[/cyan]") # Phase 2: Validate and create members members, skipped = _validate_and_create_members(resolved_items, source_kind, release) if not members: console.print("[yellow]No valid items to add[/yellow]") return added = add_workspace_members(normalized, members) console.print(f"[green]Added {added} item(s) to workspace '{normalized}'.[/green]") if skipped: console.print(f"[dim]Skipped {len(skipped)} invalid item(s)[/dim]") @app.command("clear-invalid", help="Remove members with invalid or missing source paths.") def workspace_clear_invalid( workspace: WorkspaceNameOption = None, dry_run: bool = typer.Option(False, "--dry-run", help="Show what would be removed without removing"), ) -> None: """Remove members whose source path no longer exists.""" normalized = _resolve_workspace_name(workspace) try: members = list_workspace_members(normalized, include_inactive=True) to_remove = [m for m in members if not Path(m.source_path).exists()] if not to_remove: console.print(f"[green]All members in '{normalized}' have valid paths.[/green]") return for m in to_remove: status_str = "[yellow](would remove)[/yellow]" if dry_run else "[red](removed)[/red]" console.print(f" {m.source_item_id}: {m.source_path} {status_str}") if not dry_run: for m in to_remove: remove_workspace_member(normalized, m.source_item_id) console.print(f"\n[green]Removed {len(to_remove)} invalid members.[/green]") else: console.print(f"\n[yellow]Dry-run: would remove {len(to_remove)} invalid members.[/yellow]") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error: {e}[/red]") tests/test_workspaces.py +0 −29 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from pathlib import Path import pytest from tdoc_crawler.cli._workspace_commands import _validate_and_create_members from tdoc_crawler.config import CacheManager from tdoc_crawler.config.workspace_registry import WorkspaceRegistry from tdoc_crawler.models.workspaces import ( Loading Loading @@ -226,31 +225,3 @@ class TestWorkspaceCRUD: """Test ensuring default workspace exists.""" registry = ensure_default_workspace() assert DEFAULT_WORKSPACE in registry.workspaces class TestWorkspaceAddValidation: """Tests for workspace add member validation behavior.""" def test_missing_withdrawn_tdoc_added_as_inactive(self, monkeypatch: pytest.MonkeyPatch) -> None: """Withdrawn TDocs without files should still be tracked as inactive members.""" monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: "withdrawn") monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False) members, skipped = _validate_and_create_members(["S4-230330"], SourceKind.TDOC, release=None) assert skipped == [] assert len(members) == 1 assert members[0].source_item_id == "S4-230330" assert members[0].is_active is False def test_missing_regular_tdoc_still_skipped(self, monkeypatch: pytest.MonkeyPatch) -> None: """Missing TDocs without inactive-eligible status should remain skipped.""" monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: None) monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False) members, skipped = _validate_and_create_members(["S4-251751"], SourceKind.TDOC, release=None) assert members == [] assert len(skipped) == 1 assert skipped[0][0] == "S4-251751" assert skipped[0][1] == "TDoc not found" Loading
src/tdoc_crawler/cli/_workspace_commands.pydeleted 100644 → 0 +0 −596 Original line number Diff line number Diff line """Workspace-related CLI commands for the main application. These commands create, inspect, modify, and process workspaces. """ from __future__ import annotations import asyncio import shutil from datetime import UTC, datetime from pathlib import Path from typing import Annotated import typer from rich.progress import Progress, SpinnerColumn, TextColumn from tdoc_crawler.cli._shared import console from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.convert import convert_for_wiki from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.workspaces import SourceKind, TDocNotFoundError from tdoc_crawler.tdocs.models import TDocQueryConfig from tdoc_crawler.utils.date_parser import parse_partial_date from tdoc_crawler.utils.normalization import normalize_release_version, normalize_spec_number, normalize_tdoc_id from tdoc_crawler.workspaces import ( add_workspace_members, create_workspace, delete_workspace, get_active_workspace, list_workspace_members, list_workspaces, make_workspace_member, normalize_workspace_name, remove_workspace_member, resolve_spec_release_from_db, set_active_workspace, ) logger = get_logger(__name__) INACTIVE_MISSING_TDOC_STATUSES = {"withdrawn", "reserved"} __all__ = ["app"] app = typer.Typer(help="Manage extraction workspaces") _logger = get_logger(__name__) WorkspaceNameOption = Annotated[ str | None, typer.Option( "-w", "--workspace", help="Workspace name (default: active workspace)", ), ] def _resolve_workspace_name(workspace: str | None) -> str: """Resolve workspace name to a normalized string. Args: workspace: Workspace name or None. Returns: Normalized workspace name, or active workspace if None. """ if workspace is None: return get_active_workspace() return normalize_workspace_name(workspace) def _validate_tdoc_exists(tdoc_id: str) -> bool: """Check if a TDoc exists and can be resolved. Args: tdoc_id: TDoc identifier to validate. Returns: True if the TDoc can be resolved, False otherwise. """ try: fetch_tdoc_files(tdoc_id) return True except (TDocNotFoundError, Exception) as e: _logger.debug("TDoc %s validation failed: %s", tdoc_id, e) return False async def _get_tdoc_status(tdoc_id: str) -> str | None: """Return normalized status from database for a TDoc ID, if present.""" normalized_tdoc_id = normalize_tdoc_id(tdoc_id) if normalized_tdoc_id is None: return None manager = resolve_cache_manager() query_config = TDocQueryConfig(tdoc_ids=[normalized_tdoc_id]) async with TDocDatabase(manager.db_file) as db: rows = await db.query_tdocs(query_config) if not rows: return None raw_status = rows[0].status if raw_status is None: return None return raw_status.strip().lower() or None def _resolve_inactive_missing_tdoc_status(tdoc_id: str) -> str | None: """Return status when missing files are acceptable and member should be inactive.""" status = asyncio.run(_get_tdoc_status(tdoc_id)) if status in INACTIVE_MISSING_TDOC_STATUSES: return status return None async def _validate_spec_exists(spec_number: str, release: str | None = None) -> bool: """Check if a spec exists in the database. Args: spec_number: Spec number to validate. release: Optional release version. Returns: True if the spec exists, False otherwise. """ try: normalized_spec = normalize_spec_number(spec_number) manager = resolve_cache_manager() async with SpecDatabase(manager.db_file) as db: versions = await db.get_spec_versions(normalized_spec) if not versions: return False if release: normalized = normalize_release_version(release) return any(normalized in v.version for v in versions) return True except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit)): raise _logger.debug("Spec %s validation failed: %s", spec_number, e) return False @app.command("create", help="Create a new workspace.") def workspace_create( name: str = typer.Argument(..., help="Workspace name"), ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace(normalized) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]") @app.command("list", help="List all available workspaces.") def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") @app.command("activate", help="Set a workspace as active.") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") @app.command("deactivate", help="Deactivate the currently active workspace.") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") @app.command("delete", help="Delete a workspace and optionally its artifacts.") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"), delete_wiki: bool = typer.Option(False, "--delete-wiki", help="Delete the wiki folder for this workspace"), ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized) if delete_wiki: try: manager = resolve_cache_manager() wiki_dir = manager.workspace_llm_wiki_dir(normalized) if wiki_dir.exists(): shutil.rmtree(wiki_dir) console.print(f"[green]Deleted wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]") @app.command("members", help="List workspace members.") def workspace_members( workspace: WorkspaceNameOption = None, include_inactive: bool = typer.Option(False, "--include-inactive", help="Include inactive members"), ) -> None: """List members of a workspace.""" normalized = _resolve_workspace_name(workspace) try: members = list_workspace_members(normalized, include_inactive=include_inactive) if not members: console.print(f"[dim]No members in workspace '{normalized}'.[/dim]") return for member in members: status = "[dim]inactive[/dim]" if not member.is_active else "[green]active[/green]" console.print(f" {member.source_item_id} ({member.source_kind.value}) - {status}") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error: {e}[/red]") def _should_skip_member( skip_existing: bool, force: bool, wiki_source_dir_base: Path, source_id: str, extraction_profile: ExtractionProfile, ) -> bool: """Check if a member should be skipped because artifacts already exist.""" if not skip_existing or force: return False member_wiki_dir = wiki_source_dir_base / source_id has_artifacts = bool(list(member_wiki_dir.glob("*.pdf"))) if extraction_profile == ExtractionProfile.PDF_ONLY else bool(list(member_wiki_dir.glob("*.md"))) if has_artifacts: console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]") return has_artifacts @app.command("process", help="Process workspace members.") def workspace_process( workspace: WorkspaceNameOption = None, force: bool = typer.Option(False, "--force", help="Re-process existing artifacts"), limit: int = typer.Option(None, "--limit", help="Limit number of members to process"), skip_existing: bool = typer.Option(False, "--skip-existing", help="Skip members that already have artifacts"), profile: str = typer.Option( DEFAULT_EXTRACTION_PROFILE.value, "--profile", help="Extraction profile: pdf-only, default, or advanced", ), ) -> None: """Extract structured data from all workspace members.""" normalized = _resolve_workspace_name(workspace) try: extraction_profile = ExtractionProfile(profile) except ValueError: console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]") raise typer.Exit(1) console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]") try: members = list_workspace_members(normalized, include_inactive=False) except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error listing members: {e}[/red]") raise typer.Exit(1) if not members: console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]") return if limit is not None: members = members[:limit] manager = resolve_cache_manager() wiki_source_dir_base = manager.workspace_sources_dir(normalized) processed = 0 failed = 0 skipped_items: list[tuple[str, str]] = [] for member in members: source_id = member.source_item_id if _should_skip_member(skip_existing, force, wiki_source_dir_base, source_id, extraction_profile): continue wiki_source_dir = wiki_source_dir_base / source_id wiki_source_dir.mkdir(parents=True, exist_ok=True) try: result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, source_path=member.source_path, profile=extraction_profile, force=force, ) if result_path: console.print(f"[green] Processed {source_id} -> {result_path.name}[/green]") processed += 1 else: console.print(f"[yellow] No output for {source_id}[/yellow]") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise error_text = str(e) skipped_items.append((source_id, error_text)) logger.debug("Skipped processing %s: %s", source_id, error_text) failed += 1 if skipped_items: console.print("\n[yellow]Skipped documents (processing warnings):[/yellow]") for source_id, error_text in skipped_items: console.print(f"[yellow] - {source_id}: {error_text}[/yellow]") console.print(f"\n[yellow]Processing complete: {processed} succeeded, {failed} failed[/yellow]") def _parse_date_filters( start_date: str | None, end_date: str | None, ) -> tuple[datetime | None, datetime | None]: """Parse and validate date filter strings into datetime objects. Args: start_date: ISO-8601 start date string, or None. end_date: ISO-8601 end date string, or None. Returns: Tuple of (start_datetime, end_datetime), either may be None. Raises: typer.Exit: If a date string cannot be parsed. """ try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError: console.print("[red]Invalid start date format; use ISO-8601[/red]") raise typer.Exit(1) try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError: console.print("[red]Invalid end date format; use ISO-8601[/red]") raise typer.Exit(1) return start, end async def _query_tdocs_async( source_kind: SourceKind, start_date: str | None, end_date: str | None, source: list[str] | None, source_ex: list[str] | None, title: list[str] | None, title_ex: list[str] | None, agenda: list[str] | None, agenda_ex: list[str] | None, limit: int | None, ) -> list[str]: """Query database for TDocs matching filter criteria. Args: source_kind: Must be TDOC for query mode. start_date: Filter by start date. end_date: Filter by end date. source: Filter by source pattern. source_ex: Exclude by source pattern. title: Filter by title pattern. title_ex: Exclude by title pattern. agenda: Filter by agenda pattern. agenda_ex: Exclude by agenda pattern. limit: Limit number of results. Returns: List of matching TDoc IDs. """ if source_kind != SourceKind.TDOC: console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]") raise typer.Exit(1) manager = resolve_cache_manager() start, end = _parse_date_filters(start_date, end_date) config = TDocQueryConfig( output_format=OutputFormat.TABLE, tdoc_ids=None, working_groups=None, start_date=start, end_date=end, meeting_start_date=None, meeting_end_date=None, source_pattern=source, source_pattern_exclude=source_ex, title_pattern=title, title_pattern_exclude=title_ex, agenda_pattern=agenda, agenda_pattern_exclude=agenda_ex, limit=limit, order=SortOrder.DESC, ) async with TDocDatabase(manager.db_file) as db: rows = await db.query_tdocs(config) return [row.tdoc_id for row in rows] def _validate_and_create_members( resolved_items: list[str], source_kind: SourceKind, release: str | None, ) -> tuple[list, list[tuple[str, str]]]: """Validate items and create workspace members. Args: resolved_items: Item identifiers to validate and convert. source_kind: Type of source (TDOC, SPEC, OTHER). release: Optional release version for specs. Returns: Tuple of (valid_members, skipped_items_with_reasons). """ members = [] skipped = [] for item in resolved_items: if source_kind == SourceKind.TDOC: inactive_status = _resolve_inactive_missing_tdoc_status(item) if inactive_status is not None: member = make_workspace_member( source_item_id=item, source_path=item, source_kind=source_kind, added_by="cli", ) member.is_active = False members.append(member) console.print(f"[dim] Adding {item} as inactive - status '{inactive_status}'[/dim]") continue if not _validate_tdoc_exists(item): skipped.append((item, "TDoc not found")) console.print(f"[yellow] Skipping {item} - TDoc not found[/yellow]") continue elif source_kind == SourceKind.SPEC and not asyncio.run(_validate_spec_exists(item, release)): skipped.append((item, f"Spec not found (release={release or 'latest'})")) console.print(f"[yellow] Skipping {item} - Spec not found[/yellow]") continue source_item_id = item if source_kind == SourceKind.SPEC: # Resolve release for spec member ID (always include release if available) resolved_release, _ = asyncio.run(resolve_spec_release_from_db(item, release or "latest")) if resolved_release: normalized_release = normalize_release_version(resolved_release) source_item_id = f"{item}-REL{normalized_release}" members.append( make_workspace_member( source_item_id=source_item_id, source_path=item, source_kind=source_kind, added_by="cli", ), ) return members, skipped @app.command("add", help="Add documents to an existing workspace.") def workspace_add( workspace: WorkspaceNameOption = None, items: Annotated[list[str] | None, typer.Argument(help="Items to add (TDoc IDs, spec numbers). If not provided, uses --kind/filter options.")] = None, kind: Annotated[str, typer.Option("--kind", help="Source kind: tdoc, spec, or other")] = "tdoc", release: Annotated[str | None, typer.Option("--release", help="Spec release version (e.g., 19, 19.1, 19.1.2). Only applies to specs.")] = None, # Query-based filtering options start_date: Annotated[str | None, typer.Option("--start-date", help="Filter: start date (ISO-8601)")] = None, end_date: Annotated[str | None, typer.Option("--end-date", help="Filter: end date (ISO-8601)")] = None, source: Annotated[list[str] | None, typer.Option("--source", help="Filter: source pattern (glob)")] = None, source_ex: Annotated[list[str] | None, typer.Option("--source-ex", help="Filter: exclude source pattern (glob)")] = None, title: Annotated[list[str] | None, typer.Option("--title", help="Filter: title pattern (glob)")] = None, title_ex: Annotated[list[str] | None, typer.Option("--title-ex", help="Filter: exclude title pattern (glob)")] = None, agenda: Annotated[list[str] | None, typer.Option("--agenda", help="Filter: agenda pattern (glob)")] = None, agenda_ex: Annotated[list[str] | None, typer.Option("--agenda-ex", help="Filter: exclude agenda pattern (glob)")] = None, limit: Annotated[int | None, typer.Option("--limit", help="Limit number of items")] = None, ) -> None: """Add documents to a workspace. Can be used in two modes: 1. Explicit items: workspace add <item1> <item2> --kind tdoc 2. Query-based: workspace add --kind tdoc --agenda "*pattern*" --start-date 2018 For query-based mode, provide filter options (--agenda, --title, --source, etc.) without explicit items. Workspace is specified via -w/--workspace option, defaulting to active workspace. """ normalized = _resolve_workspace_name(workspace) source_kind = SourceKind(kind.lower().rstrip("s")) if kind.lower().rstrip("s") in {e.value for e in SourceKind} else SourceKind.OTHER # Phase 1: Resolve items - either directly provided or via database query if items is not None: resolved_items = items else: # Database query mode with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console, ) as progress: progress.add_task("[cyan]Querying database...", total=None) resolved_items = asyncio.run( _query_tdocs_async( source_kind=source_kind, start_date=start_date, end_date=end_date, source=source, source_ex=source_ex, title=title, title_ex=title_ex, agenda=agenda, agenda_ex=agenda_ex, limit=limit, ), ) if not resolved_items: console.print("[yellow]No items match the provided filters[/yellow]") return console.print(f"[cyan]Found {len(resolved_items)} matching items[/cyan]") # Phase 2: Validate and create members members, skipped = _validate_and_create_members(resolved_items, source_kind, release) if not members: console.print("[yellow]No valid items to add[/yellow]") return added = add_workspace_members(normalized, members) console.print(f"[green]Added {added} item(s) to workspace '{normalized}'.[/green]") if skipped: console.print(f"[dim]Skipped {len(skipped)} invalid item(s)[/dim]") @app.command("clear-invalid", help="Remove members with invalid or missing source paths.") def workspace_clear_invalid( workspace: WorkspaceNameOption = None, dry_run: bool = typer.Option(False, "--dry-run", help="Show what would be removed without removing"), ) -> None: """Remove members whose source path no longer exists.""" normalized = _resolve_workspace_name(workspace) try: members = list_workspace_members(normalized, include_inactive=True) to_remove = [m for m in members if not Path(m.source_path).exists()] if not to_remove: console.print(f"[green]All members in '{normalized}' have valid paths.[/green]") return for m in to_remove: status_str = "[yellow](would remove)[/yellow]" if dry_run else "[red](removed)[/red]" console.print(f" {m.source_item_id}: {m.source_path} {status_str}") if not dry_run: for m in to_remove: remove_workspace_member(normalized, m.source_item_id) console.print(f"\n[green]Removed {len(to_remove)} invalid members.[/green]") else: console.print(f"\n[yellow]Dry-run: would remove {len(to_remove)} invalid members.[/yellow]") except Exception as e: if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)): raise console.print(f"[red]Error: {e}[/red]")
tests/test_workspaces.py +0 −29 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from pathlib import Path import pytest from tdoc_crawler.cli._workspace_commands import _validate_and_create_members from tdoc_crawler.config import CacheManager from tdoc_crawler.config.workspace_registry import WorkspaceRegistry from tdoc_crawler.models.workspaces import ( Loading Loading @@ -226,31 +225,3 @@ class TestWorkspaceCRUD: """Test ensuring default workspace exists.""" registry = ensure_default_workspace() assert DEFAULT_WORKSPACE in registry.workspaces class TestWorkspaceAddValidation: """Tests for workspace add member validation behavior.""" def test_missing_withdrawn_tdoc_added_as_inactive(self, monkeypatch: pytest.MonkeyPatch) -> None: """Withdrawn TDocs without files should still be tracked as inactive members.""" monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: "withdrawn") monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False) members, skipped = _validate_and_create_members(["S4-230330"], SourceKind.TDOC, release=None) assert skipped == [] assert len(members) == 1 assert members[0].source_item_id == "S4-230330" assert members[0].is_active is False def test_missing_regular_tdoc_still_skipped(self, monkeypatch: pytest.MonkeyPatch) -> None: """Missing TDocs without inactive-eligible status should remain skipped.""" monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._resolve_inactive_missing_tdoc_status", lambda _item: None) monkeypatch.setattr("tdoc_crawler.cli._workspace_commands._validate_tdoc_exists", lambda _item: False) members, skipped = _validate_and_create_members(["S4-251751"], SourceKind.TDOC, release=None) assert members == [] assert len(skipped) == 1 assert skipped[0][0] == "S4-251751" assert skipped[0][1] == "TDoc not found"