Loading src/tdoc_crawler/cli/mgmt_app.py +1 −1 Original line number Diff line number Diff line Loading @@ -11,9 +11,9 @@ from typing import Annotated import typer from tdoc_crawler.cli._workspace_commands import app as workspace_app from tdoc_crawler.cli.config import load_cli_config from tdoc_crawler.cli.config_app import config_app from tdoc_crawler.cli.workspace import app as workspace_app from tdoc_crawler.config import CacheManager from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY from tdoc_crawler.logging import set_verbosity Loading src/tdoc_crawler/cli/workspace/__init__.py 0 → 100644 +33 −0 Original line number Diff line number Diff line """Workspace CLI commands — split from monolithic _workspace_commands.py.""" from __future__ import annotations import typer from tdoc_crawler.cli.workspace.create import workspace_create from tdoc_crawler.cli.workspace.lifecycle import ( workspace_activate, workspace_deactivate, workspace_delete, workspace_list, ) from tdoc_crawler.cli.workspace.members import ( workspace_add, workspace_clear_invalid, workspace_members, ) from tdoc_crawler.cli.workspace.process import workspace_process app = typer.Typer(help="Manage extraction workspaces") app.command("create")(workspace_create) app.command("list")(workspace_list) app.command("activate")(workspace_activate) app.command("deactivate")(workspace_deactivate) app.command("delete")(workspace_delete) app.command("members")(workspace_members) app.command("add")(workspace_add) app.command("clear-invalid")(workspace_clear_invalid) app.command("process")(workspace_process) __all__ = ["app"] src/tdoc_crawler/cli/workspace/create.py 0 → 100644 +24 −0 Original line number Diff line number Diff line """Workspace create command.""" from __future__ import annotations import typer from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import SourcesDirNameOption, WorkspaceDirOption from tdoc_crawler.workspaces import create_workspace, normalize_workspace_name def workspace_create( name: str = typer.Argument(..., help="Workspace name"), workspace_dir: WorkspaceDirOption = None, sources_dirname: SourcesDirNameOption = None, ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace( normalized, workspace_dir=str(workspace_dir) if workspace_dir else None, sources_dirname=sources_dirname, ) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]") src/tdoc_crawler/cli/workspace/lifecycle.py 0 → 100644 +72 −0 Original line number Diff line number Diff line """Workspace lifecycle commands: list, activate, deactivate, delete.""" from __future__ import annotations import shutil import typer from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import DeleteArtifactsOption, DeleteLlmWikiOption, WorkspaceDeleteForceOption from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.workspaces import ( delete_workspace, get_workspace, list_workspaces, normalize_workspace_name, set_active_workspace, ) def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: WorkspaceDeleteForceOption = False, delete_artifacts: DeleteArtifactsOption = False, delete_llm_wiki: DeleteLlmWikiOption = False, ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized, delete_artifacts=delete_artifacts) if delete_llm_wiki: try: manager = resolve_cache_manager() metadata = get_workspace(normalized) ws_dir = metadata.resolve_workspace_directory(manager.workspaces_dir) if metadata is not None else manager.workspaces_dir / normalized llm_wiki_dir = ws_dir / "wiki" if llm_wiki_dir.exists(): shutil.rmtree(llm_wiki_dir) console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]") src/tdoc_crawler/cli/_workspace_commands.py→src/tdoc_crawler/cli/workspace/members.py +230 −0 Original line number Diff line number Diff line """Workspace-related CLI commands for the main application. These commands create, inspect, modify, and process workspaces. """ """Workspace membership commands: members, add, clear-invalid.""" from __future__ import annotations import asyncio import json import shutil from datetime import UTC, datetime from pathlib import Path from typing import Any import typer from tdoc_crawler.cli._shared import console, create_progress_bar, ensure_hybrid_server_for_profile from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import ( AgendaPatternExcludeOption, AgendaPatternOption, AutoCrawlSpecsOption, DeleteArtifactsOption, DeleteLlmWikiOption, DryRunOption, EndDateOption, IncludeInactiveOption, LimitOption, MdYamlFrontmatterOption, ProcessLimitOption, ProfileOption, ReleaseOption, SkipExistingOption, SourceKindOption, SourcePatternExcludeOption, SourcePatternOption, StartDateOption, TitlePatternExcludeOption, TitlePatternOption, VerbosityOption, WorkspaceDeleteForceOption, WorkspaceItemsArgument, WorkspaceNameOption, WorkspaceProcessForceOption, ) from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output from tdoc_crawler.config import PathConfig, resolve_cache_manager from tdoc_crawler.config import PathConfig from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.convert import convert_for_wiki from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY from tdoc_crawler.logging import get_logger, set_verbosity from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.workspaces import SourceKind from tdoc_crawler.specs.operations.checkout import resolve_spec_release Loading @@ -56,110 +36,14 @@ from tdoc_crawler.tdocs.models import TDocQueryConfig from tdoc_crawler.utils.date_parser import parse_partial_date from tdoc_crawler.workspaces import ( add_workspace_members, create_workspace, delete_workspace, get_active_workspace, list_workspace_members, list_workspaces, make_workspace_member, normalize_workspace_name, remove_workspace_member, set_active_workspace, ) logger = get_logger(__name__) __all__ = ["app"] app = typer.Typer(help="Manage extraction workspaces") _logger = get_logger(__name__) def _print_output( data: Any, output_format: OutputFormat, *, table_title: str, table_columns: list[TableColumnSpec] | None = None, ) -> None: """Print structured command output through the shared formatter pipeline.""" print_structured_output( data, output_format, table_title=table_title, table_columns=table_columns, console=console, ) @app.command("create", help="Create a new workspace.") def workspace_create( name: str = typer.Argument(..., help="Workspace name"), ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace(normalized) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]") @app.command("list", help="List all available workspaces.") def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") @app.command("activate", help="Set a workspace as active.") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") @app.command("deactivate", help="Deactivate the currently active workspace.") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") @app.command("delete", help="Delete a workspace and optionally its artifacts.") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: WorkspaceDeleteForceOption = False, delete_artifacts: DeleteArtifactsOption = False, delete_llm_wiki: DeleteLlmWikiOption = False, ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized, delete_artifacts=delete_artifacts) if delete_llm_wiki: try: manager = resolve_cache_manager() llm_wiki_dir = manager.workspaces_dir / normalized / "wiki" if llm_wiki_dir.exists(): shutil.rmtree(llm_wiki_dir) console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]") @app.command("members", help="List workspace members.") def workspace_members( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), include_inactive: IncludeInactiveOption = False, Loading @@ -181,187 +65,6 @@ def workspace_members( console.print(f"[red]Error: {e}[/red]") _PROFILE_LEVELS = { ExtractionProfile.PDF_ONLY: 0, ExtractionProfile.DEFAULT: 1, ExtractionProfile.ADVANCED: 2, } def _should_skip_member( source_id: str, wiki_base: Path, profile: ExtractionProfile, force: bool, skip_existing: bool, ) -> bool: """Check if a workspace member should be skipped due to existing artifacts.""" if force or not skip_existing: return False member_dir = wiki_base / source_id if profile == ExtractionProfile.PDF_ONLY: if list(member_dir.glob("*.pdf")): logger.debug("Skipping %s — PDF exists", source_id) return True return False md_files = list(member_dir.glob("*.md")) json_files = list(member_dir.glob("*.json")) if not md_files or not json_files: return False saved_profile = _read_json_profile(json_files[0]) saved_level = _PROFILE_LEVELS.get(saved_profile, -1) if saved_level < 0: saved_level = _PROFILE_LEVELS.get(_coerce_profile(saved_profile), -1) required_level = _PROFILE_LEVELS[profile] if saved_level >= required_level: label = saved_profile if saved_level >= 0 else "unknown" logger.debug("Skipping %s — %s output exists", source_id, label) return True return False def _read_json_profile(json_path: Path) -> str: """Read extraction_profile from a JSON file, empty string on failure.""" try: data = json.loads(json_path.read_text(encoding="utf-8")) return data.get("extraction_profile", "") except json.JSONDecodeError, OSError: return "" def _coerce_profile(value: str) -> ExtractionProfile | None: """Try to parse a string as ExtractionProfile, return None on failure.""" try: return ExtractionProfile(value) except ValueError: return None def _process_member( member: Any, wiki_source_dir_base: Path, extraction_profile: ExtractionProfile, force: bool, md_yaml_frontmatter: bool, ) -> tuple[str, bool, bool]: """Process a single workspace member. Returns: Tuple of (source_id, succeeded, failed). """ source_id = member.source_item_id wiki_source_dir = wiki_source_dir_base / source_id wiki_source_dir.mkdir(parents=True, exist_ok=True) try: result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, profile=extraction_profile, force=force, release=member.release, md_yaml_frontmatter=md_yaml_frontmatter, ) if result_path: suffix = result_path.suffix.lstrip(".") logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix) return source_id, True, False logger.debug("No output for %s", source_id) return source_id, False, False except Exception as e: console.print(f"[red] Failed {source_id}: {e}[/red]") logger.error("Failed to process %s: %s", source_id, e) return source_id, False, True @app.command("process", help="Process workspace members.") def workspace_process( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), force: WorkspaceProcessForceOption = False, limit: ProcessLimitOption = None, skip_existing: SkipExistingOption = False, profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value, md_yaml_frontmatter: MdYamlFrontmatterOption = True, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Extract structured data from all workspace members.""" set_verbosity(verbosity) if workspace_name is None: workspace_name = get_active_workspace() normalized = normalize_workspace_name(workspace_name) try: extraction_profile = ExtractionProfile(profile) except ValueError: console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]") raise typer.Exit(1) console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]") try: members = list_workspace_members(normalized, include_inactive=False) except Exception as e: console.print(f"[red]Error listing members: {e}[/red]") raise typer.Exit(1) if not members: console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]") return if limit is not None: members = members[:limit] if not ensure_hybrid_server_for_profile(extraction_profile): raise typer.Exit(1) cache_manager = resolve_cache_manager() wiki_source_dir_base = cache_manager.workspaces_dir / normalized / "sources" processed = 0 failed = 0 skipped = 0 progress, task = create_progress_bar( f"Processing [{extraction_profile.value}]", total=len(members), ) with progress: for member in members: source_id = member.source_item_id if _should_skip_member(source_id, wiki_source_dir_base, extraction_profile, force, skip_existing): skipped += 1 progress.advance(task) continue progress.update(task, description=f"Processing [{extraction_profile.value}] {source_id}") _, succeeded, errored = _process_member( member, wiki_source_dir_base, extraction_profile, force, md_yaml_frontmatter, ) if succeeded: processed += 1 if errored: failed += 1 progress.advance(task) console.print(f"[yellow]Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped[/yellow]") def _resolve_spec_release_for_add( item: str, release: str, Loading @@ -385,7 +88,6 @@ def _resolve_spec_release_for_add( return release @app.command("add", help="Add documents to an existing workspace.") def workspace_add( items: WorkspaceItemsArgument = None, workspace: WorkspaceNameOption = None, Loading @@ -408,7 +110,6 @@ def workspace_add( Either provide items directly, or use filter options (--agenda, --source, etc.) to query the TDoc database and add matching results in bulk. """ # Resolve workspace name: -w flag > active workspace resolved_name = workspace or get_active_workspace() if resolved_name is None: console.print("[red]No workspace specified and no active workspace set. Use 'workspace create' and 'workspace activate' first.[/red]") Loading @@ -417,13 +118,10 @@ def workspace_add( normalized = normalize_workspace_name(resolved_name) source_kind = SourceKind(kind) # Check if any filter flags are set has_filters = any([agenda, agenda_ex, start_date, end_date, source, source_ex, title, title_ex]) members: list = [] if has_filters: # Batch mode: query TDoc database and add matching results try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: Loading Loading @@ -478,7 +176,6 @@ def workspace_add( ) elif items: # Direct mode: add items by ID for item in items: resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs) members.append( Loading @@ -503,7 +200,6 @@ def workspace_add( console.print(f"[green]Added {added} item(s) to workspace '{normalized}' ({mode}).[/green]") @app.command("clear-invalid", help="Remove members with invalid or missing source paths.") def workspace_clear_invalid( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), dry_run: DryRunOption = False, Loading Loading
src/tdoc_crawler/cli/mgmt_app.py +1 −1 Original line number Diff line number Diff line Loading @@ -11,9 +11,9 @@ from typing import Annotated import typer from tdoc_crawler.cli._workspace_commands import app as workspace_app from tdoc_crawler.cli.config import load_cli_config from tdoc_crawler.cli.config_app import config_app from tdoc_crawler.cli.workspace import app as workspace_app from tdoc_crawler.config import CacheManager from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY from tdoc_crawler.logging import set_verbosity Loading
src/tdoc_crawler/cli/workspace/__init__.py 0 → 100644 +33 −0 Original line number Diff line number Diff line """Workspace CLI commands — split from monolithic _workspace_commands.py.""" from __future__ import annotations import typer from tdoc_crawler.cli.workspace.create import workspace_create from tdoc_crawler.cli.workspace.lifecycle import ( workspace_activate, workspace_deactivate, workspace_delete, workspace_list, ) from tdoc_crawler.cli.workspace.members import ( workspace_add, workspace_clear_invalid, workspace_members, ) from tdoc_crawler.cli.workspace.process import workspace_process app = typer.Typer(help="Manage extraction workspaces") app.command("create")(workspace_create) app.command("list")(workspace_list) app.command("activate")(workspace_activate) app.command("deactivate")(workspace_deactivate) app.command("delete")(workspace_delete) app.command("members")(workspace_members) app.command("add")(workspace_add) app.command("clear-invalid")(workspace_clear_invalid) app.command("process")(workspace_process) __all__ = ["app"]
src/tdoc_crawler/cli/workspace/create.py 0 → 100644 +24 −0 Original line number Diff line number Diff line """Workspace create command.""" from __future__ import annotations import typer from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import SourcesDirNameOption, WorkspaceDirOption from tdoc_crawler.workspaces import create_workspace, normalize_workspace_name def workspace_create( name: str = typer.Argument(..., help="Workspace name"), workspace_dir: WorkspaceDirOption = None, sources_dirname: SourcesDirNameOption = None, ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace( normalized, workspace_dir=str(workspace_dir) if workspace_dir else None, sources_dirname=sources_dirname, ) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")
src/tdoc_crawler/cli/workspace/lifecycle.py 0 → 100644 +72 −0 Original line number Diff line number Diff line """Workspace lifecycle commands: list, activate, deactivate, delete.""" from __future__ import annotations import shutil import typer from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import DeleteArtifactsOption, DeleteLlmWikiOption, WorkspaceDeleteForceOption from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.workspaces import ( delete_workspace, get_workspace, list_workspaces, normalize_workspace_name, set_active_workspace, ) def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: WorkspaceDeleteForceOption = False, delete_artifacts: DeleteArtifactsOption = False, delete_llm_wiki: DeleteLlmWikiOption = False, ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized, delete_artifacts=delete_artifacts) if delete_llm_wiki: try: manager = resolve_cache_manager() metadata = get_workspace(normalized) ws_dir = metadata.resolve_workspace_directory(manager.workspaces_dir) if metadata is not None else manager.workspaces_dir / normalized llm_wiki_dir = ws_dir / "wiki" if llm_wiki_dir.exists(): shutil.rmtree(llm_wiki_dir) console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]")
src/tdoc_crawler/cli/_workspace_commands.py→src/tdoc_crawler/cli/workspace/members.py +230 −0 Original line number Diff line number Diff line """Workspace-related CLI commands for the main application. These commands create, inspect, modify, and process workspaces. """ """Workspace membership commands: members, add, clear-invalid.""" from __future__ import annotations import asyncio import json import shutil from datetime import UTC, datetime from pathlib import Path from typing import Any import typer from tdoc_crawler.cli._shared import console, create_progress_bar, ensure_hybrid_server_for_profile from tdoc_crawler.cli._shared import console from tdoc_crawler.cli.args import ( AgendaPatternExcludeOption, AgendaPatternOption, AutoCrawlSpecsOption, DeleteArtifactsOption, DeleteLlmWikiOption, DryRunOption, EndDateOption, IncludeInactiveOption, LimitOption, MdYamlFrontmatterOption, ProcessLimitOption, ProfileOption, ReleaseOption, SkipExistingOption, SourceKindOption, SourcePatternExcludeOption, SourcePatternOption, StartDateOption, TitlePatternExcludeOption, TitlePatternOption, VerbosityOption, WorkspaceDeleteForceOption, WorkspaceItemsArgument, WorkspaceNameOption, WorkspaceProcessForceOption, ) from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output from tdoc_crawler.config import PathConfig, resolve_cache_manager from tdoc_crawler.config import PathConfig from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.convert import convert_for_wiki from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY from tdoc_crawler.logging import get_logger, set_verbosity from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.workspaces import SourceKind from tdoc_crawler.specs.operations.checkout import resolve_spec_release Loading @@ -56,110 +36,14 @@ from tdoc_crawler.tdocs.models import TDocQueryConfig from tdoc_crawler.utils.date_parser import parse_partial_date from tdoc_crawler.workspaces import ( add_workspace_members, create_workspace, delete_workspace, get_active_workspace, list_workspace_members, list_workspaces, make_workspace_member, normalize_workspace_name, remove_workspace_member, set_active_workspace, ) logger = get_logger(__name__) __all__ = ["app"] app = typer.Typer(help="Manage extraction workspaces") _logger = get_logger(__name__) def _print_output( data: Any, output_format: OutputFormat, *, table_title: str, table_columns: list[TableColumnSpec] | None = None, ) -> None: """Print structured command output through the shared formatter pipeline.""" print_structured_output( data, output_format, table_title=table_title, table_columns=table_columns, console=console, ) @app.command("create", help="Create a new workspace.") def workspace_create( name: str = typer.Argument(..., help="Workspace name"), ) -> None: """Create a workspace.""" normalized = normalize_workspace_name(name) create_workspace(normalized) console.print(f"[green]Workspace '{normalized}' created successfully.[/green]") @app.command("list", help="List all available workspaces.") def workspace_list() -> None: """Display all existing workspaces.""" workspaces = list_workspaces() if not workspaces: console.print("[dim]No workspaces found.[/dim]") return for ws in sorted(workspaces, key=lambda w: w.name): active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else "" console.print(f"- {ws.name}{active_marker}") @app.command("activate", help="Set a workspace as active.") def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None: """Activate workspace for default command targets.""" normalized = normalize_workspace_name(workspace_name) set_active_workspace(normalized) console.print(f"[green]Workspace '{normalized}' is now active.[/green]") @app.command("deactivate", help="Deactivate the currently active workspace.") def workspace_deactivate() -> None: """Deactivate workspace context.""" set_active_workspace(None) console.print("[yellow]Workspace deactivated.[/yellow]") @app.command("delete", help="Delete a workspace and optionally its artifacts.") def workspace_delete( workspace_name: str = typer.Argument(..., help="Workspace name"), force: WorkspaceDeleteForceOption = False, delete_artifacts: DeleteArtifactsOption = False, delete_llm_wiki: DeleteLlmWikiOption = False, ) -> None: """Permanently delete a workspace and all associated files.""" normalized = normalize_workspace_name(workspace_name) if not force: console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]") return delete_workspace(normalized, delete_artifacts=delete_artifacts) if delete_llm_wiki: try: manager = resolve_cache_manager() llm_wiki_dir = manager.workspaces_dir / normalized / "wiki" if llm_wiki_dir.exists(): shutil.rmtree(llm_wiki_dir) console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]") except Exception as e: console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]") console.print(f"[green]Workspace '{normalized}' deleted.[/green]") @app.command("members", help="List workspace members.") def workspace_members( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), include_inactive: IncludeInactiveOption = False, Loading @@ -181,187 +65,6 @@ def workspace_members( console.print(f"[red]Error: {e}[/red]") _PROFILE_LEVELS = { ExtractionProfile.PDF_ONLY: 0, ExtractionProfile.DEFAULT: 1, ExtractionProfile.ADVANCED: 2, } def _should_skip_member( source_id: str, wiki_base: Path, profile: ExtractionProfile, force: bool, skip_existing: bool, ) -> bool: """Check if a workspace member should be skipped due to existing artifacts.""" if force or not skip_existing: return False member_dir = wiki_base / source_id if profile == ExtractionProfile.PDF_ONLY: if list(member_dir.glob("*.pdf")): logger.debug("Skipping %s — PDF exists", source_id) return True return False md_files = list(member_dir.glob("*.md")) json_files = list(member_dir.glob("*.json")) if not md_files or not json_files: return False saved_profile = _read_json_profile(json_files[0]) saved_level = _PROFILE_LEVELS.get(saved_profile, -1) if saved_level < 0: saved_level = _PROFILE_LEVELS.get(_coerce_profile(saved_profile), -1) required_level = _PROFILE_LEVELS[profile] if saved_level >= required_level: label = saved_profile if saved_level >= 0 else "unknown" logger.debug("Skipping %s — %s output exists", source_id, label) return True return False def _read_json_profile(json_path: Path) -> str: """Read extraction_profile from a JSON file, empty string on failure.""" try: data = json.loads(json_path.read_text(encoding="utf-8")) return data.get("extraction_profile", "") except json.JSONDecodeError, OSError: return "" def _coerce_profile(value: str) -> ExtractionProfile | None: """Try to parse a string as ExtractionProfile, return None on failure.""" try: return ExtractionProfile(value) except ValueError: return None def _process_member( member: Any, wiki_source_dir_base: Path, extraction_profile: ExtractionProfile, force: bool, md_yaml_frontmatter: bool, ) -> tuple[str, bool, bool]: """Process a single workspace member. Returns: Tuple of (source_id, succeeded, failed). """ source_id = member.source_item_id wiki_source_dir = wiki_source_dir_base / source_id wiki_source_dir.mkdir(parents=True, exist_ok=True) try: result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, profile=extraction_profile, force=force, release=member.release, md_yaml_frontmatter=md_yaml_frontmatter, ) if result_path: suffix = result_path.suffix.lstrip(".") logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix) return source_id, True, False logger.debug("No output for %s", source_id) return source_id, False, False except Exception as e: console.print(f"[red] Failed {source_id}: {e}[/red]") logger.error("Failed to process %s: %s", source_id, e) return source_id, False, True @app.command("process", help="Process workspace members.") def workspace_process( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), force: WorkspaceProcessForceOption = False, limit: ProcessLimitOption = None, skip_existing: SkipExistingOption = False, profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value, md_yaml_frontmatter: MdYamlFrontmatterOption = True, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Extract structured data from all workspace members.""" set_verbosity(verbosity) if workspace_name is None: workspace_name = get_active_workspace() normalized = normalize_workspace_name(workspace_name) try: extraction_profile = ExtractionProfile(profile) except ValueError: console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]") raise typer.Exit(1) console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]") try: members = list_workspace_members(normalized, include_inactive=False) except Exception as e: console.print(f"[red]Error listing members: {e}[/red]") raise typer.Exit(1) if not members: console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]") return if limit is not None: members = members[:limit] if not ensure_hybrid_server_for_profile(extraction_profile): raise typer.Exit(1) cache_manager = resolve_cache_manager() wiki_source_dir_base = cache_manager.workspaces_dir / normalized / "sources" processed = 0 failed = 0 skipped = 0 progress, task = create_progress_bar( f"Processing [{extraction_profile.value}]", total=len(members), ) with progress: for member in members: source_id = member.source_item_id if _should_skip_member(source_id, wiki_source_dir_base, extraction_profile, force, skip_existing): skipped += 1 progress.advance(task) continue progress.update(task, description=f"Processing [{extraction_profile.value}] {source_id}") _, succeeded, errored = _process_member( member, wiki_source_dir_base, extraction_profile, force, md_yaml_frontmatter, ) if succeeded: processed += 1 if errored: failed += 1 progress.advance(task) console.print(f"[yellow]Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped[/yellow]") def _resolve_spec_release_for_add( item: str, release: str, Loading @@ -385,7 +88,6 @@ def _resolve_spec_release_for_add( return release @app.command("add", help="Add documents to an existing workspace.") def workspace_add( items: WorkspaceItemsArgument = None, workspace: WorkspaceNameOption = None, Loading @@ -408,7 +110,6 @@ def workspace_add( Either provide items directly, or use filter options (--agenda, --source, etc.) to query the TDoc database and add matching results in bulk. """ # Resolve workspace name: -w flag > active workspace resolved_name = workspace or get_active_workspace() if resolved_name is None: console.print("[red]No workspace specified and no active workspace set. Use 'workspace create' and 'workspace activate' first.[/red]") Loading @@ -417,13 +118,10 @@ def workspace_add( normalized = normalize_workspace_name(resolved_name) source_kind = SourceKind(kind) # Check if any filter flags are set has_filters = any([agenda, agenda_ex, start_date, end_date, source, source_ex, title, title_ex]) members: list = [] if has_filters: # Batch mode: query TDoc database and add matching results try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: Loading Loading @@ -478,7 +176,6 @@ def workspace_add( ) elif items: # Direct mode: add items by ID for item in items: resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs) members.append( Loading @@ -503,7 +200,6 @@ def workspace_add( console.print(f"[green]Added {added} item(s) to workspace '{normalized}' ({mode}).[/green]") @app.command("clear-invalid", help="Remove members with invalid or missing source paths.") def workspace_clear_invalid( workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"), dry_run: DryRunOption = False, Loading