♻️ refactor(cli): split workspace commands into subpackage (83cbe215) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/mgmt_app.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -11,9 +11,9 @@ from typing import Annotated

		import typer

		from tdoc_crawler.cli._workspace_commands import app as workspace_app
		from tdoc_crawler.cli.config import load_cli_config
		from tdoc_crawler.cli.config_app import config_app
		from tdoc_crawler.cli.workspace import app as workspace_app
		from tdoc_crawler.config import CacheManager
		from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
		from tdoc_crawler.logging import set_verbosity

src/tdoc_crawler/cli/workspace/init.py

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		"""Workspace CLI commands — split from monolithic _workspace_commands.py."""

		from __future__ import annotations

		import typer

		from tdoc_crawler.cli.workspace.create import workspace_create
		from tdoc_crawler.cli.workspace.lifecycle import (
		workspace_activate,
		workspace_deactivate,
		workspace_delete,
		workspace_list,
		)
		from tdoc_crawler.cli.workspace.members import (
		workspace_add,
		workspace_clear_invalid,
		workspace_members,
		)
		from tdoc_crawler.cli.workspace.process import workspace_process

		app = typer.Typer(help="Manage extraction workspaces")

		app.command("create")(workspace_create)
		app.command("list")(workspace_list)
		app.command("activate")(workspace_activate)
		app.command("deactivate")(workspace_deactivate)
		app.command("delete")(workspace_delete)
		app.command("members")(workspace_members)
		app.command("add")(workspace_add)
		app.command("clear-invalid")(workspace_clear_invalid)
		app.command("process")(workspace_process)

		__all__ = ["app"]

src/tdoc_crawler/cli/workspace/create.py

0 → 100644

+24 −0

Original line number	Diff line number	Diff line
		"""Workspace create command."""

		from __future__ import annotations

		import typer

		from tdoc_crawler.cli._shared import console
		from tdoc_crawler.cli.args import SourcesDirNameOption, WorkspaceDirOption
		from tdoc_crawler.workspaces import create_workspace, normalize_workspace_name


		def workspace_create(
		name: str = typer.Argument(..., help="Workspace name"),
		workspace_dir: WorkspaceDirOption = None,
		sources_dirname: SourcesDirNameOption = None,
		) -> None:
		"""Create a workspace."""
		normalized = normalize_workspace_name(name)
		create_workspace(
		normalized,
		workspace_dir=str(workspace_dir) if workspace_dir else None,
		sources_dirname=sources_dirname,
		)
		console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")

src/tdoc_crawler/cli/workspace/lifecycle.py

0 → 100644

+72 −0

Original line number	Diff line number	Diff line
		"""Workspace lifecycle commands: list, activate, deactivate, delete."""

		from __future__ import annotations

		import shutil

		import typer

		from tdoc_crawler.cli._shared import console
		from tdoc_crawler.cli.args import DeleteArtifactsOption, DeleteLlmWikiOption, WorkspaceDeleteForceOption
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.workspaces import (
		delete_workspace,
		get_workspace,
		list_workspaces,
		normalize_workspace_name,
		set_active_workspace,
		)


		def workspace_list() -> None:
		"""Display all existing workspaces."""
		workspaces = list_workspaces()
		if not workspaces:
		console.print("[dim]No workspaces found.[/dim]")
		return

		for ws in sorted(workspaces, key=lambda w: w.name):
		active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else ""
		console.print(f"- {ws.name}{active_marker}")


		def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None:
		"""Activate workspace for default command targets."""
		normalized = normalize_workspace_name(workspace_name)
		set_active_workspace(normalized)
		console.print(f"[green]Workspace '{normalized}' is now active.[/green]")


		def workspace_deactivate() -> None:
		"""Deactivate workspace context."""
		set_active_workspace(None)
		console.print("[yellow]Workspace deactivated.[/yellow]")


		def workspace_delete(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		force: WorkspaceDeleteForceOption = False,
		delete_artifacts: DeleteArtifactsOption = False,
		delete_llm_wiki: DeleteLlmWikiOption = False,
		) -> None:
		"""Permanently delete a workspace and all associated files."""
		normalized = normalize_workspace_name(workspace_name)
		if not force:
		console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
		return

		delete_workspace(normalized, delete_artifacts=delete_artifacts)

		if delete_llm_wiki:
		try:
		manager = resolve_cache_manager()
		metadata = get_workspace(normalized)
		ws_dir = metadata.resolve_workspace_directory(manager.workspaces_dir) if metadata is not None else manager.workspaces_dir / normalized
		llm_wiki_dir = ws_dir / "wiki"
		if llm_wiki_dir.exists():
		shutil.rmtree(llm_wiki_dir)
		console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]")
		except Exception as e:
		console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]")

		console.print(f"[green]Workspace '{normalized}' deleted.[/green]")

src/tdoc_crawler/cli/_workspace_commands.py→src/tdoc_crawler/cli/workspace/members.py

+230 −0

Original line number	Diff line number	Diff line
		"""Workspace-related CLI commands for the main application.

		These commands create, inspect, modify, and process workspaces.
		"""
		"""Workspace membership commands: members, add, clear-invalid."""

		from __future__ import annotations

		import asyncio
		import json
		import shutil
		from datetime import UTC, datetime
		from pathlib import Path
		from typing import Any

		import typer

		from tdoc_crawler.cli._shared import console, create_progress_bar, ensure_hybrid_server_for_profile
		from tdoc_crawler.cli._shared import console
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		AutoCrawlSpecsOption,
		DeleteArtifactsOption,
		DeleteLlmWikiOption,
		DryRunOption,
		EndDateOption,
		IncludeInactiveOption,
		LimitOption,
		MdYamlFrontmatterOption,
		ProcessLimitOption,
		ProfileOption,
		ReleaseOption,
		SkipExistingOption,
		SourceKindOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		StartDateOption,
		TitlePatternExcludeOption,
		TitlePatternOption,
		VerbosityOption,
		WorkspaceDeleteForceOption,
		WorkspaceItemsArgument,
		WorkspaceNameOption,
		WorkspaceProcessForceOption,
		)
		from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
		from tdoc_crawler.config import PathConfig, resolve_cache_manager
		from tdoc_crawler.config import PathConfig
		from tdoc_crawler.database.tdocs import TDocDatabase
		from tdoc_crawler.extraction.convert import convert_for_wiki
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
		from tdoc_crawler.logging import get_logger, set_verbosity
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		from tdoc_crawler.models.workspaces import SourceKind
		from tdoc_crawler.specs.operations.checkout import resolve_spec_release
		@@ -56,110 +36,14 @@ from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.workspaces import (
		add_workspace_members,
		create_workspace,
		delete_workspace,
		get_active_workspace,
		list_workspace_members,
		list_workspaces,
		make_workspace_member,
		normalize_workspace_name,
		remove_workspace_member,
		set_active_workspace,
		)

		logger = get_logger(__name__)

		__all__ = ["app"]

		app = typer.Typer(help="Manage extraction workspaces")

		_logger = get_logger(__name__)


		def _print_output(
		data: Any,
		output_format: OutputFormat,
		*,
		table_title: str,
		table_columns: list[TableColumnSpec] \| None = None,
		) -> None:
		"""Print structured command output through the shared formatter pipeline."""
		print_structured_output(
		data,
		output_format,
		table_title=table_title,
		table_columns=table_columns,
		console=console,
		)


		@app.command("create", help="Create a new workspace.")
		def workspace_create(
		name: str = typer.Argument(..., help="Workspace name"),
		) -> None:
		"""Create a workspace."""
		normalized = normalize_workspace_name(name)
		create_workspace(normalized)
		console.print(f"[green]Workspace '{normalized}' created successfully.[/green]")


		@app.command("list", help="List all available workspaces.")
		def workspace_list() -> None:
		"""Display all existing workspaces."""
		workspaces = list_workspaces()
		if not workspaces:
		console.print("[dim]No workspaces found.[/dim]")
		return

		for ws in sorted(workspaces, key=lambda w: w.name):
		active_marker = " [green](active)[/green]" if hasattr(ws, "is_active") and ws.is_active else ""
		console.print(f"- {ws.name}{active_marker}")


		@app.command("activate", help="Set a workspace as active.")
		def workspace_activate(workspace_name: str = typer.Argument(..., help="Workspace name")) -> None:
		"""Activate workspace for default command targets."""
		normalized = normalize_workspace_name(workspace_name)
		set_active_workspace(normalized)
		console.print(f"[green]Workspace '{normalized}' is now active.[/green]")


		@app.command("deactivate", help="Deactivate the currently active workspace.")
		def workspace_deactivate() -> None:
		"""Deactivate workspace context."""
		set_active_workspace(None)
		console.print("[yellow]Workspace deactivated.[/yellow]")


		@app.command("delete", help="Delete a workspace and optionally its artifacts.")
		def workspace_delete(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		force: WorkspaceDeleteForceOption = False,
		delete_artifacts: DeleteArtifactsOption = False,
		delete_llm_wiki: DeleteLlmWikiOption = False,
		) -> None:
		"""Permanently delete a workspace and all associated files."""
		normalized = normalize_workspace_name(workspace_name)
		if not force:
		console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
		return

		delete_workspace(normalized, delete_artifacts=delete_artifacts)

		if delete_llm_wiki:
		try:
		manager = resolve_cache_manager()
		llm_wiki_dir = manager.workspaces_dir / normalized / "wiki"
		if llm_wiki_dir.exists():
		shutil.rmtree(llm_wiki_dir)
		console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]")
		except Exception as e:
		console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]")

		console.print(f"[green]Workspace '{normalized}' deleted.[/green]")


		@app.command("members", help="List workspace members.")
		def workspace_members(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		include_inactive: IncludeInactiveOption = False,
		@@ -181,187 +65,6 @@ def workspace_members(
		console.print(f"[red]Error: {e}[/red]")


		_PROFILE_LEVELS = {
		ExtractionProfile.PDF_ONLY: 0,
		ExtractionProfile.DEFAULT: 1,
		ExtractionProfile.ADVANCED: 2,
		}


		def _should_skip_member(
		source_id: str,
		wiki_base: Path,
		profile: ExtractionProfile,
		force: bool,
		skip_existing: bool,
		) -> bool:
		"""Check if a workspace member should be skipped due to existing artifacts."""
		if force or not skip_existing:
		return False

		member_dir = wiki_base / source_id

		if profile == ExtractionProfile.PDF_ONLY:
		if list(member_dir.glob("*.pdf")):
		logger.debug("Skipping %s — PDF exists", source_id)
		return True
		return False

		md_files = list(member_dir.glob("*.md"))
		json_files = list(member_dir.glob("*.json"))
		if not md_files or not json_files:
		return False

		saved_profile = _read_json_profile(json_files[0])
		saved_level = _PROFILE_LEVELS.get(saved_profile, -1)
		if saved_level < 0:
		saved_level = _PROFILE_LEVELS.get(_coerce_profile(saved_profile), -1)

		required_level = _PROFILE_LEVELS[profile]
		if saved_level >= required_level:
		label = saved_profile if saved_level >= 0 else "unknown"
		logger.debug("Skipping %s — %s output exists", source_id, label)
		return True

		return False


		def _read_json_profile(json_path: Path) -> str:
		"""Read extraction_profile from a JSON file, empty string on failure."""
		try:
		data = json.loads(json_path.read_text(encoding="utf-8"))
		return data.get("extraction_profile", "")
		except json.JSONDecodeError, OSError:
		return ""


		def _coerce_profile(value: str) -> ExtractionProfile \| None:
		"""Try to parse a string as ExtractionProfile, return None on failure."""
		try:
		return ExtractionProfile(value)
		except ValueError:
		return None


		def _process_member(
		member: Any,
		wiki_source_dir_base: Path,
		extraction_profile: ExtractionProfile,
		force: bool,
		md_yaml_frontmatter: bool,
		) -> tuple[str, bool, bool]:
		"""Process a single workspace member.

		Returns:
		Tuple of (source_id, succeeded, failed).
		"""
		source_id = member.source_item_id
		wiki_source_dir = wiki_source_dir_base / source_id
		wiki_source_dir.mkdir(parents=True, exist_ok=True)

		try:
		result_path = convert_for_wiki(
		document_id=source_id,
		wiki_source_dir=wiki_source_dir,
		source_kind=member.source_kind,
		profile=extraction_profile,
		force=force,
		release=member.release,
		md_yaml_frontmatter=md_yaml_frontmatter,
		)
		if result_path:
		suffix = result_path.suffix.lstrip(".")
		logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
		return source_id, True, False
		logger.debug("No output for %s", source_id)
		return source_id, False, False
		except Exception as e:
		console.print(f"[red] Failed {source_id}: {e}[/red]")
		logger.error("Failed to process %s: %s", source_id, e)
		return source_id, False, True


		@app.command("process", help="Process workspace members.")
		def workspace_process(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		force: WorkspaceProcessForceOption = False,
		limit: ProcessLimitOption = None,
		skip_existing: SkipExistingOption = False,
		profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value,
		md_yaml_frontmatter: MdYamlFrontmatterOption = True,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		"""Extract structured data from all workspace members."""
		set_verbosity(verbosity)

		if workspace_name is None:
		workspace_name = get_active_workspace()

		normalized = normalize_workspace_name(workspace_name)

		try:
		extraction_profile = ExtractionProfile(profile)
		except ValueError:
		console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]")
		raise typer.Exit(1)

		console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]")

		try:
		members = list_workspace_members(normalized, include_inactive=False)
		except Exception as e:
		console.print(f"[red]Error listing members: {e}[/red]")
		raise typer.Exit(1)

		if not members:
		console.print(f"[dim]No active members in workspace '{normalized}'.[/dim]")
		return

		if limit is not None:
		members = members[:limit]

		if not ensure_hybrid_server_for_profile(extraction_profile):
		raise typer.Exit(1)

		cache_manager = resolve_cache_manager()
		wiki_source_dir_base = cache_manager.workspaces_dir / normalized / "sources"

		processed = 0
		failed = 0
		skipped = 0

		progress, task = create_progress_bar(
		f"Processing [{extraction_profile.value}]",
		total=len(members),
		)

		with progress:
		for member in members:
		source_id = member.source_item_id

		if _should_skip_member(source_id, wiki_source_dir_base, extraction_profile, force, skip_existing):
		skipped += 1
		progress.advance(task)
		continue

		progress.update(task, description=f"Processing [{extraction_profile.value}] {source_id}")
		_, succeeded, errored = _process_member(
		member,
		wiki_source_dir_base,
		extraction_profile,
		force,
		md_yaml_frontmatter,
		)
		if succeeded:
		processed += 1
		if errored:
		failed += 1

		progress.advance(task)

		console.print(f"[yellow]Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped[/yellow]")


		def _resolve_spec_release_for_add(
		item: str,
		release: str,
		@@ -385,7 +88,6 @@ def _resolve_spec_release_for_add(
		return release


		@app.command("add", help="Add documents to an existing workspace.")
		def workspace_add(
		items: WorkspaceItemsArgument = None,
		workspace: WorkspaceNameOption = None,
		@@ -408,7 +110,6 @@ def workspace_add(
		Either provide items directly, or use filter options (--agenda, --source, etc.)
		to query the TDoc database and add matching results in bulk.
		"""
		# Resolve workspace name: -w flag > active workspace
		resolved_name = workspace or get_active_workspace()
		if resolved_name is None:
		console.print("[red]No workspace specified and no active workspace set. Use 'workspace create' and 'workspace activate' first.[/red]")
		@@ -417,13 +118,10 @@ def workspace_add(
		normalized = normalize_workspace_name(resolved_name)
		source_kind = SourceKind(kind)

		# Check if any filter flags are set
		has_filters = any([agenda, agenda_ex, start_date, end_date, source, source_ex, title, title_ex])

		members: list = []

		if has_filters:
		# Batch mode: query TDoc database and add matching results
		try:
		start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None
		except ValueError as exc:
		@@ -478,7 +176,6 @@ def workspace_add(
		)

		elif items:
		# Direct mode: add items by ID
		for item in items:
		resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs)
		members.append(
		@@ -503,7 +200,6 @@ def workspace_add(
		console.print(f"[green]Added {added} item(s) to workspace '{normalized}' ({mode}).[/green]")


		@app.command("clear-invalid", help="Remove members with invalid or missing source paths.")
		def workspace_clear_invalid(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		dry_run: DryRunOption = False,