feat(workspaces): enhance workspace commands with TDoc filtering and spec member support (ad24976e) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/_workspace_commands.py

+383 −68

Original line number	Diff line number	Diff line
		@@ -5,20 +5,28 @@ These commands create, inspect, modify, and process workspaces.

		from __future__ import annotations

		import asyncio
		import shutil
		from datetime import UTC, datetime
		from pathlib import Path
		from typing import Any
		from typing import Annotated

		import typer
		from rich.progress import Progress, SpinnerColumn, TextColumn

		from tdoc_crawler.cli._shared import console
		from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.database.specs import SpecDatabase
		from tdoc_crawler.database.tdocs import TDocDatabase
		from tdoc_crawler.extraction.convert import convert_for_wiki
		from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models.base import OutputFormat
		from tdoc_crawler.models.workspaces import SourceKind
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		from tdoc_crawler.models.workspaces import SourceKind, TDocNotFoundError
		from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.utils.normalization import normalize_release_version, normalize_spec_number, normalize_tdoc_id
		from tdoc_crawler.workspaces import (
		add_workspace_members,
		create_workspace,
		@@ -29,33 +37,115 @@ from tdoc_crawler.workspaces import (
		make_workspace_member,
		normalize_workspace_name,
		remove_workspace_member,
		resolve_spec_release_from_db,
		set_active_workspace,
		)

		logger = get_logger(__name__)

		INACTIVE_MISSING_TDOC_STATUSES = {"withdrawn", "reserved"}

		__all__ = ["app"]

		app = typer.Typer(help="Manage extraction workspaces")

		_logger = get_logger(__name__)

		WorkspaceNameOption = Annotated[
		str \| None,
		typer.Option(
		"-w",
		"--workspace",
		help="Workspace name (default: active workspace)",
		),
		]

		def _print_output(
		data: Any,
		output_format: OutputFormat,
		*,
		table_title: str,
		table_columns: list[TableColumnSpec] \| None = None,
		) -> None:
		"""Print structured command output through the shared formatter pipeline."""
		print_structured_output(
		data,
		output_format,
		table_title=table_title,
		table_columns=table_columns,
		console=console,
		)

		def _resolve_workspace_name(workspace: str \| None) -> str:
		"""Resolve workspace name to a normalized string.

		Args:
		workspace: Workspace name or None.

		Returns:
		Normalized workspace name, or active workspace if None.
		"""
		if workspace is None:
		return get_active_workspace()
		return normalize_workspace_name(workspace)


		def _validate_tdoc_exists(tdoc_id: str) -> bool:
		"""Check if a TDoc exists and can be resolved.

		Args:
		tdoc_id: TDoc identifier to validate.

		Returns:
		True if the TDoc can be resolved, False otherwise.
		"""
		try:
		fetch_tdoc_files(tdoc_id)
		return True
		except (TDocNotFoundError, Exception) as e:
		_logger.debug("TDoc %s validation failed: %s", tdoc_id, e)
		return False


		async def _get_tdoc_status(tdoc_id: str) -> str \| None:
		"""Return normalized status from database for a TDoc ID, if present."""
		normalized_tdoc_id = normalize_tdoc_id(tdoc_id)
		if normalized_tdoc_id is None:
		return None

		manager = resolve_cache_manager()
		query_config = TDocQueryConfig(tdoc_ids=[normalized_tdoc_id])
		async with TDocDatabase(manager.db_file) as db:
		rows = await db.query_tdocs(query_config)

		if not rows:
		return None

		raw_status = rows[0].status
		if raw_status is None:
		return None
		return raw_status.strip().lower() or None


		def _resolve_inactive_missing_tdoc_status(tdoc_id: str) -> str \| None:
		"""Return status when missing files are acceptable and member should be inactive."""
		status = asyncio.run(_get_tdoc_status(tdoc_id))
		if status in INACTIVE_MISSING_TDOC_STATUSES:
		return status
		return None


		async def _validate_spec_exists(spec_number: str, release: str \| None = None) -> bool:
		"""Check if a spec exists in the database.

		Args:
		spec_number: Spec number to validate.
		release: Optional release version.

		Returns:
		True if the spec exists, False otherwise.
		"""
		try:
		normalized_spec = normalize_spec_number(spec_number)
		manager = resolve_cache_manager()
		async with SpecDatabase(manager.db_file) as db:
		versions = await db.get_spec_versions(normalized_spec)
		if not versions:
		return False
		if release:
		normalized = normalize_release_version(release)
		return any(normalized in v.version for v in versions)
		return True
		except Exception as e:
		if isinstance(e, (KeyboardInterrupt, SystemExit)):
		raise
		_logger.debug("Spec %s validation failed: %s", spec_number, e)
		return False


		@app.command("create", help="Create a new workspace.")
		@@ -100,8 +190,7 @@ def workspace_deactivate() -> None:
		def workspace_delete(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"),
		delete_artifacts: bool = typer.Option(False, "--delete-artifacts", help="Delete all .ai artifacts for workspace members"),
		delete_llm_wiki: bool = typer.Option(False, "--delete-llm-wiki", help="Delete the .llm-wiki folder for this workspace"),
		delete_wiki: bool = typer.Option(False, "--delete-wiki", help="Delete the wiki folder for this workspace"),
		) -> None:
		"""Permanently delete a workspace and all associated files."""
		normalized = normalize_workspace_name(workspace_name)
		@@ -109,28 +198,28 @@ def workspace_delete(
		console.print("[yellow]Use --force to permanently delete workspace and all artifacts.[/yellow]")
		return

		delete_workspace(normalized, delete_artifacts=delete_artifacts)
		delete_workspace(normalized)

		if delete_llm_wiki:
		if delete_wiki:
		try:
		manager = resolve_cache_manager()
		llm_wiki_dir = manager.checkout_dir / normalized / "wiki"
		if llm_wiki_dir.exists():
		shutil.rmtree(llm_wiki_dir)
		console.print(f"[green]Deleted .llm-wiki folder for '{normalized}'.[/green]")
		wiki_dir = manager.workspace_llm_wiki_dir(normalized)
		if wiki_dir.exists():
		shutil.rmtree(wiki_dir)
		console.print(f"[green]Deleted wiki folder for '{normalized}'.[/green]")
		except Exception as e:
		console.print(f"[yellow]Could not delete .llm-wiki folder: {e}[/yellow]")
		console.print(f"[yellow]Could not delete wiki folder: {e}[/yellow]")

		console.print(f"[green]Workspace '{normalized}' deleted.[/green]")


		@app.command("members", help="List workspace members.")
		def workspace_members(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		workspace: WorkspaceNameOption = None,
		include_inactive: bool = typer.Option(False, "--include-inactive", help="Include inactive members"),
		) -> None:
		"""List members of a workspace."""
		normalized = normalize_workspace_name(workspace_name)
		normalized = _resolve_workspace_name(workspace)
		try:
		members = list_workspace_members(normalized, include_inactive=include_inactive)
		if not members:
		@@ -141,12 +230,33 @@ def workspace_members(
		status = "[dim]inactive[/dim]" if not member.is_active else "[green]active[/green]"
		console.print(f" {member.source_item_id} ({member.source_kind.value}) - {status}")
		except Exception as e:
		if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
		raise
		console.print(f"[red]Error: {e}[/red]")


		def _should_skip_member(
		skip_existing: bool,
		force: bool,
		wiki_source_dir_base: Path,
		source_id: str,
		extraction_profile: ExtractionProfile,
		) -> bool:
		"""Check if a member should be skipped because artifacts already exist."""
		if not skip_existing or force:
		return False

		member_wiki_dir = wiki_source_dir_base / source_id
		has_artifacts = bool(list(member_wiki_dir.glob(".pdf"))) if extraction_profile == ExtractionProfile.PDF_ONLY else bool(list(member_wiki_dir.glob(".md")))

		if has_artifacts:
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		return has_artifacts


		@app.command("process", help="Process workspace members.")
		def workspace_process(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		workspace: WorkspaceNameOption = None,
		force: bool = typer.Option(False, "--force", help="Re-process existing artifacts"),
		limit: int = typer.Option(None, "--limit", help="Limit number of members to process"),
		skip_existing: bool = typer.Option(False, "--skip-existing", help="Skip members that already have artifacts"),
		@@ -157,10 +267,7 @@ def workspace_process(
		),
		) -> None:
		"""Extract structured data from all workspace members."""
		if workspace_name is None:
		workspace_name = get_active_workspace()

		normalized = normalize_workspace_name(workspace_name)
		normalized = _resolve_workspace_name(workspace)

		try:
		extraction_profile = ExtractionProfile(profile)
		@@ -173,6 +280,8 @@ def workspace_process(
		try:
		members = list_workspace_members(normalized, include_inactive=False)
		except Exception as e:
		if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
		raise
		console.print(f"[red]Error listing members: {e}[/red]")
		raise typer.Exit(1)

		@@ -184,25 +293,16 @@ def workspace_process(
		members = members[:limit]

		manager = resolve_cache_manager()
		wiki_source_dir_base = manager.checkout_dir / normalized / "sources"
		wiki_source_dir_base = manager.workspace_sources_dir(normalized)

		processed = 0
		failed = 0
		skipped_items: list[tuple[str, str]] = []

		for member in members:
		source_id = member.source_item_id

		if skip_existing and not force:
		member_wiki_dir = wiki_source_dir_base / source_id
		if extraction_profile == ExtractionProfile.PDF_ONLY:
		pdf_exists = list(member_wiki_dir.glob("*.pdf"))
		if pdf_exists:
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		continue
		else:
		md_exists = list(member_wiki_dir.glob("*.md"))
		if md_exists:
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		if _should_skip_member(skip_existing, force, wiki_source_dir_base, source_id, extraction_profile):
		continue

		wiki_source_dir = wiki_source_dir_base / source_id
		@@ -212,6 +312,8 @@ def workspace_process(
		result_path = convert_for_wiki(
		document_id=source_id,
		wiki_source_dir=wiki_source_dir,
		source_kind=member.source_kind,
		source_path=member.source_path,
		profile=extraction_profile,
		force=force,
		)
		@@ -221,44 +323,255 @@ def workspace_process(
		else:
		console.print(f"[yellow] No output for {source_id}[/yellow]")
		except Exception as e:
		console.print(f"[red] Failed {source_id}: {e}[/red]")
		logger.error(f"Failed to process {source_id}: {e}")
		if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
		raise
		error_text = str(e)
		skipped_items.append((source_id, error_text))
		logger.debug("Skipped processing %s: %s", source_id, error_text)
		failed += 1

		if skipped_items:
		console.print("\n[yellow]Skipped documents (processing warnings):[/yellow]")
		for source_id, error_text in skipped_items:
		console.print(f"[yellow] - {source_id}: {error_text}[/yellow]")

		console.print(f"\n[yellow]Processing complete: {processed} succeeded, {failed} failed[/yellow]")


		@app.command("add", help="Add documents to an existing workspace.")
		def workspace_add(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		items: list[str] = typer.Argument(..., help="Items to add (TDoc IDs, spec numbers, etc.)"),
		kind: str = typer.Option("tdoc", "--kind", help="Source kind: tdoc, spec, or other"),
		) -> None:
		"""Add documents to a workspace."""
		normalized = normalize_workspace_name(workspace_name)
		source_kind = SourceKind(kind)
		def _parse_date_filters(
		start_date: str \| None,
		end_date: str \| None,
		) -> tuple[datetime \| None, datetime \| None]:
		"""Parse and validate date filter strings into datetime objects.

		members = [
		make_workspace_member(
		Args:
		start_date: ISO-8601 start date string, or None.
		end_date: ISO-8601 end date string, or None.

		Returns:
		Tuple of (start_datetime, end_datetime), either may be None.

		Raises:
		typer.Exit: If a date string cannot be parsed.
		"""
		try:
		start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None
		except ValueError:
		console.print("[red]Invalid start date format; use ISO-8601[/red]")
		raise typer.Exit(1)

		try:
		end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None
		except ValueError:
		console.print("[red]Invalid end date format; use ISO-8601[/red]")
		raise typer.Exit(1)

		return start, end


		async def _query_tdocs_async(
		source_kind: SourceKind,
		start_date: str \| None,
		end_date: str \| None,
		source: list[str] \| None,
		source_ex: list[str] \| None,
		title: list[str] \| None,
		title_ex: list[str] \| None,
		agenda: list[str] \| None,
		agenda_ex: list[str] \| None,
		limit: int \| None,
		) -> list[str]:
		"""Query database for TDocs matching filter criteria.

		Args:
		source_kind: Must be TDOC for query mode.
		start_date: Filter by start date.
		end_date: Filter by end date.
		source: Filter by source pattern.
		source_ex: Exclude by source pattern.
		title: Filter by title pattern.
		title_ex: Exclude by title pattern.
		agenda: Filter by agenda pattern.
		agenda_ex: Exclude by agenda pattern.
		limit: Limit number of results.

		Returns:
		List of matching TDoc IDs.
		"""
		if source_kind != SourceKind.TDOC:
		console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]")
		raise typer.Exit(1)

		manager = resolve_cache_manager()
		start, end = _parse_date_filters(start_date, end_date)

		config = TDocQueryConfig(
		output_format=OutputFormat.TABLE,
		tdoc_ids=None,
		working_groups=None,
		start_date=start,
		end_date=end,
		meeting_start_date=None,
		meeting_end_date=None,
		source_pattern=source,
		source_pattern_exclude=source_ex,
		title_pattern=title,
		title_pattern_exclude=title_ex,
		agenda_pattern=agenda,
		agenda_pattern_exclude=agenda_ex,
		limit=limit,
		order=SortOrder.DESC,
		)

		async with TDocDatabase(manager.db_file) as db:
		rows = await db.query_tdocs(config)

		return [row.tdoc_id for row in rows]


		def _validate_and_create_members(
		resolved_items: list[str],
		source_kind: SourceKind,
		release: str \| None,
		) -> tuple[list, list[tuple[str, str]]]:
		"""Validate items and create workspace members.

		Args:
		resolved_items: Item identifiers to validate and convert.
		source_kind: Type of source (TDOC, SPEC, OTHER).
		release: Optional release version for specs.

		Returns:
		Tuple of (valid_members, skipped_items_with_reasons).
		"""
		members = []
		skipped = []
		for item in resolved_items:
		if source_kind == SourceKind.TDOC:
		inactive_status = _resolve_inactive_missing_tdoc_status(item)
		if inactive_status is not None:
		member = make_workspace_member(
		source_item_id=item,
		source_path=item,
		source_kind=source_kind,
		added_by="cli",
		)
		for item in items
		]
		member.is_active = False
		members.append(member)
		console.print(f"[dim] Adding {item} as inactive - status '{inactive_status}'[/dim]")
		continue

		if not _validate_tdoc_exists(item):
		skipped.append((item, "TDoc not found"))
		console.print(f"[yellow] Skipping {item} - TDoc not found[/yellow]")
		continue
		elif source_kind == SourceKind.SPEC and not asyncio.run(_validate_spec_exists(item, release)):
		skipped.append((item, f"Spec not found (release={release or 'latest'})"))
		console.print(f"[yellow] Skipping {item} - Spec not found[/yellow]")
		continue

		source_item_id = item
		if source_kind == SourceKind.SPEC:
		# Resolve release for spec member ID (always include release if available)
		resolved_release, _ = asyncio.run(resolve_spec_release_from_db(item, release or "latest"))
		if resolved_release:
		normalized_release = normalize_release_version(resolved_release)
		source_item_id = f"{item}-REL{normalized_release}"

		members.append(
		make_workspace_member(
		source_item_id=source_item_id,
		source_path=item,
		source_kind=source_kind,
		added_by="cli",
		),
		)
		return members, skipped


		@app.command("add", help="Add documents to an existing workspace.")
		def workspace_add(
		workspace: WorkspaceNameOption = None,
		items: Annotated[list[str] \| None, typer.Argument(help="Items to add (TDoc IDs, spec numbers). If not provided, uses --kind/filter options.")] = None,
		kind: Annotated[str, typer.Option("--kind", help="Source kind: tdoc, spec, or other")] = "tdoc",
		release: Annotated[str \| None, typer.Option("--release", help="Spec release version (e.g., 19, 19.1, 19.1.2). Only applies to specs.")] = None,
		# Query-based filtering options
		start_date: Annotated[str \| None, typer.Option("--start-date", help="Filter: start date (ISO-8601)")] = None,
		end_date: Annotated[str \| None, typer.Option("--end-date", help="Filter: end date (ISO-8601)")] = None,
		source: Annotated[list[str] \| None, typer.Option("--source", help="Filter: source pattern (glob)")] = None,
		source_ex: Annotated[list[str] \| None, typer.Option("--source-ex", help="Filter: exclude source pattern (glob)")] = None,
		title: Annotated[list[str] \| None, typer.Option("--title", help="Filter: title pattern (glob)")] = None,
		title_ex: Annotated[list[str] \| None, typer.Option("--title-ex", help="Filter: exclude title pattern (glob)")] = None,
		agenda: Annotated[list[str] \| None, typer.Option("--agenda", help="Filter: agenda pattern (glob)")] = None,
		agenda_ex: Annotated[list[str] \| None, typer.Option("--agenda-ex", help="Filter: exclude agenda pattern (glob)")] = None,
		limit: Annotated[int \| None, typer.Option("--limit", help="Limit number of items")] = None,
		) -> None:
		"""Add documents to a workspace.

		Can be used in two modes:
		1. Explicit items: workspace add <item1> <item2> --kind tdoc
		2. Query-based: workspace add --kind tdoc --agenda "pattern" --start-date 2018

		For query-based mode, provide filter options (--agenda, --title, --source, etc.)
		without explicit items.

		Workspace is specified via -w/--workspace option, defaulting to active workspace.
		"""
		normalized = _resolve_workspace_name(workspace)
		source_kind = SourceKind(kind.lower().rstrip("s")) if kind.lower().rstrip("s") in {e.value for e in SourceKind} else SourceKind.OTHER

		# Phase 1: Resolve items - either directly provided or via database query
		if items is not None:
		resolved_items = items
		else:
		# Database query mode
		with Progress(
		SpinnerColumn(),
		TextColumn("[progress.description]{task.description}"),
		console=console,
		) as progress:
		progress.add_task("[cyan]Querying database...", total=None)
		resolved_items = asyncio.run(
		_query_tdocs_async(
		source_kind=source_kind,
		start_date=start_date,
		end_date=end_date,
		source=source,
		source_ex=source_ex,
		title=title,
		title_ex=title_ex,
		agenda=agenda,
		agenda_ex=agenda_ex,
		limit=limit,
		),
		)

		if not resolved_items:
		console.print("[yellow]No items match the provided filters[/yellow]")
		return

		console.print(f"[cyan]Found {len(resolved_items)} matching items[/cyan]")

		# Phase 2: Validate and create members
		members, skipped = _validate_and_create_members(resolved_items, source_kind, release)

		if not members:
		console.print("[yellow]No valid items to add[/yellow]")
		return

		added = add_workspace_members(normalized, members)
		console.print(f"[green]Added {added} item(s) to workspace '{normalized}'.[/green]")
		if skipped:
		console.print(f"[dim]Skipped {len(skipped)} invalid item(s)[/dim]")


		@app.command("clear-invalid", help="Remove members with invalid or missing source paths.")
		def workspace_clear_invalid(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		workspace: WorkspaceNameOption = None,
		dry_run: bool = typer.Option(False, "--dry-run", help="Show what would be removed without removing"),
		) -> None:
		"""Remove members whose source path no longer exists."""
		normalized = normalize_workspace_name(workspace_name)
		normalized = _resolve_workspace_name(workspace)
		try:
		members = list_workspace_members(normalized, include_inactive=True)
		to_remove = [m for m in members if not Path(m.source_path).exists()]
		@@ -278,4 +591,6 @@ def workspace_clear_invalid(
		else:
		console.print(f"\n[yellow]Dry-run: would remove {len(to_remove)} invalid members.[/yellow]")
		except Exception as e:
		if isinstance(e, (KeyboardInterrupt, SystemExit, typer.Exit)):
		raise
		console.print(f"[red]Error: {e}[/red]")

src/tdoc_crawler/tdocs/operations/checkout.py

+68 −18

File changed.

Preview size limit exceeded, changes collapsed.

src/tdoc_crawler/tdocs/sources/doclist.py

+21 −16

File changed.

Preview size limit exceeded, changes collapsed.

src/tdoc_crawler/workspaces/init.py

+9 −2

Original line number	Diff line number	Diff line
		@@ -19,14 +19,20 @@ from tdoc_crawler.workspaces.members import (
		remove_invalid_members,
		remove_workspace_member,
		)
		from tdoc_crawler.workspaces.utils import delete_ai_folder, resolve_tdoc_checkout_path
		from tdoc_crawler.workspaces.utils import (
		checkout_spec_to_workspace,
		checkout_tdoc_to_workspace,
		resolve_spec_release_from_db,
		resolve_tdoc_checkout_path,
		)

		__all__ = [
		"DEFAULT_WORKSPACE",
		"add_workspace_members",
		"checkout_spec_to_workspace",
		"checkout_tdoc_to_workspace",
		"create_workspace",
		"deactivate_workspace_member",
		"delete_ai_folder",
		"delete_workspace",
		"ensure_default_workspace",
		"get_active_workspace",
		@@ -39,6 +45,7 @@ __all__ = [
		"normalize_workspace_name",
		"remove_invalid_members",
		"remove_workspace_member",
		"resolve_spec_release_from_db",
		"resolve_tdoc_checkout_path",
		"set_active_workspace",
		]

src/tdoc_crawler/workspaces/crud.py

+5 −41

File changed.

Preview size limit exceeded, changes collapsed.