feat(workspace): enhance workspace processing with hybrid server support (4e4f34d9) · Commits · Jan Reimes / 3gpp-crawler

demo.bat

+1 −1

Original line number	Diff line number	Diff line
		@@ -36,4 +36,4 @@ tdoc-crawler query --agenda "atias" --start-date 2018
		3gpp-crawler workspace members

		:: convert tdocs/specs to PDF/Markdown/artefacts for AI processing
		3gpp-crawler workspace process
		3gpp-crawler workspace process --profile pdf-only
		No newline at end of file

src/tdoc_crawler/cli/_shared.py

+26 −0

Original line number	Diff line number	Diff line
		@@ -7,6 +7,8 @@ from pathlib import Path
		from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TaskID, TextColumn

		from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server
		from tdoc_crawler.extraction.profiles import ExtractionProfile
		from tdoc_crawler.logging import get_console
		from tdoc_crawler.specs.operations.checkout import clear_checkout_specs
		from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs
		@@ -14,6 +16,29 @@ from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs
		console = get_console()


		def ensure_hybrid_server_for_profile(
		profile: ExtractionProfile,
		) -> bool:
		"""Pre-start the hybrid server if the extraction profile requires it.

		Returns:
		True if the server is running (or not needed), False if startup failed.
		"""
		if profile == ExtractionProfile.PDF_ONLY:
		return True

		_, status = ensure_hybrid_server(
		progress_callback=lambda msg: console.print(f"[dim] {msg}[/dim]"),
		)
		if status.running:
		console.print(f"[green]Hybrid server running at {status.url}[/green]")
		return True

		console.print(f"[red]Failed to start hybrid server: {status.error}[/red]")
		console.print("[dim]Hybrid mode is required for non-PDF profiles. Use --profile pdf-only to skip.[/dim]")
		return False


		def handle_clear_options(
		db_file: Path,
		checkout_dir: Path,
		@@ -85,5 +110,6 @@ def create_progress_bar(description: str, total: float = 100) -> tuple[Progress,
		__all__ = [
		"console",
		"create_progress_bar",
		"ensure_hybrid_server_for_profile",
		"handle_clear_options",
		]

src/tdoc_crawler/cli/_workspace_commands.py

+154 −28

Original line number	Diff line number	Diff line
		@@ -5,20 +5,38 @@ These commands create, inspect, modify, and process workspaces.

		from __future__ import annotations

		import asyncio
		import shutil
		from datetime import UTC, datetime
		from pathlib import Path
		from typing import Any

		import typer

		from tdoc_crawler.cli._shared import console
		from tdoc_crawler.cli._shared import console, ensure_hybrid_server_for_profile
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		EndDateOption,
		LimitOption,
		ReleaseOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		StartDateOption,
		TitlePatternExcludeOption,
		TitlePatternOption,
		WorkspaceItemsArgument,
		)
		from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.config import PathConfig, resolve_cache_manager
		from tdoc_crawler.database.tdocs import TDocDatabase
		from tdoc_crawler.extraction.convert import convert_for_wiki
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models.base import OutputFormat
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		from tdoc_crawler.models.workspaces import SourceKind
		from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.workspaces import (
		add_workspace_members,
		create_workspace,
		@@ -130,6 +148,8 @@ def workspace_members(
		include_inactive: bool = typer.Option(False, "--include-inactive", help="Include inactive members"),
		) -> None:
		"""List members of a workspace."""
		if workspace_name is None:
		workspace_name = get_active_workspace()
		normalized = normalize_workspace_name(workspace_name)
		try:
		members = list_workspace_members(normalized, include_inactive=include_inactive)
		@@ -144,6 +164,24 @@ def workspace_members(
		console.print(f"[red]Error: {e}[/red]")


		def _should_skip_member(
		source_id: str,
		wiki_base: Path,
		profile: ExtractionProfile,
		force: bool,
		skip_existing: bool,
		) -> bool:
		"""Check if a workspace member should be skipped due to existing artifacts."""
		if not skip_existing or force:
		return False
		member_dir = wiki_base / source_id
		glob_pattern = ".pdf" if profile == ExtractionProfile.PDF_ONLY else ".md"
		if list(member_dir.glob(glob_pattern)):
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		return True
		return False


		@app.command("process", help="Process workspace members.")
		def workspace_process(
		workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
		@@ -183,6 +221,10 @@ def workspace_process(
		if limit is not None:
		members = members[:limit]

		# Pre-start hybrid server if extraction profile requires it
		if not ensure_hybrid_server_for_profile(extraction_profile):
		raise typer.Exit(1)

		manager = resolve_cache_manager()
		wiki_source_dir_base = manager.checkout_dir / normalized / "sources"

		@@ -192,17 +234,7 @@ def workspace_process(
		for member in members:
		source_id = member.source_item_id

		if skip_existing and not force:
		member_wiki_dir = wiki_source_dir_base / source_id
		if extraction_profile == ExtractionProfile.PDF_ONLY:
		pdf_exists = list(member_wiki_dir.glob("*.pdf"))
		if pdf_exists:
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		continue
		else:
		md_exists = list(member_wiki_dir.glob("*.md"))
		if md_exists:
		console.print(f"[dim] Skipping {source_id} - artifacts exist[/dim]")
		if _should_skip_member(source_id, wiki_source_dir_base, extraction_profile, force, skip_existing):
		continue

		wiki_source_dir = wiki_source_dir_base / source_id
		@@ -230,26 +262,118 @@ def workspace_process(

		@app.command("add", help="Add documents to an existing workspace.")
		def workspace_add(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		items: list[str] = typer.Argument(..., help="Items to add (TDoc IDs, spec numbers, etc.)"),
		items: WorkspaceItemsArgument = None,
		workspace: str \| None = typer.Option(None, "-w", "--workspace", help="Workspace name (default: active workspace)"),
		kind: str = typer.Option("tdoc", "--kind", help="Source kind: tdoc, spec, or other"),
		# Filter options for batch-adding from DB query
		agenda: AgendaPatternOption = None,
		agenda_ex: AgendaPatternExcludeOption = None,
		release: ReleaseOption = "latest",
		limit: LimitOption = None,
		start_date: StartDateOption = None,
		end_date: EndDateOption = None,
		source: SourcePatternOption = None,
		source_ex: SourcePatternExcludeOption = None,
		title: TitlePatternOption = None,
		title_ex: TitlePatternExcludeOption = None,
		) -> None:
		"""Add documents to a workspace."""
		normalized = normalize_workspace_name(workspace_name)
		"""Add documents to a workspace.

		Either provide items directly, or use filter options (--agenda, --source, etc.)
		to query the TDoc database and add matching results in bulk.
		"""
		# Resolve workspace name: -w flag > active workspace
		resolved_name = workspace or get_active_workspace()
		if resolved_name is None:
		console.print("[red]No workspace specified and no active workspace set. Use 'workspace create' and 'workspace activate' first.[/red]")
		raise typer.Exit(1)

		normalized = normalize_workspace_name(resolved_name)
		source_kind = SourceKind(kind)

		members = [
		# Check if any filter flags are set
		has_filters = any([agenda, agenda_ex, start_date, end_date, source, source_ex, title, title_ex])

		members: list = []

		if has_filters:
		# Batch mode: query TDoc database and add matching results
		try:
		start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None
		except ValueError as exc:
		console.print("[red]Invalid start date format; use ISO-8601[/red]")
		raise typer.Exit(code=2) from exc
		try:
		end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None
		except ValueError as exc:
		console.print("[red]Invalid end date format; use ISO-8601[/red]")
		raise typer.Exit(code=2) from exc

		query_config = TDocQueryConfig(
		output_format=OutputFormat.TABLE,
		start_date=start,
		end_date=end,
		source_pattern=source,
		source_pattern_exclude=source_ex,
		title_pattern=title,
		title_pattern_exclude=title_ex,
		agenda_pattern=agenda,
		agenda_pattern_exclude=agenda_ex,
		limit=limit,
		order=SortOrder.DESC,
		)

		path_config = PathConfig()
		db_file = path_config.db_file

		async def _query_tdocs() -> list:
		async with TDocDatabase(db_file) as database:
		return await database.query_tdocs(query_config)

		try:
		results = asyncio.run(_query_tdocs())
		except Exception as exc:
		console.print(f"[red]Failed to query TDoc database: {exc}[/red]")
		raise typer.Exit(1) from exc

		if not results:
		console.print("[yellow]No TDocs matched the given filters.[/yellow]")
		return

		console.print(f"[dim]Found {len(results)} TDoc(s) matching filters.[/dim]")
		for tdoc in results:
		members.append(
		make_workspace_member(
		source_item_id=tdoc.tdoc_id,
		source_path=tdoc.tdoc_id,
		source_kind=source_kind,
		added_by="cli:query",
		)
		)

		elif items:
		# Direct mode: add items by ID
		for item in items:
		members.append(
		make_workspace_member(
		source_item_id=item,
		source_path=item,
		source_kind=source_kind,
		added_by="cli",
		release=release if release != "latest" else None,
		)
		)
		for item in items
		]
		else:
		console.print("[red]Provide items to add or use filter options (--agenda, --source, etc.).[/red]")
		raise typer.Exit(1)

		if not members:
		console.print("[yellow]No items to add.[/yellow]")
		return

		added = add_workspace_members(normalized, members)
		console.print(f"[green]Added {added} item(s) to workspace '{normalized}'.[/green]")
		mode = "from query" if has_filters else "directly"
		console.print(f"[green]Added {added} item(s) to workspace '{normalized}' ({mode}).[/green]")


		@app.command("clear-invalid", help="Remove members with invalid or missing source paths.")
		@@ -258,6 +382,8 @@ def workspace_clear_invalid(
		dry_run: bool = typer.Option(False, "--dry-run", help="Show what would be removed without removing"),
		) -> None:
		"""Remove members whose source path no longer exists."""
		if workspace_name is None:
		workspace_name = get_active_workspace()
		normalized = normalize_workspace_name(workspace_name)
		try:
		members = list_workspace_members(normalized, include_inactive=True)

src/tdoc_crawler/cli/args.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -14,6 +14,7 @@ TDocIdsArgument = Annotated[list[str] \| None, typer.Argument(help="TDoc identifi
		TDocIdArgument = Annotated[str, typer.Argument(help="TDoc identifier to download and open")]
		CheckoutTDocIdsArgument = Annotated[list[str], typer.Argument(help="TDoc identifier(s) to checkout")]
		SpecArgument = Annotated[list[str] \| None, typer.Argument(help="Spec number(s) to query (dotted or undotted)")]
		WorkspaceItemsArgument = Annotated[list[str] \| None, typer.Argument(help="Items to add (TDoc IDs, spec numbers, etc.)")]

		# Options - TDocs/Meetings
		WorkingGroupOption = Annotated[
		@@ -150,4 +151,3 @@ NoProgressOption = Annotated[
		bool,
		typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
		]

src/tdoc_crawler/config/cache_manager.py

+1 −3

Original line number	Diff line number	Diff line
		@@ -118,7 +118,5 @@ def resolve_cache_manager() -> CacheManager:
		CacheManagerNotRegisteredError: If no manager is registered
		"""
		if CacheManager._instance is None:
		raise CacheManagerNotRegisteredError(
		"CacheManager not registered. Call CacheManager(cache_dir).register() at application startup."
		)
		raise CacheManagerNotRegisteredError("CacheManager not registered. Call CacheManager(cache_dir).register() at application startup.")
		return CacheManager._instance