cli(specs): add crawl-specs command and spec input helpers (d3f3a69b) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/app.py

+61 −2

Original line number	Diff line number	Diff line
		@@ -19,6 +19,9 @@ from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
		from tdoc_crawler.credentials import set_credentials
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
		from tdoc_crawler.specs import SpecCatalog
		from tdoc_crawler.specs.sources import fetch_threegpp_metadata, fetch_whatthespec_metadata
		from tdoc_crawler.specs.sources.base import FunctionSpecSource

		from .args import (
		CacheDirOption,
		@@ -43,6 +46,9 @@ from .args import (
		OutputFormatOption,
		OverallTimeoutOption,
		PromptCredentialsOption,
		ReleaseOption,
		SpecFileOption,
		SpecOption,
		StartDateOption,
		SubgroupOption,
		TDocIdArgument,
		@@ -55,8 +61,23 @@ from .args import (
		)
		from .console import get_console
		from .fetching import maybe_fetch_missing_tdocs
		from .helpers import build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file
		from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict
		from .helpers import (
		build_limits,
		collect_spec_numbers,
		database_path,
		launch_file,
		parse_subgroups,
		parse_working_groups,
		prepare_tdoc_file,
		)
		from .printing import (
		meeting_to_dict,
		print_meeting_table,
		print_spec_crawl_table,
		print_tdoc_table,
		spec_crawl_to_dict,
		tdoc_to_dict,
		)

		load_dotenv()

		@@ -503,6 +524,44 @@ def stats(
		console.print(table)


		@app.command("crawl-specs", rich_help_panel=HELP_PANEL_CRAWLING)
		def crawl_specs(
		spec: SpecOption = None,
		spec_file: SpecFileOption = None,
		release: ReleaseOption = "latest",
		output_format: OutputFormatOption = OutputFormat.TABLE.value,
		cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
		) -> None:
		"""Crawl spec metadata from configured sources."""
		specs = collect_spec_numbers(spec, spec_file)
		try:
		output = OutputFormat(output_format.lower())
		except ValueError as exc:
		console.print("[red]Invalid output format; use table, json, or yaml")
		raise typer.Exit(code=2) from exc

		sources = [
		FunctionSpecSource("3gpp", fetch_threegpp_metadata),
		FunctionSpecSource("whatthespec", fetch_whatthespec_metadata),
		]

		db_path = database_path(cache_dir)
		with TDocDatabase(db_path) as database:
		catalog = SpecCatalog(database)
		results = catalog.crawl_specs(specs, release, sources)

		if not results:
		console.print("[yellow]No specs crawled[/yellow]")
		return

		if output is OutputFormat.JSON:
		console.print(json.dumps([spec_crawl_to_dict(result) for result in results], indent=2))
		elif output is OutputFormat.YAML:
		console.print(yaml.dump([spec_crawl_to_dict(result) for result in results], sort_keys=False))
		else:
		print_spec_crawl_table(results)


		# Register command aliases
		app.command("ct", rich_help_panel=HELP_PANEL_CRAWLING, hidden=True)(crawl_tdocs)
		app.command("cm", rich_help_panel=HELP_PANEL_CRAWLING, hidden=True)(crawl_meetings)

src/tdoc_crawler/cli/args.py

+6 −0

Original line number	Diff line number	Diff line
		@@ -74,3 +74,9 @@ IncludeWithoutFilesOption = Annotated[
		TDocIdArgument = Annotated[str, typer.Argument(help="TDoc identifier to download and open")]
		CheckoutTDocIdsArgument = Annotated[list[str], typer.Argument(help="TDoc identifier(s) to checkout")]
		ForceOption = Annotated[bool, typer.Option("--force", "-f", help="Re-download even if already checked out")]

		SpecOption = Annotated[list[str] \| None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]
		SpecFileOption = Annotated[Path \| None, typer.Option("--spec-file", help="File with spec numbers")]
		ReleaseOption = Annotated[str, typer.Option("--release", help="Spec release selector")]
		DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")]
		CheckoutDirOption = Annotated[Path \| None, typer.Option("--checkout-dir", help="Spec checkout base directory")]

src/tdoc_crawler/cli/helpers.py

+37 −0

Original line number	Diff line number	Diff line
		@@ -107,6 +107,43 @@ def parse_subgroups(values: list[str] \| None) -> list[str] \| None:
		return resolved


		def collect_spec_numbers(specs: list[str] \| None, spec_file: Path \| None) -> list[str]:
		"""Collect spec numbers from CLI options and stdin.

		Args:
		specs: Spec numbers provided on the command line. Use "-" to read stdin.
		spec_file: Optional file containing spec numbers (one per line).

		Returns:
		List of spec numbers in input order.
		"""
		collected: list[str] = []

		if specs:
		for item in specs:
		if item == "-":
		stdin_text = sys.stdin.read()
		collected.extend(line.strip() for line in stdin_text.splitlines() if line.strip())
		else:
		stripped = item.strip()
		if stripped:
		collected.append(stripped)

		if spec_file is not None:
		try:
		file_text = spec_file.read_text(encoding="utf-8")
		except OSError as exc:
		console.print(f"[red]Failed to read spec file: {exc}")
		raise typer.Exit(code=2) from exc
		collected.extend(line.strip() for line in file_text.splitlines() if line.strip())

		if not collected:
		console.print("[red]No spec numbers provided[/red]")
		raise typer.Exit(code=2)

		return collected


		def build_limits(
		limit_tdocs: int \| None,
		limit_meetings: int \| None,

src/tdoc_crawler/cli/printing.py

+43 −0

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@ from typing import Any
		from rich.table import Table

		from tdoc_crawler.models import MeetingMetadata, TDocMetadata
		from tdoc_crawler.specs import SpecCrawlResult

		from .console import get_console

		@@ -78,3 +79,45 @@ def print_meeting_table(results: list[MeetingMetadata]) -> None:
		)

		console.print(table)


		def spec_crawl_to_dict(result: SpecCrawlResult) -> dict[str, Any]:
		"""Convert SpecCrawlResult to dictionary for JSON/YAML output."""
		return {
		"spec_number": result.spec_number,
		"release": result.release,
		"status": result.status,
		"latest_version": result.latest_version,
		"sources": [
		{
		"source_name": outcome.source_name,
		"status": outcome.status,
		"versions": outcome.versions,
		"message": outcome.message,
		}
		for outcome in result.sources
		],
		"message": result.message,
		}


		def print_spec_crawl_table(results: list[SpecCrawlResult]) -> None:
		"""Print spec crawl results as formatted table."""
		table = Table(title=f"Spec crawl results ({len(results)} rows)")
		table.add_column("Spec", style="cyan")
		table.add_column("Status", style="magenta")
		table.add_column("Release", style="yellow")
		table.add_column("Latest", style="green")
		table.add_column("Sources", style="blue")

		for result in results[:100]:
		sources = ", ".join(f"{outcome.source_name}:{outcome.status}" for outcome in result.sources) or "-"
		table.add_row(
		result.spec_number,
		result.status,
		result.release,
		result.latest_version or "-",
		sources,
		)

		console.print(table)