♻️ refactor(cli): move helper functions to utils module (2ffa247f) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/AGENTS.md

+1 −35

Original line number	Diff line number	Diff line
		@@ -37,43 +37,9 @@ Assume `cli/` could be separated as an optional package. If a function would be
		\| `app.py` \| Typer command definitions and CLI entry points \|
		\| `args.py` \| Typer Annotated types for arguments and options \|
		\| `console.py` \| Rich Console singleton for CLI output \|
		\| `helpers.py` \| Helper functions - check classification rules above \|
		\| `fetching.py` \| TDoc fetching - check classification rules below \|
		\| `utils.py` \| Helper functions - check classification rules above \|
		\| `printing.py` \| Table and output formatting for CLI \|

		## Function Classification

		### `helpers.py`

		CLI Functions (stay in cli/):

		- `parse_working_groups()` - CLI argument parsing
		- `parse_subgroups()` - CLI argument parsing
		- `collect_spec_numbers()` - CLI stdin/file input handling
		- `build_limits()` - CLI config builder wrapper
		- `launch_file()` - System calls for opening files
		- `resolve_http_cache_config()` - CLI/env var configuration parsing
		- `infer_working_groups_from_ids()` - CLI string inference

		Library Functions (moved to core):

		- `normalize_portal_meeting_name()` → `tdoc_crawler.specs.normalization`
		- `resolve_meeting_id()` → `tdoc_crawler.database`
		- `download_to_path()` → `tdoc_crawler.http_client`
		- `prepare_tdoc_file()` → `tdoc_crawler.checkout`
		- `database_path()` → `tdoc_crawler.database`

		### `fetching.py`

		CLI Functions (stay in cli/):

		- `fetch_missing_tdocs()` - Uses CLI console output
		- `_fetch_via_whatthespec()` - Uses CLI console output
		- `maybe_fetch_missing_tdocs()` - CLI console and flag handling

		Library Functions:

		- `fetch_tdoc()` - Import from `tdoc_crawler.fetching` (not duplicated in CLI)

		## Lessons Learned

src/tdoc_crawler/cli/app.py

+29 −30

Original line number	Diff line number	Diff line
		@@ -26,6 +26,7 @@ from tdoc_crawler.checkout import (
		from tdoc_crawler.cli.args import (
		DEFAULT_VERBOSITY,
		CacheDirOption,
		CheckoutDirOption,
		CheckoutOption,
		CheckoutTDocIdsArgument,
		ClearDbOption,
		@@ -64,7 +65,6 @@ from tdoc_crawler.cli.args import (
		WorkingGroupOption,
		)
		from tdoc_crawler.cli.console import get_console
		from tdoc_crawler.cli.helpers import build_limits, collect_spec_numbers, launch_file, parse_subgroups, parse_working_groups
		from tdoc_crawler.cli.printing import (
		meeting_to_dict,
		print_checkout_results,
		@@ -76,17 +76,17 @@ from tdoc_crawler.cli.printing import (
		spec_query_to_dict,
		tdoc_to_dict,
		)
		from tdoc_crawler.cli.utils import launch_file
		from tdoc_crawler.config import CacheManager
		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
		from tdoc_crawler.credentials import resolve_credentials, set_credentials
		from tdoc_crawler.database import SpecDatabase, TDocDatabase
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.fetching import fetch_missing_tdocs
		from tdoc_crawler.http_client import create_cached_session
		from tdoc_crawler.logging import set_verbosity
		from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
		from tdoc_crawler.models.specs import SpecQueryFilters
		from tdoc_crawler.specs import SpecCatalog
		from tdoc_crawler.specs.downloads import SpecDownloads
		from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, SpecQueryFilters, TDocCrawlConfig
		from tdoc_crawler.specs import SpecDatabase, SpecDownloads
		from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

		load_dotenv()

		@@ -126,7 +126,8 @@ def crawl_tdocs(

		subgroups = parse_subgroups(subgroup)
		working_groups = parse_working_groups(working_group, subgroups)
		limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)

		limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)
		config = TDocCrawlConfig(
		cache_dir=manager.root,
		working_groups=working_groups,
		@@ -158,7 +159,7 @@ def crawl_tdocs(
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

		with SpecDatabase(db_file) as database:
		with TDocDatabase(db_file) as database:
		checkout_dir = manager.checkout_dir
		# Clear TDocs if requested
		if clear_tdocs:
		@@ -275,7 +276,7 @@ def crawl_meetings(

		subgroups = parse_subgroups(subgroup)
		working_groups = parse_working_groups(working_group, subgroups)
		limits = build_limits(None, limit_meetings, limit_meetings_per_wg, limit_wgs)
		limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_wg, limit_wgs)
		set_credentials(eol_username, eol_password, prompt_credentials)
		config = MeetingCrawlConfig(
		cache_dir=manager.root,
		@@ -297,7 +298,7 @@ def crawl_meetings(
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_dir = manager.checkout_dir
		# Clear all data if requested
		if clear_db:
		@@ -369,7 +370,7 @@ def crawl_meetings(
		order=SortOrder.DESC,
		include_without_files=False,
		)
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		meetings = database.query_meetings(query_config)

		with create_cached_session(manager.http_cache_dir) as session:
		@@ -515,7 +516,7 @@ def query_meetings(
		)

		db_file = manager.db_file
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_dir = manager.checkout_dir
		if clear_tdocs:
		deleted_count = database.clear_tdocs()
		@@ -589,7 +590,7 @@ def query_specs(
		raise typer.Exit(code=2) from exc

		db_file = manager.db_file
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_dir = manager.checkout_dir
		if clear_tdocs:
		deleted_count = database.clear_tdocs()
		@@ -605,8 +606,7 @@ def query_specs(
		if removed_specs:
		console.print("[yellow]Cleared checkout entries for specs[/yellow]")

		catalog = SpecCatalog(database)
		results = catalog.query_specs(filters, release="latest")
		results = database.query_specs(filters)

		if not results:
		console.print("[yellow]No specs found[/yellow]")
		@@ -614,7 +614,7 @@ def query_specs(

		if checkout:
		spec_list = [result.spec_number for result in results]
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_specs(spec_list, manager.checkout_dir, database, release="latest")

		if output is OutputFormat.JSON:
		@@ -670,13 +670,13 @@ def open_tdoc(
		metadata = results[0]

		try:
		target_file = prepare_tdoc_file(metadata, manager.root, session=session)
		tdoc_file = prepare_tdoc_file(metadata, manager.root, session=session)
		except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
		console.print(f"[red]Failed to prepare TDoc {normalized_id}: {exc}")
		raise typer.Exit(code=1) from exc

		console.print(f"[green]Opening {target_file}")
		launch_file(target_file)
		console.print(f"[green]Opening {tdoc_file}")
		launch_file(tdoc_file)


		@app.command("checkout", rich_help_panel=HELP_PANEL_MAIN)
		@@ -763,7 +763,7 @@ def stats(
		console.print(f"[red]Database not found: {db_file}[/red]")
		raise typer.Exit(code=1)

		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		stats_dict = cast(dict[str, Any], database.get_statistics())

		table = Table(title="TDoc database statistics")
		@@ -811,7 +811,7 @@ def crawl_specs(
		sources = build_default_spec_sources()

		db_file = manager.db_file
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_dir = manager.checkout_dir
		if clear_tdocs:
		deleted_count = database.clear_tdocs()
		@@ -827,15 +827,14 @@ def crawl_specs(
		if removed_specs:
		console.print("[yellow]Cleared checkout entries for specs[/yellow]")

		catalog = SpecCatalog(database)
		results = catalog.crawl_specs(specs, release, sources)
		results = database.crawl_specs(specs, release, sources)

		if not results:
		console.print("[yellow]No specs crawled[/yellow]")
		return

		if checkout:
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		checkout_specs(
		[result.spec_number for result in results],
		manager.checkout_dir,
		@@ -857,7 +856,7 @@ def checkout_spec(
		spec_file: SpecFileOption = None,
		release: ReleaseOption = "latest",
		doc_only: DocOnlyOption = False,
		# checkout_dir: CheckoutDirOption = None,
		checkout_dir: CheckoutDirOption = None,
		cache_dir: CacheDirOption = None,
		verbosity: VerbosityOption = DEFAULT_VERBOSITY,
		) -> None:
		@@ -876,7 +875,7 @@ def checkout_spec(
		sources = build_default_spec_sources()

		db_file = manager.db_file
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		downloader = SpecDownloads(database)
		results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources)

		@@ -901,12 +900,12 @@ def open_spec(
		sources = build_default_spec_sources()

		db_file = manager.db_file
		with TDocDatabase(db_file) as database:
		with SpecDatabase(db_file) as database:
		downloader = SpecDownloads(database)
		try:
		path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources)
		console.print(f"[green]Opening {path}[/green]")
		launch_file(path)
		spec_file = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources)
		console.print(f"[green]Opening {spec_file}[/green]")
		launch_file(spec_file)
		except Exception as exc:
		console.print(f"[red]Failed to open spec: {exc}[/red]")
		raise typer.Exit(code=1)

src/tdoc_crawler/cli/args.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -78,6 +78,7 @@ ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meeti
		CheckoutOption = Annotated[
		bool, typer.Option("--checkout/--no-checkout", help="Download and extract metadata results to checkout folder", envvar="TDC_CHECKOUT")
		]
		CheckoutDirOption = Annotated[Path \| None, typer.Option("--checkout-dir", help="Directory for checkout files", envvar="TDC_CHECKOUT_DIR")]
		WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers", envvar="TDC_WORKERS")]
		MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")]
		TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")]

src/tdoc_crawler/cli/helpers.py

deleted100644 → 0

+0 −223

Original line number	Diff line number	Diff line
		"""Helper functions for CLI operations."""

		from __future__ import annotations

		import os
		import subprocess
		import sys
		from collections.abc import Iterable
		from pathlib import Path

		import click
		import typer

		from tdoc_crawler.cli.console import get_console
		from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, SortOrder, WorkingGroup
		from tdoc_crawler.specs.normalization import expand_spec_ranges_batch, normalize_portal_meeting_name

		console = get_console()
		_logger = get_logger(__name__)

		DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db"
		DOWNLOAD_TIMEOUT = 60
		ALLOWED_DOWNLOAD_SCHEMES = ("ftp://", "http://", "https://")


		def infer_working_groups_from_subgroups(subgroups: list[str]) -> list[WorkingGroup]:
		"""Infer working groups from subgroup codes.

		Args:
		subgroups: List of subgroup codes (e.g., ["S4", "R1"])

		Returns:
		List of inferred working groups without duplicates
		"""
		working_groups: list[WorkingGroup] = []
		for subgroup in subgroups:
		# Extract first character to determine working group
		if subgroup and len(subgroup) >= 1:
		first_char = subgroup[0].upper()
		if first_char == "R":
		wg = WorkingGroup.RAN
		elif first_char == "S":
		wg = WorkingGroup.SA
		elif first_char == "C":
		wg = WorkingGroup.CT
		else:
		continue

		if wg not in working_groups:
		working_groups.append(wg)

		return working_groups if working_groups else [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]


		def parse_working_groups(values: list[str] \| None, subgroups: list[str] \| None = None) -> list[WorkingGroup]:
		"""Parse and normalize working group names, expanding plenary aliases.

		Args:
		values: Explicit working group values from CLI
		subgroups: Optional subgroup list to infer working groups from

		Returns:
		List of working groups to crawl
		"""
		if not values:
		# If subgroups are specified but no explicit working groups, infer from subgroups
		if subgroups:
		return infer_working_groups_from_subgroups(subgroups)
		# Otherwise default to all working groups
		return [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]
		resolved: list[WorkingGroup] = []
		for item in values:
		# Try alias normalization first (RP->RAN, SP->SA, CP->CT)
		normalized = normalize_working_group_alias(item)
		try:
		resolved.append(WorkingGroup(normalized.upper()))
		except ValueError as exc:
		console.print(f"[red]Unknown working group: {item}")
		raise typer.Exit(code=2) from exc
		if not resolved:
		console.print("[red]No valid working groups specified")
		raise typer.Exit(code=2)
		return resolved


		def parse_subgroups(values: list[str] \| None) -> list[str] \| None:
		"""Parse and normalize subgroup aliases to canonical names."""
		if not values:
		return None

		resolved: list[str] = []
		for item in values:
		normalized = normalize_subgroup_alias(item)
		if not normalized:
		console.print(f"[red]Unknown subgroup: {item}")
		raise typer.Exit(code=2)
		resolved.extend(normalized)

		return resolved


		def collect_spec_numbers(specs: list[str] \| None, spec_file: Path \| None) -> list[str]:
		"""Collect spec numbers from CLI arguments or a file."""
		collected: list[str] = []

		if specs:
		for spec in specs:
		if spec == "-":
		# Read from stdin
		for line in sys.stdin:
		line_stripped = line.strip()
		if line_stripped:
		collected.append(line_stripped)
		else:
		collected.append(spec.strip())

		if spec_file and spec_file.exists():
		try:
		with spec_file.open("r", encoding="utf-8") as f:
		for line in f:
		line_stripped = line.strip()
		if line_stripped:
		collected.append(line_stripped)
		except OSError as exc:
		raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}")

		if not collected:
		return []

		try:
		expanded = expand_spec_ranges_batch(collected)
		except ValueError as e:
		raise click.UsageError(str(e))

		return expanded


		def build_limits(
		limit_tdocs: int \| None,
		limit_meetings: int \| None,
		limit_meetings_per_wg: int \| None,
		limit_wgs: int \| None,
		) -> CrawlLimits:
		"""Build CrawlLimits configuration from individual parameters."""
		return CrawlLimits(
		limit_tdocs=limit_tdocs,
		limit_meetings=limit_meetings,
		limit_meetings_per_wg=limit_meetings_per_wg,
		limit_wgs=limit_wgs,
		)


		def infer_working_groups_from_ids(ids: Iterable[str]) -> list[WorkingGroup]:
		"""Infer working groups from TDoc IDs based on first character."""
		mapping = {
		"R": WorkingGroup.RAN,
		"S": WorkingGroup.SA,
		"T": WorkingGroup.CT,
		"C": WorkingGroup.CT,
		}
		resolved: list[WorkingGroup] = []
		for value in ids:
		if not value:
		continue
		group = mapping.get(value[0].upper())
		if group and group not in resolved:
		resolved.append(group)
		return resolved or [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]


		def launch_file(path: Path) -> None:
		"""Launch file in system's default application."""
		if not path.exists():
		console.print(f"[red]File not found: {path}")
		raise typer.Exit(code=1)
		try:
		if sys.platform.startswith("win"):
		os.startfile(path) # noqa: S606
		elif sys.platform == "darwin":
		open_cmd = Path("/usr/bin/open")
		if open_cmd.exists():
		subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603
		else:
		console.print("[yellow]/usr/bin/open not available[/yellow]")
		else:
		xdg_cmd = Path("/usr/bin/xdg-open")
		if xdg_cmd.exists():
		subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603
		else:
		console.print("[yellow]xdg-open command not available[/yellow]")
		except OSError as exc:
		console.print(f"[red]Failed to open file: {exc}")
		raise typer.Exit(code=1) from exc


		def resolve_http_cache_config(cache_ttl: int \| None = None, cache_refresh_on_access: bool \| None = None) -> HttpCacheConfig:
		"""Resolve HTTP cache configuration from CLI parameters and environment variables.

		Args:
		cache_ttl: TTL for cache entries (CLI parameter)
		cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter)

		Returns:
		HttpCacheConfig instance with resolved values
		"""
		# Check CLI parameters first, then environment variables, then defaults
		if cache_ttl is not None:
		ttl = cache_ttl
		else:
		env_ttl = os.getenv("HTTP_CACHE_TTL")
		ttl = int(env_ttl) if env_ttl else 7200

		# Handle refresh on access - check CLI param, then env var, then default
		if cache_refresh_on_access is not None:
		refresh_on_access = cache_refresh_on_access
		else:
		env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower()
		refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True

		return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access)

src/tdoc_crawler/cli/utils.py

0 → 100644

+40 −0

Original line number	Diff line number	Diff line
		"""CLI utilities."""

		from __future__ import annotations

		import os
		import subprocess
		import sys
		from pathlib import Path

		import typer

		from tdoc_crawler.cli.console import get_console

		console = get_console()


		def launch_file(path: Path) -> None:
		"""Launch file in system's default application."""
		if not path.exists():
		console.print(f"[red]File not found: {path}")
		raise typer.Exit(code=1)
		try:
		if sys.platform.startswith("win"):
		os.startfile(path) # noqa: S606
		elif sys.platform == "darwin":
		open_cmd = Path("/usr/bin/open")
		if open_cmd.exists():
		subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603
		else:
		console.print("[yellow]/usr/bin/open not available[/yellow]")
		else:
		# Linux and other Unix-like systems
		xdg_cmd = Path("/usr/bin/xdg-open")
		if xdg_cmd.exists():
		subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603
		else:
		console.print("[yellow]xdg-open command not available[/yellow]")
		except OSError as exc:
		console.print(f"[red]Failed to open file: {exc}")
		raise typer.Exit(code=1) from exc