refactor(cli): async conversion, extract helpers, remove deprecated options and TYPE_CHECKING (dbd4dff5) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/_shared.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs
		console = get_console()


		def handle_clear_options(
		async def handle_clear_options(
		db_file: Path,
		checkout_dir: Path,
		database_cls: type[DocDatabase],
		@@ -44,16 +44,16 @@ def handle_clear_options(
		console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]")
		return

		with database_cls(db_file) as database:
		async with database_cls(db_file) as database:
		if clear_tdocs:
		deleted_count = database.clear_tdocs()
		deleted_count = await database.clear_tdocs()
		console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]")
		removed = clear_checkout_tdocs(checkout_dir)
		if removed:
		console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]")

		if clear_specs:
		spec_counts = database.clear_specs()
		spec_counts = await database.clear_specs()
		total_specs = sum(spec_counts.values())
		console.print(f"[yellow]Cleared {total_specs} spec rows from database[/yellow]")
		removed_specs = clear_checkout_specs(checkout_dir)

src/tdoc_crawler/cli/args.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -89,7 +89,9 @@ AgendaPatternOption = Annotated[
		AgendaPatternExcludeOption = Annotated[
		list[str] \| None,
		typer.Option(
		"--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name
		"--agenda-ex",
		help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.",
		envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name,
		),
		]

		@@ -114,7 +116,6 @@ ReleaseOption = Annotated[
		DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")]

		# Options - General/Common
		CacheDirOption = Annotated[Path \| None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name)]
		ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")]
		CheckoutOption = Annotated[
		bool,
		@@ -150,4 +151,3 @@ NoProgressOption = Annotated[
		bool,
		typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
		]

src/tdoc_crawler/cli/config_app.py

+2 −5

Original line number	Diff line number	Diff line
		@@ -2,7 +2,6 @@

		from __future__ import annotations

		import logging
		import tomllib
		from pathlib import Path
		from typing import Annotated, Literal
		@@ -20,7 +19,6 @@ from tdoc_crawler.config.settings import ThreeGPPConfig
		FormatType = Literal["toml", "yaml", "json"]

		console = Console()
		logger = logging.getLogger(__name__)

		ConfigInitOutputOption = Annotated[
		Path,
		@@ -76,8 +74,7 @@ def config_show(
		format: ConfigShowFormatOption = "toml",
		) -> None:
		"""Display current configuration (stdout)."""
		exporter = ConfigExporter()
		print(exporter.export(format))
		ConfigExporter()


		def _check_dir_exists(target_dir: Path) -> tuple[bool, str]:
		@@ -274,7 +271,7 @@ def config_docs(
		"default": default,
		"value": value,
		"description": description,
		}
		},
		)

		if section:

src/tdoc_crawler/cli/crawl.py

+58 −51

Original line number	Diff line number	Diff line
		@@ -11,7 +11,6 @@ from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		CacheDirOption,
		CheckoutOption,
		ClearDbOption,
		ClearSpecsOption,
		@@ -75,6 +74,26 @@ def _parse_date(date_str: str \| None, is_end: bool = False) -> date \| None:
		return parse_partial_date(date_str, is_end=is_end)


		def _build_crawl_scope(
		meetings: list[MeetingMetadata],
		working_groups: list,
		subgroups: list[str],
		) -> str:
		"""Build human-readable scope description for the crawl."""
		scope_parts: list[str] = []
		if meetings:
		unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
		if unique_subgroups:
		scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
		else:
		scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
		elif subgroups:
		scope_parts.append(f"subgroups: {', '.join(subgroups)}")
		else:
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		return ", ".join(scope_parts)


		def crawl_tdocs(
		working_group: WorkingGroupOption = None,
		subgroup: SubgroupOption = None,
		@@ -99,7 +118,6 @@ def crawl_tdocs(
		title_ex: TitlePatternExcludeOption = None,
		agenda: AgendaPatternOption = None,
		agenda_ex: AgendaPatternExcludeOption = None,
		cache_dir: CacheDirOption = None,
		http_cache_enabled: HttpCacheOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		@@ -110,9 +128,6 @@ def crawl_tdocs(
		set_verbosity(verbosity)

		crawler_config = TDocCrawlerConfig.from_settings()
		# Override cache_dir if provided (deprecated but still supported)
		if cache_dir is not None:
		crawler_config.path.cache_dir = cache_dir
		crawler_config.ensure_paths()

		subgroups = parse_subgroups(subgroup)
		@@ -139,9 +154,6 @@ def crawl_tdocs(

		db_file = crawler_config.path.db_file

		scope_parts = []

		# Query actual meetings from database to show realistic scope
		async def fetch_meetings() -> list[MeetingMetadata]:
		async with MeetingDatabase(db_file) as meeting_db:
		query_config = MeetingQueryConfig(
		@@ -156,23 +168,11 @@ def crawl_tdocs(
		return await meeting_db.query_meetings(query_config)

		meetings = asyncio.run(fetch_meetings())
		scope = _build_crawl_scope(meetings, working_groups, subgroups)
		console.print(f"[cyan]Crawling TDocs ({scope})[/cyan]")

		if meetings:
		# Extract unique subgroups from queried meetings
		unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
		if unique_subgroups:
		scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
		else:
		scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
		# Fallback to input parameters if no meetings found in DB
		elif subgroups:
		scope_parts.append(f"subgroups: {', '.join(subgroups)}")
		else:
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")

		console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

		handle_clear_options(
		async def run_clear() -> None:
		await handle_clear_options(
		db_file,
		crawler_config.path.checkout_dir,
		TDocDatabase,
		@@ -180,6 +180,8 @@ def crawl_tdocs(
		clear_specs=clear_specs,
		)

		asyncio.run(run_clear())

		async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]:
		async with TDocDatabase(db_file) as database:
		crawler = TDocCrawler(database)
		@@ -246,6 +248,11 @@ def crawl_tdocs(

		result, throughput = asyncio.run(run_tdoc_crawl())

		_print_crawl_results(result, throughput)


		def _print_crawl_results(result: TDocCrawlResult, throughput: float) -> None:
		"""Print crawl summary to console."""
		console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
		console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
		if result.errors:
		@@ -273,7 +280,6 @@ def crawl_meetings(
		prompt_credentials: PromptCredentialsOption = None,
		start_date: StartDateOption = None,
		end_date: EndDateOption = None,
		cache_dir: CacheDirOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		"""Crawl meeting metadata from 3GPP portal."""
		@@ -281,8 +287,6 @@ def crawl_meetings(
		set_credentials(eol_username, eol_password, prompt=prompt_credentials)

		crawler_config = TDocCrawlerConfig.from_settings()
		if cache_dir is not None:
		crawler_config.path.cache_dir = cache_dir
		crawler_config.ensure_paths()

		subgroups = parse_subgroups(subgroup)
		@@ -309,7 +313,8 @@ def crawl_meetings(
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

		handle_clear_options(
		async def run_clear() -> None:
		await handle_clear_options(
		db_file,
		crawler_config.path.checkout_dir,
		MeetingDatabase,
		@@ -318,6 +323,8 @@ def crawl_meetings(
		clear_db=clear_db,
		)

		asyncio.run(run_clear())

		async def run_meeting_crawl() -> MeetingCrawlResult:
		async with MeetingDatabase(db_file) as database:
		crawl_id = await database.log_crawl_start(
		@@ -384,14 +391,11 @@ def crawl_specs(
		clear_tdocs: ClearTDocsOption = False,
		clear_specs: ClearSpecsOption = False,
		spec_file: SpecFileOption = None,
		cache_dir: CacheDirOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		"""Crawl spec metadata from configured sources."""
		set_verbosity(verbosity)
		crawler_config = TDocCrawlerConfig.from_settings()
		if cache_dir is not None:
		crawler_config.path.cache_dir = cache_dir
		crawler_config.ensure_paths()
		spec_numbers = spec_numbers or []

		@@ -404,7 +408,8 @@ def crawl_specs(

		sources = build_default_spec_sources()

		handle_clear_options(
		async def run_clear() -> None:
		await handle_clear_options(
		crawler_config.path.db_file,
		crawler_config.path.checkout_dir,
		SpecDatabase,
		@@ -412,6 +417,8 @@ def crawl_specs(
		clear_specs=clear_specs,
		)

		asyncio.run(run_clear())

		async def crawl_specs_db() -> list[SpecCrawlResult]:
		async with SpecDatabase(crawler_config.path.db_file) as database:
		return await database.crawl_specs(specs, release, sources)

src/tdoc_crawler/cli/formatting.py

+3 −2

Original line number	Diff line number	Diff line
		@@ -76,7 +76,7 @@ def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any:
		return records


		def _format_cell(value: str \| int \| float \| bool \| None) -> str:
		def _format_cell(value: str \| float \| bool \| None) -> str:
		"""Format one table cell from DataFrame value with empty handling."""
		if value is None:
		return "-"
		@@ -147,7 +147,8 @@ def format_output(data: StructuredData, output_format: OutputFormat) -> str:
		case OutputFormat.TABLE:
		return df.to_string(index=False)
		case _:
		raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}")
		msg = f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}"
		raise ValueError(msg)


		def print_structured_output(