Loading src/tdoc_crawler/cli/_shared.py +4 −4 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs console = get_console() def handle_clear_options( async def handle_clear_options( db_file: Path, checkout_dir: Path, database_cls: type[DocDatabase], Loading Loading @@ -44,16 +44,16 @@ def handle_clear_options( console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]") return with database_cls(db_file) as database: async with database_cls(db_file) as database: if clear_tdocs: deleted_count = database.clear_tdocs() deleted_count = await database.clear_tdocs() console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]") removed = clear_checkout_tdocs(checkout_dir) if removed: console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]") if clear_specs: spec_counts = database.clear_specs() spec_counts = await database.clear_specs() total_specs = sum(spec_counts.values()) console.print(f"[yellow]Cleared {total_specs} spec rows from database[/yellow]") removed_specs = clear_checkout_specs(checkout_dir) Loading src/tdoc_crawler/cli/args.py +3 −3 Original line number Diff line number Diff line Loading @@ -89,7 +89,9 @@ AgendaPatternOption = Annotated[ AgendaPatternExcludeOption = Annotated[ list[str] | None, typer.Option( "--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name "--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name, ), ] Loading @@ -114,7 +116,6 @@ ReleaseOption = Annotated[ DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")] # Options - General/Common CacheDirOption = Annotated[Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name)] ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")] CheckoutOption = Annotated[ bool, Loading Loading @@ -150,4 +151,3 @@ NoProgressOption = Annotated[ bool, typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"), ] src/tdoc_crawler/cli/config_app.py +2 −5 Original line number Diff line number Diff line Loading @@ -2,7 +2,6 @@ from __future__ import annotations import logging import tomllib from pathlib import Path from typing import Annotated, Literal Loading @@ -20,7 +19,6 @@ from tdoc_crawler.config.settings import ThreeGPPConfig FormatType = Literal["toml", "yaml", "json"] console = Console() logger = logging.getLogger(__name__) ConfigInitOutputOption = Annotated[ Path, Loading Loading @@ -76,8 +74,7 @@ def config_show( format: ConfigShowFormatOption = "toml", ) -> None: """Display current configuration (stdout).""" exporter = ConfigExporter() print(exporter.export(format)) ConfigExporter() def _check_dir_exists(target_dir: Path) -> tuple[bool, str]: Loading Loading @@ -274,7 +271,7 @@ def config_docs( "default": default, "value": value, "description": description, } }, ) if section: Loading src/tdoc_crawler/cli/crawl.py +58 −51 Original line number Diff line number Diff line Loading @@ -11,7 +11,6 @@ from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_ from tdoc_crawler.cli.args import ( AgendaPatternExcludeOption, AgendaPatternOption, CacheDirOption, CheckoutOption, ClearDbOption, ClearSpecsOption, Loading Loading @@ -75,6 +74,26 @@ def _parse_date(date_str: str | None, is_end: bool = False) -> date | None: return parse_partial_date(date_str, is_end=is_end) def _build_crawl_scope( meetings: list[MeetingMetadata], working_groups: list, subgroups: list[str], ) -> str: """Build human-readable scope description for the crawl.""" scope_parts: list[str] = [] if meetings: unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") return ", ".join(scope_parts) def crawl_tdocs( working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, Loading @@ -99,7 +118,6 @@ def crawl_tdocs( title_ex: TitlePatternExcludeOption = None, agenda: AgendaPatternOption = None, agenda_ex: AgendaPatternExcludeOption = None, cache_dir: CacheDirOption = None, http_cache_enabled: HttpCacheOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: Loading @@ -110,9 +128,6 @@ def crawl_tdocs( set_verbosity(verbosity) crawler_config = TDocCrawlerConfig.from_settings() # Override cache_dir if provided (deprecated but still supported) if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() subgroups = parse_subgroups(subgroup) Loading @@ -139,9 +154,6 @@ def crawl_tdocs( db_file = crawler_config.path.db_file scope_parts = [] # Query actual meetings from database to show realistic scope async def fetch_meetings() -> list[MeetingMetadata]: async with MeetingDatabase(db_file) as meeting_db: query_config = MeetingQueryConfig( Loading @@ -156,23 +168,11 @@ def crawl_tdocs( return await meeting_db.query_meetings(query_config) meetings = asyncio.run(fetch_meetings()) scope = _build_crawl_scope(meetings, working_groups, subgroups) console.print(f"[cyan]Crawling TDocs ({scope})[/cyan]") if meetings: # Extract unique subgroups from queried meetings unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") # Fallback to input parameters if no meetings found in DB elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") handle_clear_options( async def run_clear() -> None: await handle_clear_options( db_file, crawler_config.path.checkout_dir, TDocDatabase, Loading @@ -180,6 +180,8 @@ def crawl_tdocs( clear_specs=clear_specs, ) asyncio.run(run_clear()) async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]: async with TDocDatabase(db_file) as database: crawler = TDocCrawler(database) Loading Loading @@ -246,6 +248,11 @@ def crawl_tdocs( result, throughput = asyncio.run(run_tdoc_crawl()) _print_crawl_results(result, throughput) def _print_crawl_results(result: TDocCrawlResult, throughput: float) -> None: """Print crawl summary to console.""" console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: Loading Loading @@ -273,7 +280,6 @@ def crawl_meetings( prompt_credentials: PromptCredentialsOption = None, start_date: StartDateOption = None, end_date: EndDateOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Crawl meeting metadata from 3GPP portal.""" Loading @@ -281,8 +287,6 @@ def crawl_meetings( set_credentials(eol_username, eol_password, prompt=prompt_credentials) crawler_config = TDocCrawlerConfig.from_settings() if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() subgroups = parse_subgroups(subgroup) Loading @@ -309,7 +313,8 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") handle_clear_options( async def run_clear() -> None: await handle_clear_options( db_file, crawler_config.path.checkout_dir, MeetingDatabase, Loading @@ -318,6 +323,8 @@ def crawl_meetings( clear_db=clear_db, ) asyncio.run(run_clear()) async def run_meeting_crawl() -> MeetingCrawlResult: async with MeetingDatabase(db_file) as database: crawl_id = await database.log_crawl_start( Loading Loading @@ -384,14 +391,11 @@ def crawl_specs( clear_tdocs: ClearTDocsOption = False, clear_specs: ClearSpecsOption = False, spec_file: SpecFileOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Crawl spec metadata from configured sources.""" set_verbosity(verbosity) crawler_config = TDocCrawlerConfig.from_settings() if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() spec_numbers = spec_numbers or [] Loading @@ -404,7 +408,8 @@ def crawl_specs( sources = build_default_spec_sources() handle_clear_options( async def run_clear() -> None: await handle_clear_options( crawler_config.path.db_file, crawler_config.path.checkout_dir, SpecDatabase, Loading @@ -412,6 +417,8 @@ def crawl_specs( clear_specs=clear_specs, ) asyncio.run(run_clear()) async def crawl_specs_db() -> list[SpecCrawlResult]: async with SpecDatabase(crawler_config.path.db_file) as database: return await database.crawl_specs(specs, release, sources) Loading src/tdoc_crawler/cli/formatting.py +3 −2 Original line number Diff line number Diff line Loading @@ -76,7 +76,7 @@ def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any: return records def _format_cell(value: str | int | float | bool | None) -> str: def _format_cell(value: str | float | bool | None) -> str: """Format one table cell from DataFrame value with empty handling.""" if value is None: return "-" Loading Loading @@ -147,7 +147,8 @@ def format_output(data: StructuredData, output_format: OutputFormat) -> str: case OutputFormat.TABLE: return df.to_string(index=False) case _: raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}") msg = f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}" raise ValueError(msg) def print_structured_output( Loading Loading
src/tdoc_crawler/cli/_shared.py +4 −4 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs console = get_console() def handle_clear_options( async def handle_clear_options( db_file: Path, checkout_dir: Path, database_cls: type[DocDatabase], Loading Loading @@ -44,16 +44,16 @@ def handle_clear_options( console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]") return with database_cls(db_file) as database: async with database_cls(db_file) as database: if clear_tdocs: deleted_count = database.clear_tdocs() deleted_count = await database.clear_tdocs() console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]") removed = clear_checkout_tdocs(checkout_dir) if removed: console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]") if clear_specs: spec_counts = database.clear_specs() spec_counts = await database.clear_specs() total_specs = sum(spec_counts.values()) console.print(f"[yellow]Cleared {total_specs} spec rows from database[/yellow]") removed_specs = clear_checkout_specs(checkout_dir) Loading
src/tdoc_crawler/cli/args.py +3 −3 Original line number Diff line number Diff line Loading @@ -89,7 +89,9 @@ AgendaPatternOption = Annotated[ AgendaPatternExcludeOption = Annotated[ list[str] | None, typer.Option( "--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name "--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name, ), ] Loading @@ -114,7 +116,6 @@ ReleaseOption = Annotated[ DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")] # Options - General/Common CacheDirOption = Annotated[Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name)] ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")] CheckoutOption = Annotated[ bool, Loading Loading @@ -150,4 +151,3 @@ NoProgressOption = Annotated[ bool, typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"), ]
src/tdoc_crawler/cli/config_app.py +2 −5 Original line number Diff line number Diff line Loading @@ -2,7 +2,6 @@ from __future__ import annotations import logging import tomllib from pathlib import Path from typing import Annotated, Literal Loading @@ -20,7 +19,6 @@ from tdoc_crawler.config.settings import ThreeGPPConfig FormatType = Literal["toml", "yaml", "json"] console = Console() logger = logging.getLogger(__name__) ConfigInitOutputOption = Annotated[ Path, Loading Loading @@ -76,8 +74,7 @@ def config_show( format: ConfigShowFormatOption = "toml", ) -> None: """Display current configuration (stdout).""" exporter = ConfigExporter() print(exporter.export(format)) ConfigExporter() def _check_dir_exists(target_dir: Path) -> tuple[bool, str]: Loading Loading @@ -274,7 +271,7 @@ def config_docs( "default": default, "value": value, "description": description, } }, ) if section: Loading
src/tdoc_crawler/cli/crawl.py +58 −51 Original line number Diff line number Diff line Loading @@ -11,7 +11,6 @@ from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_ from tdoc_crawler.cli.args import ( AgendaPatternExcludeOption, AgendaPatternOption, CacheDirOption, CheckoutOption, ClearDbOption, ClearSpecsOption, Loading Loading @@ -75,6 +74,26 @@ def _parse_date(date_str: str | None, is_end: bool = False) -> date | None: return parse_partial_date(date_str, is_end=is_end) def _build_crawl_scope( meetings: list[MeetingMetadata], working_groups: list, subgroups: list[str], ) -> str: """Build human-readable scope description for the crawl.""" scope_parts: list[str] = [] if meetings: unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") return ", ".join(scope_parts) def crawl_tdocs( working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, Loading @@ -99,7 +118,6 @@ def crawl_tdocs( title_ex: TitlePatternExcludeOption = None, agenda: AgendaPatternOption = None, agenda_ex: AgendaPatternExcludeOption = None, cache_dir: CacheDirOption = None, http_cache_enabled: HttpCacheOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: Loading @@ -110,9 +128,6 @@ def crawl_tdocs( set_verbosity(verbosity) crawler_config = TDocCrawlerConfig.from_settings() # Override cache_dir if provided (deprecated but still supported) if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() subgroups = parse_subgroups(subgroup) Loading @@ -139,9 +154,6 @@ def crawl_tdocs( db_file = crawler_config.path.db_file scope_parts = [] # Query actual meetings from database to show realistic scope async def fetch_meetings() -> list[MeetingMetadata]: async with MeetingDatabase(db_file) as meeting_db: query_config = MeetingQueryConfig( Loading @@ -156,23 +168,11 @@ def crawl_tdocs( return await meeting_db.query_meetings(query_config) meetings = asyncio.run(fetch_meetings()) scope = _build_crawl_scope(meetings, working_groups, subgroups) console.print(f"[cyan]Crawling TDocs ({scope})[/cyan]") if meetings: # Extract unique subgroups from queried meetings unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") # Fallback to input parameters if no meetings found in DB elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") handle_clear_options( async def run_clear() -> None: await handle_clear_options( db_file, crawler_config.path.checkout_dir, TDocDatabase, Loading @@ -180,6 +180,8 @@ def crawl_tdocs( clear_specs=clear_specs, ) asyncio.run(run_clear()) async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]: async with TDocDatabase(db_file) as database: crawler = TDocCrawler(database) Loading Loading @@ -246,6 +248,11 @@ def crawl_tdocs( result, throughput = asyncio.run(run_tdoc_crawl()) _print_crawl_results(result, throughput) def _print_crawl_results(result: TDocCrawlResult, throughput: float) -> None: """Print crawl summary to console.""" console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: Loading Loading @@ -273,7 +280,6 @@ def crawl_meetings( prompt_credentials: PromptCredentialsOption = None, start_date: StartDateOption = None, end_date: EndDateOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Crawl meeting metadata from 3GPP portal.""" Loading @@ -281,8 +287,6 @@ def crawl_meetings( set_credentials(eol_username, eol_password, prompt=prompt_credentials) crawler_config = TDocCrawlerConfig.from_settings() if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() subgroups = parse_subgroups(subgroup) Loading @@ -309,7 +313,8 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") handle_clear_options( async def run_clear() -> None: await handle_clear_options( db_file, crawler_config.path.checkout_dir, MeetingDatabase, Loading @@ -318,6 +323,8 @@ def crawl_meetings( clear_db=clear_db, ) asyncio.run(run_clear()) async def run_meeting_crawl() -> MeetingCrawlResult: async with MeetingDatabase(db_file) as database: crawl_id = await database.log_crawl_start( Loading Loading @@ -384,14 +391,11 @@ def crawl_specs( clear_tdocs: ClearTDocsOption = False, clear_specs: ClearSpecsOption = False, spec_file: SpecFileOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Crawl spec metadata from configured sources.""" set_verbosity(verbosity) crawler_config = TDocCrawlerConfig.from_settings() if cache_dir is not None: crawler_config.path.cache_dir = cache_dir crawler_config.ensure_paths() spec_numbers = spec_numbers or [] Loading @@ -404,7 +408,8 @@ def crawl_specs( sources = build_default_spec_sources() handle_clear_options( async def run_clear() -> None: await handle_clear_options( crawler_config.path.db_file, crawler_config.path.checkout_dir, SpecDatabase, Loading @@ -412,6 +417,8 @@ def crawl_specs( clear_specs=clear_specs, ) asyncio.run(run_clear()) async def crawl_specs_db() -> list[SpecCrawlResult]: async with SpecDatabase(crawler_config.path.db_file) as database: return await database.crawl_specs(specs, release, sources) Loading
src/tdoc_crawler/cli/formatting.py +3 −2 Original line number Diff line number Diff line Loading @@ -76,7 +76,7 @@ def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any: return records def _format_cell(value: str | int | float | bool | None) -> str: def _format_cell(value: str | float | bool | None) -> str: """Format one table cell from DataFrame value with empty handling.""" if value is None: return "-" Loading Loading @@ -147,7 +147,8 @@ def format_output(data: StructuredData, output_format: OutputFormat) -> str: case OutputFormat.TABLE: return df.to_string(index=False) case _: raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}") msg = f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}" raise ValueError(msg) def print_structured_output( Loading