Loading src/tdoc_crawler/cli/AGENTS.md +1 −35 Original line number Diff line number Diff line Loading @@ -37,43 +37,9 @@ Assume `cli/` could be separated as an optional package. If a function would be | `app.py` | Typer command definitions and CLI entry points | | `args.py` | Typer Annotated types for arguments and options | | `console.py` | Rich Console singleton for CLI output | | `helpers.py` | Helper functions - check classification rules above | | `fetching.py` | TDoc fetching - check classification rules below | | `utils.py` | Helper functions - check classification rules above | | `printing.py` | Table and output formatting for CLI | ## Function Classification ### `helpers.py` **CLI Functions (stay in cli/):** - `parse_working_groups()` - CLI argument parsing - `parse_subgroups()` - CLI argument parsing - `collect_spec_numbers()` - CLI stdin/file input handling - `build_limits()` - CLI config builder wrapper - `launch_file()` - System calls for opening files - `resolve_http_cache_config()` - CLI/env var configuration parsing - `infer_working_groups_from_ids()` - CLI string inference **Library Functions (moved to core):** - `normalize_portal_meeting_name()` → `tdoc_crawler.specs.normalization` - `resolve_meeting_id()` → `tdoc_crawler.database` - `download_to_path()` → `tdoc_crawler.http_client` - `prepare_tdoc_file()` → `tdoc_crawler.checkout` - `database_path()` → `tdoc_crawler.database` ### `fetching.py` **CLI Functions (stay in cli/):** - `fetch_missing_tdocs()` - Uses CLI console output - `_fetch_via_whatthespec()` - Uses CLI console output - `maybe_fetch_missing_tdocs()` - CLI console and flag handling **Library Functions:** - `fetch_tdoc()` - Import from `tdoc_crawler.fetching` (not duplicated in CLI) ## Lessons Learned Loading src/tdoc_crawler/cli/app.py +29 −30 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ from tdoc_crawler.checkout import ( from tdoc_crawler.cli.args import ( DEFAULT_VERBOSITY, CacheDirOption, CheckoutDirOption, CheckoutOption, CheckoutTDocIdsArgument, ClearDbOption, Loading Loading @@ -64,7 +65,6 @@ from tdoc_crawler.cli.args import ( WorkingGroupOption, ) from tdoc_crawler.cli.console import get_console from tdoc_crawler.cli.helpers import build_limits, collect_spec_numbers, launch_file, parse_subgroups, parse_working_groups from tdoc_crawler.cli.printing import ( meeting_to_dict, print_checkout_results, Loading @@ -76,17 +76,17 @@ from tdoc_crawler.cli.printing import ( spec_query_to_dict, tdoc_to_dict, ) from tdoc_crawler.cli.utils import launch_file from tdoc_crawler.config import CacheManager from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import SpecDatabase, TDocDatabase from tdoc_crawler.database import TDocDatabase from tdoc_crawler.fetching import fetch_missing_tdocs from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import set_verbosity from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig from tdoc_crawler.models.specs import SpecQueryFilters from tdoc_crawler.specs import SpecCatalog from tdoc_crawler.specs.downloads import SpecDownloads from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, SpecQueryFilters, TDocCrawlConfig from tdoc_crawler.specs import SpecDatabase, SpecDownloads from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups load_dotenv() Loading Loading @@ -126,7 +126,8 @@ def crawl_tdocs( subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs) limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs) config = TDocCrawlConfig( cache_dir=manager.root, working_groups=working_groups, Loading Loading @@ -158,7 +159,7 @@ def crawl_tdocs( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") with SpecDatabase(db_file) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear TDocs if requested if clear_tdocs: Loading Loading @@ -275,7 +276,7 @@ def crawl_meetings( subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) limits = build_limits(None, limit_meetings, limit_meetings_per_wg, limit_wgs) limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_wg, limit_wgs) set_credentials(eol_username, eol_password, prompt_credentials) config = MeetingCrawlConfig( cache_dir=manager.root, Loading @@ -297,7 +298,7 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear all data if requested if clear_db: Loading Loading @@ -369,7 +370,7 @@ def crawl_meetings( order=SortOrder.DESC, include_without_files=False, ) with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: meetings = database.query_meetings(query_config) with create_cached_session(manager.http_cache_dir) as session: Loading Loading @@ -515,7 +516,7 @@ def query_meetings( ) db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading Loading @@ -589,7 +590,7 @@ def query_specs( raise typer.Exit(code=2) from exc db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -605,8 +606,7 @@ def query_specs( if removed_specs: console.print("[yellow]Cleared checkout entries for specs[/yellow]") catalog = SpecCatalog(database) results = catalog.query_specs(filters, release="latest") results = database.query_specs(filters) if not results: console.print("[yellow]No specs found[/yellow]") Loading @@ -614,7 +614,7 @@ def query_specs( if checkout: spec_list = [result.spec_number for result in results] with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_specs(spec_list, manager.checkout_dir, database, release="latest") if output is OutputFormat.JSON: Loading Loading @@ -670,13 +670,13 @@ def open_tdoc( metadata = results[0] try: target_file = prepare_tdoc_file(metadata, manager.root, session=session) tdoc_file = prepare_tdoc_file(metadata, manager.root, session=session) except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc: console.print(f"[red]Failed to prepare TDoc {normalized_id}: {exc}") raise typer.Exit(code=1) from exc console.print(f"[green]Opening {target_file}") launch_file(target_file) console.print(f"[green]Opening {tdoc_file}") launch_file(tdoc_file) @app.command("checkout", rich_help_panel=HELP_PANEL_MAIN) Loading Loading @@ -763,7 +763,7 @@ def stats( console.print(f"[red]Database not found: {db_file}[/red]") raise typer.Exit(code=1) with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: stats_dict = cast(dict[str, Any], database.get_statistics()) table = Table(title="TDoc database statistics") Loading Loading @@ -811,7 +811,7 @@ def crawl_specs( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -827,15 +827,14 @@ def crawl_specs( if removed_specs: console.print("[yellow]Cleared checkout entries for specs[/yellow]") catalog = SpecCatalog(database) results = catalog.crawl_specs(specs, release, sources) results = database.crawl_specs(specs, release, sources) if not results: console.print("[yellow]No specs crawled[/yellow]") return if checkout: with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_specs( [result.spec_number for result in results], manager.checkout_dir, Loading @@ -857,7 +856,7 @@ def checkout_spec( spec_file: SpecFileOption = None, release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, # checkout_dir: CheckoutDirOption = None, checkout_dir: CheckoutDirOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: Loading @@ -876,7 +875,7 @@ def checkout_spec( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: downloader = SpecDownloads(database) results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources) Loading @@ -901,12 +900,12 @@ def open_spec( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: downloader = SpecDownloads(database) try: path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) console.print(f"[green]Opening {path}[/green]") launch_file(path) spec_file = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) console.print(f"[green]Opening {spec_file}[/green]") launch_file(spec_file) except Exception as exc: console.print(f"[red]Failed to open spec: {exc}[/red]") raise typer.Exit(code=1) Loading src/tdoc_crawler/cli/args.py +1 −0 Original line number Diff line number Diff line Loading @@ -78,6 +78,7 @@ ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meeti CheckoutOption = Annotated[ bool, typer.Option("--checkout/--no-checkout", help="Download and extract metadata results to checkout folder", envvar="TDC_CHECKOUT") ] CheckoutDirOption = Annotated[Path | None, typer.Option("--checkout-dir", help="Directory for checkout files", envvar="TDC_CHECKOUT_DIR")] WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers", envvar="TDC_WORKERS")] MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")] TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")] Loading src/tdoc_crawler/cli/helpers.pydeleted 100644 → 0 +0 −223 Original line number Diff line number Diff line """Helper functions for CLI operations.""" from __future__ import annotations import os import subprocess import sys from collections.abc import Iterable from pathlib import Path import click import typer from tdoc_crawler.cli.console import get_console from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.database import TDocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, SortOrder, WorkingGroup from tdoc_crawler.specs.normalization import expand_spec_ranges_batch, normalize_portal_meeting_name console = get_console() _logger = get_logger(__name__) DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db" DOWNLOAD_TIMEOUT = 60 ALLOWED_DOWNLOAD_SCHEMES = ("ftp://", "http://", "https://") def infer_working_groups_from_subgroups(subgroups: list[str]) -> list[WorkingGroup]: """Infer working groups from subgroup codes. Args: subgroups: List of subgroup codes (e.g., ["S4", "R1"]) Returns: List of inferred working groups without duplicates """ working_groups: list[WorkingGroup] = [] for subgroup in subgroups: # Extract first character to determine working group if subgroup and len(subgroup) >= 1: first_char = subgroup[0].upper() if first_char == "R": wg = WorkingGroup.RAN elif first_char == "S": wg = WorkingGroup.SA elif first_char == "C": wg = WorkingGroup.CT else: continue if wg not in working_groups: working_groups.append(wg) return working_groups if working_groups else [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = None) -> list[WorkingGroup]: """Parse and normalize working group names, expanding plenary aliases. Args: values: Explicit working group values from CLI subgroups: Optional subgroup list to infer working groups from Returns: List of working groups to crawl """ if not values: # If subgroups are specified but no explicit working groups, infer from subgroups if subgroups: return infer_working_groups_from_subgroups(subgroups) # Otherwise default to all working groups return [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] resolved: list[WorkingGroup] = [] for item in values: # Try alias normalization first (RP->RAN, SP->SA, CP->CT) normalized = normalize_working_group_alias(item) try: resolved.append(WorkingGroup(normalized.upper())) except ValueError as exc: console.print(f"[red]Unknown working group: {item}") raise typer.Exit(code=2) from exc if not resolved: console.print("[red]No valid working groups specified") raise typer.Exit(code=2) return resolved def parse_subgroups(values: list[str] | None) -> list[str] | None: """Parse and normalize subgroup aliases to canonical names.""" if not values: return None resolved: list[str] = [] for item in values: normalized = normalize_subgroup_alias(item) if not normalized: console.print(f"[red]Unknown subgroup: {item}") raise typer.Exit(code=2) resolved.extend(normalized) return resolved def collect_spec_numbers(specs: list[str] | None, spec_file: Path | None) -> list[str]: """Collect spec numbers from CLI arguments or a file.""" collected: list[str] = [] if specs: for spec in specs: if spec == "-": # Read from stdin for line in sys.stdin: line_stripped = line.strip() if line_stripped: collected.append(line_stripped) else: collected.append(spec.strip()) if spec_file and spec_file.exists(): try: with spec_file.open("r", encoding="utf-8") as f: for line in f: line_stripped = line.strip() if line_stripped: collected.append(line_stripped) except OSError as exc: raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}") if not collected: return [] try: expanded = expand_spec_ranges_batch(collected) except ValueError as e: raise click.UsageError(str(e)) return expanded def build_limits( limit_tdocs: int | None, limit_meetings: int | None, limit_meetings_per_wg: int | None, limit_wgs: int | None, ) -> CrawlLimits: """Build CrawlLimits configuration from individual parameters.""" return CrawlLimits( limit_tdocs=limit_tdocs, limit_meetings=limit_meetings, limit_meetings_per_wg=limit_meetings_per_wg, limit_wgs=limit_wgs, ) def infer_working_groups_from_ids(ids: Iterable[str]) -> list[WorkingGroup]: """Infer working groups from TDoc IDs based on first character.""" mapping = { "R": WorkingGroup.RAN, "S": WorkingGroup.SA, "T": WorkingGroup.CT, "C": WorkingGroup.CT, } resolved: list[WorkingGroup] = [] for value in ids: if not value: continue group = mapping.get(value[0].upper()) if group and group not in resolved: resolved.append(group) return resolved or [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] def launch_file(path: Path) -> None: """Launch file in system's default application.""" if not path.exists(): console.print(f"[red]File not found: {path}") raise typer.Exit(code=1) try: if sys.platform.startswith("win"): os.startfile(path) # noqa: S606 elif sys.platform == "darwin": open_cmd = Path("/usr/bin/open") if open_cmd.exists(): subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]/usr/bin/open not available[/yellow]") else: xdg_cmd = Path("/usr/bin/xdg-open") if xdg_cmd.exists(): subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]xdg-open command not available[/yellow]") except OSError as exc: console.print(f"[red]Failed to open file: {exc}") raise typer.Exit(code=1) from exc def resolve_http_cache_config(cache_ttl: int | None = None, cache_refresh_on_access: bool | None = None) -> HttpCacheConfig: """Resolve HTTP cache configuration from CLI parameters and environment variables. Args: cache_ttl: TTL for cache entries (CLI parameter) cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter) Returns: HttpCacheConfig instance with resolved values """ # Check CLI parameters first, then environment variables, then defaults if cache_ttl is not None: ttl = cache_ttl else: env_ttl = os.getenv("HTTP_CACHE_TTL") ttl = int(env_ttl) if env_ttl else 7200 # Handle refresh on access - check CLI param, then env var, then default if cache_refresh_on_access is not None: refresh_on_access = cache_refresh_on_access else: env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower() refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access) src/tdoc_crawler/cli/utils.py 0 → 100644 +40 −0 Original line number Diff line number Diff line """CLI utilities.""" from __future__ import annotations import os import subprocess import sys from pathlib import Path import typer from tdoc_crawler.cli.console import get_console console = get_console() def launch_file(path: Path) -> None: """Launch file in system's default application.""" if not path.exists(): console.print(f"[red]File not found: {path}") raise typer.Exit(code=1) try: if sys.platform.startswith("win"): os.startfile(path) # noqa: S606 elif sys.platform == "darwin": open_cmd = Path("/usr/bin/open") if open_cmd.exists(): subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]/usr/bin/open not available[/yellow]") else: # Linux and other Unix-like systems xdg_cmd = Path("/usr/bin/xdg-open") if xdg_cmd.exists(): subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]xdg-open command not available[/yellow]") except OSError as exc: console.print(f"[red]Failed to open file: {exc}") raise typer.Exit(code=1) from exc Loading
src/tdoc_crawler/cli/AGENTS.md +1 −35 Original line number Diff line number Diff line Loading @@ -37,43 +37,9 @@ Assume `cli/` could be separated as an optional package. If a function would be | `app.py` | Typer command definitions and CLI entry points | | `args.py` | Typer Annotated types for arguments and options | | `console.py` | Rich Console singleton for CLI output | | `helpers.py` | Helper functions - check classification rules above | | `fetching.py` | TDoc fetching - check classification rules below | | `utils.py` | Helper functions - check classification rules above | | `printing.py` | Table and output formatting for CLI | ## Function Classification ### `helpers.py` **CLI Functions (stay in cli/):** - `parse_working_groups()` - CLI argument parsing - `parse_subgroups()` - CLI argument parsing - `collect_spec_numbers()` - CLI stdin/file input handling - `build_limits()` - CLI config builder wrapper - `launch_file()` - System calls for opening files - `resolve_http_cache_config()` - CLI/env var configuration parsing - `infer_working_groups_from_ids()` - CLI string inference **Library Functions (moved to core):** - `normalize_portal_meeting_name()` → `tdoc_crawler.specs.normalization` - `resolve_meeting_id()` → `tdoc_crawler.database` - `download_to_path()` → `tdoc_crawler.http_client` - `prepare_tdoc_file()` → `tdoc_crawler.checkout` - `database_path()` → `tdoc_crawler.database` ### `fetching.py` **CLI Functions (stay in cli/):** - `fetch_missing_tdocs()` - Uses CLI console output - `_fetch_via_whatthespec()` - Uses CLI console output - `maybe_fetch_missing_tdocs()` - CLI console and flag handling **Library Functions:** - `fetch_tdoc()` - Import from `tdoc_crawler.fetching` (not duplicated in CLI) ## Lessons Learned Loading
src/tdoc_crawler/cli/app.py +29 −30 Original line number Diff line number Diff line Loading @@ -26,6 +26,7 @@ from tdoc_crawler.checkout import ( from tdoc_crawler.cli.args import ( DEFAULT_VERBOSITY, CacheDirOption, CheckoutDirOption, CheckoutOption, CheckoutTDocIdsArgument, ClearDbOption, Loading Loading @@ -64,7 +65,6 @@ from tdoc_crawler.cli.args import ( WorkingGroupOption, ) from tdoc_crawler.cli.console import get_console from tdoc_crawler.cli.helpers import build_limits, collect_spec_numbers, launch_file, parse_subgroups, parse_working_groups from tdoc_crawler.cli.printing import ( meeting_to_dict, print_checkout_results, Loading @@ -76,17 +76,17 @@ from tdoc_crawler.cli.printing import ( spec_query_to_dict, tdoc_to_dict, ) from tdoc_crawler.cli.utils import launch_file from tdoc_crawler.config import CacheManager from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import SpecDatabase, TDocDatabase from tdoc_crawler.database import TDocDatabase from tdoc_crawler.fetching import fetch_missing_tdocs from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import set_verbosity from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig from tdoc_crawler.models.specs import SpecQueryFilters from tdoc_crawler.specs import SpecCatalog from tdoc_crawler.specs.downloads import SpecDownloads from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, SpecQueryFilters, TDocCrawlConfig from tdoc_crawler.specs import SpecDatabase, SpecDownloads from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups load_dotenv() Loading Loading @@ -126,7 +126,8 @@ def crawl_tdocs( subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs) limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs) config = TDocCrawlConfig( cache_dir=manager.root, working_groups=working_groups, Loading Loading @@ -158,7 +159,7 @@ def crawl_tdocs( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") with SpecDatabase(db_file) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear TDocs if requested if clear_tdocs: Loading Loading @@ -275,7 +276,7 @@ def crawl_meetings( subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) limits = build_limits(None, limit_meetings, limit_meetings_per_wg, limit_wgs) limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_wg, limit_wgs) set_credentials(eol_username, eol_password, prompt_credentials) config = MeetingCrawlConfig( cache_dir=manager.root, Loading @@ -297,7 +298,7 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear all data if requested if clear_db: Loading Loading @@ -369,7 +370,7 @@ def crawl_meetings( order=SortOrder.DESC, include_without_files=False, ) with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: meetings = database.query_meetings(query_config) with create_cached_session(manager.http_cache_dir) as session: Loading Loading @@ -515,7 +516,7 @@ def query_meetings( ) db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading Loading @@ -589,7 +590,7 @@ def query_specs( raise typer.Exit(code=2) from exc db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -605,8 +606,7 @@ def query_specs( if removed_specs: console.print("[yellow]Cleared checkout entries for specs[/yellow]") catalog = SpecCatalog(database) results = catalog.query_specs(filters, release="latest") results = database.query_specs(filters) if not results: console.print("[yellow]No specs found[/yellow]") Loading @@ -614,7 +614,7 @@ def query_specs( if checkout: spec_list = [result.spec_number for result in results] with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_specs(spec_list, manager.checkout_dir, database, release="latest") if output is OutputFormat.JSON: Loading Loading @@ -670,13 +670,13 @@ def open_tdoc( metadata = results[0] try: target_file = prepare_tdoc_file(metadata, manager.root, session=session) tdoc_file = prepare_tdoc_file(metadata, manager.root, session=session) except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc: console.print(f"[red]Failed to prepare TDoc {normalized_id}: {exc}") raise typer.Exit(code=1) from exc console.print(f"[green]Opening {target_file}") launch_file(target_file) console.print(f"[green]Opening {tdoc_file}") launch_file(tdoc_file) @app.command("checkout", rich_help_panel=HELP_PANEL_MAIN) Loading Loading @@ -763,7 +763,7 @@ def stats( console.print(f"[red]Database not found: {db_file}[/red]") raise typer.Exit(code=1) with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: stats_dict = cast(dict[str, Any], database.get_statistics()) table = Table(title="TDoc database statistics") Loading Loading @@ -811,7 +811,7 @@ def crawl_specs( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -827,15 +827,14 @@ def crawl_specs( if removed_specs: console.print("[yellow]Cleared checkout entries for specs[/yellow]") catalog = SpecCatalog(database) results = catalog.crawl_specs(specs, release, sources) results = database.crawl_specs(specs, release, sources) if not results: console.print("[yellow]No specs crawled[/yellow]") return if checkout: with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: checkout_specs( [result.spec_number for result in results], manager.checkout_dir, Loading @@ -857,7 +856,7 @@ def checkout_spec( spec_file: SpecFileOption = None, release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, # checkout_dir: CheckoutDirOption = None, checkout_dir: CheckoutDirOption = None, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: Loading @@ -876,7 +875,7 @@ def checkout_spec( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: downloader = SpecDownloads(database) results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources) Loading @@ -901,12 +900,12 @@ def open_spec( sources = build_default_spec_sources() db_file = manager.db_file with TDocDatabase(db_file) as database: with SpecDatabase(db_file) as database: downloader = SpecDownloads(database) try: path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) console.print(f"[green]Opening {path}[/green]") launch_file(path) spec_file = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) console.print(f"[green]Opening {spec_file}[/green]") launch_file(spec_file) except Exception as exc: console.print(f"[red]Failed to open spec: {exc}[/red]") raise typer.Exit(code=1) Loading
src/tdoc_crawler/cli/args.py +1 −0 Original line number Diff line number Diff line Loading @@ -78,6 +78,7 @@ ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meeti CheckoutOption = Annotated[ bool, typer.Option("--checkout/--no-checkout", help="Download and extract metadata results to checkout folder", envvar="TDC_CHECKOUT") ] CheckoutDirOption = Annotated[Path | None, typer.Option("--checkout-dir", help="Directory for checkout files", envvar="TDC_CHECKOUT_DIR")] WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers", envvar="TDC_WORKERS")] MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")] TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")] Loading
src/tdoc_crawler/cli/helpers.pydeleted 100644 → 0 +0 −223 Original line number Diff line number Diff line """Helper functions for CLI operations.""" from __future__ import annotations import os import subprocess import sys from collections.abc import Iterable from pathlib import Path import click import typer from tdoc_crawler.cli.console import get_console from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.database import TDocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, SortOrder, WorkingGroup from tdoc_crawler.specs.normalization import expand_spec_ranges_batch, normalize_portal_meeting_name console = get_console() _logger = get_logger(__name__) DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db" DOWNLOAD_TIMEOUT = 60 ALLOWED_DOWNLOAD_SCHEMES = ("ftp://", "http://", "https://") def infer_working_groups_from_subgroups(subgroups: list[str]) -> list[WorkingGroup]: """Infer working groups from subgroup codes. Args: subgroups: List of subgroup codes (e.g., ["S4", "R1"]) Returns: List of inferred working groups without duplicates """ working_groups: list[WorkingGroup] = [] for subgroup in subgroups: # Extract first character to determine working group if subgroup and len(subgroup) >= 1: first_char = subgroup[0].upper() if first_char == "R": wg = WorkingGroup.RAN elif first_char == "S": wg = WorkingGroup.SA elif first_char == "C": wg = WorkingGroup.CT else: continue if wg not in working_groups: working_groups.append(wg) return working_groups if working_groups else [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = None) -> list[WorkingGroup]: """Parse and normalize working group names, expanding plenary aliases. Args: values: Explicit working group values from CLI subgroups: Optional subgroup list to infer working groups from Returns: List of working groups to crawl """ if not values: # If subgroups are specified but no explicit working groups, infer from subgroups if subgroups: return infer_working_groups_from_subgroups(subgroups) # Otherwise default to all working groups return [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] resolved: list[WorkingGroup] = [] for item in values: # Try alias normalization first (RP->RAN, SP->SA, CP->CT) normalized = normalize_working_group_alias(item) try: resolved.append(WorkingGroup(normalized.upper())) except ValueError as exc: console.print(f"[red]Unknown working group: {item}") raise typer.Exit(code=2) from exc if not resolved: console.print("[red]No valid working groups specified") raise typer.Exit(code=2) return resolved def parse_subgroups(values: list[str] | None) -> list[str] | None: """Parse and normalize subgroup aliases to canonical names.""" if not values: return None resolved: list[str] = [] for item in values: normalized = normalize_subgroup_alias(item) if not normalized: console.print(f"[red]Unknown subgroup: {item}") raise typer.Exit(code=2) resolved.extend(normalized) return resolved def collect_spec_numbers(specs: list[str] | None, spec_file: Path | None) -> list[str]: """Collect spec numbers from CLI arguments or a file.""" collected: list[str] = [] if specs: for spec in specs: if spec == "-": # Read from stdin for line in sys.stdin: line_stripped = line.strip() if line_stripped: collected.append(line_stripped) else: collected.append(spec.strip()) if spec_file and spec_file.exists(): try: with spec_file.open("r", encoding="utf-8") as f: for line in f: line_stripped = line.strip() if line_stripped: collected.append(line_stripped) except OSError as exc: raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}") if not collected: return [] try: expanded = expand_spec_ranges_batch(collected) except ValueError as e: raise click.UsageError(str(e)) return expanded def build_limits( limit_tdocs: int | None, limit_meetings: int | None, limit_meetings_per_wg: int | None, limit_wgs: int | None, ) -> CrawlLimits: """Build CrawlLimits configuration from individual parameters.""" return CrawlLimits( limit_tdocs=limit_tdocs, limit_meetings=limit_meetings, limit_meetings_per_wg=limit_meetings_per_wg, limit_wgs=limit_wgs, ) def infer_working_groups_from_ids(ids: Iterable[str]) -> list[WorkingGroup]: """Infer working groups from TDoc IDs based on first character.""" mapping = { "R": WorkingGroup.RAN, "S": WorkingGroup.SA, "T": WorkingGroup.CT, "C": WorkingGroup.CT, } resolved: list[WorkingGroup] = [] for value in ids: if not value: continue group = mapping.get(value[0].upper()) if group and group not in resolved: resolved.append(group) return resolved or [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT] def launch_file(path: Path) -> None: """Launch file in system's default application.""" if not path.exists(): console.print(f"[red]File not found: {path}") raise typer.Exit(code=1) try: if sys.platform.startswith("win"): os.startfile(path) # noqa: S606 elif sys.platform == "darwin": open_cmd = Path("/usr/bin/open") if open_cmd.exists(): subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]/usr/bin/open not available[/yellow]") else: xdg_cmd = Path("/usr/bin/xdg-open") if xdg_cmd.exists(): subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]xdg-open command not available[/yellow]") except OSError as exc: console.print(f"[red]Failed to open file: {exc}") raise typer.Exit(code=1) from exc def resolve_http_cache_config(cache_ttl: int | None = None, cache_refresh_on_access: bool | None = None) -> HttpCacheConfig: """Resolve HTTP cache configuration from CLI parameters and environment variables. Args: cache_ttl: TTL for cache entries (CLI parameter) cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter) Returns: HttpCacheConfig instance with resolved values """ # Check CLI parameters first, then environment variables, then defaults if cache_ttl is not None: ttl = cache_ttl else: env_ttl = os.getenv("HTTP_CACHE_TTL") ttl = int(env_ttl) if env_ttl else 7200 # Handle refresh on access - check CLI param, then env var, then default if cache_refresh_on_access is not None: refresh_on_access = cache_refresh_on_access else: env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower() refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access)
src/tdoc_crawler/cli/utils.py 0 → 100644 +40 −0 Original line number Diff line number Diff line """CLI utilities.""" from __future__ import annotations import os import subprocess import sys from pathlib import Path import typer from tdoc_crawler.cli.console import get_console console = get_console() def launch_file(path: Path) -> None: """Launch file in system's default application.""" if not path.exists(): console.print(f"[red]File not found: {path}") raise typer.Exit(code=1) try: if sys.platform.startswith("win"): os.startfile(path) # noqa: S606 elif sys.platform == "darwin": open_cmd = Path("/usr/bin/open") if open_cmd.exists(): subprocess.run([str(open_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]/usr/bin/open not available[/yellow]") else: # Linux and other Unix-like systems xdg_cmd = Path("/usr/bin/xdg-open") if xdg_cmd.exists(): subprocess.run([str(xdg_cmd), str(path)], check=False) # noqa: S603 else: console.print("[yellow]xdg-open command not available[/yellow]") except OSError as exc: console.print(f"[red]Failed to open file: {exc}") raise typer.Exit(code=1) from exc