Loading src/tdoc_crawler/cli/app.py +68 −64 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ from tdoc_crawler.cli.printing import ( spec_query_to_dict, tdoc_to_dict, ) from tdoc_crawler.config import DEFAULT_CACHE_DIR, CacheManager from tdoc_crawler.config import CacheManager from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import TDocDatabase Loading @@ -102,7 +102,6 @@ HELP_PANEL_QUERY = "Query Commands" @app.command("crawl-tdocs", rich_help_panel=HELP_PANEL_CRAWLING) def crawl_tdocs( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, incremental: IncrementalOption = True, Loading @@ -117,14 +116,14 @@ def crawl_tdocs( overall_timeout: OverallTimeoutOption = None, max_retries: MaxRetriesOption = 3, timeout: TimeoutOption = 30, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Crawl TDocs from 3GPP FTP directories.""" # Set logging verbosity early to ensure all log messages respect the configured level set_verbosity(verbosity) manager = CacheManager(cache_dir) manager.ensure_paths() manager = CacheManager(cache_dir).register() subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) Loading @@ -150,7 +149,7 @@ def crawl_tdocs( use_parallel_crawling=False, ) db_path = manager.db_path db_file = manager.db_file # Build descriptive message scope_parts = [] Loading @@ -160,7 +159,7 @@ def crawl_tdocs( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear TDocs if requested if clear_tdocs: Loading Loading @@ -228,7 +227,7 @@ def crawl_tdocs( results = database.query_tdocs(query_config) # Use a shared session for checkout downloads with create_cached_session(manager.http_cache_path) as session: with create_cached_session(manager.http_cache_dir) as session: checkout_result = checkout_tdocs(results, checkout_dir, force=False, session=session) console.print(f"\n[cyan]Checked out {checkout_result.success_count} TDoc(s)[/cyan]") Loading @@ -251,7 +250,7 @@ def crawl_tdocs( @app.command("crawl-meetings", rich_help_panel=HELP_PANEL_CRAWLING) def crawl_meetings( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, incremental: IncrementalOption = True, Loading @@ -273,8 +272,7 @@ def crawl_meetings( # Set logging verbosity early to ensure all log messages respect the configured level set_verbosity(verbosity) manager = CacheManager(cache_dir) manager.ensure_paths() manager = CacheManager(cache_dir).register() subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) Loading @@ -291,7 +289,7 @@ def crawl_meetings( credentials=None, ) db_path = manager.db_path db_file = manager.db_file # Build descriptive message scope_parts = [] if subgroups: Loading @@ -300,7 +298,7 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear all data if requested if clear_db: Loading Loading @@ -372,17 +370,17 @@ def crawl_meetings( order=SortOrder.DESC, include_without_files=False, ) with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: meetings = database.query_meetings(query_config) with create_cached_session(manager.http_cache_path) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_path, session=session) with create_cached_session(manager.http_cache_dir) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_dir, session=session) @app.command("query-tdocs", rich_help_panel=HELP_PANEL_QUERY) def query_tdocs( tdoc_ids: TDocIdsArgument = None, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, clear_tdocs: ClearTDocsOption = False, clear_specs: ClearSpecsOption = False, Loading @@ -399,7 +397,7 @@ def query_tdocs( ) -> None: """Query TDoc metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() working_groups = parse_working_groups(working_group) try: start = datetime.fromisoformat(start_date) if start_date else None Loading Loading @@ -433,8 +431,8 @@ def query_tdocs( if not no_fetch: set_credentials(eol_username, eol_password, prompt=None) db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -453,8 +451,14 @@ def query_tdocs( results = database.query_tdocs(config) if not no_fetch: # Use cached session for missing TDoc fetching with create_cached_session(manager.http_cache_path) as session: result = fetch_missing_tdocs(database, manager.root, config, results, session=session) with create_cached_session(manager.http_cache_dir) as session: result = fetch_missing_tdocs( database, config, results, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") for error in result.fetch_result.errors[:3]: Loading @@ -466,7 +470,7 @@ def query_tdocs( return if checkout: with create_cached_session(manager.http_cache_path) as session: with create_cached_session(manager.http_cache_dir) as session: checkout_tdocs(results, manager.checkout_dir, force=False, session=session) if config.output_format is OutputFormat.JSON: Loading @@ -479,7 +483,7 @@ def query_tdocs( @app.command("query-meetings", rich_help_panel=HELP_PANEL_QUERY) def query_meetings( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, clear_tdocs: ClearTDocsOption = False, Loading @@ -493,7 +497,7 @@ def query_meetings( ) -> None: """Query meeting metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() working_groups = parse_working_groups(working_group) subgroups = parse_subgroups(subgroup) try: Loading @@ -511,8 +515,8 @@ def query_meetings( include_without_files=include_without_files, ) db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -535,8 +539,8 @@ def query_meetings( return if checkout: with create_cached_session(manager.http_cache_path) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_path, session=session) with create_cached_session(manager.http_cache_dir) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_dir, session=session) try: output = OutputFormat(output_format.lower()) Loading @@ -562,12 +566,12 @@ def query_specs( clear_specs: ClearSpecsOption = False, checkout: CheckoutOption = False, output_format: OutputFormatOption = OutputFormat.TABLE.value, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Query spec metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() specs = collect_spec_numbers(spec_numbers, spec_file) working_groups = parse_working_groups(working_group) wg_filter = working_groups[0].value if working_groups else None Loading @@ -585,8 +589,8 @@ def query_specs( console.print("[red]Invalid output format; use table, json, or yaml") raise typer.Exit(code=2) from exc db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -611,7 +615,7 @@ def query_specs( if checkout: spec_list = [result.spec_number for result in results] with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_specs(spec_list, manager.checkout_dir, database, release="latest") if output is OutputFormat.JSON: Loading @@ -625,7 +629,7 @@ def query_specs( @app.command("open", rich_help_panel=HELP_PANEL_MAIN) def open_tdoc( tdoc_id: TDocIdArgument, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, Loading @@ -635,28 +639,28 @@ def open_tdoc( """Download, extract, and open a TDoc file.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() normalized_id = tdoc_id.strip().upper() config = QueryConfig( cache_dir=manager.root, tdoc_ids=[normalized_id], ) db_path = manager.db_path with create_cached_session(manager.http_cache_path) as session: with TDocDatabase(db_path) as database: db_file = manager.db_file with create_cached_session(manager.http_cache_dir) as session: with TDocDatabase(db_file) as database: results = database.query_tdocs(config) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, manager.root, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") Loading @@ -679,7 +683,7 @@ def open_tdoc( @app.command("checkout", rich_help_panel=HELP_PANEL_MAIN) def checkout( tdoc_id: CheckoutTDocIdsArgument, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, force: ForceOption = False, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, Loading @@ -690,28 +694,28 @@ def checkout( """Download and extract TDoc(s) to checkout folder.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() normalized_ids = [tid.strip().upper() for tid in tdoc_id] config = QueryConfig( cache_dir=manager.root, tdoc_ids=normalized_ids, ) db_path = manager.db_path with create_cached_session(manager.http_cache_path) as session: with TDocDatabase(db_path) as database: db_file = manager.db_file with create_cached_session(manager.http_cache_dir) as session: with TDocDatabase(db_file) as database: results = database.query_tdocs(config) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, manager.root, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") Loading Loading @@ -749,18 +753,18 @@ def checkout( @app.command("stats", rich_help_panel=HELP_PANEL_MAIN) def stats( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Display database statistics.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) db_path = manager.db_path if not db_path.exists(): console.print(f"[red]Database not found: {db_path}[/red]") manager = CacheManager(cache_dir).register() db_file = manager.db_file if not db_file.exists(): console.print(f"[red]Database not found: {db_file}[/red]") raise typer.Exit(code=1) with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: stats_dict = cast(dict[str, Any], database.get_statistics()) table = Table(title="TDoc database statistics") Loading Loading @@ -790,12 +794,12 @@ def crawl_specs( clear_specs: ClearSpecsOption = False, checkout: CheckoutOption = False, output_format: OutputFormatOption = OutputFormat.TABLE.value, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Crawl spec metadata from configured sources.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() if spec_numbers is None: spec_numbers = [] specs = collect_spec_numbers(spec_numbers, spec_file) Loading @@ -807,8 +811,8 @@ def crawl_specs( sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -832,7 +836,7 @@ def crawl_specs( return if checkout: with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_specs( [result.spec_number for result in results], manager.checkout_dir, Loading @@ -855,12 +859,12 @@ def checkout_spec( release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, checkout_dir: CheckoutDirOption = None, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and extract spec documents.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() if spec_numbers is None: spec_numbers = [] specs = collect_spec_numbers(spec_numbers, spec_file) Loading @@ -872,8 +876,8 @@ def checkout_spec( sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: downloader = SpecDownloads(database) results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources) Loading @@ -886,19 +890,19 @@ def open_spec( spec: Annotated[str, typer.Argument(help="Spec number")], release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and open a spec document.""" set_verbosity(verbosity) normalized = spec.strip() manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() checkout_dir = manager.checkout_dir sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: downloader = SpecDownloads(database) try: path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) Loading src/tdoc_crawler/cli/args.py +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ from tdoc_crawler.logging import DEFAULT_LEVEL as LOGGING_DEFAULT_LEVEL DEFAULT_VERBOSITY = logging.getLevelName(LOGGING_DEFAULT_LEVEL) CacheDirOption = Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR")] CacheDirOption = Annotated[Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR")] WorkingGroupOption = Annotated[list[str] | None, typer.Option("--working-group", "-w", help="Filter by working group", envvar="TDC_WORKING_GROUP")] SubgroupOption = Annotated[list[str] | None, typer.Option("--sub-group", "-s", help="Filter by sub-working group", envvar="TDC_SUB_GROUP")] IncrementalOption = Annotated[bool, typer.Option("--incremental/--full", help="Toggle incremental mode")] Loading src/tdoc_crawler/config.py +36 −3 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import os from pathlib import Path from typing import Self # Fallback path if no argument or env var is provided DEFAULT_CACHE_DIR = Path.home() / ".tdoc-crawler" Loading @@ -11,6 +12,26 @@ DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db" DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3" DEFAULT_CHECKOUT_DIRNAME = "checkout" DEFAULT_MANAGER = "default" _cache_managers: dict[str, CacheManager] = {} def register_cache_manager(manager: CacheManager) -> None: """Register a cache manager instance under a given name.""" if (name := manager.name) in _cache_managers: raise ValueError(f"Cache manager with name '{name}' is already registered.") _cache_managers[name] = manager def resolve_cache_manager(name: str | None = None) -> CacheManager: """Resolve a cache manager by name, or return the default if name is None.""" name = name or DEFAULT_MANAGER manager = _cache_managers.get(name) if manager is None: raise ValueError(f"No cache manager registered under name '{name}'.") return manager class CacheManager: """Manages cache directory layout and path resolution. Loading @@ -18,26 +39,38 @@ class CacheManager: Acts as the single source of truth for where files are stored. """ def __init__(self, root_path: Path | None = None) -> None: def __init__(self, root_path: Path | None = None, name: str = DEFAULT_MANAGER, ensure_paths: bool = True) -> None: """Initialize cache manager. Args: root_path: Explicit root path. If None, tries TDC_CACHE_DIR env var, then falls back to DEFAULT_CACHE_DIR. name: Optional name to register this manager under. If provided, the manager is registered upon initialization. ensure_paths: If True, will create the root directory if it doesn't exist. """ self.name = name if root_path: self.root = root_path else: env_path = os.getenv("TDC_CACHE_DIR") self.root = Path(env_path) if env_path else DEFAULT_CACHE_DIR if ensure_paths: self.ensure_paths() def register(self) -> Self: """Register this instance as a cache manager under the given name.""" register_cache_manager(self) return self @property def http_cache_path(self) -> Path: def http_cache_dir(self) -> Path: """Path to the HTTP client cache database.""" return self.root / DEFAULT_HTTP_CACHE_FILENAME @property def db_path(self) -> Path: def db_file(self) -> Path: """Path to the metadata SQLite database.""" return self.root / DEFAULT_DATABASE_FILENAME Loading Loading
src/tdoc_crawler/cli/app.py +68 −64 Original line number Diff line number Diff line Loading @@ -77,7 +77,7 @@ from tdoc_crawler.cli.printing import ( spec_query_to_dict, tdoc_to_dict, ) from tdoc_crawler.config import DEFAULT_CACHE_DIR, CacheManager from tdoc_crawler.config import CacheManager from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import TDocDatabase Loading @@ -102,7 +102,6 @@ HELP_PANEL_QUERY = "Query Commands" @app.command("crawl-tdocs", rich_help_panel=HELP_PANEL_CRAWLING) def crawl_tdocs( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, incremental: IncrementalOption = True, Loading @@ -117,14 +116,14 @@ def crawl_tdocs( overall_timeout: OverallTimeoutOption = None, max_retries: MaxRetriesOption = 3, timeout: TimeoutOption = 30, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Crawl TDocs from 3GPP FTP directories.""" # Set logging verbosity early to ensure all log messages respect the configured level set_verbosity(verbosity) manager = CacheManager(cache_dir) manager.ensure_paths() manager = CacheManager(cache_dir).register() subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) Loading @@ -150,7 +149,7 @@ def crawl_tdocs( use_parallel_crawling=False, ) db_path = manager.db_path db_file = manager.db_file # Build descriptive message scope_parts = [] Loading @@ -160,7 +159,7 @@ def crawl_tdocs( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear TDocs if requested if clear_tdocs: Loading Loading @@ -228,7 +227,7 @@ def crawl_tdocs( results = database.query_tdocs(query_config) # Use a shared session for checkout downloads with create_cached_session(manager.http_cache_path) as session: with create_cached_session(manager.http_cache_dir) as session: checkout_result = checkout_tdocs(results, checkout_dir, force=False, session=session) console.print(f"\n[cyan]Checked out {checkout_result.success_count} TDoc(s)[/cyan]") Loading @@ -251,7 +250,7 @@ def crawl_tdocs( @app.command("crawl-meetings", rich_help_panel=HELP_PANEL_CRAWLING) def crawl_meetings( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, incremental: IncrementalOption = True, Loading @@ -273,8 +272,7 @@ def crawl_meetings( # Set logging verbosity early to ensure all log messages respect the configured level set_verbosity(verbosity) manager = CacheManager(cache_dir) manager.ensure_paths() manager = CacheManager(cache_dir).register() subgroups = parse_subgroups(subgroup) working_groups = parse_working_groups(working_group, subgroups) Loading @@ -291,7 +289,7 @@ def crawl_meetings( credentials=None, ) db_path = manager.db_path db_file = manager.db_file # Build descriptive message scope_parts = [] if subgroups: Loading @@ -300,7 +298,7 @@ def crawl_meetings( scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]") with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir # Clear all data if requested if clear_db: Loading Loading @@ -372,17 +370,17 @@ def crawl_meetings( order=SortOrder.DESC, include_without_files=False, ) with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: meetings = database.query_meetings(query_config) with create_cached_session(manager.http_cache_path) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_path, session=session) with create_cached_session(manager.http_cache_dir) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_dir, session=session) @app.command("query-tdocs", rich_help_panel=HELP_PANEL_QUERY) def query_tdocs( tdoc_ids: TDocIdsArgument = None, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, clear_tdocs: ClearTDocsOption = False, clear_specs: ClearSpecsOption = False, Loading @@ -399,7 +397,7 @@ def query_tdocs( ) -> None: """Query TDoc metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() working_groups = parse_working_groups(working_group) try: start = datetime.fromisoformat(start_date) if start_date else None Loading Loading @@ -433,8 +431,8 @@ def query_tdocs( if not no_fetch: set_credentials(eol_username, eol_password, prompt=None) db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -453,8 +451,14 @@ def query_tdocs( results = database.query_tdocs(config) if not no_fetch: # Use cached session for missing TDoc fetching with create_cached_session(manager.http_cache_path) as session: result = fetch_missing_tdocs(database, manager.root, config, results, session=session) with create_cached_session(manager.http_cache_dir) as session: result = fetch_missing_tdocs( database, config, results, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") for error in result.fetch_result.errors[:3]: Loading @@ -466,7 +470,7 @@ def query_tdocs( return if checkout: with create_cached_session(manager.http_cache_path) as session: with create_cached_session(manager.http_cache_dir) as session: checkout_tdocs(results, manager.checkout_dir, force=False, session=session) if config.output_format is OutputFormat.JSON: Loading @@ -479,7 +483,7 @@ def query_tdocs( @app.command("query-meetings", rich_help_panel=HELP_PANEL_QUERY) def query_meetings( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, clear_tdocs: ClearTDocsOption = False, Loading @@ -493,7 +497,7 @@ def query_meetings( ) -> None: """Query meeting metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() working_groups = parse_working_groups(working_group) subgroups = parse_subgroups(subgroup) try: Loading @@ -511,8 +515,8 @@ def query_meetings( include_without_files=include_without_files, ) db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -535,8 +539,8 @@ def query_meetings( return if checkout: with create_cached_session(manager.http_cache_path) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_path, session=session) with create_cached_session(manager.http_cache_dir) as session: checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_dir, session=session) try: output = OutputFormat(output_format.lower()) Loading @@ -562,12 +566,12 @@ def query_specs( clear_specs: ClearSpecsOption = False, checkout: CheckoutOption = False, output_format: OutputFormatOption = OutputFormat.TABLE.value, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Query spec metadata from database.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() specs = collect_spec_numbers(spec_numbers, spec_file) working_groups = parse_working_groups(working_group) wg_filter = working_groups[0].value if working_groups else None Loading @@ -585,8 +589,8 @@ def query_specs( console.print("[red]Invalid output format; use table, json, or yaml") raise typer.Exit(code=2) from exc db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -611,7 +615,7 @@ def query_specs( if checkout: spec_list = [result.spec_number for result in results] with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_specs(spec_list, manager.checkout_dir, database, release="latest") if output is OutputFormat.JSON: Loading @@ -625,7 +629,7 @@ def query_specs( @app.command("open", rich_help_panel=HELP_PANEL_MAIN) def open_tdoc( tdoc_id: TDocIdArgument, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, Loading @@ -635,28 +639,28 @@ def open_tdoc( """Download, extract, and open a TDoc file.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() normalized_id = tdoc_id.strip().upper() config = QueryConfig( cache_dir=manager.root, tdoc_ids=[normalized_id], ) db_path = manager.db_path with create_cached_session(manager.http_cache_path) as session: with TDocDatabase(db_path) as database: db_file = manager.db_file with create_cached_session(manager.http_cache_dir) as session: with TDocDatabase(db_file) as database: results = database.query_tdocs(config) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, manager.root, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") Loading @@ -679,7 +683,7 @@ def open_tdoc( @app.command("checkout", rich_help_panel=HELP_PANEL_MAIN) def checkout( tdoc_id: CheckoutTDocIdsArgument, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, force: ForceOption = False, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, Loading @@ -690,28 +694,28 @@ def checkout( """Download and extract TDoc(s) to checkout folder.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() normalized_ids = [tid.strip().upper() for tid in tdoc_id] config = QueryConfig( cache_dir=manager.root, tdoc_ids=normalized_ids, ) db_path = manager.db_path with create_cached_session(manager.http_cache_path) as session: with TDocDatabase(db_path) as database: db_file = manager.db_file with create_cached_session(manager.http_cache_dir) as session: with TDocDatabase(db_file) as database: results = database.query_tdocs(config) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, manager.root, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec, session=session, cache_manager_name=manager.name, ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") Loading Loading @@ -749,18 +753,18 @@ def checkout( @app.command("stats", rich_help_panel=HELP_PANEL_MAIN) def stats( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Display database statistics.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) db_path = manager.db_path if not db_path.exists(): console.print(f"[red]Database not found: {db_path}[/red]") manager = CacheManager(cache_dir).register() db_file = manager.db_file if not db_file.exists(): console.print(f"[red]Database not found: {db_file}[/red]") raise typer.Exit(code=1) with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: stats_dict = cast(dict[str, Any], database.get_statistics()) table = Table(title="TDoc database statistics") Loading Loading @@ -790,12 +794,12 @@ def crawl_specs( clear_specs: ClearSpecsOption = False, checkout: CheckoutOption = False, output_format: OutputFormatOption = OutputFormat.TABLE.value, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Crawl spec metadata from configured sources.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() if spec_numbers is None: spec_numbers = [] specs = collect_spec_numbers(spec_numbers, spec_file) Loading @@ -807,8 +811,8 @@ def crawl_specs( sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: checkout_dir = manager.checkout_dir if clear_tdocs: deleted_count = database.clear_tdocs() Loading @@ -832,7 +836,7 @@ def crawl_specs( return if checkout: with TDocDatabase(db_path) as database: with TDocDatabase(db_file) as database: checkout_specs( [result.spec_number for result in results], manager.checkout_dir, Loading @@ -855,12 +859,12 @@ def checkout_spec( release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, checkout_dir: CheckoutDirOption = None, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and extract spec documents.""" set_verbosity(verbosity) manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() if spec_numbers is None: spec_numbers = [] specs = collect_spec_numbers(spec_numbers, spec_file) Loading @@ -872,8 +876,8 @@ def checkout_spec( sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: downloader = SpecDownloads(database) results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources) Loading @@ -886,19 +890,19 @@ def open_spec( spec: Annotated[str, typer.Argument(help="Spec number")], release: ReleaseOption = "latest", doc_only: DocOnlyOption = False, cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, cache_dir: CacheDirOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and open a spec document.""" set_verbosity(verbosity) normalized = spec.strip() manager = CacheManager(cache_dir) manager = CacheManager(cache_dir).register() checkout_dir = manager.checkout_dir sources = build_default_spec_sources() db_path = manager.db_path with TDocDatabase(db_path) as database: db_file = manager.db_file with TDocDatabase(db_file) as database: downloader = SpecDownloads(database) try: path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources) Loading
src/tdoc_crawler/cli/args.py +1 −1 Original line number Diff line number Diff line Loading @@ -12,7 +12,7 @@ from tdoc_crawler.logging import DEFAULT_LEVEL as LOGGING_DEFAULT_LEVEL DEFAULT_VERBOSITY = logging.getLevelName(LOGGING_DEFAULT_LEVEL) CacheDirOption = Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR")] CacheDirOption = Annotated[Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR")] WorkingGroupOption = Annotated[list[str] | None, typer.Option("--working-group", "-w", help="Filter by working group", envvar="TDC_WORKING_GROUP")] SubgroupOption = Annotated[list[str] | None, typer.Option("--sub-group", "-s", help="Filter by sub-working group", envvar="TDC_SUB_GROUP")] IncrementalOption = Annotated[bool, typer.Option("--incremental/--full", help="Toggle incremental mode")] Loading
src/tdoc_crawler/config.py +36 −3 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import os from pathlib import Path from typing import Self # Fallback path if no argument or env var is provided DEFAULT_CACHE_DIR = Path.home() / ".tdoc-crawler" Loading @@ -11,6 +12,26 @@ DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db" DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3" DEFAULT_CHECKOUT_DIRNAME = "checkout" DEFAULT_MANAGER = "default" _cache_managers: dict[str, CacheManager] = {} def register_cache_manager(manager: CacheManager) -> None: """Register a cache manager instance under a given name.""" if (name := manager.name) in _cache_managers: raise ValueError(f"Cache manager with name '{name}' is already registered.") _cache_managers[name] = manager def resolve_cache_manager(name: str | None = None) -> CacheManager: """Resolve a cache manager by name, or return the default if name is None.""" name = name or DEFAULT_MANAGER manager = _cache_managers.get(name) if manager is None: raise ValueError(f"No cache manager registered under name '{name}'.") return manager class CacheManager: """Manages cache directory layout and path resolution. Loading @@ -18,26 +39,38 @@ class CacheManager: Acts as the single source of truth for where files are stored. """ def __init__(self, root_path: Path | None = None) -> None: def __init__(self, root_path: Path | None = None, name: str = DEFAULT_MANAGER, ensure_paths: bool = True) -> None: """Initialize cache manager. Args: root_path: Explicit root path. If None, tries TDC_CACHE_DIR env var, then falls back to DEFAULT_CACHE_DIR. name: Optional name to register this manager under. If provided, the manager is registered upon initialization. ensure_paths: If True, will create the root directory if it doesn't exist. """ self.name = name if root_path: self.root = root_path else: env_path = os.getenv("TDC_CACHE_DIR") self.root = Path(env_path) if env_path else DEFAULT_CACHE_DIR if ensure_paths: self.ensure_paths() def register(self) -> Self: """Register this instance as a cache manager under the given name.""" register_cache_manager(self) return self @property def http_cache_path(self) -> Path: def http_cache_dir(self) -> Path: """Path to the HTTP client cache database.""" return self.root / DEFAULT_HTTP_CACHE_FILENAME @property def db_path(self) -> Path: def db_file(self) -> Path: """Path to the metadata SQLite database.""" return self.root / DEFAULT_DATABASE_FILENAME Loading