Loading .gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -233,3 +233,4 @@ Thumbs.db *.swp .vscode/ .idea/ /scripts/cache .vscode/launch.json +157 −152 Original line number Diff line number Diff line Loading @@ -4,7 +4,13 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ { "name": "demo.py", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/scripts/demo.py", "console": "integratedTerminal", }, { "name": "Debug: crawl-meetings (max. 5)", "type": "debugpy", Loading Loading @@ -147,7 +153,6 @@ "args": [ "open", "S4-260001", ] } ] Loading src/tdoc_crawler/checkout.py +14 −17 Original line number Diff line number Diff line Loading @@ -227,37 +227,36 @@ def clear_checkout_specs(checkout_dir: Path) -> int: def checkout_specs( spec_numbers: list[str], checkout_dir: Path, database: TDocDatabase, release: str = "latest", doc_only: bool = False, cache_manager_name: str | None = None, ) -> list[Path]: """Checkout spec documents to the checkout directory. Args: spec_numbers: List of spec numbers to checkout checkout_dir: Base checkout directory database: TDocDatabase instance for metadata lookup release: Release version to checkout doc_only: If True, download only document files instead of full zip cache_manager_name: Optional cache manager name for HTTP caching Returns: List of paths to checked out specs """ sources = build_default_spec_sources() downloader = SpecDownloads(database) sources = build_default_spec_sources(cache_manager_name=cache_manager_name) downloader = SpecDownloads(database, cache_manager_name=cache_manager_name) return downloader.checkout_specs(spec_numbers, doc_only, checkout_dir, release, sources=sources) def build_default_spec_sources() -> list[SpecSource]: def build_default_spec_sources(cache_manager_name: str | None = None) -> list[SpecSource]: """Build the default list of spec sources. Returns: List of SpecSource instances for fetching spec metadata """ return [ cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata)), cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata)), cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})), cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})), ] Loading @@ -272,25 +271,23 @@ class CheckoutResult: def checkout_tdocs( results: list[TDocMetadata], checkout_dir: Path, force: bool = False, session: requests.Session | None = None, cache_manager_name: str | None = None, ) -> CheckoutResult: """Checkout multiple TDoc files to the checkout directory. Args: results: List of TDocMetadata to checkout checkout_dir: Base checkout directory force: If True, re-download even if already exists session: Optional requests.Session to reuse for downloads cache_manager_name: Optional cache manager name for HTTP caching Returns: CheckoutResult with success/error counts """ if not results: return CheckoutResult(success_count=0, error_count=0, errors=[]) checkout_dir.mkdir(parents=True, exist_ok=True) success_count = 0 error_count = 0 errors: list[str] = [] Loading @@ -310,16 +307,16 @@ def checkout_tdocs( def checkout_meeting_tdocs( meetings: list[MeetingMetadata], checkout_dir: Path, http_cache_path: Path, http_cache_dir: Path, session: requests.Session | None = None, cache_manager_name: str | None = None, ) -> CheckoutResult: """Checkout TDoc files from a list of meetings. Args: meetings: List of MeetingMetadata to checkout TDocs from checkout_dir: Base checkout directory http_cache_path: Path to HTTP cache database http_cache_dir: Path to HTTP cache database session: Optional requests.Session to reuse for downloads Returns: Loading @@ -336,7 +333,7 @@ def checkout_meeting_tdocs( errors.append(f"{meeting.short_name}: no files URL") continue try: tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_path) tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_dir) except DocumentListError as exc: errors.append(f"{meeting.short_name}: {exc}") continue Loading @@ -344,7 +341,7 @@ def checkout_meeting_tdocs( if metadata.tdoc_id not in unique: unique[metadata.tdoc_id] = metadata return checkout_tdocs(list(unique.values()), checkout_dir, force=False, session=session) return checkout_tdocs(list(unique.values()), force=False, session=session, cache_manager_name=cache_manager_name) __all__ = [ Loading src/tdoc_crawler/cli/helpers.py +1 −1 Original line number Diff line number Diff line Loading @@ -188,7 +188,7 @@ def resolve_meeting_id(database: TDocDatabase, meeting_name: str) -> int | None: """ # Query all meetings from database config = MeetingQueryConfig( cache_dir=database.db_path.parent, cache_dir=database.db_file.parent, working_groups=None, subgroups=None, limit=None, Loading src/tdoc_crawler/crawlers/meeting_doclist.py +3 −3 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ class DocumentListError(Exception): def fetch_meeting_document_list( meeting_id: int, cache_path: Path, cache_dir: Path, cache_ttl: int = 7200, cache_refresh_on_access: bool = True, timeout: int = 30, Loading @@ -32,7 +32,7 @@ def fetch_meeting_document_list( Args: meeting_id: 3GPP meeting identifier cache_path: Path to HTTP cache SQLite database cache_dir: Path to HTTP cache SQLite database cache_ttl: HTTP cache TTL in seconds cache_refresh_on_access: Whether to refresh cache TTL on access timeout: Request timeout in seconds Loading @@ -49,7 +49,7 @@ def fetch_meeting_document_list( # Create cached session (no credentials required) session = create_cached_session( cache_path=cache_path, cache_dir=cache_dir, ttl=cache_ttl, refresh_ttl_on_access=cache_refresh_on_access, max_retries=3, Loading Loading
.gitignore +2 −1 Original line number Diff line number Diff line Loading @@ -233,3 +233,4 @@ Thumbs.db *.swp .vscode/ .idea/ /scripts/cache
.vscode/launch.json +157 −152 Original line number Diff line number Diff line Loading @@ -4,7 +4,13 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ { "name": "demo.py", "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/scripts/demo.py", "console": "integratedTerminal", }, { "name": "Debug: crawl-meetings (max. 5)", "type": "debugpy", Loading Loading @@ -147,7 +153,6 @@ "args": [ "open", "S4-260001", ] } ] Loading
src/tdoc_crawler/checkout.py +14 −17 Original line number Diff line number Diff line Loading @@ -227,37 +227,36 @@ def clear_checkout_specs(checkout_dir: Path) -> int: def checkout_specs( spec_numbers: list[str], checkout_dir: Path, database: TDocDatabase, release: str = "latest", doc_only: bool = False, cache_manager_name: str | None = None, ) -> list[Path]: """Checkout spec documents to the checkout directory. Args: spec_numbers: List of spec numbers to checkout checkout_dir: Base checkout directory database: TDocDatabase instance for metadata lookup release: Release version to checkout doc_only: If True, download only document files instead of full zip cache_manager_name: Optional cache manager name for HTTP caching Returns: List of paths to checked out specs """ sources = build_default_spec_sources() downloader = SpecDownloads(database) sources = build_default_spec_sources(cache_manager_name=cache_manager_name) downloader = SpecDownloads(database, cache_manager_name=cache_manager_name) return downloader.checkout_specs(spec_numbers, doc_only, checkout_dir, release, sources=sources) def build_default_spec_sources() -> list[SpecSource]: def build_default_spec_sources(cache_manager_name: str | None = None) -> list[SpecSource]: """Build the default list of spec sources. Returns: List of SpecSource instances for fetching spec metadata """ return [ cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata)), cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata)), cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})), cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})), ] Loading @@ -272,25 +271,23 @@ class CheckoutResult: def checkout_tdocs( results: list[TDocMetadata], checkout_dir: Path, force: bool = False, session: requests.Session | None = None, cache_manager_name: str | None = None, ) -> CheckoutResult: """Checkout multiple TDoc files to the checkout directory. Args: results: List of TDocMetadata to checkout checkout_dir: Base checkout directory force: If True, re-download even if already exists session: Optional requests.Session to reuse for downloads cache_manager_name: Optional cache manager name for HTTP caching Returns: CheckoutResult with success/error counts """ if not results: return CheckoutResult(success_count=0, error_count=0, errors=[]) checkout_dir.mkdir(parents=True, exist_ok=True) success_count = 0 error_count = 0 errors: list[str] = [] Loading @@ -310,16 +307,16 @@ def checkout_tdocs( def checkout_meeting_tdocs( meetings: list[MeetingMetadata], checkout_dir: Path, http_cache_path: Path, http_cache_dir: Path, session: requests.Session | None = None, cache_manager_name: str | None = None, ) -> CheckoutResult: """Checkout TDoc files from a list of meetings. Args: meetings: List of MeetingMetadata to checkout TDocs from checkout_dir: Base checkout directory http_cache_path: Path to HTTP cache database http_cache_dir: Path to HTTP cache database session: Optional requests.Session to reuse for downloads Returns: Loading @@ -336,7 +333,7 @@ def checkout_meeting_tdocs( errors.append(f"{meeting.short_name}: no files URL") continue try: tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_path) tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_dir) except DocumentListError as exc: errors.append(f"{meeting.short_name}: {exc}") continue Loading @@ -344,7 +341,7 @@ def checkout_meeting_tdocs( if metadata.tdoc_id not in unique: unique[metadata.tdoc_id] = metadata return checkout_tdocs(list(unique.values()), checkout_dir, force=False, session=session) return checkout_tdocs(list(unique.values()), force=False, session=session, cache_manager_name=cache_manager_name) __all__ = [ Loading
src/tdoc_crawler/cli/helpers.py +1 −1 Original line number Diff line number Diff line Loading @@ -188,7 +188,7 @@ def resolve_meeting_id(database: TDocDatabase, meeting_name: str) -> int | None: """ # Query all meetings from database config = MeetingQueryConfig( cache_dir=database.db_path.parent, cache_dir=database.db_file.parent, working_groups=None, subgroups=None, limit=None, Loading
src/tdoc_crawler/crawlers/meeting_doclist.py +3 −3 Original line number Diff line number Diff line Loading @@ -23,7 +23,7 @@ class DocumentListError(Exception): def fetch_meeting_document_list( meeting_id: int, cache_path: Path, cache_dir: Path, cache_ttl: int = 7200, cache_refresh_on_access: bool = True, timeout: int = 30, Loading @@ -32,7 +32,7 @@ def fetch_meeting_document_list( Args: meeting_id: 3GPP meeting identifier cache_path: Path to HTTP cache SQLite database cache_dir: Path to HTTP cache SQLite database cache_ttl: HTTP cache TTL in seconds cache_refresh_on_access: Whether to refresh cache TTL on access timeout: Request timeout in seconds Loading @@ -49,7 +49,7 @@ def fetch_meeting_document_list( # Create cached session (no credentials required) session = create_cached_session( cache_path=cache_path, cache_dir=cache_dir, ttl=cache_ttl, refresh_ttl_on_access=cache_refresh_on_access, max_retries=3, Loading