chore(vscode): add helpful debug launch configurations (f0426c04) · Commits · Jan Reimes / 3gpp-crawler

.gitignore

+2 −1

Original line number	Diff line number	Diff line
		@@ -233,3 +233,4 @@ Thumbs.db
		*.swp
		.vscode/
		.idea/
		/scripts/cache

.vscode/launch.json

+157 −152

Original line number	Diff line number	Diff line
		@@ -4,7 +4,13 @@
		// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
		"version": "0.2.0",
		"configurations": [

		{
		"name": "demo.py",
		"type": "debugpy",
		"request": "launch",
		"program": "${workspaceFolder}/scripts/demo.py",
		"console": "integratedTerminal",
		},
		{
		"name": "Debug: crawl-meetings (max. 5)",
		"type": "debugpy",
		@@ -147,7 +153,6 @@
		"args": [
		"open",
		"S4-260001",

		]
		}
		]

src/tdoc_crawler/checkout.py

+14 −17

Original line number	Diff line number	Diff line
		@@ -227,37 +227,36 @@ def clear_checkout_specs(checkout_dir: Path) -> int:

		def checkout_specs(
		spec_numbers: list[str],
		checkout_dir: Path,
		database: TDocDatabase,
		release: str = "latest",
		doc_only: bool = False,
		cache_manager_name: str \| None = None,
		) -> list[Path]:
		"""Checkout spec documents to the checkout directory.

		Args:
		spec_numbers: List of spec numbers to checkout
		checkout_dir: Base checkout directory
		database: TDocDatabase instance for metadata lookup
		release: Release version to checkout
		doc_only: If True, download only document files instead of full zip

		cache_manager_name: Optional cache manager name for HTTP caching
		Returns:
		List of paths to checked out specs
		"""
		sources = build_default_spec_sources()
		downloader = SpecDownloads(database)
		sources = build_default_spec_sources(cache_manager_name=cache_manager_name)
		downloader = SpecDownloads(database, cache_manager_name=cache_manager_name)
		return downloader.checkout_specs(spec_numbers, doc_only, checkout_dir, release, sources=sources)


		def build_default_spec_sources() -> list[SpecSource]:
		def build_default_spec_sources(cache_manager_name: str \| None = None) -> list[SpecSource]:
		"""Build the default list of spec sources.

		Returns:
		List of SpecSource instances for fetching spec metadata
		"""
		return [
		cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata)),
		cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata)),
		cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})),
		cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata, fetcher_kwargs={"cache_manager_name": cache_manager_name})),
		]


		@@ -272,25 +271,23 @@ class CheckoutResult:

		def checkout_tdocs(
		results: list[TDocMetadata],
		checkout_dir: Path,
		force: bool = False,
		session: requests.Session \| None = None,
		cache_manager_name: str \| None = None,
		) -> CheckoutResult:
		"""Checkout multiple TDoc files to the checkout directory.

		Args:
		results: List of TDocMetadata to checkout
		checkout_dir: Base checkout directory
		force: If True, re-download even if already exists
		session: Optional requests.Session to reuse for downloads

		cache_manager_name: Optional cache manager name for HTTP caching
		Returns:
		CheckoutResult with success/error counts
		"""
		if not results:
		return CheckoutResult(success_count=0, error_count=0, errors=[])

		checkout_dir.mkdir(parents=True, exist_ok=True)
		success_count = 0
		error_count = 0
		errors: list[str] = []
		@@ -310,16 +307,16 @@ def checkout_tdocs(

		def checkout_meeting_tdocs(
		meetings: list[MeetingMetadata],
		checkout_dir: Path,
		http_cache_path: Path,
		http_cache_dir: Path,
		session: requests.Session \| None = None,
		cache_manager_name: str \| None = None,
		) -> CheckoutResult:
		"""Checkout TDoc files from a list of meetings.

		Args:
		meetings: List of MeetingMetadata to checkout TDocs from
		checkout_dir: Base checkout directory
		http_cache_path: Path to HTTP cache database
		http_cache_dir: Path to HTTP cache database
		session: Optional requests.Session to reuse for downloads

		Returns:
		@@ -336,7 +333,7 @@ def checkout_meeting_tdocs(
		errors.append(f"{meeting.short_name}: no files URL")
		continue
		try:
		tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_path)
		tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_dir)
		except DocumentListError as exc:
		errors.append(f"{meeting.short_name}: {exc}")
		continue
		@@ -344,7 +341,7 @@ def checkout_meeting_tdocs(
		if metadata.tdoc_id not in unique:
		unique[metadata.tdoc_id] = metadata

		return checkout_tdocs(list(unique.values()), checkout_dir, force=False, session=session)
		return checkout_tdocs(list(unique.values()), force=False, session=session, cache_manager_name=cache_manager_name)


		__all__ = [

src/tdoc_crawler/cli/helpers.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -188,7 +188,7 @@ def resolve_meeting_id(database: TDocDatabase, meeting_name: str) -> int \| None:
		"""
		# Query all meetings from database
		config = MeetingQueryConfig(
		cache_dir=database.db_path.parent,
		cache_dir=database.db_file.parent,
		working_groups=None,
		subgroups=None,
		limit=None,

src/tdoc_crawler/crawlers/meeting_doclist.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -23,7 +23,7 @@ class DocumentListError(Exception):

		def fetch_meeting_document_list(
		meeting_id: int,
		cache_path: Path,
		cache_dir: Path,
		cache_ttl: int = 7200,
		cache_refresh_on_access: bool = True,
		timeout: int = 30,
		@@ -32,7 +32,7 @@ def fetch_meeting_document_list(

		Args:
		meeting_id: 3GPP meeting identifier
		cache_path: Path to HTTP cache SQLite database
		cache_dir: Path to HTTP cache SQLite database
		cache_ttl: HTTP cache TTL in seconds
		cache_refresh_on_access: Whether to refresh cache TTL on access
		timeout: Request timeout in seconds
		@@ -49,7 +49,7 @@ def fetch_meeting_document_list(

		# Create cached session (no credentials required)
		session = create_cached_session(
		cache_path=cache_path,
		cache_dir=cache_dir,
		ttl=cache_ttl,
		refresh_ttl_on_access=cache_refresh_on_access,
		max_retries=3,