feat(cli): add new flags for full metadata and WhatTheSpec fetching (6f74ac82) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/app.py

+8 −2

Original line number	Diff line number	Diff line
		@@ -29,6 +29,7 @@ from .args import (
		EolPasswordOption,
		EolUsernameOption,
		ForceOption,
		FullMetadataOption,
		IncludeWithoutFilesOption,
		IncrementalOption,
		LimitMeetingsOption,
		@@ -47,6 +48,7 @@ from .args import (
		TDocIdArgument,
		TDocIdsArgument,
		TimeoutOption,
		UseWhatTheSpecOption,
		VerboseOption,
		WorkersOption,
		WorkingGroupOption,
		@@ -385,6 +387,8 @@ def query_meetings(
		def open_tdoc(
		tdoc_id: TDocIdArgument,
		cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
		full_metadata: FullMetadataOption = False,
		use_whatthespec: UseWhatTheSpecOption = False,
		) -> None:
		"""Download, extract, and open a TDoc file."""
		normalized_id = tdoc_id.strip().upper()
		@@ -396,7 +400,7 @@ def open_tdoc(
		db_path = database_path(cache_dir)
		with TDocDatabase(db_path) as database:
		results = database.query_tdocs(config)
		results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)
		results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
		if not results:
		console.print(f"[red]TDoc {normalized_id} not found[/red]")
		raise typer.Exit(code=1)
		@@ -417,6 +421,8 @@ def checkout(
		tdoc_id: CheckoutTDocIdsArgument,
		cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
		force: ForceOption = False,
		full_metadata: FullMetadataOption = False,
		use_whatthespec: UseWhatTheSpecOption = False,
		) -> None:
		"""Download and extract TDoc(s) to checkout folder."""
		normalized_ids = [tid.strip().upper() for tid in tdoc_id]
		@@ -428,7 +434,7 @@ def checkout(
		db_path = database_path(cache_dir)
		with TDocDatabase(db_path) as database:
		results = database.query_tdocs(config)
		results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)
		results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)

		# Check which TDocs were found
		found_ids = {r.tdoc_id for r in results}

src/tdoc_crawler/cli/args.py

+24 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,30 @@ MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry
		TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds")]
		VerboseOption = Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")]

		TDocIdsArgument = Annotated[list[str] \| None, typer.Argument(help="TDoc identifiers to query")]
		OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format")]

		# New options for TDoc fetching
		FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch full metadata instead of URL only")]
		UseWhatTheSpecOption = Annotated[bool, typer.Option("--use-whatthespec", help="Use WhatTheSpec API for fetching")]
		WorkingGroupOption = Annotated[list[str] \| None, typer.Option("--working-group", "-w", help="Filter by working group")]
		SubgroupOption = Annotated[list[str] \| None, typer.Option("--sub-group", "-s", help="Filter by sub-working group")]
		IncrementalOption = Annotated[bool, typer.Option("--incremental/--full", help="Toggle incremental mode")]
		ClearTDocsOption = Annotated[bool, typer.Option("--clear-tdocs", help="Clear all TDocs before crawling")]
		ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")]
		LimitTDocsOption = Annotated[int \| None, typer.Option("--limit-tdocs", help="Limit number of TDocs")]
		LimitMeetingsOption = Annotated[int \| None, typer.Option("--limit-meetings", help="Limit meetings overall")]
		LimitMeetingsPerWgOption = Annotated[int \| None, typer.Option("--limit-meetings-per-wg", help="Limit meetings per working group")]
		LimitWgsOption = Annotated[int \| None, typer.Option("--limit-wgs", help="Limit number of working groups")]
		WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers")]
		OverallTimeoutOption = Annotated[
		int \| None,
		typer.Option("--overall-timeout", help="Maximum total crawl duration in seconds (None = unlimited)"),
		]
		MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts")]
		TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds")]
		VerboseOption = Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")]

		TDocIdsArgument = Annotated[list[str] \| None, typer.Argument(help="TDoc identifiers to query")]
		OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format")]
		LimitOption = Annotated[int \| None, typer.Option("--limit", "-l", help="Maximum number of rows")]

src/tdoc_crawler/cli/fetching.py

+99 −5

Original line number	Diff line number	Diff line
		@@ -19,6 +19,74 @@ console = get_console()
		_logger = logging.getLogger(__name__)


		def fetch_tdoc(
		tdoc_id: str,
		cache_dir: Path,
		http_cache: HttpCacheConfig,
		full_metadata: bool = False,
		use_whatthespec: bool = False,
		credentials: PortalCredentials \| None = None,
		timeout: int = 30,
		) -> TDocMetadata:
		"""Fetch TDoc using the appropriate method based on flags.

		Args:
		tdoc_id: TDoc identifier (e.g., "S4-260001").
		cache_dir: Directory for HTTP cache storage.
		http_cache: HTTP cache configuration.
		full_metadata: If True, fetch full metadata (requires credentials for portal method).
		use_whatthespec: If True, always use WhatTheSpec method regardless of full_metadata.
		credentials: Portal credentials (required for authenticated portal method).
		timeout: Request timeout in seconds.

		Returns:
		TDocMetadata with available information.

		Raises:
		Exception: If fetching fails for any reason.
		"""
		# Import here to avoid circular imports
		from tdoc_crawler.crawlers import extract_tdoc_url_from_portal, resolve_via_whatthespec

		if use_whatthespec:
		# Always use WhatTheSpec method (Method 3)
		_logger.debug(f"Fetching {tdoc_id} via WhatTheSpec API")
		return resolve_via_whatthespec(tdoc_id, cache_dir, http_cache, timeout)

		elif full_metadata:
		# Use authenticated portal method (Method 2)
		if credentials is None:
		raise ValueError("Portal credentials required for full metadata fetching")
		_logger.debug(f"Fetching {tdoc_id} via authenticated 3GPP portal")
		return fetch_tdoc_metadata(tdoc_id, credentials, cache_dir, http_cache.ttl, http_cache.refresh_ttl_on_access, timeout)

		else:
		# Use unauthenticated portal method (Method 1) - URL only
		_logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
		# Extract URL and create minimal TDocMetadata
		url = extract_tdoc_url_from_portal(tdoc_id, timeout=min(timeout, 15))
		return TDocMetadata(
		tdoc_id=tdoc_id,
		url=url,
		title="",
		meeting_id=0,
		source="",
		contact="",
		agenda_item_nbr=0,
		date=None,
		revision_of="",
		technical_committee="",
		working_group="",
		type="",
		status="",
		referenced_documents=[],
		filename="",
		size=0,
		validated=False,
		validation_failed=False,
		)


		def fetch_missing_tdocs(
		database: TDocDatabase,
		cache_dir: Path,
		@@ -139,15 +207,19 @@ def maybe_fetch_missing_tdocs(
		config: QueryConfig,
		results: list[TDocMetadata],
		credentials: PortalCredentials \| None = None,
		full_metadata: bool = False,
		use_whatthespec: bool = False,
		) -> list[TDocMetadata]:
		"""Check for missing TDocs and fetch them if needed.
		"""Fetch missing TDocs if any are requested but not found in database.

		Args:
		database: Database connection
		cache_dir: Cache directory path
		config: Query configuration
		results: Current query results
		credentials: Portal credentials (optional)
		cache_dir: Cache directory
		config: Query configuration with requested TDoc IDs
		results: Already found TDoc metadata
		credentials: Portal credentials (None if not available)
		full_metadata: If True, fetch full metadata instead of URL only
		use_whatthespec: If True, use WhatTheSpec API for fetching

		Returns:
		Updated list of TDocMetadata with newly fetched TDocs
		@@ -160,6 +232,28 @@ def maybe_fetch_missing_tdocs(
		if not missing:
		return results

		# Handle use_whatthespec flag - always use WhatTheSpec method
		if use_whatthespec:
		console.print(f"[cyan]Fetching missing TDocs via WhatTheSpec: {', '.join(missing)}[/cyan]")
		_fetch_via_whatthespec(database, cache_dir, missing)
		refreshed = database.query_tdocs(config)
		return refreshed

		# Handle full_metadata flag - use authenticated portal method
		if full_metadata:
		if credentials is None:
		console.print("[red]Portal credentials required for full metadata fetching[/red]")
		return results
		console.print(f"[cyan]Fetching missing TDocs with full metadata: {', '.join(missing)}[/cyan]")
		fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
		if fetch_result.errors:
		console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
		for error in fetch_result.errors[:3]:
		console.print(f" - {error}")
		refreshed = database.query_tdocs(config)
		return refreshed

		# Default behavior - use unauthenticated portal method with WhatTheSpec fallback
		console.print(f"[cyan]Fetching missing TDocs: {', '.join(missing)}[/cyan]")
		fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
		if fetch_result.errors:

src/tdoc_crawler/crawlers/init.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@ __all__ = [
		"EXCLUDED_DIRS",
		"EXCLUDED_DIRS_NORMALIZED",
		"MEETING_CODE_REGISTRY",
		"TDOC_DOWNLOAD_URL",
		"TDOC_PATTERN",
		"TDOC_PATTERN_STR",
		"TDOC_SUBDIRS",
		@@ -28,6 +29,7 @@ __all__ = [
		"TDocCrawler",
		"WhatTheSpecResolutionError",
		"convert_excel_row_to_tdoc_metadata",
		"extract_tdoc_url_from_portal",
		"fetch_meeting_document_list",
		"fetch_meeting_tdocs",
		"fetch_tdoc_metadata",
		@@ -50,6 +52,7 @@ _ATTR_MODULES: dict[str, tuple[str, str]] = {
		"PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
		"PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
		"PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
		"TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"),
		"TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
		"TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
		"TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
		@@ -58,6 +61,7 @@ _ATTR_MODULES: dict[str, tuple[str, str]] = {
		"TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
		"WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"),
		"convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
		"extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"),
		"fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
		"fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
		"fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),

src/tdoc_crawler/crawlers/constants.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -60,6 +60,7 @@ __all__ = [
		"MEETINGS_BASE_URL",
		"MEETING_CODE_REGISTRY",
		"PORTAL_BASE_URL",
		"TDOC_DOWNLOAD_URL",
		"TDOC_PATTERN",
		"TDOC_PATTERN_STR",
		"TDOC_SUBDIRS",