Commit 6f74ac82 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(cli): add new flags for full metadata and WhatTheSpec fetching

- Introduced `--full-metadata` flag to fetch complete TDoc metadata.
- Added `--use-whatthespec` flag to prioritize fetching via WhatTheSpec API.
- Updated `open_tdoc` and `checkout` functions to handle new flags.
- Enhanced `maybe_fetch_missing_tdocs` to support new fetching methods.
- Created a new module for centralized TDoc fetching logic.
- Added tests for new CLI flags functionality.
parent 85f35c77
Loading
Loading
Loading
Loading
+8 −2
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ from .args import (
    EolPasswordOption,
    EolUsernameOption,
    ForceOption,
    FullMetadataOption,
    IncludeWithoutFilesOption,
    IncrementalOption,
    LimitMeetingsOption,
@@ -47,6 +48,7 @@ from .args import (
    TDocIdArgument,
    TDocIdsArgument,
    TimeoutOption,
    UseWhatTheSpecOption,
    VerboseOption,
    WorkersOption,
    WorkingGroupOption,
@@ -385,6 +387,8 @@ def query_meetings(
def open_tdoc(
    tdoc_id: TDocIdArgument,
    cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
    full_metadata: FullMetadataOption = False,
    use_whatthespec: UseWhatTheSpecOption = False,
) -> None:
    """Download, extract, and open a TDoc file."""
    normalized_id = tdoc_id.strip().upper()
@@ -396,7 +400,7 @@ def open_tdoc(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
        if not results:
            console.print(f"[red]TDoc {normalized_id} not found[/red]")
            raise typer.Exit(code=1)
@@ -417,6 +421,8 @@ def checkout(
    tdoc_id: CheckoutTDocIdsArgument,
    cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
    force: ForceOption = False,
    full_metadata: FullMetadataOption = False,
    use_whatthespec: UseWhatTheSpecOption = False,
) -> None:
    """Download and extract TDoc(s) to checkout folder."""
    normalized_ids = [tid.strip().upper() for tid in tdoc_id]
@@ -428,7 +434,7 @@ def checkout(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)

        # Check which TDocs were found
        found_ids = {r.tdoc_id for r in results}
+24 −0
Original line number Diff line number Diff line
@@ -26,6 +26,30 @@ MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry
TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds")]
VerboseOption = Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")]

TDocIdsArgument = Annotated[list[str] | None, typer.Argument(help="TDoc identifiers to query")]
OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format")]

# New options for TDoc fetching
FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch full metadata instead of URL only")]
UseWhatTheSpecOption = Annotated[bool, typer.Option("--use-whatthespec", help="Use WhatTheSpec API for fetching")]
WorkingGroupOption = Annotated[list[str] | None, typer.Option("--working-group", "-w", help="Filter by working group")]
SubgroupOption = Annotated[list[str] | None, typer.Option("--sub-group", "-s", help="Filter by sub-working group")]
IncrementalOption = Annotated[bool, typer.Option("--incremental/--full", help="Toggle incremental mode")]
ClearTDocsOption = Annotated[bool, typer.Option("--clear-tdocs", help="Clear all TDocs before crawling")]
ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")]
LimitTDocsOption = Annotated[int | None, typer.Option("--limit-tdocs", help="Limit number of TDocs")]
LimitMeetingsOption = Annotated[int | None, typer.Option("--limit-meetings", help="Limit meetings overall")]
LimitMeetingsPerWgOption = Annotated[int | None, typer.Option("--limit-meetings-per-wg", help="Limit meetings per working group")]
LimitWgsOption = Annotated[int | None, typer.Option("--limit-wgs", help="Limit number of working groups")]
WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers")]
OverallTimeoutOption = Annotated[
    int | None,
    typer.Option("--overall-timeout", help="Maximum total crawl duration in seconds (None = unlimited)"),
]
MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts")]
TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds")]
VerboseOption = Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")]

TDocIdsArgument = Annotated[list[str] | None, typer.Argument(help="TDoc identifiers to query")]
OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format")]
LimitOption = Annotated[int | None, typer.Option("--limit", "-l", help="Maximum number of rows")]
+99 −5
Original line number Diff line number Diff line
@@ -19,6 +19,74 @@ console = get_console()
_logger = logging.getLogger(__name__)


def fetch_tdoc(
    tdoc_id: str,
    cache_dir: Path,
    http_cache: HttpCacheConfig,
    full_metadata: bool = False,
    use_whatthespec: bool = False,
    credentials: PortalCredentials | None = None,
    timeout: int = 30,
) -> TDocMetadata:
    """Fetch TDoc using the appropriate method based on flags.

    Args:
        tdoc_id: TDoc identifier (e.g., "S4-260001").
        cache_dir: Directory for HTTP cache storage.
        http_cache: HTTP cache configuration.
        full_metadata: If True, fetch full metadata (requires credentials for portal method).
        use_whatthespec: If True, always use WhatTheSpec method regardless of full_metadata.
        credentials: Portal credentials (required for authenticated portal method).
        timeout: Request timeout in seconds.

    Returns:
        TDocMetadata with available information.

    Raises:
        Exception: If fetching fails for any reason.
    """
    # Import here to avoid circular imports
    from tdoc_crawler.crawlers import extract_tdoc_url_from_portal, resolve_via_whatthespec

    if use_whatthespec:
        # Always use WhatTheSpec method (Method 3)
        _logger.debug(f"Fetching {tdoc_id} via WhatTheSpec API")
        return resolve_via_whatthespec(tdoc_id, cache_dir, http_cache, timeout)

    elif full_metadata:
        # Use authenticated portal method (Method 2)
        if credentials is None:
            raise ValueError("Portal credentials required for full metadata fetching")
        _logger.debug(f"Fetching {tdoc_id} via authenticated 3GPP portal")
        return fetch_tdoc_metadata(tdoc_id, credentials, cache_dir, http_cache.ttl, http_cache.refresh_ttl_on_access, timeout)

    else:
        # Use unauthenticated portal method (Method 1) - URL only
        _logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
        # Extract URL and create minimal TDocMetadata
        url = extract_tdoc_url_from_portal(tdoc_id, timeout=min(timeout, 15))
        return TDocMetadata(
            tdoc_id=tdoc_id,
            url=url,
            title="",
            meeting_id=0,
            source="",
            contact="",
            agenda_item_nbr=0,
            date=None,
            revision_of="",
            technical_committee="",
            working_group="",
            type="",
            status="",
            referenced_documents=[],
            filename="",
            size=0,
            validated=False,
            validation_failed=False,
        )


def fetch_missing_tdocs(
    database: TDocDatabase,
    cache_dir: Path,
@@ -139,15 +207,19 @@ def maybe_fetch_missing_tdocs(
    config: QueryConfig,
    results: list[TDocMetadata],
    credentials: PortalCredentials | None = None,
    full_metadata: bool = False,
    use_whatthespec: bool = False,
) -> list[TDocMetadata]:
    """Check for missing TDocs and fetch them if needed.
    """Fetch missing TDocs if any are requested but not found in database.

    Args:
        database: Database connection
        cache_dir: Cache directory path
        config: Query configuration
        results: Current query results
        credentials: Portal credentials (optional)
        cache_dir: Cache directory
        config: Query configuration with requested TDoc IDs
        results: Already found TDoc metadata
        credentials: Portal credentials (None if not available)
        full_metadata: If True, fetch full metadata instead of URL only
        use_whatthespec: If True, use WhatTheSpec API for fetching

    Returns:
        Updated list of TDocMetadata with newly fetched TDocs
@@ -160,6 +232,28 @@ def maybe_fetch_missing_tdocs(
    if not missing:
        return results

    # Handle use_whatthespec flag - always use WhatTheSpec method
    if use_whatthespec:
        console.print(f"[cyan]Fetching missing TDocs via WhatTheSpec: {', '.join(missing)}[/cyan]")
        _fetch_via_whatthespec(database, cache_dir, missing)
        refreshed = database.query_tdocs(config)
        return refreshed

    # Handle full_metadata flag - use authenticated portal method
    if full_metadata:
        if credentials is None:
            console.print("[red]Portal credentials required for full metadata fetching[/red]")
            return results
        console.print(f"[cyan]Fetching missing TDocs with full metadata: {', '.join(missing)}[/cyan]")
        fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
        if fetch_result.errors:
            console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
            for error in fetch_result.errors[:3]:
                console.print(f"  - {error}")
        refreshed = database.query_tdocs(config)
        return refreshed

    # Default behavior - use unauthenticated portal method with WhatTheSpec fallback
    console.print(f"[cyan]Fetching missing TDocs: {', '.join(missing)}[/cyan]")
    fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
    if fetch_result.errors:
+4 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ __all__ = [
    "EXCLUDED_DIRS",
    "EXCLUDED_DIRS_NORMALIZED",
    "MEETING_CODE_REGISTRY",
    "TDOC_DOWNLOAD_URL",
    "TDOC_PATTERN",
    "TDOC_PATTERN_STR",
    "TDOC_SUBDIRS",
@@ -28,6 +29,7 @@ __all__ = [
    "TDocCrawler",
    "WhatTheSpecResolutionError",
    "convert_excel_row_to_tdoc_metadata",
    "extract_tdoc_url_from_portal",
    "fetch_meeting_document_list",
    "fetch_meeting_tdocs",
    "fetch_tdoc_metadata",
@@ -50,6 +52,7 @@ _ATTR_MODULES: dict[str, tuple[str, str]] = {
    "PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
    "PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
    "PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
    "TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"),
    "TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
    "TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
    "TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
@@ -58,6 +61,7 @@ _ATTR_MODULES: dict[str, tuple[str, str]] = {
    "TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
    "WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"),
    "convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
    "extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"),
    "fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
    "fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
    "fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),
+1 −0
Original line number Diff line number Diff line
@@ -60,6 +60,7 @@ __all__ = [
    "MEETINGS_BASE_URL",
    "MEETING_CODE_REGISTRY",
    "PORTAL_BASE_URL",
    "TDOC_DOWNLOAD_URL",
    "TDOC_PATTERN",
    "TDOC_PATTERN_STR",
    "TDOC_SUBDIRS",
Loading