Commit 07f7320b authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(fetching): enhance TDoc fetching with batch processing and error handling

* Introduce `fetch_missing_tdocs_batch` for batch fetching of missing TDocs.
* Implement `fetch_via_whatthespec_batch` for batch fetching using WhatTheSpec API.
* Update `fetch_tdoc` to support new fetching methods and improve error handling.
* Refactor tests to accommodate new batch fetching methods and ensure coverage.
* Modify existing tests to use the new fetching functions and improve clarity.
* Ensure proper handling of validation errors and logging throughout the fetching process.
parent c54c5397
Loading
Loading
Loading
Loading
+17 −13
Original line number Diff line number Diff line
@@ -58,7 +58,6 @@ from tdoc_crawler.cli.args import (
    WorkingGroupOption,
)
from tdoc_crawler.cli.console import get_console
from tdoc_crawler.cli.fetching import maybe_fetch_missing_tdocs
from tdoc_crawler.cli.helpers import build_limits, collect_spec_numbers, launch_file, parse_subgroups, parse_working_groups
from tdoc_crawler.cli.printing import (
    meeting_to_dict,
@@ -75,6 +74,7 @@ from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.crawlers.meeting_doclist import DocumentListError, fetch_meeting_document_list
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import TDocDatabase, database_path
from tdoc_crawler.fetching import fetch_missing_tdocs
from tdoc_crawler.models import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig, TDocMetadata
from tdoc_crawler.models.specs import SpecQueryFilters
from tdoc_crawler.specs import SpecCatalog
@@ -523,7 +523,12 @@ def query_tdocs(

        results = database.query_tdocs(config)
        if not no_fetch:
            results = maybe_fetch_missing_tdocs(database, config.cache_dir, config, results, None)
            result = fetch_missing_tdocs(database, config.cache_dir, config, results, None)
            if result.fetch_result and result.fetch_result.errors:
                console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
                for error in result.fetch_result.errors[:3]:
                    console.print(f"  - {error}")
            results = result.refreshed

    if not results:
        console.print("[yellow]No TDocs found[/yellow]")
@@ -695,7 +700,10 @@ def open_tdoc(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
        result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
        if result.fetch_result and result.fetch_result.errors:
            console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        results = result.refreshed
        if not results:
            console.print(f"[red]TDoc {normalized_id} not found[/red]")
            raise typer.Exit(code=1)
@@ -729,14 +737,10 @@ def checkout(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)

        # Check which TDocs were found
        found_ids = {r.tdoc_id for r in results}
        missing_ids = set(normalized_ids) - found_ids

        if missing_ids:
            console.print(f"[red]TDoc(s) not found: {', '.join(sorted(missing_ids))}[/red]")
        result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
        if result.fetch_result and result.fetch_result.errors:
            console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        results = result.refreshed
        if not results:
            raise typer.Exit(code=1)

src/tdoc_crawler/cli/fetching.py

deleted100644 → 0
+0 −217
Original line number Diff line number Diff line
"""Functions for fetching missing TDocs from the portal."""

from __future__ import annotations

import logging
from pathlib import Path

from pydantic import ValidationError

from tdoc_crawler.cli.console import get_console
from tdoc_crawler.cli.helpers import resolve_meeting_id
from tdoc_crawler.crawlers import TDocCrawlResult, WhatTheSpecResolutionError, create_portal_client, resolve_via_whatthespec
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import HttpCacheConfig, PortalCredentials, QueryConfig, TDocMetadata

console = get_console()
_logger = logging.getLogger(__name__)


# TODO: these functions do not belong in the CLI layer, they should be refactored into a separate service layer that can be called from both CLI and API contexts. The CLI should only handle user interaction and delegate fetching logic to a service module. This would improve separation of concerns and make the fetching logic reusable across different interfaces.
# TODO: the fetching logic is currently duplicated between the portal method and the whatthespec method. We should refactor to have a single fetching function that can use different data sources based on configuration, rather than having separate functions with similar logic. This would reduce code duplication and make it easier to maintain.
def fetch_missing_tdocs(
    database: TDocDatabase,
    cache_dir: Path,
    missing_ids: list[str],
    credentials: PortalCredentials | None = None,
) -> TDocCrawlResult:
    """Fetch missing TDocs using portal authentication.

    Args:
        database: Database connection
        cache_dir: Cache directory path
        missing_ids: List of TDoc IDs to fetch
        credentials: Portal credentials (optional)

    Returns:
        TDocCrawlResult with inserted/updated counts and errors
    """
    errors = []

    # Only try to resolve credentials if not explicitly provided
    if credentials is None:
        credentials = resolve_credentials(None, None)

    # If still no credentials, log that portal fetch will be skipped
    # The caller may attempt whatthespec fallback if credentials are unavailable
    if not credentials:
        _logger.info("Portal credentials not available, skipping portal authentication fetch")
        errors.append("Portal credentials required for targeted fetch. Set EOL_USERNAME and EOL_PASSWORD.")
        return TDocCrawlResult(processed=len(missing_ids), inserted=0, updated=0, errors=errors)

    inserted_count = 0
    updated_count = 0

    client = create_portal_client(credentials=credentials, cache_dir=cache_dir)

    for tdoc_id in missing_ids:
        try:
            # Fetch metadata from portal using PortalClient
            metadata = client.fetch_tdoc_metadata(tdoc_id)

            # Resolve meeting_id from meeting name if available
            if metadata.meeting_name:
                meeting_id = resolve_meeting_id(database, metadata.meeting_name)
                if meeting_id:
                    metadata.meeting_id = meeting_id
                else:
                    _logger.warning(f"Could not resolve meeting '{metadata.meeting_name}' to meeting_id for {tdoc_id}")

            # Insert/update in database
            inserted, updated = database.upsert_tdoc(metadata)
            if inserted:
                inserted_count += 1
            elif updated:
                updated_count += 1

            _logger.info(f"Successfully fetched and stored {tdoc_id}")

        except ValidationError as exc:
            # Extract and display only actual validation errors, not full stack trace
            validation_errors = []
            for error in exc.errors():
                field = error["loc"][0] if error["loc"] else "unknown"
                message = error["msg"]
                validation_errors.append(f"{field}: {message}")

            error_msg = f"Failed to fetch {tdoc_id}: {'; '.join(validation_errors)}"
            _logger.warning(error_msg)
            errors.append(error_msg)
        except Exception as exc:
            error_msg = f"Failed to fetch {tdoc_id}: {exc}"
            _logger.error(error_msg)
            errors.append(error_msg)

    return TDocCrawlResult(
        processed=len(missing_ids),
        inserted=inserted_count,
        updated=updated_count,
        errors=errors,
    )


def _fetch_via_whatthespec(
    database: TDocDatabase,
    cache_dir: Path,
    missing_ids: list[str],
) -> None:
    """Fetch missing TDocs using whatthespec.net fallback.

    Args:
        database: Database connection
        cache_dir: Cache directory path
        missing_ids: List of TDoc IDs to fetch
    """
    http_cache = HttpCacheConfig()

    for tdoc_id in missing_ids:
        try:
            metadata = resolve_via_whatthespec(tdoc_id, cache_dir, http_cache)
            if metadata:
                inserted, updated = database.upsert_tdoc(metadata)
                if inserted or updated:
                    _logger.info(f"Successfully fetched {tdoc_id} via whatthespec")
            else:
                _logger.warning(f"WhatTheSpec returned no data for {tdoc_id}")
        except WhatTheSpecResolutionError as exc:
            _logger.warning(f"WhatTheSpec resolution failed for {tdoc_id}: {exc}")
        except Exception as exc:
            _logger.exception(f"Unexpected error fetching {tdoc_id} via whatthespec: {exc}")


def maybe_fetch_missing_tdocs(
    database: TDocDatabase,
    cache_dir: Path,
    config: QueryConfig,
    results: list[TDocMetadata],
    credentials: PortalCredentials | None = None,
    full_metadata: bool = False,
    use_whatthespec: bool = False,
) -> list[TDocMetadata]:
    """Fetch missing TDocs if any are requested but not found in database.

    Args:
        database: Database connection
        cache_dir: Cache directory
        config: Query configuration with requested TDoc IDs
        results: Already found TDoc metadata
        credentials: Portal credentials (None if not available)
        full_metadata: If True, fetch full metadata instead of URL only
        use_whatthespec: If True, use WhatTheSpec API for fetching

    Returns:
        Updated list of TDocMetadata with newly fetched TDocs
    """
    if not config.tdoc_ids:
        return results
    requested = [value.upper() for value in config.tdoc_ids]
    found = {item.tdoc_id for item in results}
    missing = [value for value in requested if value not in found]
    if not missing:
        return results

    # Handle use_whatthespec flag - always use WhatTheSpec method
    if use_whatthespec:
        console.print(f"[cyan]Fetching missing TDocs via WhatTheSpec: {', '.join(missing)}[/cyan]")
        _fetch_via_whatthespec(database, cache_dir, missing)
        refreshed = database.query_tdocs(config)
        return refreshed

    # Handle full_metadata flag - use authenticated portal method
    if full_metadata:
        if credentials is None:
            console.print("[red]Portal credentials required for full metadata fetching[/red]")
            return results
        console.print(f"[cyan]Fetching missing TDocs with full metadata: {', '.join(missing)}[/cyan]")
        fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
        if fetch_result.errors:
            console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
            for error in fetch_result.errors[:3]:
                console.print(f"  - {error}")
        refreshed = database.query_tdocs(config)
        return refreshed

    # Default behavior - use unauthenticated portal method with WhatTheSpec fallback
    console.print(f"[cyan]Fetching missing TDocs: {', '.join(missing)}[/cyan]")
    fetch_result = fetch_missing_tdocs(database, cache_dir, missing, credentials)
    if fetch_result.errors:
        console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        for error in fetch_result.errors[:3]:
            console.print(f"  - {error}")

    # Check if we should try whatthespec fallback
    credentials_unavailable = credentials is None
    has_credentials_error = any("credentials required" in str(err).lower() for err in fetch_result.errors)
    should_try_whatthespec = credentials_unavailable and has_credentials_error

    # Try whatthespec fallback for remaining unresolved TDocs
    if should_try_whatthespec:
        refreshed = database.query_tdocs(config)
        refreshed_ids = {item.tdoc_id for item in refreshed}
        still_missing = [value for value in requested if value not in refreshed_ids]

        if still_missing:
            console.print(f"[cyan]Trying whatthespec fallback for {len(still_missing)} TDoc(s)[/cyan]")
            _fetch_via_whatthespec(database, cache_dir, still_missing)

    refreshed = database.query_tdocs(config)
    refreshed_ids = {item.tdoc_id for item in refreshed}
    unresolved = [value for value in requested if value not in refreshed_ids]
    if unresolved:
        console.print(f"[yellow]Still missing: {', '.join(unresolved)}[/yellow]")
    else:
        console.print(
            f"[green]Added {fetch_result.inserted} and updated {fetch_result.updated} TDocs[/green]",
        )
    return refreshed
+245 −6
Original line number Diff line number Diff line
@@ -11,12 +11,18 @@ The central fetch_tdoc() function selects the appropriate method based on flags.
from __future__ import annotations

import logging
from decimal import Decimal
from pathlib import Path

from pydantic import ValidationError

from tdoc_crawler.crawlers.portal import create_portal_client
from tdoc_crawler.crawlers.whatthespec import resolve_via_whatthespec
from tdoc_crawler.crawlers.tdocs import TDocCrawlResult
from tdoc_crawler.crawlers.whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database import TDocDatabase, resolve_meeting_id
from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
from tdoc_crawler.models.tdocs import TDocMetadata
from tdoc_crawler.models.tdocs import QueryConfig, TDocMetadata

logger = logging.getLogger(__name__)

@@ -48,7 +54,6 @@ def fetch_tdoc(
        Exception: If fetching fails for any reason.
    """
    if use_whatthespec:
        # Always use WhatTheSpec method (Method 3)
        logger.debug(f"Fetching {tdoc_id} via WhatTheSpec API")
        metadata = resolve_via_whatthespec(tdoc_id, cache_dir, http_cache, timeout)
        if metadata is None:
@@ -56,7 +61,6 @@ def fetch_tdoc(
        return metadata

    elif full_metadata:
        # Use authenticated portal method (Method 2)
        if credentials is None:
            raise ValueError("Portal credentials required for full metadata fetching")
        logger.debug(f"Fetching {tdoc_id} via authenticated 3GPP portal")
@@ -70,10 +74,10 @@ def fetch_tdoc(
        return client.fetch_tdoc_metadata(tdoc_id)

    else:
        # Use unauthenticated portal method (Method 1) - URL only
        logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
        client = create_portal_client(cache_dir=cache_dir, timeout=min(timeout, 15))
        url = client.extract_tdoc_url(tdoc_id)

        return TDocMetadata(
            tdoc_id=tdoc_id.upper(),
            meeting_id=0,
@@ -83,8 +87,243 @@ def fetch_tdoc(
            contact="",
            tdoc_type="unknown",
            for_purpose="unknown",
            agenda_item_nbr=0,
            agenda_item_nbr=Decimal("0"),
            agenda_item_text="Unknown",
            status="",
            meeting_name=None,
            is_revision_of=None,
            file_size=None,
            date_created=None,
            validated=False,
            validation_failed=False,
        )


def fetch_missing_tdocs_batch(
    database: TDocDatabase,
    cache_dir: Path,
    missing_ids: list[str],
    credentials: PortalCredentials | None = None,
    http_cache: HttpCacheConfig | None = None,
    timeout: int = 30,
) -> TDocCrawlResult:
    """Fetch multiple missing TDocs using portal authentication."""
    errors = []

    if credentials is None:
        credentials = resolve_credentials(None, None)

    if not credentials:
        logger.info("Portal credentials not available, skipping portal authentication fetch")
        errors.append("Portal credentials required for targeted fetch. Set EOL_USERNAME and EOL_PASSWORD.")
        return TDocCrawlResult(processed=len(missing_ids), inserted=0, updated=0, errors=errors)

    inserted_count = 0
    updated_count = 0

    client = create_portal_client(
        credentials=credentials,
        cache_dir=cache_dir,
        cache_ttl=http_cache.ttl if http_cache else 3600,
        cache_refresh_on_access=http_cache.refresh_ttl_on_access if http_cache else False,
        timeout=timeout,
    )

    for tdoc_id in missing_ids:
        try:
            metadata = client.fetch_tdoc_metadata(tdoc_id)

            if metadata.meeting_name:
                meeting_id = resolve_meeting_id(database, metadata.meeting_name)
                if meeting_id:
                    metadata.meeting_id = meeting_id
                else:
                    logger.warning(f"Could not resolve meeting '{metadata.meeting_name}' to meeting_id for {tdoc_id}")

            inserted, updated = database.upsert_tdoc(metadata)
            if inserted:
                inserted_count += 1
            elif updated:
                updated_count += 1

            logger.info(f"Successfully fetched and stored {tdoc_id}")

        except ValidationError as exc:
            validation_errors = []
            for error in exc.errors():
                field = error["loc"][0] if error["loc"] else "unknown"
                message = error["msg"]
                validation_errors.append(f"{field}: {message}")

            error_msg = f"Failed to fetch {tdoc_id}: {'; '.join(validation_errors)}"
            logger.warning(error_msg)
            errors.append(error_msg)
        except Exception as exc:
            error_msg = f"Failed to fetch {tdoc_id}: {exc}"
            logger.error(error_msg)
            errors.append(error_msg)

    return TDocCrawlResult(
        processed=len(missing_ids),
        inserted=inserted_count,
        updated=updated_count,
        errors=errors,
    )


def fetch_via_whatthespec_batch(
    database: TDocDatabase,
    cache_dir: Path,
    missing_ids: list[str],
    http_cache: HttpCacheConfig | None = None,
) -> TDocCrawlResult:
    """Fetch multiple missing TDocs using WhatTheSpec.net API."""
    errors = []
    inserted_count = 0
    updated_count = 0

    if http_cache is None:
        http_cache = HttpCacheConfig()

    for tdoc_id in missing_ids:
        try:
            metadata = resolve_via_whatthespec(tdoc_id, cache_dir, http_cache)
            if metadata:
                inserted, updated = database.upsert_tdoc(metadata)
                if inserted:
                    inserted_count += 1
                elif updated:
                    updated_count += 1
                logger.info(f"Successfully fetched {tdoc_id} via whatthespec")
            else:
                error_msg = f"WhatTheSpec returned no data for {tdoc_id}"
                logger.warning(error_msg)
                errors.append(error_msg)
        except WhatTheSpecResolutionError as exc:
            error_msg = f"WhatTheSpec resolution failed for {tdoc_id}: {exc}"
            logger.warning(error_msg)
            errors.append(error_msg)
        except Exception as exc:
            error_msg = f"Unexpected error fetching {tdoc_id} via whatthespec: {exc}"
            logger.exception(error_msg)
            errors.append(error_msg)

    return TDocCrawlResult(
        processed=len(missing_ids),
        inserted=inserted_count,
        updated=updated_count,
        errors=errors,
    )


class FetchMissingResult:
    """Result from fetch_missing_tdocs orchestration."""

    def __init__(
        self,
        refreshed: list[TDocMetadata],
        fetch_result: TDocCrawlResult | None = None,
        used_whatthespec_fallback: bool = False,
    ):
        self.refreshed = refreshed
        self.fetch_result = fetch_result
        self.used_whatthespec_fallback = used_whatthespec_fallback


def fetch_missing_tdocs(
    database: TDocDatabase,
    cache_dir: Path,
    config: QueryConfig,
    results: list[TDocMetadata],
    credentials: PortalCredentials | None = None,
    full_metadata: bool = False,
    use_whatthespec: bool = False,
) -> FetchMissingResult:
    """Fetch missing TDocs if any are requested but not found in database.

    Orchestrates fetching based on flags, uses WhatTheSpec fallback when needed.
    """
    if not config.tdoc_ids:
        return FetchMissingResult(refreshed=results)

    requested = [value.upper() for value in config.tdoc_ids]
    found = {item.tdoc_id for item in results}
    missing = [value for value in requested if value not in found]
    if not missing:
        return FetchMissingResult(refreshed=results)

    # Log what we're about to do
    if use_whatthespec:
        logger.info(f"Fetching missing TDocs via WhatTheSpec: {', '.join(missing)}")
    elif full_metadata:
        logger.info(f"Fetching missing TDocs with full metadata: {', '.join(missing)}")
    else:
        logger.info(f"Fetching missing TDocs: {', '.join(missing)}")

    # Handle use_whatthespec flag - always use WhatTheSpec method
    if use_whatthespec:
        fetch_result = fetch_via_whatthespec_batch(database, cache_dir, missing)
        refreshed = database.query_tdocs(config)
        return FetchMissingResult(
            refreshed=refreshed,
            fetch_result=fetch_result,
            used_whatthespec_fallback=True,
        )

    # Handle full_metadata flag - use authenticated portal method
    if full_metadata:
        if credentials is None:
            fetch_result = TDocCrawlResult(
                processed=len(missing),
                inserted=0,
                updated=0,
                errors=["Portal credentials required for full metadata fetching"],
            )
            logger.warning("Portal credentials required for full metadata fetching")
            return FetchMissingResult(refreshed=results, fetch_result=fetch_result)

        fetch_result = fetch_missing_tdocs_batch(
            database=database,
            cache_dir=cache_dir,
            missing_ids=missing,
            credentials=credentials,
        )
        refreshed = database.query_tdocs(config)
        return FetchMissingResult(refreshed=refreshed, fetch_result=fetch_result)

    # Default behavior - use unauthenticated portal method with WhatTheSpec fallback
    fetch_result = fetch_missing_tdocs_batch(
        database=database,
        cache_dir=cache_dir,
        missing_ids=missing,
        credentials=credentials,
    )

    # Check if we should try whatthespec fallback
    credentials_unavailable = credentials is None
    has_credentials_error = any("credentials required" in str(err).lower() for err in fetch_result.errors)
    should_try_whatthespec = credentials_unavailable and has_credentials_error

    used_fallback = False
    if should_try_whatthespec:
        refreshed = database.query_tdocs(config)
        refreshed_ids = {item.tdoc_id for item in refreshed}
        still_missing = [value for value in requested if value not in refreshed_ids]

        if still_missing:
            logger.info(f"Trying whatthespec fallback for {len(still_missing)} TDoc(s)")
            fallback_result = fetch_via_whatthespec_batch(database, cache_dir, still_missing)
            fetch_result = TDocCrawlResult(
                processed=fetch_result.processed + fallback_result.processed,
                inserted=fetch_result.inserted + fallback_result.inserted,
                updated=fetch_result.updated + fallback_result.updated,
                errors=fetch_result.errors + fallback_result.errors,
            )
            used_fallback = True

    refreshed = database.query_tdocs(config)
    return FetchMissingResult(
        refreshed=refreshed,
        fetch_result=fetch_result,
        used_whatthespec_fallback=used_fallback,
    )
+24 −14

File changed.

Preview size limit exceeded, changes collapsed.

+7 −6

File changed.

Preview size limit exceeded, changes collapsed.

Loading