Commit e277d273 authored by Jan Reimes's avatar Jan Reimes
Browse files

tdocs: use TDocStatus enum in checkout and fix fetch/doclist URL handling

parent f5de3ed3
Loading
Loading
Loading
Loading
+21 −3
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@ import requests
from tdoc_crawler.http_client import download_to_file
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingMetadata
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.models import TDocMetadata, TDocStatus
from tdoc_crawler.tdocs.sources.doclist import DocumentListError, fetch_meeting_document_list

logger = get_logger(__name__)
@@ -85,6 +85,11 @@ def checkout_tdoc(
    Raises:
        FileNotFoundError: If download fails or zip is empty
    """
    # Check if TDoc is withdrawn - if so, skip download and log debug message
    if metadata.status == TDocStatus.WITHDRAWN:
        logger.debug(f"Skipping checkout for withdrawn TDoc {metadata.tdoc_id}")
        raise FileNotFoundError("withdrawn")

    checkout_path = get_checkout_path(metadata, checkout_dir)

    if checkout_path.exists() and not force:
@@ -120,11 +125,19 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool

    Returns:
        Path to the downloaded file, or the extract directory when return_dir is True.

    Raises:
        FileNotFoundError: If download fails for non-withdrawn TDocs
    """
    # Handle the case where metadata.url is None
    if metadata.url is None:
        raise ValueError(f"Cannot prepare TDoc file for {metadata.tdoc_id}: URL is None")

    # Check if TDoc is withdrawn - if so, skip download and log debug message
    if metadata.status == TDocStatus.WITHDRAWN:
        logger.debug(f"Skipping download for withdrawn TDoc {metadata.tdoc_id}")
        raise FileNotFoundError("withdrawn")

    downloads_dir = cache_dir / "checkout"
    downloads_dir.mkdir(parents=True, exist_ok=True)
    path = urlparse(metadata.url).path
@@ -244,6 +257,11 @@ def checkout_tdocs(
            checkout_tdoc(metadata, checkout_dir, force=force, session=session)
            success_count += 1
        except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
            error_message = str(exc)
            # For withdrawn TDocs, log debug instead of adding to errors
            if error_message == "withdrawn":  # This matches the FileNotFoundError message
                logger.debug(f"Skipped withdrawn TDoc {metadata.tdoc_id}")
            else:
                errors.append(f"{metadata.tdoc_id}: {exc}")
                error_count += 1

+1 −0
Original line number Diff line number Diff line
@@ -142,6 +142,7 @@ class TDocCrawler:
                    timeout=config.timeout,
                    http_cache=config.http_cache,
                    cache_manager_name=config.cache_manager_name,
                    files_url=meeting.files_url,
                )
                futures[future] = meeting

+1 −1
Original line number Diff line number Diff line
@@ -145,7 +145,7 @@ def fetch_tdoc(
            for_purpose="unknown",
            agenda_item_nbr=Decimal("0"),
            agenda_item_text="Unknown",
            status="",
            status=None,
            meeting_name=None,
            is_revision_of=None,
            file_size=None,
+33 −5
Original line number Diff line number Diff line
@@ -28,7 +28,11 @@ class DocumentListError(Exception):


def fetch_meeting_document_list(
    meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None
    meeting_id: int,
    timeout: int = 30,
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
    files_url: str | None = None,
) -> list[TDocMetadata]:
    """Fetch all TDoc metadata for a meeting via document list Excel file.

@@ -37,6 +41,8 @@ def fetch_meeting_document_list(
        timeout: Request timeout in seconds
        http_cache: Optional HTTP cache configuration for caching document list responses
        cache_manager_name: Optional name of cache manager to use for HTTP caching
        files_url: Optional HTTP directory containing meeting documents (for URL construction)

    Returns:
        List of TDocMetadata instances for all TDocs in the meeting

@@ -67,7 +73,7 @@ def fetch_meeting_document_list(

        # Parse Excel file
        logger.debug(f"Parsing Excel document list for meeting {meeting_id}")
        return parse_excel_document_list(response.content, meeting_id)
        return parse_excel_document_list(response.content, meeting_id, files_url)

    except Exception as exc:
        if isinstance(exc, DocumentListError):
@@ -81,12 +87,14 @@ def fetch_meeting_document_list(
def parse_excel_document_list(
    excel_content: bytes,
    meeting_id: int,
    files_url: str | None = None,
) -> list[TDocMetadata]:
    """Parse Excel document list and convert to TDocMetadata instances.

    Args:
        excel_content: Raw Excel file content
        meeting_id: Meeting ID for reference
        files_url: Optional HTTP directory containing meeting documents (for URL construction)

    Returns:
        List of TDocMetadata instances
@@ -117,7 +125,7 @@ def parse_excel_document_list(
        tdoc_metadata_list = []
        for i, (_idx, row) in enumerate(df.iterrows()):
            try:
                tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id)
                tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id, files_url)
                if tdoc_metadata:
                    tdoc_metadata_list.append(tdoc_metadata)
                else:
@@ -136,12 +144,14 @@ def parse_excel_document_list(
def convert_excel_row_to_tdoc_metadata(
    row: pd.Series,
    meeting_id: int,
    files_url: str | None = None,
) -> TDocMetadata | None:
    """Convert a single Excel row to TDocMetadata.

    Args:
        row: pandas Series representing one Excel row
        meeting_id: Meeting ID for reference
        files_url: Optional HTTP directory containing meeting documents (for URL construction)

    Returns:
        TDocMetadata instance or None if conversion fails
@@ -164,8 +174,9 @@ def convert_excel_row_to_tdoc_metadata(
    is_revision_of = _get_column_value(row, ["Is revision of"])
    date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date", "Reservation date"])

    # Generate URL (this will be validated/updated later by the directory crawler)
    url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip"
    # Generate URL from files_url if available
    # Format: files_url + "/Docs/" + tdoc_id + ".zip"
    url = _construct_tdoc_url(files_url, tdoc_id.upper()) if files_url else None

    now = datetime.now(UTC)

@@ -205,6 +216,23 @@ def convert_excel_row_to_tdoc_metadata(
        return None


def _construct_tdoc_url(files_url: str | None, tdoc_id: str) -> str | None:
    """Construct the full TDoc URL from files_url.

    Args:
        files_url: Base HTTP directory containing meeting documents
        tdoc_id: TDoc identifier (e.g., "S4-260454")

    Returns:
        Full URL to TDoc zip file (e.g., "https://.../Docs/S4-260454.zip")
        or None if files_url is not available
    """
    if not files_url:
        return None
    base = files_url.rstrip("/")
    return f"{base}/Docs/{tdoc_id.upper()}.zip"


def _extract_tdoc_id(row: pd.Series) -> str | None:
    """Extract TDoc ID from Excel row.

+9 −1
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ def fetch_meeting_document_list_subinterpreter(
    timeout: int,
    http_cache: HttpCacheConfig | None = None,
    cache_manager_name: str | None = None,
    files_url: str | None = None,
) -> str:
    """Fetch meeting document list in subinterpreter context.

@@ -26,13 +27,20 @@ def fetch_meeting_document_list_subinterpreter(
        timeout: Request timeout in seconds
        http_cache: Optional HTTP cache configuration for caching document list responses
        cache_manager_name: Optional name of cache manager to use for HTTP caching
        files_url: Optional HTTP directory containing meeting documents (for URL construction)

    Returns:
        JSON string containing list of TDocMetadata or error information
    """
    try:
        # Fetch document list for the meeting
        tdoc_metadata_list = fetch_meeting_document_list(meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name)
        tdoc_metadata_list = fetch_meeting_document_list(
            meeting_id=meeting_id,
            timeout=timeout,
            http_cache=http_cache,
            cache_manager_name=cache_manager_name,
            files_url=files_url,
        )

        # Serialize to JSON for inter-process communication
        serialized = []