Commit d0c1c75d authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(crawler): save meeting doclist Excel to checkout folder

When checking out meeting TDocs, the Excel document list is now saved to
the meeting folder (e.g., ~/.3gpp-crawler/checkout/TSG_SA/WG4_CODEC/TSGS4_126_Chicago/meeting_12345_doclist.xlsx)
for manual inspection. HTTP caching ensures no redundant downloads.
parent 91c93374
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ from tdoc_crawler.tdocs.operations.checkout import (
    clear_checkout_tdocs,
    get_checked_out_tdocs,
    get_checkout_path,
    get_meeting_checkout_folder,
    prepare_tdoc_file,
)
from tdoc_crawler.tdocs.operations.crawl import TDocCrawler, TDocCrawlResult
@@ -39,5 +40,6 @@ __all__ = [
    "fetch_via_whatthespec_batch",
    "get_checked_out_tdocs",
    "get_checkout_path",
    "get_meeting_checkout_folder",
    "prepare_tdoc_file",
]
+33 −1
Original line number Diff line number Diff line
@@ -268,6 +268,35 @@ def checkout_tdocs(
    return CheckoutResult(success_count=success_count, error_count=error_count, errors=errors)


def get_meeting_checkout_folder(meeting: MeetingMetadata, checkout_dir: Path) -> Path:
    """Calculate the checkout folder for a meeting based on its files_url.

    The folder mirrors the 3GPP server directory structure.

    Args:
        meeting: MeetingMetadata containing the files_url
        checkout_dir: Base checkout directory

    Returns:
        Path to the meeting checkout folder
    """
    if meeting.files_url is None:
        raise ValueError(f"Meeting {meeting.short_name} has no files_url")

    url_path_str: str = str(urlparse(meeting.files_url).path)
    url_path_str = url_path_str.lstrip("/")
    path_parts: list[str] = url_path_str.split("/")

    try:
        ftp_index = path_parts.index("ftp")
        relative_parts: list[str] = path_parts[ftp_index + 1 :]
    except ValueError:
        relative_parts = path_parts

    meeting_folder = checkout_dir.joinpath(*relative_parts) if relative_parts else checkout_dir
    return meeting_folder


def checkout_meeting_tdocs(
    meetings: list[MeetingMetadata],
    checkout_dir: Path,
@@ -298,7 +327,9 @@ def checkout_meeting_tdocs(
            errors.append(f"{meeting.short_name}: no files URL")
            continue
        try:
            tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_file)
            meeting_folder = get_meeting_checkout_folder(meeting, checkout_dir)
            doclist_path = meeting_folder / f"meeting_{meeting.meeting_id}_doclist.xlsx"
            tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_file, save_doclist_to=doclist_path)
        except DocumentListError as exc:
            errors.append(f"{meeting.short_name}: {exc}")
            continue
@@ -317,5 +348,6 @@ __all__ = [
    "clear_checkout_tdocs",
    "get_checked_out_tdocs",
    "get_checkout_path",
    "get_meeting_checkout_folder",
    "prepare_tdoc_file",
]
+8 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@ def fetch_meeting_document_list(
    http_config: HttpConfig | None = None,
    http_cache_file: Path | None = None,
    files_url: str | None = None,
    save_doclist_to: Path | None = None,
) -> list[TDocMetadata]:
    """Fetch all TDoc metadata for a meeting via document list Excel file.

@@ -43,6 +44,7 @@ def fetch_meeting_document_list(
        http_config: Optional HTTP configuration from ThreeGPPConfig
        http_cache_file: Optional explicit path to the HTTP cache database
        files_url: Optional HTTP directory containing meeting documents (for URL construction)
        save_doclist_to: Optional path to save the Excel document list file for inspection

    Returns:
        List of TDocMetadata instances for all TDocs in the meeting
@@ -72,6 +74,12 @@ def fetch_meeting_document_list(
            # Some responses might not set content-type correctly, check file signature
            raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}")

        # Save doclist to file if path is provided
        if save_doclist_to is not None:
            save_doclist_to.parent.mkdir(parents=True, exist_ok=True)
            save_doclist_to.write_bytes(response.content)
            _logger.debug(f"Saved document list to {save_doclist_to}")

        # Parse Excel file
        _logger.debug(f"Parsing Excel document list for meeting {meeting_id}")
        return parse_excel_document_list(response.content, meeting_id, files_url)