Commit cb8d9d25 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor: improve document list crawler with TYPE_CHECKING and error handling

parent 5e451054
Loading
Loading
Loading
Loading
+329 −331
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from __future__ import annotations

import io
import logging
import re
from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
@@ -15,6 +16,8 @@ from tdoc_crawler.http_client import create_cached_session

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata
else:
    from tdoc_crawler.models.tdocs import TDocMetadata  # noqa: PLC0415

logger = logging.getLogger(__name__)

@@ -65,9 +68,8 @@ def fetch_meeting_document_list(

        # Check if we got a valid Excel file
        content_type = response.headers.get("content-type", "").lower()
        if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type:
        if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type and not response.content.startswith(b"PK"):
            # Some responses might not set content-type correctly, check file signature
            if not response.content.startswith(b"PK"):
            raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}")

        # Parse Excel file
@@ -139,8 +141,6 @@ def convert_excel_row_to_tdoc_metadata(
    Returns:
        TDocMetadata instance or None if conversion fails
    """
    from tdoc_crawler.models.tdocs import TDocMetadata

    # Map Excel columns to TDocMetadata fields
    # Try multiple possible column names to handle different Excel formats
    tdoc_id = _extract_tdoc_id(row)
@@ -229,8 +229,6 @@ def _is_valid_tdoc_id(tdoc_id: str) -> bool:
    Returns:
        True if valid TDoc ID format
    """
    import re

    # TDoc ID pattern: [RSC][1-6P] followed by 4-10 chars
    pattern = re.compile(r"^[RSC][1-6P].{4,10}$", re.IGNORECASE)
    return bool(pattern.match(tdoc_id.strip()))
@@ -325,7 +323,7 @@ def _parse_date(date_value: str | None) -> datetime | None:

__all__ = [
    "DocumentListError",
    "convert_excel_row_to_tdoc_metadata",
    "fetch_meeting_document_list",
    "parse_excel_document_list",
    "convert_excel_row_to_tdoc_metadata",
]
+298 −310
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

from __future__ import annotations

import io
from decimal import Decimal
from pathlib import Path
from unittest.mock import MagicMock, patch
@@ -9,15 +10,9 @@ from unittest.mock import MagicMock, patch
import pandas as pd
import pytest

from tdoc_crawler.crawlers import (
    DocumentListError,
    HybridCrawlResult,
    HybridTDocCrawler,
    fetch_meeting_document_list,
    parse_excel_document_list,
)
from tdoc_crawler.crawlers import DocumentListError, HybridCrawlResult, HybridTDocCrawler, fetch_meeting_document_list, parse_excel_document_list
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import TDocCrawlConfig, WorkingGroup
from tdoc_crawler.models import MeetingMetadata, TDocCrawlConfig, WorkingGroup
from tdoc_crawler.models.tdocs import TDocMetadata


@@ -165,7 +160,6 @@ class TestMeetingDocumentList:

        with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings:
            # Mock meeting
            from tdoc_crawler.models import MeetingMetadata

            mock_meeting = MeetingMetadata(
                meeting_id=12345,
@@ -213,8 +207,6 @@ class TestMeetingDocumentList:
        )

        with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings:
            from tdoc_crawler.models import MeetingMetadata

            mock_meeting = MeetingMetadata(
                meeting_id=12345,
                tbid=373,  # RAN
@@ -262,8 +254,6 @@ class TestMeetingDocumentList:
        )

        with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings:
            from tdoc_crawler.models import MeetingMetadata

            mock_meeting = MeetingMetadata(
                meeting_id=12345,
                tbid=373,  # RAN
@@ -290,8 +280,6 @@ class TestMeetingDocumentList:

def _create_test_excel_bytes(df: pd.DataFrame) -> bytes:
    """Create test Excel file bytes from DataFrame."""
    import io

    # Use xlsxwriter for writing Excel files (as per AGENTS.md)
    output = io.BytesIO()