Loading src/tdoc_crawler/crawlers/meeting_doclist.py +329 −331 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import io import logging import re from datetime import UTC, datetime from decimal import Decimal from pathlib import Path Loading @@ -15,6 +16,8 @@ from tdoc_crawler.http_client import create_cached_session if TYPE_CHECKING: from tdoc_crawler.models.tdocs import TDocMetadata else: from tdoc_crawler.models.tdocs import TDocMetadata # noqa: PLC0415 logger = logging.getLogger(__name__) Loading Loading @@ -65,9 +68,8 @@ def fetch_meeting_document_list( # Check if we got a valid Excel file content_type = response.headers.get("content-type", "").lower() if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type: if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type and not response.content.startswith(b"PK"): # Some responses might not set content-type correctly, check file signature if not response.content.startswith(b"PK"): raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}") # Parse Excel file Loading Loading @@ -139,8 +141,6 @@ def convert_excel_row_to_tdoc_metadata( Returns: TDocMetadata instance or None if conversion fails """ from tdoc_crawler.models.tdocs import TDocMetadata # Map Excel columns to TDocMetadata fields # Try multiple possible column names to handle different Excel formats tdoc_id = _extract_tdoc_id(row) Loading Loading @@ -229,8 +229,6 @@ def _is_valid_tdoc_id(tdoc_id: str) -> bool: Returns: True if valid TDoc ID format """ import re # TDoc ID pattern: [RSC][1-6P] followed by 4-10 chars pattern = re.compile(r"^[RSC][1-6P].{4,10}$", re.IGNORECASE) return bool(pattern.match(tdoc_id.strip())) Loading Loading @@ -325,7 +323,7 @@ def _parse_date(date_value: str | None) -> datetime | None: __all__ = [ "DocumentListError", "convert_excel_row_to_tdoc_metadata", "fetch_meeting_document_list", "parse_excel_document_list", "convert_excel_row_to_tdoc_metadata", ] tests/test_meeting_document_list.py +298 −310 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ from __future__ import annotations import io from decimal import Decimal from pathlib import Path from unittest.mock import MagicMock, patch Loading @@ -9,15 +10,9 @@ from unittest.mock import MagicMock, patch import pandas as pd import pytest from tdoc_crawler.crawlers import ( DocumentListError, HybridCrawlResult, HybridTDocCrawler, fetch_meeting_document_list, parse_excel_document_list, ) from tdoc_crawler.crawlers import DocumentListError, HybridCrawlResult, HybridTDocCrawler, fetch_meeting_document_list, parse_excel_document_list from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import TDocCrawlConfig, WorkingGroup from tdoc_crawler.models import MeetingMetadata, TDocCrawlConfig, WorkingGroup from tdoc_crawler.models.tdocs import TDocMetadata Loading Loading @@ -165,7 +160,6 @@ class TestMeetingDocumentList: with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: # Mock meeting from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, Loading Loading @@ -213,8 +207,6 @@ class TestMeetingDocumentList: ) with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, tbid=373, # RAN Loading Loading @@ -262,8 +254,6 @@ class TestMeetingDocumentList: ) with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, tbid=373, # RAN Loading @@ -290,8 +280,6 @@ class TestMeetingDocumentList: def _create_test_excel_bytes(df: pd.DataFrame) -> bytes: """Create test Excel file bytes from DataFrame.""" import io # Use xlsxwriter for writing Excel files (as per AGENTS.md) output = io.BytesIO() Loading Loading
src/tdoc_crawler/crawlers/meeting_doclist.py +329 −331 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import io import logging import re from datetime import UTC, datetime from decimal import Decimal from pathlib import Path Loading @@ -15,6 +16,8 @@ from tdoc_crawler.http_client import create_cached_session if TYPE_CHECKING: from tdoc_crawler.models.tdocs import TDocMetadata else: from tdoc_crawler.models.tdocs import TDocMetadata # noqa: PLC0415 logger = logging.getLogger(__name__) Loading Loading @@ -65,9 +68,8 @@ def fetch_meeting_document_list( # Check if we got a valid Excel file content_type = response.headers.get("content-type", "").lower() if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type: if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type and not response.content.startswith(b"PK"): # Some responses might not set content-type correctly, check file signature if not response.content.startswith(b"PK"): raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}") # Parse Excel file Loading Loading @@ -139,8 +141,6 @@ def convert_excel_row_to_tdoc_metadata( Returns: TDocMetadata instance or None if conversion fails """ from tdoc_crawler.models.tdocs import TDocMetadata # Map Excel columns to TDocMetadata fields # Try multiple possible column names to handle different Excel formats tdoc_id = _extract_tdoc_id(row) Loading Loading @@ -229,8 +229,6 @@ def _is_valid_tdoc_id(tdoc_id: str) -> bool: Returns: True if valid TDoc ID format """ import re # TDoc ID pattern: [RSC][1-6P] followed by 4-10 chars pattern = re.compile(r"^[RSC][1-6P].{4,10}$", re.IGNORECASE) return bool(pattern.match(tdoc_id.strip())) Loading Loading @@ -325,7 +323,7 @@ def _parse_date(date_value: str | None) -> datetime | None: __all__ = [ "DocumentListError", "convert_excel_row_to_tdoc_metadata", "fetch_meeting_document_list", "parse_excel_document_list", "convert_excel_row_to_tdoc_metadata", ]
tests/test_meeting_document_list.py +298 −310 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ from __future__ import annotations import io from decimal import Decimal from pathlib import Path from unittest.mock import MagicMock, patch Loading @@ -9,15 +10,9 @@ from unittest.mock import MagicMock, patch import pandas as pd import pytest from tdoc_crawler.crawlers import ( DocumentListError, HybridCrawlResult, HybridTDocCrawler, fetch_meeting_document_list, parse_excel_document_list, ) from tdoc_crawler.crawlers import DocumentListError, HybridCrawlResult, HybridTDocCrawler, fetch_meeting_document_list, parse_excel_document_list from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import TDocCrawlConfig, WorkingGroup from tdoc_crawler.models import MeetingMetadata, TDocCrawlConfig, WorkingGroup from tdoc_crawler.models.tdocs import TDocMetadata Loading Loading @@ -165,7 +160,6 @@ class TestMeetingDocumentList: with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: # Mock meeting from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, Loading Loading @@ -213,8 +207,6 @@ class TestMeetingDocumentList: ) with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, tbid=373, # RAN Loading Loading @@ -262,8 +254,6 @@ class TestMeetingDocumentList: ) with patch.object(crawler, "_get_meetings_to_crawl") as mock_get_meetings: from tdoc_crawler.models import MeetingMetadata mock_meeting = MeetingMetadata( meeting_id=12345, tbid=373, # RAN Loading @@ -290,8 +280,6 @@ class TestMeetingDocumentList: def _create_test_excel_bytes(df: pd.DataFrame) -> bytes: """Create test Excel file bytes from DataFrame.""" import io # Use xlsxwriter for writing Excel files (as per AGENTS.md) output = io.BytesIO() Loading