Commit 50fd1281 authored by Jan Reimes's avatar Jan Reimes
Browse files

tests: update tests to accommodate schema refactor (meeting_name removal, subtb lookups)

parent b9e352e7
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -56,7 +56,7 @@ def parse_meeting_row(
    cells: list[Tag],
    working_group: WorkingGroup,
    subgroup: str | None,
    get_subtb: Callable[[str], int] | None = None,
    get_subtb: Callable[[str], int | None] | None = None,
) -> MeetingMetadata:
    """Parse a single meeting row from the table.

@@ -93,8 +93,6 @@ def parse_meeting_row(
        meeting_id=meeting_id,
        tbid=tbid,
        subtb=subtb,
        working_group=working_group,
        subgroup=subgroup,
        short_name=short_name,
        title=title or None,
        start_date=start_date,
@@ -102,6 +100,7 @@ def parse_meeting_row(
        location=location or None,
        files_url=files_url,
        portal_url=portal_url,
        tdoc_count=0,
    )


+11 −8
Original line number Diff line number Diff line
@@ -119,21 +119,24 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T
        logger.warning(error_msg)
        raise PortalParsingError(error_msg)

    agenda_item_value = metadata.get("agenda_item_nbr") or "0"
    return TDocMetadata(
        tdoc_id=tdoc_id,
        meeting_id=0,  # Placeholder - caller must resolve via meeting_name
        title=metadata.get("title", ""),
        title=str(metadata.get("title") or ""),
        url=url,
        source=metadata.get("source", ""),
        contact=metadata.get("contact", ""),
        tdoc_type=metadata.get("tdoc_type", "unknown"),
        for_purpose=metadata.get("for", "unknown"),
        agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")),
        agenda_item_text=metadata.get("agenda_item_text", "Unknown"),
        source=str(metadata.get("source") or ""),
        contact=str(metadata.get("contact") or ""),
        tdoc_type=str(metadata.get("tdoc_type") or "unknown"),
        for_purpose=str(metadata.get("for") or "unknown"),
        agenda_item_nbr=Decimal(str(agenda_item_value)),
        agenda_item_text=str(metadata.get("agenda_item_text") or "Unknown"),
        status=metadata.get("status"),
        meeting_name=metadata.get("meeting"),
        is_revision_of=metadata.get("is_revision_of"),
        file_size=None,
        date_created=None,
        validated=True,
        validation_failed=False,
    )


+1 −1
Original line number Diff line number Diff line
@@ -42,11 +42,11 @@ class TDocMetadata(BaseModel):
    status: str | None = Field(None, description="Document status as reported by the portal")

    # Optional metadata fields (from portal or determined otherwise)
    meeting_name: str | None = Field(None, description="Meeting name from portal (temporary, used for resolution)")
    is_revision_of: str | None = Field(None, description="Reference to a previous TDoc version")
    file_size: int | None = Field(None, description="File size in bytes, when available/downloaded")

    # fields for local database management

    date_created: datetime | None = Field(None, description="Original creation timestamp when provided")
    date_retrieved: datetime = Field(default_factory=utc_now, description="Timestamp of the last retrieval")
    date_updated: datetime = Field(default_factory=utc_now, description="Timestamp of the last database update")
+2 −9
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from pydantic import ValidationError

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import HttpCacheConfig
from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig
@@ -216,14 +216,7 @@ def fetch_missing_tdocs_batch(
                errors.append(error_msg)
                continue

            # Resolve meeting_id if needed
            if metadata.meeting_name:
                with MeetingDatabase(database.db_file) as meeting_db:
                    meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name)
                if meeting_id is not None:
                    metadata.meeting_id = meeting_id
                else:
                    logger.warning(f"Could not resolve meeting '{metadata.meeting_name}' to meeting_id for {tdoc_id}")
            # meeting_id resolution is now done inside source modules

            inserted, updated = database.upsert_tdoc(metadata)
            if inserted:
+0 −1
Original line number Diff line number Diff line
@@ -190,7 +190,6 @@ def convert_excel_row_to_tdoc_metadata(
            agenda_item_nbr=agenda_nbr,
            agenda_item_text=agenda_text,
            status=status,
            meeting_name=None,  # Will be resolved from meeting database
            is_revision_of=is_revision_of,
            file_size=None,  # Not available in Excel
            date_created=parsed_date,
Loading