Loading src/tdoc_crawler/parsers/meetings.py +2 −3 Original line number Diff line number Diff line Loading @@ -56,7 +56,7 @@ def parse_meeting_row( cells: list[Tag], working_group: WorkingGroup, subgroup: str | None, get_subtb: Callable[[str], int] | None = None, get_subtb: Callable[[str], int | None] | None = None, ) -> MeetingMetadata: """Parse a single meeting row from the table. Loading Loading @@ -93,8 +93,6 @@ def parse_meeting_row( meeting_id=meeting_id, tbid=tbid, subtb=subtb, working_group=working_group, subgroup=subgroup, short_name=short_name, title=title or None, start_date=start_date, Loading @@ -102,6 +100,7 @@ def parse_meeting_row( location=location or None, files_url=files_url, portal_url=portal_url, tdoc_count=0, ) Loading src/tdoc_crawler/parsers/portal.py +11 −8 Original line number Diff line number Diff line Loading @@ -119,21 +119,24 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) agenda_item_value = metadata.get("agenda_item_nbr") or "0" return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name title=metadata.get("title", ""), title=str(metadata.get("title") or ""), url=url, source=metadata.get("source", ""), contact=metadata.get("contact", ""), tdoc_type=metadata.get("tdoc_type", "unknown"), for_purpose=metadata.get("for", "unknown"), agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")), agenda_item_text=metadata.get("agenda_item_text", "Unknown"), source=str(metadata.get("source") or ""), contact=str(metadata.get("contact") or ""), tdoc_type=str(metadata.get("tdoc_type") or "unknown"), for_purpose=str(metadata.get("for") or "unknown"), agenda_item_nbr=Decimal(str(agenda_item_value)), agenda_item_text=str(metadata.get("agenda_item_text") or "Unknown"), status=metadata.get("status"), meeting_name=metadata.get("meeting"), is_revision_of=metadata.get("is_revision_of"), file_size=None, date_created=None, validated=True, validation_failed=False, ) Loading src/tdoc_crawler/tdocs/models.py +1 −1 Original line number Diff line number Diff line Loading @@ -42,11 +42,11 @@ class TDocMetadata(BaseModel): status: str | None = Field(None, description="Document status as reported by the portal") # Optional metadata fields (from portal or determined otherwise) meeting_name: str | None = Field(None, description="Meeting name from portal (temporary, used for resolution)") is_revision_of: str | None = Field(None, description="Reference to a previous TDoc version") file_size: int | None = Field(None, description="File size in bytes, when available/downloaded") # fields for local database management date_created: datetime | None = Field(None, description="Original creation timestamp when provided") date_retrieved: datetime = Field(default_factory=utc_now, description="Timestamp of the last retrieval") date_updated: datetime = Field(default_factory=utc_now, description="Timestamp of the last database update") Loading src/tdoc_crawler/tdocs/operations/fetch.py +2 −9 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ from pydantic import ValidationError from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database import MeetingDatabase, TDocDatabase from tdoc_crawler.database import TDocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import HttpCacheConfig from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig Loading Loading @@ -216,14 +216,7 @@ def fetch_missing_tdocs_batch( errors.append(error_msg) continue # Resolve meeting_id if needed if metadata.meeting_name: with MeetingDatabase(database.db_file) as meeting_db: meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name) if meeting_id is not None: metadata.meeting_id = meeting_id else: logger.warning(f"Could not resolve meeting '{metadata.meeting_name}' to meeting_id for {tdoc_id}") # meeting_id resolution is now done inside source modules inserted, updated = database.upsert_tdoc(metadata) if inserted: Loading src/tdoc_crawler/tdocs/sources/doclist.py +0 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,6 @@ def convert_excel_row_to_tdoc_metadata( agenda_item_nbr=agenda_nbr, agenda_item_text=agenda_text, status=status, meeting_name=None, # Will be resolved from meeting database is_revision_of=is_revision_of, file_size=None, # Not available in Excel date_created=parsed_date, Loading Loading
src/tdoc_crawler/parsers/meetings.py +2 −3 Original line number Diff line number Diff line Loading @@ -56,7 +56,7 @@ def parse_meeting_row( cells: list[Tag], working_group: WorkingGroup, subgroup: str | None, get_subtb: Callable[[str], int] | None = None, get_subtb: Callable[[str], int | None] | None = None, ) -> MeetingMetadata: """Parse a single meeting row from the table. Loading Loading @@ -93,8 +93,6 @@ def parse_meeting_row( meeting_id=meeting_id, tbid=tbid, subtb=subtb, working_group=working_group, subgroup=subgroup, short_name=short_name, title=title or None, start_date=start_date, Loading @@ -102,6 +100,7 @@ def parse_meeting_row( location=location or None, files_url=files_url, portal_url=portal_url, tdoc_count=0, ) Loading
src/tdoc_crawler/parsers/portal.py +11 −8 Original line number Diff line number Diff line Loading @@ -119,21 +119,24 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) agenda_item_value = metadata.get("agenda_item_nbr") or "0" return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name title=metadata.get("title", ""), title=str(metadata.get("title") or ""), url=url, source=metadata.get("source", ""), contact=metadata.get("contact", ""), tdoc_type=metadata.get("tdoc_type", "unknown"), for_purpose=metadata.get("for", "unknown"), agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")), agenda_item_text=metadata.get("agenda_item_text", "Unknown"), source=str(metadata.get("source") or ""), contact=str(metadata.get("contact") or ""), tdoc_type=str(metadata.get("tdoc_type") or "unknown"), for_purpose=str(metadata.get("for") or "unknown"), agenda_item_nbr=Decimal(str(agenda_item_value)), agenda_item_text=str(metadata.get("agenda_item_text") or "Unknown"), status=metadata.get("status"), meeting_name=metadata.get("meeting"), is_revision_of=metadata.get("is_revision_of"), file_size=None, date_created=None, validated=True, validation_failed=False, ) Loading
src/tdoc_crawler/tdocs/models.py +1 −1 Original line number Diff line number Diff line Loading @@ -42,11 +42,11 @@ class TDocMetadata(BaseModel): status: str | None = Field(None, description="Document status as reported by the portal") # Optional metadata fields (from portal or determined otherwise) meeting_name: str | None = Field(None, description="Meeting name from portal (temporary, used for resolution)") is_revision_of: str | None = Field(None, description="Reference to a previous TDoc version") file_size: int | None = Field(None, description="File size in bytes, when available/downloaded") # fields for local database management date_created: datetime | None = Field(None, description="Original creation timestamp when provided") date_retrieved: datetime = Field(default_factory=utc_now, description="Timestamp of the last retrieval") date_updated: datetime = Field(default_factory=utc_now, description="Timestamp of the last database update") Loading
src/tdoc_crawler/tdocs/operations/fetch.py +2 −9 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ from pydantic import ValidationError from tdoc_crawler.config import resolve_cache_manager from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database import MeetingDatabase, TDocDatabase from tdoc_crawler.database import TDocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models.base import HttpCacheConfig from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig Loading Loading @@ -216,14 +216,7 @@ def fetch_missing_tdocs_batch( errors.append(error_msg) continue # Resolve meeting_id if needed if metadata.meeting_name: with MeetingDatabase(database.db_file) as meeting_db: meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name) if meeting_id is not None: metadata.meeting_id = meeting_id else: logger.warning(f"Could not resolve meeting '{metadata.meeting_name}' to meeting_id for {tdoc_id}") # meeting_id resolution is now done inside source modules inserted, updated = database.upsert_tdoc(metadata) if inserted: Loading
src/tdoc_crawler/tdocs/sources/doclist.py +0 −1 Original line number Diff line number Diff line Loading @@ -190,7 +190,6 @@ def convert_excel_row_to_tdoc_metadata( agenda_item_nbr=agenda_nbr, agenda_item_text=agenda_text, status=status, meeting_name=None, # Will be resolved from meeting database is_revision_of=is_revision_of, file_size=None, # Not available in Excel date_created=parsed_date, Loading