Loading src/tdoc_crawler/meetings/operations/crawl.py +72 −36 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ from __future__ import annotations from collections import defaultdict from collections.abc import Callable from dataclasses import dataclass from typing import Any from tdoc_crawler.constants.urls import MEETINGS_BASE_URL from tdoc_crawler.credentials import resolve_credentials Loading @@ -13,7 +14,11 @@ from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS from tdoc_crawler.models.subworking_groups import ( CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord, ) from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.parsers.meetings import parse_meeting_page Loading Loading @@ -67,42 +72,15 @@ class MeetingCrawler: try: for working_group in working_groups: # Filter records for the current working group relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid] for record in relevant_records: # Skip subgroup if subgroups filter is set and this subgroup is not in the list if config.subgroups and record.code not in config.subgroups: continue url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=config.timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) continue # Create callback for subtb lookup using pre-built index def get_subtb(subgroup_code: str) -> int | None: """Get subtb from subgroup code via CODE_INDEX lookup.""" record = CODE_INDEX.get(subgroup_code.upper()) return record.subtb if record else None parsed_meetings = parse_meeting_page( response.text, working_group, record.code, get_subtb=get_subtb, meetings.extend( self._crawl_working_group( session=session, working_group=working_group, config=config, existing_ids=existing_ids, errors=errors, ) ) for meeting in parsed_meetings: if config.incremental and meeting.meeting_id in existing_ids: continue # Filter out meetings without files_url unless explicitly included if not config.include_without_files and not meeting.files_url: continue meetings.append(meeting) finally: session.close() Loading @@ -123,6 +101,32 @@ class MeetingCrawler: errors=errors, ) def _crawl_working_group( self, session: Any, working_group: WorkingGroup, config: MeetingCrawlConfig, existing_ids: set[int], errors: list[str], ) -> list[MeetingMetadata]: """Crawl and parse meeting metadata for one working group.""" meetings: list[MeetingMetadata] = [] records = [record for record in SUBWORKING_GROUP_RECORDS if record.tbid == working_group.tbid and self._record_is_in_scope(record, config)] for record in records: html_text = self._fetch_meeting_page_text(session, record, config.timeout, errors) if html_text is None: continue parsed_meetings = parse_meeting_page( html_text, working_group, record.code, get_subtb=self._get_subtb, ) for meeting in parsed_meetings: if self._include_meeting(meeting, config, existing_ids): meetings.append(meeting) return meetings def _apply_limits( self, meetings: list[MeetingMetadata], Loading @@ -136,6 +140,38 @@ class MeetingCrawler: filtered = self._limit_meetings(filtered, limits.limit_meetings) return filtered @staticmethod def _get_subtb(subgroup_code: str) -> int | None: """Resolve subgroup code to subtb via lookup index.""" record = CODE_INDEX.get(subgroup_code.upper()) return record.subtb if record else None @staticmethod def _record_is_in_scope(record: SubWorkingGroupRecord, config: MeetingCrawlConfig) -> bool: """Check whether subgroup record should be crawled for the current config.""" return not config.subgroups or record.code in config.subgroups @staticmethod def _include_meeting(meeting: MeetingMetadata, config: MeetingCrawlConfig, existing_ids: set[int]) -> bool: """Apply incremental and files-url filters before persisting meetings.""" if config.incremental and meeting.meeting_id in existing_ids: return False return config.include_without_files or bool(meeting.files_url) @staticmethod def _fetch_meeting_page_text(session: Any, record: SubWorkingGroupRecord, timeout: float, errors: list[str]) -> str | None: """Fetch meeting page HTML text for one subgroup record.""" url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) return None return response.text @staticmethod def _limit_working_groups( working_groups: list[WorkingGroup], Loading src/tdoc_crawler/parsers/portal.py +100 −75 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ from __future__ import annotations from decimal import Decimal from typing import Any import requests from bs4 import BeautifulSoup Loading Loading @@ -69,109 +70,133 @@ class PortalParsingError(Exception): """Raised when portal page parsing fails.""" def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata: """Parse TDoc metadata from portal HTML page. Args: html: HTML content of the TDoc portal page tdoc_id: TDoc identifier for logging url: Optional TDoc URL (if known) Returns: TDocMetadata instance with portal metadata Raises: PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing Expected fields: - meeting: Meeting identifier (required) - title: Document title (required) - contact: Contact person/organization (required) - source: Source organization (required) - tdoc_type: Document type classification (required) - for: Purpose (agreement, discussion, etc.) (required) - agenda_item: Associated agenda item (required) - status: Document status (optional) - is_revision_of: Reference to previous TDoc version (optional) """ soup = BeautifulSoup(html, "html.parser") # Check for "not found" or error messages error_indicators = [ ERROR_INDICATORS: tuple[str, ...] = ( "not found", "does not exist", "invalid", "no document", ] ) MANDATORY_FIELDS: tuple[str, ...] = ( "meeting", "title", "contact", "source", "tdoc_type", "for", "agenda_item_nbr", ) def _validate_page_content(soup: BeautifulSoup, tdoc_id: str) -> None: """Raise when portal HTML looks like a missing-document page.""" page_text = soup.get_text().lower() if any(indicator in page_text for indicator in error_indicators): if any(indicator in page_text for indicator in ERROR_INDICATORS): logger.warning(f"TDoc {tdoc_id} not found in portal") raise PortalParsingError(f"TDoc {tdoc_id} not found in portal") metadata: dict[str, str | None] = {} # Find the metadata table def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any: """Get TDoc metadata table from portal HTML.""" table = soup.find("table", {"class": "ultimate3gpp", "id": "tableTdocGeneralTabView"}) if not table: logger.warning(f"Metadata table not found for TDoc {tdoc_id}") raise PortalParsingError(f"Metadata table not found for TDoc {tdoc_id}") return table # Iterate over table rows rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) < 2: continue # Extract label from first cell def _normalize_label(cells: list[Any]) -> str | None: """Extract normalized key from first column label.""" label_cell = cells[0].get_text(strip=True) if not label_cell or not label_cell.endswith(":"): continue # Remove trailing colon and normalize label return None label = label_cell.rstrip(":").strip() label_key = label.lower().replace(" ", "_") return label.lower().replace(" ", "_") # Extract value from second cell def _normalize_value(cells: list[Any], label_key: str) -> str | None: """Extract normalized value from second column with status cleanup.""" value = cells[1].get_text(strip=True) if len(cells) > 1 else "" value = value.strip() if value else None # Skip empty values if not value: continue # Special handling for "status" field # Remove brackets and content within (e.g., "agreed(Download TDoc)" -> "agreed") if label_key == "status" and value: return None if label_key == "status": bracket_pos = value.find("(") if bracket_pos != -1: value = value[:bracket_pos].strip() return value[:bracket_pos].strip() return value # Store the value metadata[label_key] = value # Special handling for "Agenda item" field if label_key == "agenda_item" and value: # Parse "7.1 - Some text" format def _store_agenda_fields(metadata: dict[str, str | None], value: str) -> None: """Split agenda item into number and text if separator is present.""" parts = value.split(" - ", 1) if len(parts) == 2: agenda_nbr = parts[0].strip() agenda_text = parts[1].strip() metadata["agenda_item_nbr"] = agenda_nbr metadata["agenda_item_text"] = agenda_text else: # No separator found, treat whole thing as number metadata["agenda_item_nbr"] = parts[0].strip() metadata["agenda_item_text"] = parts[1].strip() return metadata["agenda_item_nbr"] = value # Check for mandatory fields mandatory_fields = ["meeting", "title", "contact", "source", "tdoc_type", "for", "agenda_item_nbr"] missing_fields = [field for field in mandatory_fields if field not in metadata or not metadata[field]] def _parse_metadata_table(table: Any) -> dict[str, str | None]: """Parse metadata rows from portal table.""" metadata: dict[str, str | None] = {} for row in table.find_all("tr"): cells = row.find_all("td") if len(cells) < 2: continue label_key = _normalize_label(cells) if label_key is None: continue value = _normalize_value(cells, label_key) if value is None: continue metadata[label_key] = value if label_key == "agenda_item": _store_agenda_fields(metadata, value) return metadata def _validate_mandatory_fields(metadata: dict[str, str | None], tdoc_id: str) -> None: """Ensure all required portal metadata fields are available.""" missing_fields = [field for field in MANDATORY_FIELDS if field not in metadata or not metadata[field]] if missing_fields: error_msg = f"Missing mandatory fields for TDoc {tdoc_id}: {', '.join(missing_fields)}" logger.warning(error_msg) raise PortalParsingError(error_msg) def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata: """Parse TDoc metadata from portal HTML page. Args: html: HTML content of the TDoc portal page tdoc_id: TDoc identifier for logging url: Optional TDoc URL (if known) Returns: TDocMetadata instance with portal metadata Raises: PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing Expected fields: - meeting: Meeting identifier (required) - title: Document title (required) - contact: Contact person/organization (required) - source: Source organization (required) - tdoc_type: Document type classification (required) - for: Purpose (agreement, discussion, etc.) (required) - agenda_item: Associated agenda item (required) - status: Document status (optional) - is_revision_of: Reference to previous TDoc version (optional) """ soup = BeautifulSoup(html, "html.parser") _validate_page_content(soup, tdoc_id) table = _get_metadata_table(soup, tdoc_id) metadata = _parse_metadata_table(table) _validate_mandatory_fields(metadata, tdoc_id) agenda_item_value = metadata.get("agenda_item_nbr") or "0" return TDocMetadata( tdoc_id=tdoc_id, Loading Loading
src/tdoc_crawler/meetings/operations/crawl.py +72 −36 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ from __future__ import annotations from collections import defaultdict from collections.abc import Callable from dataclasses import dataclass from typing import Any from tdoc_crawler.constants.urls import MEETINGS_BASE_URL from tdoc_crawler.credentials import resolve_credentials Loading @@ -13,7 +14,11 @@ from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS from tdoc_crawler.models.subworking_groups import ( CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord, ) from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.parsers.meetings import parse_meeting_page Loading Loading @@ -67,42 +72,15 @@ class MeetingCrawler: try: for working_group in working_groups: # Filter records for the current working group relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid] for record in relevant_records: # Skip subgroup if subgroups filter is set and this subgroup is not in the list if config.subgroups and record.code not in config.subgroups: continue url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=config.timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) continue # Create callback for subtb lookup using pre-built index def get_subtb(subgroup_code: str) -> int | None: """Get subtb from subgroup code via CODE_INDEX lookup.""" record = CODE_INDEX.get(subgroup_code.upper()) return record.subtb if record else None parsed_meetings = parse_meeting_page( response.text, working_group, record.code, get_subtb=get_subtb, meetings.extend( self._crawl_working_group( session=session, working_group=working_group, config=config, existing_ids=existing_ids, errors=errors, ) ) for meeting in parsed_meetings: if config.incremental and meeting.meeting_id in existing_ids: continue # Filter out meetings without files_url unless explicitly included if not config.include_without_files and not meeting.files_url: continue meetings.append(meeting) finally: session.close() Loading @@ -123,6 +101,32 @@ class MeetingCrawler: errors=errors, ) def _crawl_working_group( self, session: Any, working_group: WorkingGroup, config: MeetingCrawlConfig, existing_ids: set[int], errors: list[str], ) -> list[MeetingMetadata]: """Crawl and parse meeting metadata for one working group.""" meetings: list[MeetingMetadata] = [] records = [record for record in SUBWORKING_GROUP_RECORDS if record.tbid == working_group.tbid and self._record_is_in_scope(record, config)] for record in records: html_text = self._fetch_meeting_page_text(session, record, config.timeout, errors) if html_text is None: continue parsed_meetings = parse_meeting_page( html_text, working_group, record.code, get_subtb=self._get_subtb, ) for meeting in parsed_meetings: if self._include_meeting(meeting, config, existing_ids): meetings.append(meeting) return meetings def _apply_limits( self, meetings: list[MeetingMetadata], Loading @@ -136,6 +140,38 @@ class MeetingCrawler: filtered = self._limit_meetings(filtered, limits.limit_meetings) return filtered @staticmethod def _get_subtb(subgroup_code: str) -> int | None: """Resolve subgroup code to subtb via lookup index.""" record = CODE_INDEX.get(subgroup_code.upper()) return record.subtb if record else None @staticmethod def _record_is_in_scope(record: SubWorkingGroupRecord, config: MeetingCrawlConfig) -> bool: """Check whether subgroup record should be crawled for the current config.""" return not config.subgroups or record.code in config.subgroups @staticmethod def _include_meeting(meeting: MeetingMetadata, config: MeetingCrawlConfig, existing_ids: set[int]) -> bool: """Apply incremental and files-url filters before persisting meetings.""" if config.incremental and meeting.meeting_id in existing_ids: return False return config.include_without_files or bool(meeting.files_url) @staticmethod def _fetch_meeting_page_text(session: Any, record: SubWorkingGroupRecord, timeout: float, errors: list[str]) -> str | None: """Fetch meeting page HTML text for one subgroup record.""" url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) return None return response.text @staticmethod def _limit_working_groups( working_groups: list[WorkingGroup], Loading
src/tdoc_crawler/parsers/portal.py +100 −75 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ from __future__ import annotations from decimal import Decimal from typing import Any import requests from bs4 import BeautifulSoup Loading Loading @@ -69,109 +70,133 @@ class PortalParsingError(Exception): """Raised when portal page parsing fails.""" def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata: """Parse TDoc metadata from portal HTML page. Args: html: HTML content of the TDoc portal page tdoc_id: TDoc identifier for logging url: Optional TDoc URL (if known) Returns: TDocMetadata instance with portal metadata Raises: PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing Expected fields: - meeting: Meeting identifier (required) - title: Document title (required) - contact: Contact person/organization (required) - source: Source organization (required) - tdoc_type: Document type classification (required) - for: Purpose (agreement, discussion, etc.) (required) - agenda_item: Associated agenda item (required) - status: Document status (optional) - is_revision_of: Reference to previous TDoc version (optional) """ soup = BeautifulSoup(html, "html.parser") # Check for "not found" or error messages error_indicators = [ ERROR_INDICATORS: tuple[str, ...] = ( "not found", "does not exist", "invalid", "no document", ] ) MANDATORY_FIELDS: tuple[str, ...] = ( "meeting", "title", "contact", "source", "tdoc_type", "for", "agenda_item_nbr", ) def _validate_page_content(soup: BeautifulSoup, tdoc_id: str) -> None: """Raise when portal HTML looks like a missing-document page.""" page_text = soup.get_text().lower() if any(indicator in page_text for indicator in error_indicators): if any(indicator in page_text for indicator in ERROR_INDICATORS): logger.warning(f"TDoc {tdoc_id} not found in portal") raise PortalParsingError(f"TDoc {tdoc_id} not found in portal") metadata: dict[str, str | None] = {} # Find the metadata table def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any: """Get TDoc metadata table from portal HTML.""" table = soup.find("table", {"class": "ultimate3gpp", "id": "tableTdocGeneralTabView"}) if not table: logger.warning(f"Metadata table not found for TDoc {tdoc_id}") raise PortalParsingError(f"Metadata table not found for TDoc {tdoc_id}") return table # Iterate over table rows rows = table.find_all("tr") for row in rows: cells = row.find_all("td") if len(cells) < 2: continue # Extract label from first cell def _normalize_label(cells: list[Any]) -> str | None: """Extract normalized key from first column label.""" label_cell = cells[0].get_text(strip=True) if not label_cell or not label_cell.endswith(":"): continue # Remove trailing colon and normalize label return None label = label_cell.rstrip(":").strip() label_key = label.lower().replace(" ", "_") return label.lower().replace(" ", "_") # Extract value from second cell def _normalize_value(cells: list[Any], label_key: str) -> str | None: """Extract normalized value from second column with status cleanup.""" value = cells[1].get_text(strip=True) if len(cells) > 1 else "" value = value.strip() if value else None # Skip empty values if not value: continue # Special handling for "status" field # Remove brackets and content within (e.g., "agreed(Download TDoc)" -> "agreed") if label_key == "status" and value: return None if label_key == "status": bracket_pos = value.find("(") if bracket_pos != -1: value = value[:bracket_pos].strip() return value[:bracket_pos].strip() return value # Store the value metadata[label_key] = value # Special handling for "Agenda item" field if label_key == "agenda_item" and value: # Parse "7.1 - Some text" format def _store_agenda_fields(metadata: dict[str, str | None], value: str) -> None: """Split agenda item into number and text if separator is present.""" parts = value.split(" - ", 1) if len(parts) == 2: agenda_nbr = parts[0].strip() agenda_text = parts[1].strip() metadata["agenda_item_nbr"] = agenda_nbr metadata["agenda_item_text"] = agenda_text else: # No separator found, treat whole thing as number metadata["agenda_item_nbr"] = parts[0].strip() metadata["agenda_item_text"] = parts[1].strip() return metadata["agenda_item_nbr"] = value # Check for mandatory fields mandatory_fields = ["meeting", "title", "contact", "source", "tdoc_type", "for", "agenda_item_nbr"] missing_fields = [field for field in mandatory_fields if field not in metadata or not metadata[field]] def _parse_metadata_table(table: Any) -> dict[str, str | None]: """Parse metadata rows from portal table.""" metadata: dict[str, str | None] = {} for row in table.find_all("tr"): cells = row.find_all("td") if len(cells) < 2: continue label_key = _normalize_label(cells) if label_key is None: continue value = _normalize_value(cells, label_key) if value is None: continue metadata[label_key] = value if label_key == "agenda_item": _store_agenda_fields(metadata, value) return metadata def _validate_mandatory_fields(metadata: dict[str, str | None], tdoc_id: str) -> None: """Ensure all required portal metadata fields are available.""" missing_fields = [field for field in MANDATORY_FIELDS if field not in metadata or not metadata[field]] if missing_fields: error_msg = f"Missing mandatory fields for TDoc {tdoc_id}: {', '.join(missing_fields)}" logger.warning(error_msg) raise PortalParsingError(error_msg) def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata: """Parse TDoc metadata from portal HTML page. Args: html: HTML content of the TDoc portal page tdoc_id: TDoc identifier for logging url: Optional TDoc URL (if known) Returns: TDocMetadata instance with portal metadata Raises: PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing Expected fields: - meeting: Meeting identifier (required) - title: Document title (required) - contact: Contact person/organization (required) - source: Source organization (required) - tdoc_type: Document type classification (required) - for: Purpose (agreement, discussion, etc.) (required) - agenda_item: Associated agenda item (required) - status: Document status (optional) - is_revision_of: Reference to previous TDoc version (optional) """ soup = BeautifulSoup(html, "html.parser") _validate_page_content(soup, tdoc_id) table = _get_metadata_table(soup, tdoc_id) metadata = _parse_metadata_table(table) _validate_mandatory_fields(metadata, tdoc_id) agenda_item_value = metadata.get("agenda_item_nbr") or "0" return TDocMetadata( tdoc_id=tdoc_id, Loading