Commit 2d7b75e4 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(meetings): simplify crawl and portal parsing flow

parent 2959733e
Loading
Loading
Loading
Loading
+72 −36
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ from __future__ import annotations
from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any

from tdoc_crawler.constants.urls import MEETINGS_BASE_URL
from tdoc_crawler.credentials import resolve_credentials
@@ -13,7 +14,11 @@ from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS
from tdoc_crawler.models.subworking_groups import (
    CODE_INDEX,
    SUBWORKING_GROUP_RECORDS,
    SubWorkingGroupRecord,
)
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.parsers.meetings import parse_meeting_page

@@ -67,42 +72,15 @@ class MeetingCrawler:

        try:
            for working_group in working_groups:
                # Filter records for the current working group
                relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid]

                for record in relevant_records:
                    # Skip subgroup if subgroups filter is set and this subgroup is not in the list
                    if config.subgroups and record.code not in config.subgroups:
                        continue
                    url = MEETINGS_BASE_URL.format(code=record.code)
                    try:
                        response = session.get(url, timeout=config.timeout)
                        response.raise_for_status()
                    except Exception as exc:
                        message = f"Meeting crawl failed for {record.code}: {exc}"
                        logger.warning(message)
                        errors.append(message)
                        continue

                    # Create callback for subtb lookup using pre-built index
                    def get_subtb(subgroup_code: str) -> int | None:
                        """Get subtb from subgroup code via CODE_INDEX lookup."""
                        record = CODE_INDEX.get(subgroup_code.upper())
                        return record.subtb if record else None

                    parsed_meetings = parse_meeting_page(
                        response.text,
                        working_group,
                        record.code,
                        get_subtb=get_subtb,
                meetings.extend(
                    self._crawl_working_group(
                        session=session,
                        working_group=working_group,
                        config=config,
                        existing_ids=existing_ids,
                        errors=errors,
                    )
                )
                    for meeting in parsed_meetings:
                        if config.incremental and meeting.meeting_id in existing_ids:
                            continue
                        # Filter out meetings without files_url unless explicitly included
                        if not config.include_without_files and not meeting.files_url:
                            continue
                        meetings.append(meeting)
        finally:
            session.close()

@@ -123,6 +101,32 @@ class MeetingCrawler:
            errors=errors,
        )

    def _crawl_working_group(
        self,
        session: Any,
        working_group: WorkingGroup,
        config: MeetingCrawlConfig,
        existing_ids: set[int],
        errors: list[str],
    ) -> list[MeetingMetadata]:
        """Crawl and parse meeting metadata for one working group."""
        meetings: list[MeetingMetadata] = []
        records = [record for record in SUBWORKING_GROUP_RECORDS if record.tbid == working_group.tbid and self._record_is_in_scope(record, config)]
        for record in records:
            html_text = self._fetch_meeting_page_text(session, record, config.timeout, errors)
            if html_text is None:
                continue
            parsed_meetings = parse_meeting_page(
                html_text,
                working_group,
                record.code,
                get_subtb=self._get_subtb,
            )
            for meeting in parsed_meetings:
                if self._include_meeting(meeting, config, existing_ids):
                    meetings.append(meeting)
        return meetings

    def _apply_limits(
        self,
        meetings: list[MeetingMetadata],
@@ -136,6 +140,38 @@ class MeetingCrawler:
        filtered = self._limit_meetings(filtered, limits.limit_meetings)
        return filtered

    @staticmethod
    def _get_subtb(subgroup_code: str) -> int | None:
        """Resolve subgroup code to subtb via lookup index."""
        record = CODE_INDEX.get(subgroup_code.upper())
        return record.subtb if record else None

    @staticmethod
    def _record_is_in_scope(record: SubWorkingGroupRecord, config: MeetingCrawlConfig) -> bool:
        """Check whether subgroup record should be crawled for the current config."""
        return not config.subgroups or record.code in config.subgroups

    @staticmethod
    def _include_meeting(meeting: MeetingMetadata, config: MeetingCrawlConfig, existing_ids: set[int]) -> bool:
        """Apply incremental and files-url filters before persisting meetings."""
        if config.incremental and meeting.meeting_id in existing_ids:
            return False
        return config.include_without_files or bool(meeting.files_url)

    @staticmethod
    def _fetch_meeting_page_text(session: Any, record: SubWorkingGroupRecord, timeout: float, errors: list[str]) -> str | None:
        """Fetch meeting page HTML text for one subgroup record."""
        url = MEETINGS_BASE_URL.format(code=record.code)
        try:
            response = session.get(url, timeout=timeout)
            response.raise_for_status()
        except Exception as exc:
            message = f"Meeting crawl failed for {record.code}: {exc}"
            logger.warning(message)
            errors.append(message)
            return None
        return response.text

    @staticmethod
    def _limit_working_groups(
        working_groups: list[WorkingGroup],
+100 −75
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
from __future__ import annotations

from decimal import Decimal
from typing import Any

import requests
from bs4 import BeautifulSoup
@@ -69,109 +70,133 @@ class PortalParsingError(Exception):
    """Raised when portal page parsing fails."""


def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata:
    """Parse TDoc metadata from portal HTML page.

    Args:
        html: HTML content of the TDoc portal page
        tdoc_id: TDoc identifier for logging
        url: Optional TDoc URL (if known)

    Returns:
        TDocMetadata instance with portal metadata

    Raises:
        PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing

    Expected fields:
        - meeting: Meeting identifier (required)
        - title: Document title (required)
        - contact: Contact person/organization (required)
        - source: Source organization (required)
        - tdoc_type: Document type classification (required)
        - for: Purpose (agreement, discussion, etc.) (required)
        - agenda_item: Associated agenda item (required)
        - status: Document status (optional)
        - is_revision_of: Reference to previous TDoc version (optional)
    """
    soup = BeautifulSoup(html, "html.parser")

    # Check for "not found" or error messages
    error_indicators = [
ERROR_INDICATORS: tuple[str, ...] = (
    "not found",
    "does not exist",
    "invalid",
    "no document",
    ]
)

MANDATORY_FIELDS: tuple[str, ...] = (
    "meeting",
    "title",
    "contact",
    "source",
    "tdoc_type",
    "for",
    "agenda_item_nbr",
)


def _validate_page_content(soup: BeautifulSoup, tdoc_id: str) -> None:
    """Raise when portal HTML looks like a missing-document page."""
    page_text = soup.get_text().lower()
    if any(indicator in page_text for indicator in error_indicators):
    if any(indicator in page_text for indicator in ERROR_INDICATORS):
        logger.warning(f"TDoc {tdoc_id} not found in portal")
        raise PortalParsingError(f"TDoc {tdoc_id} not found in portal")

    metadata: dict[str, str | None] = {}

    # Find the metadata table
def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any:
    """Get TDoc metadata table from portal HTML."""
    table = soup.find("table", {"class": "ultimate3gpp", "id": "tableTdocGeneralTabView"})
    if not table:
        logger.warning(f"Metadata table not found for TDoc {tdoc_id}")
        raise PortalParsingError(f"Metadata table not found for TDoc {tdoc_id}")
    return table

    # Iterate over table rows
    rows = table.find_all("tr")
    for row in rows:
        cells = row.find_all("td")
        if len(cells) < 2:
            continue

        # Extract label from first cell
def _normalize_label(cells: list[Any]) -> str | None:
    """Extract normalized key from first column label."""
    label_cell = cells[0].get_text(strip=True)
    if not label_cell or not label_cell.endswith(":"):
            continue

        # Remove trailing colon and normalize label
        return None
    label = label_cell.rstrip(":").strip()
        label_key = label.lower().replace(" ", "_")
    return label.lower().replace(" ", "_")


        # Extract value from second cell
def _normalize_value(cells: list[Any], label_key: str) -> str | None:
    """Extract normalized value from second column with status cleanup."""
    value = cells[1].get_text(strip=True) if len(cells) > 1 else ""
    value = value.strip() if value else None

        # Skip empty values
    if not value:
            continue

        # Special handling for "status" field
        # Remove brackets and content within (e.g., "agreed(Download TDoc)" -> "agreed")
        if label_key == "status" and value:
        return None
    if label_key == "status":
        bracket_pos = value.find("(")
        if bracket_pos != -1:
                value = value[:bracket_pos].strip()
            return value[:bracket_pos].strip()
    return value

        # Store the value
        metadata[label_key] = value

        # Special handling for "Agenda item" field
        if label_key == "agenda_item" and value:
            # Parse "7.1 - Some text" format
def _store_agenda_fields(metadata: dict[str, str | None], value: str) -> None:
    """Split agenda item into number and text if separator is present."""
    parts = value.split(" - ", 1)
    if len(parts) == 2:
                agenda_nbr = parts[0].strip()
                agenda_text = parts[1].strip()
                metadata["agenda_item_nbr"] = agenda_nbr
                metadata["agenda_item_text"] = agenda_text
            else:
                # No separator found, treat whole thing as number
        metadata["agenda_item_nbr"] = parts[0].strip()
        metadata["agenda_item_text"] = parts[1].strip()
        return
    metadata["agenda_item_nbr"] = value

    # Check for mandatory fields
    mandatory_fields = ["meeting", "title", "contact", "source", "tdoc_type", "for", "agenda_item_nbr"]
    missing_fields = [field for field in mandatory_fields if field not in metadata or not metadata[field]]

def _parse_metadata_table(table: Any) -> dict[str, str | None]:
    """Parse metadata rows from portal table."""
    metadata: dict[str, str | None] = {}
    for row in table.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) < 2:
            continue
        label_key = _normalize_label(cells)
        if label_key is None:
            continue
        value = _normalize_value(cells, label_key)
        if value is None:
            continue
        metadata[label_key] = value
        if label_key == "agenda_item":
            _store_agenda_fields(metadata, value)
    return metadata


def _validate_mandatory_fields(metadata: dict[str, str | None], tdoc_id: str) -> None:
    """Ensure all required portal metadata fields are available."""
    missing_fields = [field for field in MANDATORY_FIELDS if field not in metadata or not metadata[field]]
    if missing_fields:
        error_msg = f"Missing mandatory fields for TDoc {tdoc_id}: {', '.join(missing_fields)}"
        logger.warning(error_msg)
        raise PortalParsingError(error_msg)


def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> TDocMetadata:
    """Parse TDoc metadata from portal HTML page.

    Args:
        html: HTML content of the TDoc portal page
        tdoc_id: TDoc identifier for logging
        url: Optional TDoc URL (if known)

    Returns:
        TDocMetadata instance with portal metadata

    Raises:
        PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing

    Expected fields:
        - meeting: Meeting identifier (required)
        - title: Document title (required)
        - contact: Contact person/organization (required)
        - source: Source organization (required)
        - tdoc_type: Document type classification (required)
        - for: Purpose (agreement, discussion, etc.) (required)
        - agenda_item: Associated agenda item (required)
        - status: Document status (optional)
        - is_revision_of: Reference to previous TDoc version (optional)
    """
    soup = BeautifulSoup(html, "html.parser")

    _validate_page_content(soup, tdoc_id)
    table = _get_metadata_table(soup, tdoc_id)
    metadata = _parse_metadata_table(table)
    _validate_mandatory_fields(metadata, tdoc_id)

    agenda_item_value = metadata.get("agenda_item_nbr") or "0"
    return TDocMetadata(
        tdoc_id=tdoc_id,