Commit d115aeef authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(meetings/tdocs): switch to SUBWORKING_GROUP_RECORDS and align parsing/normalization types

parent 928d0a36
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -4,7 +4,6 @@ from collections.abc import Callable, Iterable
from datetime import UTC, datetime
from decimal import Decimal

from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import WorkingGroup
@@ -14,11 +13,11 @@ from tdoc_crawler.utils.misc import utc_now
_logger = get_logger(__name__)


class TDocDatabase(DocDatabase):
class TDocDatabase(MeetingDatabase):
    """Unified database operations for TDocs and Meetings.

    This class provides a unified interface for both TDoc and Meeting operations
    by inheriting from DocDatabase. This maintains backward compatibility
    by inheriting from MeetingDatabase. This maintains backward compatibility
    with code that expects a single database interface.
    """

+16 −9
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass

from tdoc_crawler.constants.registry import MEETING_CODE_REGISTRY
from tdoc_crawler.constants.urls import MEETINGS_BASE_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database.meetings import MeetingDatabase
@@ -14,6 +13,7 @@ from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.parsers.meetings import parse_meeting_page

@@ -68,30 +68,37 @@ class MeetingCrawler:

        try:
            for working_group in working_groups:
                for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
                # Filter records for the current working group
                relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid]

                for record in relevant_records:
                    # Skip subgroup if subgroups filter is set and this subgroup is not in the list
                    if config.subgroups and subgroup not in config.subgroups:
                    if config.subgroups and record.code not in config.subgroups:
                        continue
                    url = MEETINGS_BASE_URL.format(code=code)
                    url = MEETINGS_BASE_URL.format(code=record.code)
                    try:
                        response = session.get(url, timeout=config.timeout)
                        response.raise_for_status()
                    except Exception as exc:
                        message = f"Meeting crawl failed for {code}: {exc}"
                        message = f"Meeting crawl failed for {record.code}: {exc}"
                        logger.warning(message)
                        errors.append(message)
                        continue

                    # TODO: can be made much efficient by directly accessing properties of enum/dataclasses?
                    # Create callback for subtb lookup
                    def get_subtb(subgroup_code: str) -> int | None:
                        """Get subtb from subgroup code via database lookup."""
                        subgroup_data = self.database.get_subgroup_by_code(subgroup_code)
                        return subgroup_data["subtb"] if subgroup_data else None
                        """Get subtb from subgroup code via record lookup."""
                        # Using SUBWORKING_GROUP_RECORDS directly as it's the source of truth
                        for r in SUBWORKING_GROUP_RECORDS:
                            if r.code == subgroup_code:
                                return r.subtb
                        return None

                    parsed_meetings = parse_meeting_page(
                        response.text,
                        working_group,
                        subgroup,
                        record.code,
                        get_subtb=get_subtb,
                    )
                    for meeting in parsed_meetings:
+2 −1
Original line number Diff line number Diff line
@@ -72,7 +72,8 @@ def resolve_via_whatthespec(

    record = payload[0] or {}
    resolved_id = str(record.get("name") or tdoc_id).strip().upper()
    meeting_name = str(record.get("meeting") or "")
    meeting_raw = record.get("meeting")
    meeting_name = str(meeting_raw) if meeting_raw else None
    agenda_item_nbr = parse_agenda_item_nbr(record.get("ainumber"))

    manager = resolve_cache_manager(cache_manager_name)
+4 −1
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ def normalize_tdoc_ids(ids: Iterable[str]) -> list[str]:
    return [str(value).strip().upper() for value in ids]


def normalize_portal_meeting_name(portal_meeting: str) -> str:
def normalize_portal_meeting_name(portal_meeting: str | None) -> str:
    """Normalize portal meeting name to database format.

    The portal uses format like "SA4#133-e" while the database uses "S4-133-e".
@@ -26,6 +26,9 @@ def normalize_portal_meeting_name(portal_meeting: str) -> str:
    Returns:
        Normalized meeting name (e.g., "S4-133-e")
    """
    if not portal_meeting:
        return ""

    # Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc.
    normalized = portal_meeting.replace("#", "-")

+4 −6
Original line number Diff line number Diff line
@@ -77,11 +77,7 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None =
    for item in values:
        # Try alias normalization first (RP->RAN, SP->SA, CP->CT)
        normalized = normalize_working_group_alias(item)
        try:
            resolved.append(WorkingGroup(normalized.upper()))
        except ValueError as exc:
            _logger.warning(f"Unknown working group: {item}")
            raise typer.Exit(code=2) from exc
        resolved.append(normalized)
    if not resolved:
        _logger.warning("No valid working groups specified")
        raise typer.Exit(code=2)
@@ -89,16 +85,18 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None =


def parse_subgroups(values: list[str] | None) -> list[str] | None:
    """Parse and normalize subgroup aliases to canonical names."""
    """Parse and normalize subgroup aliases to canonical subgroup codes."""
    if not values:
        return None

    resolved: list[str] = []
    for item in values:
        # Convert SubWorkingGroup enums to their names (e.g., S4, R1, CP)
        normalized = normalize_subgroup_alias(item)
        if not normalized:
            _logger.warning(f"Unknown subgroup: {item}")
            raise typer.Exit(code=2)

        resolved.extend(normalized)

    return resolved