Commit 34e0cc28 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(meetings): enhance meeting crawling with date filters and REST API

* Add start_date and end_date fields to MeetingCrawlConfig for filtering.
* Update crawl_meetings function to accept date parameters.
* Introduce MEETINGS_REST_URL for fetching meeting data via REST API.
* Remove legacy MeetingParser and related parsing logic.
parent 0f809b42
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -85,6 +85,8 @@ def crawl_meetings(
        limit_meetings=limit_meetings,
        limit_meetings_per_subwg=limit_meetings_per_subwg,
        limit_subwgs=limit_subwgs,
        start_date=_parse_date(start_date),
        end_date=_parse_date(end_date, is_end=True),
    )

    db_file = crawler_config.path.db_file
+2 −2
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from tdoc_crawler.constants.patterns import (
)
from tdoc_crawler.constants.urls import (
    LOGIN_URL,
    MEETINGS_BASE_URL,
    MEETINGS_REST_URL,
    PORTAL_BASE_URL,
    SPEC_URL_TEMPLATE,
    TDOC_DOWNLOAD_URL,
@@ -25,7 +25,7 @@ __all__ = [
    "EXCLUDED_DIRS",
    "EXCLUDED_DIRS_NORMALIZED",
    "LOGIN_URL",
    "MEETINGS_BASE_URL",
    "MEETINGS_REST_URL",
    "PORTAL_BASE_URL",
    "SPEC_URL_TEMPLATE",
    "TDOC_DOWNLOAD_URL",
+3 −2
Original line number Diff line number Diff line
@@ -4,8 +4,9 @@ from __future__ import annotations

from typing import Final

MEETINGS_BASE_URL: Final[str] = "https://www.3gpp.org/dynareport?code=Meetings-{code}.htm"
PORTAL_BASE_URL: Final[str] = "https://portal.3gpp.org"

MEETINGS_REST_URL: Final[str] = f"{PORTAL_BASE_URL}/webservices/Rest/Meetings.svc/GetMeetings"
TDOC_VIEW_URL: Final[str] = f"{PORTAL_BASE_URL}/ngppapp/CreateTdoc.Aspx"
TDOC_DOWNLOAD_URL: Final[str] = f"{PORTAL_BASE_URL}/ngppapp/DownloadTDoc.aspx"
LOGIN_URL: Final[str] = f"{PORTAL_BASE_URL}/login.aspx"
@@ -25,7 +26,7 @@ BROWSER_HEADERS: dict[str, str] = {

__all__ = [
    "LOGIN_URL",
    "MEETINGS_BASE_URL",
    "MEETINGS_REST_URL",
    "PORTAL_BASE_URL",
    "SPEC_URL_TEMPLATE",
    "TDOC_DOWNLOAD_URL",
+2 −0
Original line number Diff line number Diff line
@@ -81,6 +81,8 @@ class MeetingCrawlConfig(BaseModel):
    limit_meetings: int | None = Field(None, description="Maximum meetings to crawl overall (negative = newest N)")
    limit_meetings_per_subwg: int | None = Field(None, description="Per sub-working group meeting limit")
    limit_subwgs: int | None = Field(None, description="Maximum number of sub-working groups to process")
    start_date: date | None = Field(None, description="Only crawl meetings starting from this date")
    end_date: date | None = Field(None, description="Only crawl meetings ending before this date")

    @field_validator("working_groups", mode="before")
    @classmethod
+62 −115
Original line number Diff line number Diff line
"""Meeting crawler for retrieving meeting metadata from 3GPP portal."""
"""Meeting crawler for retrieving meeting metadata from 3GPP portal REST API."""

from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any

from tdoc_crawler.constants.urls import MEETINGS_BASE_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata
from tdoc_crawler.models.subworking_groups import (
    CODE_INDEX,
    SUBWORKING_GROUP_RECORDS,
    SubWorkingGroupRecord,
)
from tdoc_crawler.meetings.sources.portal import fetch_meetings
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.parsers.meetings import parse_meeting_page

logger = get_logger(__name__)

@@ -35,10 +26,9 @@ class MeetingCrawlResult:


class MeetingCrawler:
    """Crawler fetching meeting metadata from the 3GPP portal."""
    """Crawler fetching meeting metadata from the 3GPP portal REST API."""

    def __init__(self, database: MeetingDatabase) -> None:

        if not isinstance(database, MeetingDatabase):
            msg = f"Expected MeetingDatabase, got {type(database)}"
            raise TypeError(msg)
@@ -49,84 +39,83 @@ class MeetingCrawler:
        config: MeetingCrawlConfig,
        progress_callback: Callable[[float, float], None] | None = None,
    ) -> MeetingCrawlResult:
        """Crawl meeting metadata from 3GPP portal.
        """Crawl meeting metadata from the portal REST API.

        Args:
            config: Configuration for crawl operation
            progress_callback: Optional callback for progress updates (processed, total)
            config: Configuration for crawl operation.
            progress_callback: Optional callback for progress updates.

        Returns:
            Summary of crawl results (processed, inserted, updated, errors)
            Summary of crawl results.
        """
        errors: list[str] = []
        meetings: list[MeetingMetadata] = []

        working_groups = self._limit_working_groups(config.working_groups, config.limit_subwgs)
        existing_ids: set[int] = set()
        if config.incremental:
            existing_ids = await self.database.get_existing_meeting_ids(working_groups)
        credentials = resolve_credentials(None, None)
        session = create_cached_session(http_config=config.http_config)
        if credentials is not None:
            session.auth = (credentials.username, credentials.password)
        # Determine date range from config
        start_date = config.start_date
        end_date = config.end_date

        # Build subgroup list for API filter
        subgroups: list[str] | None = None
        if config.subgroups:
            subgroups = list(config.subgroups)
        elif config.working_groups:
            # Convert WorkingGroup enum values to subgroup codes
            codes: set[str] = set()
            for wg in config.working_groups:
                if wg == WorkingGroup.RAN:
                    codes.add("RP")
                elif wg == WorkingGroup.SA:
                    codes.add("SP")
                elif wg == WorkingGroup.CT:
                    codes.add("CP")
                else:
                    codes.add(wg.value)
            subgroups = sorted(codes)

        if progress_callback:
            progress_callback(0, 0)

        try:
            for working_group in working_groups:
                meetings.extend(
                    self._crawl_working_group(
                        session=session,
                        working_group=working_group,
                        config=config,
                        existing_ids=existing_ids,
                        errors=errors,
                    ),
            meetings = fetch_meetings(
                subgroups=subgroups,
                start_date=start_date,
                end_date=end_date,
                timeout=config.timeout,
            )
        finally:
            session.close()
        except Exception as exc:
            msg = f"Portal meetings API failed: {exc}"
            logger.exception(msg)
            errors.append(msg)
            return MeetingCrawlResult(processed=0, inserted=0, updated=0, errors=errors)

        if progress_callback:
            progress_callback(len(meetings), len(meetings))

        # Apply incremental filter
        if config.incremental and meetings:
            existing_ids = await self.database.get_existing_meeting_ids(config.working_groups)
            meetings = [m for m in meetings if m.meeting_id not in existing_ids]

        # Apply meeting count limits
        meetings = self._apply_limits(meetings, config.limit_meetings, config.limit_meetings_per_subwg)

        filtered = self._apply_limits(meetings, config.limit_meetings, config.limit_meetings_per_subwg)
        inserted = 0
        updated = 0
        if filtered:
            # Pass progress callback to bulk_upsert_meetings to update after each DB operation
        if meetings:
            inserted, updated = await self.database.bulk_upsert_meetings(
                filtered,
                meetings,
                progress_callback=progress_callback,
            )

        return MeetingCrawlResult(
            processed=len(filtered),
            processed=len(meetings),
            inserted=inserted,
            updated=updated,
            errors=errors,
        )

    def _crawl_working_group(
        self,
        session: Any,
        working_group: WorkingGroup,
        config: MeetingCrawlConfig,
        existing_ids: set[int],
        errors: list[str],
    ) -> list[MeetingMetadata]:
        """Crawl and parse meeting metadata for one working group."""
        meetings: list[MeetingMetadata] = []
        records = [record for record in SUBWORKING_GROUP_RECORDS if record.tbid == working_group.tbid and self._record_is_in_scope(record, config)]
        for record in records:
            html_text = self._fetch_meeting_page_text(session, record, config.timeout, errors)
            if html_text is None:
                continue
            parsed_meetings = parse_meeting_page(
                html_text,
                working_group,
                record.code,
                get_subtb=self._get_subtb,
            )
            meetings.extend(meeting for meeting in parsed_meetings if self._include_meeting(meeting, config, existing_ids))
        return meetings

    @staticmethod
    def _apply_limits(
        self,
        meetings: list[MeetingMetadata],
        limit_meetings: int | None,
        limit_meetings_per_subwg: int | None,
@@ -135,50 +124,8 @@ class MeetingCrawler:
        if not meetings:
            return []
        filtered = list(meetings)
        filtered = self._limit_meetings_per_subwg(filtered, limit_meetings_per_subwg)
        return self._limit_meetings(filtered, limit_meetings)

    @staticmethod
    def _get_subtb(subgroup_code: str) -> int | None:
        """Resolve subgroup code to subtb via lookup index."""
        record = CODE_INDEX.get(subgroup_code.upper())
        return record.subtb if record else None

    @staticmethod
    def _record_is_in_scope(record: SubWorkingGroupRecord, config: MeetingCrawlConfig) -> bool:
        """Check whether subgroup record should be crawled for the current config."""
        return not config.subgroups or record.code in config.subgroups

    @staticmethod
    def _include_meeting(meeting: MeetingMetadata, config: MeetingCrawlConfig, existing_ids: set[int]) -> bool:
        """Apply incremental and files-url filters before persisting meetings."""
        if config.incremental and meeting.meeting_id in existing_ids:
            return False
        return config.include_without_files or bool(meeting.files_url)

    @staticmethod
    def _fetch_meeting_page_text(session: Any, record: SubWorkingGroupRecord, timeout: float, errors: list[str]) -> str | None:
        """Fetch meeting page HTML text for one subgroup record."""
        url = MEETINGS_BASE_URL.format(code=record.code)
        try:
            response = session.get(url, timeout=timeout)
            response.raise_for_status()
        except Exception as exc:
            message = f"Meeting crawl failed for {record.code}: {exc}"
            logger.warning(message)
            errors.append(message)
            return None
        return response.text

    @staticmethod
    def _limit_working_groups(
        working_groups: list[WorkingGroup],
        limit_subwgs: int | None,
    ) -> list[WorkingGroup]:
        """Apply working group limits from crawl configuration."""
        if limit_subwgs is None or limit_subwgs == 0:
            return working_groups
        return working_groups[:limit_subwgs] if limit_subwgs > 0 else working_groups[limit_subwgs:]
        filtered = MeetingCrawler._limit_meetings_per_subwg(filtered, limit_meetings_per_subwg)
        return MeetingCrawler._limit_meetings(filtered, limit_meetings)

    @staticmethod
    def _limit_meetings_per_subwg(
@@ -198,7 +145,7 @@ class MeetingCrawler:
        for wg_id, sequence in order.items():
            selected = sequence[:limit] if limit > 0 else sequence[limit:]
            allowed_ids[wg_id] = set(selected)
        return [meeting for meeting in meetings if meeting.meeting_id in allowed_ids.get(meeting.subtb or meeting.tbid, {meeting.meeting_id})]
        return [m for m in meetings if m.meeting_id in allowed_ids.get(m.subtb or m.tbid, {m.meeting_id})]

    @staticmethod
    def _limit_meetings(
@@ -213,7 +160,7 @@ class MeetingCrawler:
            if meeting.meeting_id not in sequence:
                sequence.append(meeting.meeting_id)
        allowed = set(sequence[:limit]) if limit > 0 else set(sequence[limit:])
        return [meeting for meeting in meetings if meeting.meeting_id in allowed]
        return [m for m in meetings if m.meeting_id in allowed]


__all__ = [
Loading