fix(meetings/tdocs): switch to SUBWORKING_GROUP_RECORDS and align parsing/normalization types (d115aeef) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/database/tdocs.py

+2 −3

Original line number	Diff line number	Diff line
		@@ -4,7 +4,6 @@ from collections.abc import Callable, Iterable
		from datetime import UTC, datetime
		from decimal import Decimal

		from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.database.meetings import MeetingDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models import WorkingGroup
		@@ -14,11 +13,11 @@ from tdoc_crawler.utils.misc import utc_now
		_logger = get_logger(__name__)


		class TDocDatabase(DocDatabase):
		class TDocDatabase(MeetingDatabase):
		"""Unified database operations for TDocs and Meetings.

		This class provides a unified interface for both TDoc and Meeting operations
		by inheriting from DocDatabase. This maintains backward compatibility
		by inheriting from MeetingDatabase. This maintains backward compatibility
		with code that expects a single database interface.
		"""

src/tdoc_crawler/meetings/operations/crawl.py

+16 −9

Original line number	Diff line number	Diff line
		@@ -6,7 +6,6 @@ from collections import defaultdict
		from collections.abc import Callable
		from dataclasses import dataclass

		from tdoc_crawler.constants.registry import MEETING_CODE_REGISTRY
		from tdoc_crawler.constants.urls import MEETINGS_BASE_URL
		from tdoc_crawler.credentials import resolve_credentials
		from tdoc_crawler.database.meetings import MeetingDatabase
		@@ -14,6 +13,7 @@ from tdoc_crawler.http_client import create_cached_session
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata
		from tdoc_crawler.models.crawl_limits import CrawlLimits
		from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS
		from tdoc_crawler.models.working_groups import WorkingGroup
		from tdoc_crawler.parsers.meetings import parse_meeting_page

		@@ -68,30 +68,37 @@ class MeetingCrawler:

		try:
		for working_group in working_groups:
		for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
		# Filter records for the current working group
		relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid]

		for record in relevant_records:
		# Skip subgroup if subgroups filter is set and this subgroup is not in the list
		if config.subgroups and subgroup not in config.subgroups:
		if config.subgroups and record.code not in config.subgroups:
		continue
		url = MEETINGS_BASE_URL.format(code=code)
		url = MEETINGS_BASE_URL.format(code=record.code)
		try:
		response = session.get(url, timeout=config.timeout)
		response.raise_for_status()
		except Exception as exc:
		message = f"Meeting crawl failed for {code}: {exc}"
		message = f"Meeting crawl failed for {record.code}: {exc}"
		logger.warning(message)
		errors.append(message)
		continue

		# TODO: can be made much efficient by directly accessing properties of enum/dataclasses?
		# Create callback for subtb lookup
		def get_subtb(subgroup_code: str) -> int \| None:
		"""Get subtb from subgroup code via database lookup."""
		subgroup_data = self.database.get_subgroup_by_code(subgroup_code)
		return subgroup_data["subtb"] if subgroup_data else None
		"""Get subtb from subgroup code via record lookup."""
		# Using SUBWORKING_GROUP_RECORDS directly as it's the source of truth
		for r in SUBWORKING_GROUP_RECORDS:
		if r.code == subgroup_code:
		return r.subtb
		return None

		parsed_meetings = parse_meeting_page(
		response.text,
		working_group,
		subgroup,
		record.code,
		get_subtb=get_subtb,
		)
		for meeting in parsed_meetings:

src/tdoc_crawler/tdocs/sources/whatthespec.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -72,7 +72,8 @@ def resolve_via_whatthespec(

		record = payload[0] or {}
		resolved_id = str(record.get("name") or tdoc_id).strip().upper()
		meeting_name = str(record.get("meeting") or "")
		meeting_raw = record.get("meeting")
		meeting_name = str(meeting_raw) if meeting_raw else None
		agenda_item_nbr = parse_agenda_item_nbr(record.get("ainumber"))

		manager = resolve_cache_manager(cache_manager_name)

src/tdoc_crawler/utils/normalization.py

+4 −1

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ def normalize_tdoc_ids(ids: Iterable[str]) -> list[str]:
		return [str(value).strip().upper() for value in ids]


		def normalize_portal_meeting_name(portal_meeting: str) -> str:
		def normalize_portal_meeting_name(portal_meeting: str \| None) -> str:
		"""Normalize portal meeting name to database format.

		The portal uses format like "SA4#133-e" while the database uses "S4-133-e".
		@@ -26,6 +26,9 @@ def normalize_portal_meeting_name(portal_meeting: str) -> str:
		Returns:
		Normalized meeting name (e.g., "S4-133-e")
		"""
		if not portal_meeting:
		return ""

		# Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc.
		normalized = portal_meeting.replace("#", "-")

src/tdoc_crawler/utils/parse.py

+4 −6

Original line number	Diff line number	Diff line
		@@ -77,11 +77,7 @@ def parse_working_groups(values: list[str] \| None, subgroups: list[str] \| None =
		for item in values:
		# Try alias normalization first (RP->RAN, SP->SA, CP->CT)
		normalized = normalize_working_group_alias(item)
		try:
		resolved.append(WorkingGroup(normalized.upper()))
		except ValueError as exc:
		_logger.warning(f"Unknown working group: {item}")
		raise typer.Exit(code=2) from exc
		resolved.append(normalized)
		if not resolved:
		_logger.warning("No valid working groups specified")
		raise typer.Exit(code=2)
		@@ -89,16 +85,18 @@ def parse_working_groups(values: list[str] \| None, subgroups: list[str] \| None =


		def parse_subgroups(values: list[str] \| None) -> list[str] \| None:
		"""Parse and normalize subgroup aliases to canonical names."""
		"""Parse and normalize subgroup aliases to canonical subgroup codes."""
		if not values:
		return None

		resolved: list[str] = []
		for item in values:
		# Convert SubWorkingGroup enums to their names (e.g., S4, R1, CP)
		normalized = normalize_subgroup_alias(item)
		if not normalized:
		_logger.warning(f"Unknown subgroup: {item}")
		raise typer.Exit(code=2)

		resolved.extend(normalized)

		return resolved