Loading src/tdoc_crawler/database/tdocs.py +2 −3 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ from collections.abc import Callable, Iterable from datetime import UTC, datetime from decimal import Decimal from tdoc_crawler.database.base import DocDatabase from tdoc_crawler.database.meetings import MeetingDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import WorkingGroup Loading @@ -14,11 +13,11 @@ from tdoc_crawler.utils.misc import utc_now _logger = get_logger(__name__) class TDocDatabase(DocDatabase): class TDocDatabase(MeetingDatabase): """Unified database operations for TDocs and Meetings. This class provides a unified interface for both TDoc and Meeting operations by inheriting from DocDatabase. This maintains backward compatibility by inheriting from MeetingDatabase. This maintains backward compatibility with code that expects a single database interface. """ Loading src/tdoc_crawler/meetings/operations/crawl.py +16 −9 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from collections import defaultdict from collections.abc import Callable from dataclasses import dataclass from tdoc_crawler.constants.registry import MEETING_CODE_REGISTRY from tdoc_crawler.constants.urls import MEETINGS_BASE_URL from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database.meetings import MeetingDatabase Loading @@ -14,6 +13,7 @@ from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.parsers.meetings import parse_meeting_page Loading Loading @@ -68,30 +68,37 @@ class MeetingCrawler: try: for working_group in working_groups: for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []): # Filter records for the current working group relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid] for record in relevant_records: # Skip subgroup if subgroups filter is set and this subgroup is not in the list if config.subgroups and subgroup not in config.subgroups: if config.subgroups and record.code not in config.subgroups: continue url = MEETINGS_BASE_URL.format(code=code) url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=config.timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {code}: {exc}" message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) continue # TODO: can be made much efficient by directly accessing properties of enum/dataclasses? # Create callback for subtb lookup def get_subtb(subgroup_code: str) -> int | None: """Get subtb from subgroup code via database lookup.""" subgroup_data = self.database.get_subgroup_by_code(subgroup_code) return subgroup_data["subtb"] if subgroup_data else None """Get subtb from subgroup code via record lookup.""" # Using SUBWORKING_GROUP_RECORDS directly as it's the source of truth for r in SUBWORKING_GROUP_RECORDS: if r.code == subgroup_code: return r.subtb return None parsed_meetings = parse_meeting_page( response.text, working_group, subgroup, record.code, get_subtb=get_subtb, ) for meeting in parsed_meetings: Loading src/tdoc_crawler/tdocs/sources/whatthespec.py +2 −1 Original line number Diff line number Diff line Loading @@ -72,7 +72,8 @@ def resolve_via_whatthespec( record = payload[0] or {} resolved_id = str(record.get("name") or tdoc_id).strip().upper() meeting_name = str(record.get("meeting") or "") meeting_raw = record.get("meeting") meeting_name = str(meeting_raw) if meeting_raw else None agenda_item_nbr = parse_agenda_item_nbr(record.get("ainumber")) manager = resolve_cache_manager(cache_manager_name) Loading src/tdoc_crawler/utils/normalization.py +4 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ def normalize_tdoc_ids(ids: Iterable[str]) -> list[str]: return [str(value).strip().upper() for value in ids] def normalize_portal_meeting_name(portal_meeting: str) -> str: def normalize_portal_meeting_name(portal_meeting: str | None) -> str: """Normalize portal meeting name to database format. The portal uses format like "SA4#133-e" while the database uses "S4-133-e". Loading @@ -26,6 +26,9 @@ def normalize_portal_meeting_name(portal_meeting: str) -> str: Returns: Normalized meeting name (e.g., "S4-133-e") """ if not portal_meeting: return "" # Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc. normalized = portal_meeting.replace("#", "-") Loading src/tdoc_crawler/utils/parse.py +4 −6 Original line number Diff line number Diff line Loading @@ -77,11 +77,7 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = for item in values: # Try alias normalization first (RP->RAN, SP->SA, CP->CT) normalized = normalize_working_group_alias(item) try: resolved.append(WorkingGroup(normalized.upper())) except ValueError as exc: _logger.warning(f"Unknown working group: {item}") raise typer.Exit(code=2) from exc resolved.append(normalized) if not resolved: _logger.warning("No valid working groups specified") raise typer.Exit(code=2) Loading @@ -89,16 +85,18 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = def parse_subgroups(values: list[str] | None) -> list[str] | None: """Parse and normalize subgroup aliases to canonical names.""" """Parse and normalize subgroup aliases to canonical subgroup codes.""" if not values: return None resolved: list[str] = [] for item in values: # Convert SubWorkingGroup enums to their names (e.g., S4, R1, CP) normalized = normalize_subgroup_alias(item) if not normalized: _logger.warning(f"Unknown subgroup: {item}") raise typer.Exit(code=2) resolved.extend(normalized) return resolved Loading Loading
src/tdoc_crawler/database/tdocs.py +2 −3 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ from collections.abc import Callable, Iterable from datetime import UTC, datetime from decimal import Decimal from tdoc_crawler.database.base import DocDatabase from tdoc_crawler.database.meetings import MeetingDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import WorkingGroup Loading @@ -14,11 +13,11 @@ from tdoc_crawler.utils.misc import utc_now _logger = get_logger(__name__) class TDocDatabase(DocDatabase): class TDocDatabase(MeetingDatabase): """Unified database operations for TDocs and Meetings. This class provides a unified interface for both TDoc and Meeting operations by inheriting from DocDatabase. This maintains backward compatibility by inheriting from MeetingDatabase. This maintains backward compatibility with code that expects a single database interface. """ Loading
src/tdoc_crawler/meetings/operations/crawl.py +16 −9 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from collections import defaultdict from collections.abc import Callable from dataclasses import dataclass from tdoc_crawler.constants.registry import MEETING_CODE_REGISTRY from tdoc_crawler.constants.urls import MEETINGS_BASE_URL from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database.meetings import MeetingDatabase Loading @@ -14,6 +13,7 @@ from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata from tdoc_crawler.models.crawl_limits import CrawlLimits from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.parsers.meetings import parse_meeting_page Loading Loading @@ -68,30 +68,37 @@ class MeetingCrawler: try: for working_group in working_groups: for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []): # Filter records for the current working group relevant_records = [r for r in SUBWORKING_GROUP_RECORDS if r.tbid == working_group.tbid] for record in relevant_records: # Skip subgroup if subgroups filter is set and this subgroup is not in the list if config.subgroups and subgroup not in config.subgroups: if config.subgroups and record.code not in config.subgroups: continue url = MEETINGS_BASE_URL.format(code=code) url = MEETINGS_BASE_URL.format(code=record.code) try: response = session.get(url, timeout=config.timeout) response.raise_for_status() except Exception as exc: message = f"Meeting crawl failed for {code}: {exc}" message = f"Meeting crawl failed for {record.code}: {exc}" logger.warning(message) errors.append(message) continue # TODO: can be made much efficient by directly accessing properties of enum/dataclasses? # Create callback for subtb lookup def get_subtb(subgroup_code: str) -> int | None: """Get subtb from subgroup code via database lookup.""" subgroup_data = self.database.get_subgroup_by_code(subgroup_code) return subgroup_data["subtb"] if subgroup_data else None """Get subtb from subgroup code via record lookup.""" # Using SUBWORKING_GROUP_RECORDS directly as it's the source of truth for r in SUBWORKING_GROUP_RECORDS: if r.code == subgroup_code: return r.subtb return None parsed_meetings = parse_meeting_page( response.text, working_group, subgroup, record.code, get_subtb=get_subtb, ) for meeting in parsed_meetings: Loading
src/tdoc_crawler/tdocs/sources/whatthespec.py +2 −1 Original line number Diff line number Diff line Loading @@ -72,7 +72,8 @@ def resolve_via_whatthespec( record = payload[0] or {} resolved_id = str(record.get("name") or tdoc_id).strip().upper() meeting_name = str(record.get("meeting") or "") meeting_raw = record.get("meeting") meeting_name = str(meeting_raw) if meeting_raw else None agenda_item_nbr = parse_agenda_item_nbr(record.get("ainumber")) manager = resolve_cache_manager(cache_manager_name) Loading
src/tdoc_crawler/utils/normalization.py +4 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ def normalize_tdoc_ids(ids: Iterable[str]) -> list[str]: return [str(value).strip().upper() for value in ids] def normalize_portal_meeting_name(portal_meeting: str) -> str: def normalize_portal_meeting_name(portal_meeting: str | None) -> str: """Normalize portal meeting name to database format. The portal uses format like "SA4#133-e" while the database uses "S4-133-e". Loading @@ -26,6 +26,9 @@ def normalize_portal_meeting_name(portal_meeting: str) -> str: Returns: Normalized meeting name (e.g., "S4-133-e") """ if not portal_meeting: return "" # Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc. normalized = portal_meeting.replace("#", "-") Loading
src/tdoc_crawler/utils/parse.py +4 −6 Original line number Diff line number Diff line Loading @@ -77,11 +77,7 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = for item in values: # Try alias normalization first (RP->RAN, SP->SA, CP->CT) normalized = normalize_working_group_alias(item) try: resolved.append(WorkingGroup(normalized.upper())) except ValueError as exc: _logger.warning(f"Unknown working group: {item}") raise typer.Exit(code=2) from exc resolved.append(normalized) if not resolved: _logger.warning("No valid working groups specified") raise typer.Exit(code=2) Loading @@ -89,16 +85,18 @@ def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = def parse_subgroups(values: list[str] | None) -> list[str] | None: """Parse and normalize subgroup aliases to canonical names.""" """Parse and normalize subgroup aliases to canonical subgroup codes.""" if not values: return None resolved: list[str] = [] for item in values: # Convert SubWorkingGroup enums to their names (e.g., S4, R1, CP) normalized = normalize_subgroup_alias(item) if not normalized: _logger.warning(f"Unknown subgroup: {item}") raise typer.Exit(code=2) resolved.extend(normalized) return resolved Loading