Loading src/tdoc_crawler/database/meetings.py +14 −12 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ from tdoc_crawler.database.base import DocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig from tdoc_crawler.models.base import SortOrder from tdoc_crawler.models.subworking_groups import CODE_INDEX from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup from tdoc_crawler.utils.misc import utc_now from tdoc_crawler.utils.normalization import normalize_portal_meeting_name Loading Loading @@ -101,12 +101,12 @@ class MeetingDatabase(DocDatabase): meetings = self._table_rows("meetings") if config.working_groups: allowed = {wg.value for wg in config.working_groups} meetings = [meeting for meeting in meetings if meeting.working_group and meeting.working_group in allowed] allowed = {wg.tbid for wg in config.working_groups} meetings = [meeting for meeting in meetings if meeting.tbid in allowed] if config.subgroups: allowed_subgroups = {value.strip().upper() for value in config.subgroups} meetings = [meeting for meeting in meetings if (meeting.subgroup or "").upper() in allowed_subgroups] meetings = [meeting for meeting in meetings if meeting.subtb in SUBTB_INDEX and SUBTB_INDEX[meeting.subtb].code in allowed_subgroups] if not config.include_without_files: meetings = [meeting for meeting in meetings if meeting.files_url] Loading Loading @@ -138,8 +138,8 @@ class MeetingDatabase(DocDatabase): if not working_groups: return {meeting.meeting_id for meeting in meetings} allowed = {wg.value for wg in working_groups} return {meeting.meeting_id for meeting in meetings if meeting.working_group and meeting.working_group in allowed} allowed = {wg.tbid for wg in working_groups} return {meeting.meeting_id for meeting in meetings if meeting.tbid in allowed} def get_tdoc_count_for_meeting(self, meeting_id: int) -> int: """Get the number of TDocs associated with a meeting. Loading Loading @@ -238,10 +238,16 @@ class MeetingDatabase(DocDatabase): crawl_entries = self._table_rows("crawl_log") by_working_group: dict[str, int] = defaultdict(int) tbid_to_code = {working_group.tbid: working_group.value for working_group in WorkingGroup} for record in tdocs: meeting = meetings.get(record.meeting_id or -1) if meeting and meeting.working_group: by_working_group[meeting.working_group] += 1 if meeting is None: continue code = tbid_to_code.get(meeting.tbid) if code is None: _logger.debug("Unknown tbid in meeting %s: %s", meeting.meeting_id, meeting.tbid) continue by_working_group[code] += 1 recent_crawls = [ { Loading Loading @@ -304,10 +310,6 @@ class MeetingDatabase(DocDatabase): @staticmethod def _prepare_meeting(metadata: MeetingMetadata) -> MeetingMetadata: """Prepare meeting metadata for insertion (set defaults).""" if metadata.working_group is None and metadata.tbid: for working_group in WorkingGroup: if working_group.tbid == metadata.tbid: return metadata.model_copy(update={"working_group": working_group}) return metadata @staticmethod Loading src/tdoc_crawler/database/specs.py +212 −161 Original line number Diff line number Diff line Loading @@ -24,6 +24,72 @@ from tdoc_crawler.utils.normalization import normalize_release, normalize_spec_n _logger = get_logger(__name__) def _build_spec_candidate( *, compact: str, metadata_payload: dict[str, object], normalized: str, source_name: str, versions: list[str], ) -> tuple[Specification, list[SpecificationVersion]]: title = str(metadata_payload.get("title", "Unknown")) spec_type = str(metadata_payload.get("spec_type", "TS")) status = str(metadata_payload.get("status", "unknown")) working_group = str(metadata_payload.get("working_group", "unknown")) series = str(metadata_payload.get("series", f"{normalized.split('.', maxsplit=1)[0]}_series")) latest_version = metadata_payload.get("latest_version") if latest_version is None and versions: latest_version = versions[0] candidate = Specification( spec_number=normalized, spec_number_compact=compact, spec_type=spec_type, title=title, status=status, working_group=working_group, series=series, latest_version=str(latest_version) if latest_version is not None else None, ) spec_versions = _build_spec_versions( compact=compact, metadata_payload=metadata_payload, normalized=normalized, source_name=source_name, versions=versions, ) return candidate, spec_versions def _build_spec_versions( *, compact: str, metadata_payload: dict[str, object], normalized: str, source_name: str, versions: list[str], ) -> list[SpecificationVersion]: spec_versions: list[SpecificationVersion] = [] for i, version in enumerate(versions): file_name = f"{compact}-unknown.zip" if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list): spec_files = metadata_payload["specfile"] if i < len(spec_files): file_name = str(spec_files[i]) elif "file_name" in metadata_payload: file_name = str(metadata_payload["file_name"]) spec_versions.append( SpecificationVersion( spec_number=normalized, version=str(version), file_name=file_name, source_name=source_name, ) ) return spec_versions def _version_matches_release(version: str, release_type: str, release_value: str, specificity: int) -> bool: """Check if a version string matches the release selector.""" try: Loading Loading @@ -207,7 +273,12 @@ class SpecDatabase(DocDatabase): """ return self._clear_tables(["spec_downloads", "spec_versions", "spec_source_records", "specs"]) def crawl_specs(self, spec_numbers: list[str], release: str, sources: list[SpecSource]) -> list[SpecCrawlResult]: def crawl_specs( self, spec_numbers: list[str], release: str, sources: list[SpecSource], ) -> list[SpecCrawlResult]: """Crawl and store spec metadata for the provided spec numbers. Args: Loading @@ -219,9 +290,36 @@ class SpecDatabase(DocDatabase): List of crawl outcomes for each requested spec. """ results: list[SpecCrawlResult] = [] release_type, release_value, specificity = normalize_release(release) resolved_release = release_value if release_value is not None else "" resolved_specificity = specificity if specificity is not None else 0 for raw_spec in spec_numbers: normalized = normalize_spec_number(raw_spec) compact = normalized.replace(".", "") results.append( self._crawl_single_spec( compact=compact, normalized=normalized, release=release, release_type=release_type, release_value=resolved_release, specificity=resolved_specificity, sources=sources, ) ) return results def _crawl_single_spec( self, *, compact: str, normalized: str, release: str, release_type: str, release_value: str, specificity: int, sources: list[SpecSource], ) -> SpecCrawlResult: outcomes: list[SpecCrawlSourceOutcome] = [] source_records: list[SpecificationSourceRecord] = [] spec_versions: list[SpecificationVersion] = [] Loading Loading @@ -271,51 +369,22 @@ class SpecDatabase(DocDatabase): ) ) title = str(metadata_payload.get("title", "Unknown")) spec_type = str(metadata_payload.get("spec_type", "TS")) status = str(metadata_payload.get("status", "unknown")) working_group = str(metadata_payload.get("working_group", "unknown")) series = str(metadata_payload.get("series", f"{normalized.split('.')[0]}_series")) latest_version = metadata_payload.get("latest_version") if latest_version is None and normalized_versions: latest_version = normalized_versions[0] candidate = Specification( spec_number=normalized, spec_number_compact=compact, spec_type=spec_type, title=title, status=status, working_group=working_group, series=series, latest_version=str(latest_version) if latest_version is not None else None, candidate, versions_payload = _build_spec_candidate( compact=compact, metadata_payload=metadata_payload, normalized=normalized, source_name=source_name, versions=normalized_versions, ) if aggregated is None: aggregated = candidate elif aggregated.latest_version is None and candidate.latest_version is not None: aggregated = aggregated.model_copy(update={"latest_version": candidate.latest_version}) for i, version in enumerate(normalized_versions): # Try to get specific file name for this version from payload file_name = f"{compact}-unknown.zip" if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list): if i < len(metadata_payload["specfile"]): file_name = str(metadata_payload["specfile"][i]) elif "file_name" in metadata_payload: file_name = str(metadata_payload["file_name"]) spec_versions.append( SpecificationVersion( spec_number=normalized, version=str(version), file_name=file_name, source_name=source_name, ) ) spec_versions.extend(versions_payload) if not outcomes: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="error", Loading @@ -323,15 +392,11 @@ class SpecDatabase(DocDatabase): sources=[], message="no-sources", ) ) continue # Check if 3GPP source failed - if so, skip this spec entirely threegpp_outcome = next((o for o in outcomes if o.source_name == "3gpp"), None) if threegpp_outcome is not None and threegpp_outcome.status == "error": _logger.warning("Skipping spec %s due to 3GPP source error", normalized) results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="skipped", Loading @@ -339,14 +404,10 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="3gpp-source-error", ) ) continue release_type, release_value, specificity = normalize_release(release) if release_type in ("all", "latest"): release_matches = True elif release_type in ("exact", "prefix"): # release_value and specificity are guaranteed non-None here release_matches = any( _version_matches_release(v, release_type, release_value, specificity) # type: ignore[arg-type] for outcome in outcomes Loading @@ -357,8 +418,7 @@ class SpecDatabase(DocDatabase): release_matches = False if not release_matches: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="skipped", Loading @@ -366,12 +426,9 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="release-not-found", ) ) continue if aggregated is None: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="error", Loading @@ -379,8 +436,6 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="no-metadata", ) ) continue for record in source_records: self.upsert_spec_source_record(record) Loading @@ -388,17 +443,13 @@ class SpecDatabase(DocDatabase): for version in spec_versions: self.upsert_spec_version(version) results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="stored", latest_version=aggregated.latest_version, sources=outcomes, ) ) return results def _spec_table_rows(self) -> list[Specification]: return self._table_rows("specs") Loading src/tdoc_crawler/database/tdocs.py +10 −11 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ from decimal import Decimal from tdoc_crawler.database.meetings import MeetingDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import WorkingGroup from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig from tdoc_crawler.utils.misc import utc_now Loading Loading @@ -113,15 +114,8 @@ class TDocDatabase(MeetingDatabase): meeting = meeting_map.get(record.meeting_id or -1) if meeting is None or meeting.tbid not in allowed_tbids: continue record = record.model_copy(update={"meeting_name": meeting.short_name}) filtered.append(record) records = filtered else: # Always ensure meeting_name is populated even without filters for i, record in enumerate(records): meeting = meeting_map.get(record.meeting_id or -1) if meeting: records[i] = record.model_copy(update={"meeting_name": meeting.short_name}) if config.start_date is not None: records = [record for record in records if record.date_retrieved and record.date_retrieved >= config.start_date] Loading Loading @@ -202,7 +196,13 @@ class TDocDatabase(MeetingDatabase): continue if allowed_tbids and meeting.tbid not in allowed_tbids: continue if allowed_subgroups and (meeting.subgroup or "").upper() not in allowed_subgroups: if allowed_subgroups: subgroup_code = None if meeting.subtb is not None: subgroup_record = SUBTB_INDEX.get(meeting.subtb) if subgroup_record: subgroup_code = subgroup_record.code if subgroup_code is None or subgroup_code.upper() not in allowed_subgroups: continue processed.add(record.meeting_id) return processed Loading Loading @@ -238,7 +238,6 @@ class TDocDatabase(MeetingDatabase): agenda_item_nbr=Decimal("0.0"), agenda_item_text="Unknown", status=None, meeting_name=None, is_revision_of=None, file_size=None, date_created=None, Loading @@ -258,7 +257,7 @@ class TDocDatabase(MeetingDatabase): def _get_tdoc(self, tdoc_id: str) -> TDocMetadata | None: """Get a TDoc by ID.""" try: return self.connection.model_from_table("tdocs", tdoc_id.upper()) # type: ignore[arg-type] return self.connection.model_from_table("tdocs", tdoc_id.upper()) except KeyError: return None Loading src/tdoc_crawler/meetings/models.py +17 −51 Original line number Diff line number Diff line Loading @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator from tdoc_crawler.models.base import BaseConfigModel, SortOrder from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.utils.misc import utc_now Loading @@ -19,10 +19,14 @@ class MeetingMetadata(BaseModel): # parsed from the portal meeting_id: int = Field(..., description="Unique meeting identifier from the 3GPP portal") tbid: int = Field(..., description="Technical body identifier of the parent working group") subtb: int | None = Field(..., description="Sub-technical body identifier of the subworking group") working_group: str | None = Field(None, description="Working group (as string, e.g., 'SA')") subgroup: str | None = Field(None, description="Canonical subgroup code (e.g., 'R1')") tbid: int = Field( ..., description="Technical body identifier of the parent working group (FK to working_groups.tbid)", ) subtb: int | None = Field( ..., description="Sub-technical body identifier (FK to subworking_groups.subtb)", ) short_name: str = Field(..., description="Short meeting name (e.g., SA4#134)") title: str | None = Field(None, description="Descriptive meeting title") start_date: date | None = Field(None, description="Meeting start date") Loading @@ -41,53 +45,15 @@ class MeetingMetadata(BaseModel): @model_validator(mode="after") def _sync_relationships(self) -> MeetingMetadata: # working_group is now a string, so match tbid if not set if self.working_group is None: for working_group in WorkingGroup: if working_group.tbid == self.tbid: self.working_group = working_group.value break else: # Validate tbid matches working_group try: wg = WorkingGroup(self.working_group) except Exception: raise ValueError(f"Invalid working_group: {self.working_group}") if wg.tbid != self.tbid: msg = "tbid does not match provided working_group" raise ValueError(msg) if self.subtb is None and self.subgroup: record = CODE_INDEX.get(self.subgroup) if record is not None: self.subtb = record.subtb if self.subgroup is None and self.subtb is not None: if self.subtb is None: return self record = SUBTB_INDEX.get(self.subtb) if record is not None: self.subgroup = record.code if record is None: return self if record.tbid != self.tbid: msg = "tbid does not match subworking group" raise ValueError(msg) return self @field_validator("working_group", mode="before") @classmethod def _validate_working_group(cls, value: WorkingGroup | str | None) -> str | None: if value is None: return None # Accept enum or str, store as str if isinstance(value, WorkingGroup): return value.value if value not in {wg.value for wg in WorkingGroup}: raise ValueError(f"Invalid working_group: {value}") return value @field_validator("subgroup", mode="before") @classmethod def _normalize_subgroup(cls, value: str | None) -> str | None: if value is None: return None normalized = value.strip().upper() return normalized or None class MeetingCrawlConfig(BaseConfigModel): Loading src/tdoc_crawler/models/subworking_groups.py +4 −1 Original line number Diff line number Diff line Loading @@ -82,7 +82,10 @@ class SubWorkingGroupRecord(BaseModel): """Persistent representation of a subworking group.""" subtb: int = Field(..., description="Sub-technical body identifier") tbid: int = Field(..., description="Parent technical body identifier") tbid: int = Field( ..., description="Parent technical body identifier (FK to working_groups.tbid)", ) code: str = Field(..., description="Canonical subgroup code (e.g., 'S4', 'RP')") name: str = Field(..., description="Display name of the subgroup") Loading Loading
src/tdoc_crawler/database/meetings.py +14 −12 Original line number Diff line number Diff line Loading @@ -8,7 +8,7 @@ from tdoc_crawler.database.base import DocDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig from tdoc_crawler.models.base import SortOrder from tdoc_crawler.models.subworking_groups import CODE_INDEX from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup from tdoc_crawler.utils.misc import utc_now from tdoc_crawler.utils.normalization import normalize_portal_meeting_name Loading Loading @@ -101,12 +101,12 @@ class MeetingDatabase(DocDatabase): meetings = self._table_rows("meetings") if config.working_groups: allowed = {wg.value for wg in config.working_groups} meetings = [meeting for meeting in meetings if meeting.working_group and meeting.working_group in allowed] allowed = {wg.tbid for wg in config.working_groups} meetings = [meeting for meeting in meetings if meeting.tbid in allowed] if config.subgroups: allowed_subgroups = {value.strip().upper() for value in config.subgroups} meetings = [meeting for meeting in meetings if (meeting.subgroup or "").upper() in allowed_subgroups] meetings = [meeting for meeting in meetings if meeting.subtb in SUBTB_INDEX and SUBTB_INDEX[meeting.subtb].code in allowed_subgroups] if not config.include_without_files: meetings = [meeting for meeting in meetings if meeting.files_url] Loading Loading @@ -138,8 +138,8 @@ class MeetingDatabase(DocDatabase): if not working_groups: return {meeting.meeting_id for meeting in meetings} allowed = {wg.value for wg in working_groups} return {meeting.meeting_id for meeting in meetings if meeting.working_group and meeting.working_group in allowed} allowed = {wg.tbid for wg in working_groups} return {meeting.meeting_id for meeting in meetings if meeting.tbid in allowed} def get_tdoc_count_for_meeting(self, meeting_id: int) -> int: """Get the number of TDocs associated with a meeting. Loading Loading @@ -238,10 +238,16 @@ class MeetingDatabase(DocDatabase): crawl_entries = self._table_rows("crawl_log") by_working_group: dict[str, int] = defaultdict(int) tbid_to_code = {working_group.tbid: working_group.value for working_group in WorkingGroup} for record in tdocs: meeting = meetings.get(record.meeting_id or -1) if meeting and meeting.working_group: by_working_group[meeting.working_group] += 1 if meeting is None: continue code = tbid_to_code.get(meeting.tbid) if code is None: _logger.debug("Unknown tbid in meeting %s: %s", meeting.meeting_id, meeting.tbid) continue by_working_group[code] += 1 recent_crawls = [ { Loading Loading @@ -304,10 +310,6 @@ class MeetingDatabase(DocDatabase): @staticmethod def _prepare_meeting(metadata: MeetingMetadata) -> MeetingMetadata: """Prepare meeting metadata for insertion (set defaults).""" if metadata.working_group is None and metadata.tbid: for working_group in WorkingGroup: if working_group.tbid == metadata.tbid: return metadata.model_copy(update={"working_group": working_group}) return metadata @staticmethod Loading
src/tdoc_crawler/database/specs.py +212 −161 Original line number Diff line number Diff line Loading @@ -24,6 +24,72 @@ from tdoc_crawler.utils.normalization import normalize_release, normalize_spec_n _logger = get_logger(__name__) def _build_spec_candidate( *, compact: str, metadata_payload: dict[str, object], normalized: str, source_name: str, versions: list[str], ) -> tuple[Specification, list[SpecificationVersion]]: title = str(metadata_payload.get("title", "Unknown")) spec_type = str(metadata_payload.get("spec_type", "TS")) status = str(metadata_payload.get("status", "unknown")) working_group = str(metadata_payload.get("working_group", "unknown")) series = str(metadata_payload.get("series", f"{normalized.split('.', maxsplit=1)[0]}_series")) latest_version = metadata_payload.get("latest_version") if latest_version is None and versions: latest_version = versions[0] candidate = Specification( spec_number=normalized, spec_number_compact=compact, spec_type=spec_type, title=title, status=status, working_group=working_group, series=series, latest_version=str(latest_version) if latest_version is not None else None, ) spec_versions = _build_spec_versions( compact=compact, metadata_payload=metadata_payload, normalized=normalized, source_name=source_name, versions=versions, ) return candidate, spec_versions def _build_spec_versions( *, compact: str, metadata_payload: dict[str, object], normalized: str, source_name: str, versions: list[str], ) -> list[SpecificationVersion]: spec_versions: list[SpecificationVersion] = [] for i, version in enumerate(versions): file_name = f"{compact}-unknown.zip" if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list): spec_files = metadata_payload["specfile"] if i < len(spec_files): file_name = str(spec_files[i]) elif "file_name" in metadata_payload: file_name = str(metadata_payload["file_name"]) spec_versions.append( SpecificationVersion( spec_number=normalized, version=str(version), file_name=file_name, source_name=source_name, ) ) return spec_versions def _version_matches_release(version: str, release_type: str, release_value: str, specificity: int) -> bool: """Check if a version string matches the release selector.""" try: Loading Loading @@ -207,7 +273,12 @@ class SpecDatabase(DocDatabase): """ return self._clear_tables(["spec_downloads", "spec_versions", "spec_source_records", "specs"]) def crawl_specs(self, spec_numbers: list[str], release: str, sources: list[SpecSource]) -> list[SpecCrawlResult]: def crawl_specs( self, spec_numbers: list[str], release: str, sources: list[SpecSource], ) -> list[SpecCrawlResult]: """Crawl and store spec metadata for the provided spec numbers. Args: Loading @@ -219,9 +290,36 @@ class SpecDatabase(DocDatabase): List of crawl outcomes for each requested spec. """ results: list[SpecCrawlResult] = [] release_type, release_value, specificity = normalize_release(release) resolved_release = release_value if release_value is not None else "" resolved_specificity = specificity if specificity is not None else 0 for raw_spec in spec_numbers: normalized = normalize_spec_number(raw_spec) compact = normalized.replace(".", "") results.append( self._crawl_single_spec( compact=compact, normalized=normalized, release=release, release_type=release_type, release_value=resolved_release, specificity=resolved_specificity, sources=sources, ) ) return results def _crawl_single_spec( self, *, compact: str, normalized: str, release: str, release_type: str, release_value: str, specificity: int, sources: list[SpecSource], ) -> SpecCrawlResult: outcomes: list[SpecCrawlSourceOutcome] = [] source_records: list[SpecificationSourceRecord] = [] spec_versions: list[SpecificationVersion] = [] Loading Loading @@ -271,51 +369,22 @@ class SpecDatabase(DocDatabase): ) ) title = str(metadata_payload.get("title", "Unknown")) spec_type = str(metadata_payload.get("spec_type", "TS")) status = str(metadata_payload.get("status", "unknown")) working_group = str(metadata_payload.get("working_group", "unknown")) series = str(metadata_payload.get("series", f"{normalized.split('.')[0]}_series")) latest_version = metadata_payload.get("latest_version") if latest_version is None and normalized_versions: latest_version = normalized_versions[0] candidate = Specification( spec_number=normalized, spec_number_compact=compact, spec_type=spec_type, title=title, status=status, working_group=working_group, series=series, latest_version=str(latest_version) if latest_version is not None else None, candidate, versions_payload = _build_spec_candidate( compact=compact, metadata_payload=metadata_payload, normalized=normalized, source_name=source_name, versions=normalized_versions, ) if aggregated is None: aggregated = candidate elif aggregated.latest_version is None and candidate.latest_version is not None: aggregated = aggregated.model_copy(update={"latest_version": candidate.latest_version}) for i, version in enumerate(normalized_versions): # Try to get specific file name for this version from payload file_name = f"{compact}-unknown.zip" if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list): if i < len(metadata_payload["specfile"]): file_name = str(metadata_payload["specfile"][i]) elif "file_name" in metadata_payload: file_name = str(metadata_payload["file_name"]) spec_versions.append( SpecificationVersion( spec_number=normalized, version=str(version), file_name=file_name, source_name=source_name, ) ) spec_versions.extend(versions_payload) if not outcomes: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="error", Loading @@ -323,15 +392,11 @@ class SpecDatabase(DocDatabase): sources=[], message="no-sources", ) ) continue # Check if 3GPP source failed - if so, skip this spec entirely threegpp_outcome = next((o for o in outcomes if o.source_name == "3gpp"), None) if threegpp_outcome is not None and threegpp_outcome.status == "error": _logger.warning("Skipping spec %s due to 3GPP source error", normalized) results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="skipped", Loading @@ -339,14 +404,10 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="3gpp-source-error", ) ) continue release_type, release_value, specificity = normalize_release(release) if release_type in ("all", "latest"): release_matches = True elif release_type in ("exact", "prefix"): # release_value and specificity are guaranteed non-None here release_matches = any( _version_matches_release(v, release_type, release_value, specificity) # type: ignore[arg-type] for outcome in outcomes Loading @@ -357,8 +418,7 @@ class SpecDatabase(DocDatabase): release_matches = False if not release_matches: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="skipped", Loading @@ -366,12 +426,9 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="release-not-found", ) ) continue if aggregated is None: results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="error", Loading @@ -379,8 +436,6 @@ class SpecDatabase(DocDatabase): sources=outcomes, message="no-metadata", ) ) continue for record in source_records: self.upsert_spec_source_record(record) Loading @@ -388,17 +443,13 @@ class SpecDatabase(DocDatabase): for version in spec_versions: self.upsert_spec_version(version) results.append( SpecCrawlResult( return SpecCrawlResult( spec_number=normalized, release=release, status="stored", latest_version=aggregated.latest_version, sources=outcomes, ) ) return results def _spec_table_rows(self) -> list[Specification]: return self._table_rows("specs") Loading
src/tdoc_crawler/database/tdocs.py +10 −11 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ from decimal import Decimal from tdoc_crawler.database.meetings import MeetingDatabase from tdoc_crawler.logging import get_logger from tdoc_crawler.models import WorkingGroup from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig from tdoc_crawler.utils.misc import utc_now Loading Loading @@ -113,15 +114,8 @@ class TDocDatabase(MeetingDatabase): meeting = meeting_map.get(record.meeting_id or -1) if meeting is None or meeting.tbid not in allowed_tbids: continue record = record.model_copy(update={"meeting_name": meeting.short_name}) filtered.append(record) records = filtered else: # Always ensure meeting_name is populated even without filters for i, record in enumerate(records): meeting = meeting_map.get(record.meeting_id or -1) if meeting: records[i] = record.model_copy(update={"meeting_name": meeting.short_name}) if config.start_date is not None: records = [record for record in records if record.date_retrieved and record.date_retrieved >= config.start_date] Loading Loading @@ -202,7 +196,13 @@ class TDocDatabase(MeetingDatabase): continue if allowed_tbids and meeting.tbid not in allowed_tbids: continue if allowed_subgroups and (meeting.subgroup or "").upper() not in allowed_subgroups: if allowed_subgroups: subgroup_code = None if meeting.subtb is not None: subgroup_record = SUBTB_INDEX.get(meeting.subtb) if subgroup_record: subgroup_code = subgroup_record.code if subgroup_code is None or subgroup_code.upper() not in allowed_subgroups: continue processed.add(record.meeting_id) return processed Loading Loading @@ -238,7 +238,6 @@ class TDocDatabase(MeetingDatabase): agenda_item_nbr=Decimal("0.0"), agenda_item_text="Unknown", status=None, meeting_name=None, is_revision_of=None, file_size=None, date_created=None, Loading @@ -258,7 +257,7 @@ class TDocDatabase(MeetingDatabase): def _get_tdoc(self, tdoc_id: str) -> TDocMetadata | None: """Get a TDoc by ID.""" try: return self.connection.model_from_table("tdocs", tdoc_id.upper()) # type: ignore[arg-type] return self.connection.model_from_table("tdocs", tdoc_id.upper()) except KeyError: return None Loading
src/tdoc_crawler/meetings/models.py +17 −51 Original line number Diff line number Diff line Loading @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator from tdoc_crawler.models.base import BaseConfigModel, SortOrder from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.utils.misc import utc_now Loading @@ -19,10 +19,14 @@ class MeetingMetadata(BaseModel): # parsed from the portal meeting_id: int = Field(..., description="Unique meeting identifier from the 3GPP portal") tbid: int = Field(..., description="Technical body identifier of the parent working group") subtb: int | None = Field(..., description="Sub-technical body identifier of the subworking group") working_group: str | None = Field(None, description="Working group (as string, e.g., 'SA')") subgroup: str | None = Field(None, description="Canonical subgroup code (e.g., 'R1')") tbid: int = Field( ..., description="Technical body identifier of the parent working group (FK to working_groups.tbid)", ) subtb: int | None = Field( ..., description="Sub-technical body identifier (FK to subworking_groups.subtb)", ) short_name: str = Field(..., description="Short meeting name (e.g., SA4#134)") title: str | None = Field(None, description="Descriptive meeting title") start_date: date | None = Field(None, description="Meeting start date") Loading @@ -41,53 +45,15 @@ class MeetingMetadata(BaseModel): @model_validator(mode="after") def _sync_relationships(self) -> MeetingMetadata: # working_group is now a string, so match tbid if not set if self.working_group is None: for working_group in WorkingGroup: if working_group.tbid == self.tbid: self.working_group = working_group.value break else: # Validate tbid matches working_group try: wg = WorkingGroup(self.working_group) except Exception: raise ValueError(f"Invalid working_group: {self.working_group}") if wg.tbid != self.tbid: msg = "tbid does not match provided working_group" raise ValueError(msg) if self.subtb is None and self.subgroup: record = CODE_INDEX.get(self.subgroup) if record is not None: self.subtb = record.subtb if self.subgroup is None and self.subtb is not None: if self.subtb is None: return self record = SUBTB_INDEX.get(self.subtb) if record is not None: self.subgroup = record.code if record is None: return self if record.tbid != self.tbid: msg = "tbid does not match subworking group" raise ValueError(msg) return self @field_validator("working_group", mode="before") @classmethod def _validate_working_group(cls, value: WorkingGroup | str | None) -> str | None: if value is None: return None # Accept enum or str, store as str if isinstance(value, WorkingGroup): return value.value if value not in {wg.value for wg in WorkingGroup}: raise ValueError(f"Invalid working_group: {value}") return value @field_validator("subgroup", mode="before") @classmethod def _normalize_subgroup(cls, value: str | None) -> str | None: if value is None: return None normalized = value.strip().upper() return normalized or None class MeetingCrawlConfig(BaseConfigModel): Loading
src/tdoc_crawler/models/subworking_groups.py +4 −1 Original line number Diff line number Diff line Loading @@ -82,7 +82,10 @@ class SubWorkingGroupRecord(BaseModel): """Persistent representation of a subworking group.""" subtb: int = Field(..., description="Sub-technical body identifier") tbid: int = Field(..., description="Parent technical body identifier") tbid: int = Field( ..., description="Parent technical body identifier (FK to working_groups.tbid)", ) code: str = Field(..., description="Canonical subgroup code (e.g., 'S4', 'RP')") name: str = Field(..., description="Display name of the subgroup") Loading