Commit b9e352e7 authored by Jan Reimes's avatar Jan Reimes
Browse files

models(db): normalize meeting/subworking_group schema; add validators and...

models(db): normalize meeting/subworking_group schema; add validators and remove duplicated label fields
parent b539daa5
Loading
Loading
Loading
Loading
+14 −12
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.models.subworking_groups import CODE_INDEX
from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX
from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.normalization import normalize_portal_meeting_name
@@ -101,12 +101,12 @@ class MeetingDatabase(DocDatabase):
        meetings = self._table_rows("meetings")

        if config.working_groups:
            allowed = {wg.value for wg in config.working_groups}
            meetings = [meeting for meeting in meetings if meeting.working_group and meeting.working_group in allowed]
            allowed = {wg.tbid for wg in config.working_groups}
            meetings = [meeting for meeting in meetings if meeting.tbid in allowed]

        if config.subgroups:
            allowed_subgroups = {value.strip().upper() for value in config.subgroups}
            meetings = [meeting for meeting in meetings if (meeting.subgroup or "").upper() in allowed_subgroups]
            meetings = [meeting for meeting in meetings if meeting.subtb in SUBTB_INDEX and SUBTB_INDEX[meeting.subtb].code in allowed_subgroups]

        if not config.include_without_files:
            meetings = [meeting for meeting in meetings if meeting.files_url]
@@ -138,8 +138,8 @@ class MeetingDatabase(DocDatabase):
        if not working_groups:
            return {meeting.meeting_id for meeting in meetings}

        allowed = {wg.value for wg in working_groups}
        return {meeting.meeting_id for meeting in meetings if meeting.working_group and meeting.working_group in allowed}
        allowed = {wg.tbid for wg in working_groups}
        return {meeting.meeting_id for meeting in meetings if meeting.tbid in allowed}

    def get_tdoc_count_for_meeting(self, meeting_id: int) -> int:
        """Get the number of TDocs associated with a meeting.
@@ -238,10 +238,16 @@ class MeetingDatabase(DocDatabase):
        crawl_entries = self._table_rows("crawl_log")

        by_working_group: dict[str, int] = defaultdict(int)
        tbid_to_code = {working_group.tbid: working_group.value for working_group in WorkingGroup}
        for record in tdocs:
            meeting = meetings.get(record.meeting_id or -1)
            if meeting and meeting.working_group:
                by_working_group[meeting.working_group] += 1
            if meeting is None:
                continue
            code = tbid_to_code.get(meeting.tbid)
            if code is None:
                _logger.debug("Unknown tbid in meeting %s: %s", meeting.meeting_id, meeting.tbid)
                continue
            by_working_group[code] += 1

        recent_crawls = [
            {
@@ -304,10 +310,6 @@ class MeetingDatabase(DocDatabase):
    @staticmethod
    def _prepare_meeting(metadata: MeetingMetadata) -> MeetingMetadata:
        """Prepare meeting metadata for insertion (set defaults)."""
        if metadata.working_group is None and metadata.tbid:
            for working_group in WorkingGroup:
                if working_group.tbid == metadata.tbid:
                    return metadata.model_copy(update={"working_group": working_group})
        return metadata

    @staticmethod
+212 −161
Original line number Diff line number Diff line
@@ -24,6 +24,72 @@ from tdoc_crawler.utils.normalization import normalize_release, normalize_spec_n
_logger = get_logger(__name__)


def _build_spec_candidate(
    *,
    compact: str,
    metadata_payload: dict[str, object],
    normalized: str,
    source_name: str,
    versions: list[str],
) -> tuple[Specification, list[SpecificationVersion]]:
    title = str(metadata_payload.get("title", "Unknown"))
    spec_type = str(metadata_payload.get("spec_type", "TS"))
    status = str(metadata_payload.get("status", "unknown"))
    working_group = str(metadata_payload.get("working_group", "unknown"))
    series = str(metadata_payload.get("series", f"{normalized.split('.', maxsplit=1)[0]}_series"))
    latest_version = metadata_payload.get("latest_version")
    if latest_version is None and versions:
        latest_version = versions[0]

    candidate = Specification(
        spec_number=normalized,
        spec_number_compact=compact,
        spec_type=spec_type,
        title=title,
        status=status,
        working_group=working_group,
        series=series,
        latest_version=str(latest_version) if latest_version is not None else None,
    )
    spec_versions = _build_spec_versions(
        compact=compact,
        metadata_payload=metadata_payload,
        normalized=normalized,
        source_name=source_name,
        versions=versions,
    )
    return candidate, spec_versions


def _build_spec_versions(
    *,
    compact: str,
    metadata_payload: dict[str, object],
    normalized: str,
    source_name: str,
    versions: list[str],
) -> list[SpecificationVersion]:
    spec_versions: list[SpecificationVersion] = []
    for i, version in enumerate(versions):
        file_name = f"{compact}-unknown.zip"
        if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list):
            spec_files = metadata_payload["specfile"]
            if i < len(spec_files):
                file_name = str(spec_files[i])
        elif "file_name" in metadata_payload:
            file_name = str(metadata_payload["file_name"])

        spec_versions.append(
            SpecificationVersion(
                spec_number=normalized,
                version=str(version),
                file_name=file_name,
                source_name=source_name,
            )
        )
    return spec_versions


def _version_matches_release(version: str, release_type: str, release_value: str, specificity: int) -> bool:
    """Check if a version string matches the release selector."""
    try:
@@ -207,7 +273,12 @@ class SpecDatabase(DocDatabase):
        """
        return self._clear_tables(["spec_downloads", "spec_versions", "spec_source_records", "specs"])

    def crawl_specs(self, spec_numbers: list[str], release: str, sources: list[SpecSource]) -> list[SpecCrawlResult]:
    def crawl_specs(
        self,
        spec_numbers: list[str],
        release: str,
        sources: list[SpecSource],
    ) -> list[SpecCrawlResult]:
        """Crawl and store spec metadata for the provided spec numbers.

        Args:
@@ -219,9 +290,36 @@ class SpecDatabase(DocDatabase):
            List of crawl outcomes for each requested spec.
        """
        results: list[SpecCrawlResult] = []
        release_type, release_value, specificity = normalize_release(release)
        resolved_release = release_value if release_value is not None else ""
        resolved_specificity = specificity if specificity is not None else 0
        for raw_spec in spec_numbers:
            normalized = normalize_spec_number(raw_spec)
            compact = normalized.replace(".", "")
            results.append(
                self._crawl_single_spec(
                    compact=compact,
                    normalized=normalized,
                    release=release,
                    release_type=release_type,
                    release_value=resolved_release,
                    specificity=resolved_specificity,
                    sources=sources,
                )
            )
        return results

    def _crawl_single_spec(
        self,
        *,
        compact: str,
        normalized: str,
        release: str,
        release_type: str,
        release_value: str,
        specificity: int,
        sources: list[SpecSource],
    ) -> SpecCrawlResult:
        outcomes: list[SpecCrawlSourceOutcome] = []
        source_records: list[SpecificationSourceRecord] = []
        spec_versions: list[SpecificationVersion] = []
@@ -271,51 +369,22 @@ class SpecDatabase(DocDatabase):
                )
            )

                title = str(metadata_payload.get("title", "Unknown"))
                spec_type = str(metadata_payload.get("spec_type", "TS"))
                status = str(metadata_payload.get("status", "unknown"))
                working_group = str(metadata_payload.get("working_group", "unknown"))
                series = str(metadata_payload.get("series", f"{normalized.split('.')[0]}_series"))
                latest_version = metadata_payload.get("latest_version")
                if latest_version is None and normalized_versions:
                    latest_version = normalized_versions[0]

                candidate = Specification(
                    spec_number=normalized,
                    spec_number_compact=compact,
                    spec_type=spec_type,
                    title=title,
                    status=status,
                    working_group=working_group,
                    series=series,
                    latest_version=str(latest_version) if latest_version is not None else None,
            candidate, versions_payload = _build_spec_candidate(
                compact=compact,
                metadata_payload=metadata_payload,
                normalized=normalized,
                source_name=source_name,
                versions=normalized_versions,
            )
            if aggregated is None:
                aggregated = candidate
            elif aggregated.latest_version is None and candidate.latest_version is not None:
                aggregated = aggregated.model_copy(update={"latest_version": candidate.latest_version})

                for i, version in enumerate(normalized_versions):
                    # Try to get specific file name for this version from payload
                    file_name = f"{compact}-unknown.zip"
                    if "specfile" in metadata_payload and isinstance(metadata_payload["specfile"], list):
                        if i < len(metadata_payload["specfile"]):
                            file_name = str(metadata_payload["specfile"][i])
                    elif "file_name" in metadata_payload:
                        file_name = str(metadata_payload["file_name"])

                    spec_versions.append(
                        SpecificationVersion(
                            spec_number=normalized,
                            version=str(version),
                            file_name=file_name,
                            source_name=source_name,
                        )
                    )
            spec_versions.extend(versions_payload)

        if not outcomes:
                results.append(
                    SpecCrawlResult(
            return SpecCrawlResult(
                spec_number=normalized,
                release=release,
                status="error",
@@ -323,15 +392,11 @@ class SpecDatabase(DocDatabase):
                sources=[],
                message="no-sources",
            )
                )
                continue

            # Check if 3GPP source failed - if so, skip this spec entirely
        threegpp_outcome = next((o for o in outcomes if o.source_name == "3gpp"), None)
        if threegpp_outcome is not None and threegpp_outcome.status == "error":
            _logger.warning("Skipping spec %s due to 3GPP source error", normalized)
                results.append(
                    SpecCrawlResult(
            return SpecCrawlResult(
                spec_number=normalized,
                release=release,
                status="skipped",
@@ -339,14 +404,10 @@ class SpecDatabase(DocDatabase):
                sources=outcomes,
                message="3gpp-source-error",
            )
                )
                continue

            release_type, release_value, specificity = normalize_release(release)
        if release_type in ("all", "latest"):
            release_matches = True
        elif release_type in ("exact", "prefix"):
                # release_value and specificity are guaranteed non-None here
            release_matches = any(
                _version_matches_release(v, release_type, release_value, specificity)  # type: ignore[arg-type]
                for outcome in outcomes
@@ -357,8 +418,7 @@ class SpecDatabase(DocDatabase):
            release_matches = False

        if not release_matches:
                results.append(
                    SpecCrawlResult(
            return SpecCrawlResult(
                spec_number=normalized,
                release=release,
                status="skipped",
@@ -366,12 +426,9 @@ class SpecDatabase(DocDatabase):
                sources=outcomes,
                message="release-not-found",
            )
                )
                continue

        if aggregated is None:
                results.append(
                    SpecCrawlResult(
            return SpecCrawlResult(
                spec_number=normalized,
                release=release,
                status="error",
@@ -379,8 +436,6 @@ class SpecDatabase(DocDatabase):
                sources=outcomes,
                message="no-metadata",
            )
                )
                continue

        for record in source_records:
            self.upsert_spec_source_record(record)
@@ -388,17 +443,13 @@ class SpecDatabase(DocDatabase):
        for version in spec_versions:
            self.upsert_spec_version(version)

            results.append(
                SpecCrawlResult(
        return SpecCrawlResult(
            spec_number=normalized,
            release=release,
            status="stored",
            latest_version=aggregated.latest_version,
            sources=outcomes,
        )
            )

        return results

    def _spec_table_rows(self) -> list[Specification]:
        return self._table_rows("specs")
+10 −11
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ from decimal import Decimal
from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import WorkingGroup
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig
from tdoc_crawler.utils.misc import utc_now

@@ -113,15 +114,8 @@ class TDocDatabase(MeetingDatabase):
                meeting = meeting_map.get(record.meeting_id or -1)
                if meeting is None or meeting.tbid not in allowed_tbids:
                    continue
                record = record.model_copy(update={"meeting_name": meeting.short_name})
                filtered.append(record)
            records = filtered
        else:
            # Always ensure meeting_name is populated even without filters
            for i, record in enumerate(records):
                meeting = meeting_map.get(record.meeting_id or -1)
                if meeting:
                    records[i] = record.model_copy(update={"meeting_name": meeting.short_name})

        if config.start_date is not None:
            records = [record for record in records if record.date_retrieved and record.date_retrieved >= config.start_date]
@@ -202,7 +196,13 @@ class TDocDatabase(MeetingDatabase):
                continue
            if allowed_tbids and meeting.tbid not in allowed_tbids:
                continue
            if allowed_subgroups and (meeting.subgroup or "").upper() not in allowed_subgroups:
            if allowed_subgroups:
                subgroup_code = None
                if meeting.subtb is not None:
                    subgroup_record = SUBTB_INDEX.get(meeting.subtb)
                    if subgroup_record:
                        subgroup_code = subgroup_record.code
                if subgroup_code is None or subgroup_code.upper() not in allowed_subgroups:
                    continue
            processed.add(record.meeting_id)
        return processed
@@ -238,7 +238,6 @@ class TDocDatabase(MeetingDatabase):
            agenda_item_nbr=Decimal("0.0"),
            agenda_item_text="Unknown",
            status=None,
            meeting_name=None,
            is_revision_of=None,
            file_size=None,
            date_created=None,
@@ -258,7 +257,7 @@ class TDocDatabase(MeetingDatabase):
    def _get_tdoc(self, tdoc_id: str) -> TDocMetadata | None:
        """Get a TDoc by ID."""
        try:
            return self.connection.model_from_table("tdocs", tdoc_id.upper())  # type: ignore[arg-type]
            return self.connection.model_from_table("tdocs", tdoc_id.upper())
        except KeyError:
            return None

+17 −51
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator

from tdoc_crawler.models.base import BaseConfigModel, SortOrder
from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits
from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.utils.misc import utc_now

@@ -19,10 +19,14 @@ class MeetingMetadata(BaseModel):

    # parsed from the portal
    meeting_id: int = Field(..., description="Unique meeting identifier from the 3GPP portal")
    tbid: int = Field(..., description="Technical body identifier of the parent working group")
    subtb: int | None = Field(..., description="Sub-technical body identifier of the subworking group")
    working_group: str | None = Field(None, description="Working group (as string, e.g., 'SA')")
    subgroup: str | None = Field(None, description="Canonical subgroup code (e.g., 'R1')")
    tbid: int = Field(
        ...,
        description="Technical body identifier of the parent working group (FK to working_groups.tbid)",
    )
    subtb: int | None = Field(
        ...,
        description="Sub-technical body identifier (FK to subworking_groups.subtb)",
    )
    short_name: str = Field(..., description="Short meeting name (e.g., SA4#134)")
    title: str | None = Field(None, description="Descriptive meeting title")
    start_date: date | None = Field(None, description="Meeting start date")
@@ -41,53 +45,15 @@ class MeetingMetadata(BaseModel):

    @model_validator(mode="after")
    def _sync_relationships(self) -> MeetingMetadata:
        # working_group is now a string, so match tbid if not set
        if self.working_group is None:
            for working_group in WorkingGroup:
                if working_group.tbid == self.tbid:
                    self.working_group = working_group.value
                    break
        else:
            # Validate tbid matches working_group
            try:
                wg = WorkingGroup(self.working_group)
            except Exception:
                raise ValueError(f"Invalid working_group: {self.working_group}")
            if wg.tbid != self.tbid:
                msg = "tbid does not match provided working_group"
                raise ValueError(msg)

        if self.subtb is None and self.subgroup:
            record = CODE_INDEX.get(self.subgroup)
            if record is not None:
                self.subtb = record.subtb

        if self.subgroup is None and self.subtb is not None:
        if self.subtb is None:
            return self
        record = SUBTB_INDEX.get(self.subtb)
            if record is not None:
                self.subgroup = record.code

        if record is None:
            return self
        if record.tbid != self.tbid:
            msg = "tbid does not match subworking group"
            raise ValueError(msg)
        return self

    @field_validator("working_group", mode="before")
    @classmethod
    def _validate_working_group(cls, value: WorkingGroup | str | None) -> str | None:
        if value is None:
            return None
        # Accept enum or str, store as str
        if isinstance(value, WorkingGroup):
            return value.value
        if value not in {wg.value for wg in WorkingGroup}:
            raise ValueError(f"Invalid working_group: {value}")
        return value

    @field_validator("subgroup", mode="before")
    @classmethod
    def _normalize_subgroup(cls, value: str | None) -> str | None:
        if value is None:
            return None
        normalized = value.strip().upper()
        return normalized or None


class MeetingCrawlConfig(BaseConfigModel):
+4 −1
Original line number Diff line number Diff line
@@ -82,7 +82,10 @@ class SubWorkingGroupRecord(BaseModel):
    """Persistent representation of a subworking group."""

    subtb: int = Field(..., description="Sub-technical body identifier")
    tbid: int = Field(..., description="Parent technical body identifier")
    tbid: int = Field(
        ...,
        description="Parent technical body identifier (FK to working_groups.tbid)",
    )
    code: str = Field(..., description="Canonical subgroup code (e.g., 'S4', 'RP')")
    name: str = Field(..., description="Display name of the subgroup")