Commit 25433cc1 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(core): reorganize DB helpers and crawler entrypoints

* Move __init__ and crawl implementations into canonical positions for
  MeetingCrawler and TDocCrawler to improve readability (no behaviour
  change).
* Consolidate and hoist TDocDatabase helpers: introduce single change-
  detection helpers (_tdoc_changed, _meeting_changed) and centralize
  normalization helpers to remove duplicates.
* Add docstring for _table_exists and tidy connection/initialization
  ordering for clearer lifecycle handling.
* Reposition DatabaseError.__init__ for readability.
* Move QueryConfig working_groups validator earlier so input
  normalization runs before model initialization.
parent d855341f
Loading
Loading
Loading
Loading
+53 −53
Original line number Diff line number Diff line
@@ -105,59 +105,6 @@ class MeetingCrawlResult:
class MeetingCrawler:
    """Crawler fetching meeting metadata from the 3GPP portal."""

    def __init__(self, database: TDocDatabase) -> None:
        self.database = database

    def crawl(self, config: MeetingCrawlConfig, progress_callback: Callable[[float, float], None] | None = None) -> MeetingCrawlResult:
        errors: list[str] = []
        meetings: list[MeetingMetadata] = []

        working_groups = self._limit_working_groups(config.working_groups, config.limits)
        existing_ids: set[int] = set()
        if config.incremental:
            existing_ids = self.database.get_existing_meeting_ids(working_groups)
        session = requests.Session()
        session.headers["User-Agent"] = "tdoc-crawler/0.0.1"
        if config.credentials is not None:
            session.auth = (config.credentials.username, config.credentials.password)

        try:
            for working_group in working_groups:
                for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
                    # Skip subgroup if subgroups filter is set and this subgroup is not in the list
                    if config.subgroups and subgroup not in config.subgroups:
                        continue
                    url = MEETINGS_BASE_URL.format(code=code)
                    try:
                        response = session.get(url, timeout=config.timeout)
                        response.raise_for_status()
                    except requests.RequestException as exc:
                        message = f"Meeting crawl failed for {code}: {exc}"
                        logger.warning(message)
                        errors.append(message)
                        continue
                    parsed_meetings = self._parse_meeting_page(response.text, working_group, subgroup)
                    for meeting in parsed_meetings:
                        if config.incremental and meeting.meeting_id in existing_ids:
                            continue
                        meetings.append(meeting)
        finally:
            session.close()

        filtered = self._apply_limits(meetings, config.limits)
        inserted = 0
        updated = 0
        if filtered:
            # Pass progress callback to bulk_upsert_meetings to update after each DB operation
            inserted, updated = self.database.bulk_upsert_meetings(filtered, progress_callback=progress_callback)

        return MeetingCrawlResult(
            processed=len(filtered),
            inserted=inserted,
            updated=updated,
            errors=errors,
        )

    def _limit_working_groups(self, working_groups: list[WorkingGroup], limits: CrawlLimits) -> list[WorkingGroup]:
        if limits.limit_wgs is None or limits.limit_wgs == 0:
            return working_groups
@@ -322,6 +269,59 @@ class MeetingCrawler:
        allowed = set(sequence[:limit]) if limit > 0 else set(sequence[limit:])
        return [meeting for meeting in meetings if meeting.meeting_id in allowed]

    def __init__(self, database: TDocDatabase) -> None:
        self.database = database

    def crawl(self, config: MeetingCrawlConfig, progress_callback: Callable[[float, float], None] | None = None) -> MeetingCrawlResult:
        errors: list[str] = []
        meetings: list[MeetingMetadata] = []

        working_groups = self._limit_working_groups(config.working_groups, config.limits)
        existing_ids: set[int] = set()
        if config.incremental:
            existing_ids = self.database.get_existing_meeting_ids(working_groups)
        session = requests.Session()
        session.headers["User-Agent"] = "tdoc-crawler/0.0.1"
        if config.credentials is not None:
            session.auth = (config.credentials.username, config.credentials.password)

        try:
            for working_group in working_groups:
                for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
                    # Skip subgroup if subgroups filter is set and this subgroup is not in the list
                    if config.subgroups and subgroup not in config.subgroups:
                        continue
                    url = MEETINGS_BASE_URL.format(code=code)
                    try:
                        response = session.get(url, timeout=config.timeout)
                        response.raise_for_status()
                    except requests.RequestException as exc:
                        message = f"Meeting crawl failed for {code}: {exc}"
                        logger.warning(message)
                        errors.append(message)
                        continue
                    parsed_meetings = self._parse_meeting_page(response.text, working_group, subgroup)
                    for meeting in parsed_meetings:
                        if config.incremental and meeting.meeting_id in existing_ids:
                            continue
                        meetings.append(meeting)
        finally:
            session.close()

        filtered = self._apply_limits(meetings, config.limits)
        inserted = 0
        updated = 0
        if filtered:
            # Pass progress callback to bulk_upsert_meetings to update after each DB operation
            inserted, updated = self.database.bulk_upsert_meetings(filtered, progress_callback=progress_callback)

        return MeetingCrawlResult(
            processed=len(filtered),
            inserted=inserted,
            updated=updated,
            errors=errors,
        )


__all__ = [
    "MEETING_CODE_REGISTRY",
+37 −37
Original line number Diff line number Diff line
@@ -29,43 +29,6 @@ class TDocCrawlResult:
class TDocCrawler:
    """Configuration-driven crawler orchestrating subinterpreter workers."""

    def __init__(self, database: TDocDatabase) -> None:
        self.database = database

    def crawl(self, config: TDocCrawlConfig, progress_callback: Callable[[float, float], None] | None = None) -> TDocCrawlResult:
        """Execute a crawl using the provided configuration."""

        meetings = self._get_meetings_to_crawl(config)
        if not meetings:
            logger.warning("No meetings found matching criteria")
            return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found"])

        meetings = self._apply_meeting_limits(meetings, config.limits)
        if not meetings:
            logger.warning("Meetings filtered out by limits configuration")
            return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found after applying limits"])

        existing_ids: set[str] = set()
        if config.incremental:
            existing_ids = {tdoc_id.upper() for tdoc_id in self.database.get_existing_tdoc_ids(config.working_groups)}

        targets = set(config.target_ids) if config.target_ids else None

        collected: list[TDocMetadata] = []
        errors = asyncio.run(self._crawl_meetings_parallel(meetings, config, collected, existing_ids, targets))

        inserted = 0
        updated = 0
        if collected:
            inserted, updated = self.database.bulk_upsert_tdocs(collected, progress_callback=progress_callback)

        return TDocCrawlResult(
            processed=len(collected),
            inserted=inserted,
            updated=updated,
            errors=errors,
        )

    def _apply_meeting_limits(self, meetings: list, limits: CrawlLimits) -> list:
        """Apply crawl limits to the meeting list."""

@@ -280,6 +243,43 @@ class TDocCrawler:

        return False

    def __init__(self, database: TDocDatabase) -> None:
        self.database = database

    def crawl(self, config: TDocCrawlConfig, progress_callback: Callable[[float, float], None] | None = None) -> TDocCrawlResult:
        """Execute a crawl using the provided configuration."""

        meetings = self._get_meetings_to_crawl(config)
        if not meetings:
            logger.warning("No meetings found matching criteria")
            return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found"])

        meetings = self._apply_meeting_limits(meetings, config.limits)
        if not meetings:
            logger.warning("Meetings filtered out by limits configuration")
            return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found after applying limits"])

        existing_ids: set[str] = set()
        if config.incremental:
            existing_ids = {tdoc_id.upper() for tdoc_id in self.database.get_existing_tdoc_ids(config.working_groups)}

        targets = set(config.target_ids) if config.target_ids else None

        collected: list[TDocMetadata] = []
        errors = asyncio.run(self._crawl_meetings_parallel(meetings, config, collected, existing_ids, targets))

        inserted = 0
        updated = 0
        if collected:
            inserted, updated = self.database.bulk_upsert_tdocs(collected, progress_callback=progress_callback)

        return TDocCrawlResult(
            processed=len(collected),
            inserted=inserted,
            updated=updated,
            errors=errors,
        )


__all__ = [
    "TDocCrawlResult",
+82 −82
Original line number Diff line number Diff line
@@ -27,35 +27,23 @@ from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS
class TDocDatabase:
    """High-level facade for all database operations."""

    def __init__(self, db_path: Path) -> None:
        self.db_path = db_path
        self._database: DataBase | None = None

    # ------------------------------------------------------------------
    # Context manager lifecycle
    # ------------------------------------------------------------------
    def __enter__(self) -> TDocDatabase:
        try:
            self.db_path.parent.mkdir(parents=True, exist_ok=True)
            self._database = DataBase(self.db_path)
            self._ensure_reference_data()
        except Exception as exc:
            raise DatabaseError("database-initialization-failed", detail=str(exc)) from exc
        return self

    def __exit__(self, exc_type, exc, exc_tb) -> None:
        self._database = None

    # ------------------------------------------------------------------
    # Core accessors and utilities
    # ------------------------------------------------------------------
    @property
    def connection(self) -> DataBase:
        """Expose the underlying DataBase instance (read-only)."""
    @staticmethod
    def _tdoc_changed(current: TDocMetadata, candidate: TDocMetadata) -> bool:
        for field in TDocMetadata.model_fields:
            if field == "date_updated":
                continue
            if getattr(current, field) != getattr(candidate, field):
                return True
        return False

        if self._database is None:
            raise DatabaseError.connection_not_open()
        return self._database
    @staticmethod
    def _meeting_changed(current: MeetingMetadata, candidate: MeetingMetadata) -> bool:
        for field in MeetingMetadata.model_fields:
            if field in {"updated_at", "last_synced", "tdoc_count"}:
                continue
            if getattr(current, field) != getattr(candidate, field):
                return True
        return False

    def _ensure_reference_data(self) -> None:
        """Populate reference tables for working and subworking groups."""
@@ -121,6 +109,72 @@ class TDocDatabase:
        except KeyError:
            return None

    # ------------------------------------------------------------------
    # Normalisation helpers
    # ------------------------------------------------------------------
    def _prepare_tdoc(self, metadata: TDocMetadata) -> TDocMetadata:
        updates: dict[str, object] = {}
        if metadata.date_retrieved is None:
            updates["date_retrieved"] = utc_now()
        if updates:
            return metadata.model_copy(update=updates)
        return metadata

    def _prepare_meeting(self, metadata: MeetingMetadata) -> MeetingMetadata:
        if metadata.working_group is None and metadata.tbid:
            for working_group in WorkingGroup:
                if working_group.tbid == metadata.tbid:
                    return metadata.model_copy(update={"working_group": working_group})
        return metadata

    # ------------------------------------------------------------------
    # Data clearing methods
    # ------------------------------------------------------------------
    def _table_exists(self, table_name: str) -> bool:
        """Check if a table exists in the database.

        Args:
            table_name: Name of the table to check

        Returns:
            True if table exists, False otherwise
        """
        cursor = self.connection._db.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table_name,),
        )
        return cursor.fetchone() is not None

    def __init__(self, db_path: Path) -> None:
        self.db_path = db_path
        self._database: DataBase | None = None

    # ------------------------------------------------------------------
    # Context manager lifecycle
    # ------------------------------------------------------------------
    def __enter__(self) -> TDocDatabase:
        try:
            self.db_path.parent.mkdir(parents=True, exist_ok=True)
            self._database = DataBase(self.db_path)
            self._ensure_reference_data()
        except Exception as exc:
            raise DatabaseError("database-initialization-failed", detail=str(exc)) from exc
        return self

    def __exit__(self, exc_type, exc, exc_tb) -> None:
        self._database = None

    # ------------------------------------------------------------------
    # Core accessors and utilities
    # ------------------------------------------------------------------
    @property
    def connection(self) -> DataBase:
        """Expose the underlying DataBase instance (read-only)."""

        if self._database is None:
            raise DatabaseError.connection_not_open()
        return self._database

    # ------------------------------------------------------------------
    # TDoc operations
    # ------------------------------------------------------------------
@@ -513,60 +567,6 @@ class TDocDatabase:
            "recent_crawls": recent_crawls,
        }

    # ------------------------------------------------------------------
    # Normalisation helpers
    # ------------------------------------------------------------------
    def _prepare_tdoc(self, metadata: TDocMetadata) -> TDocMetadata:
        updates: dict[str, object] = {}
        if metadata.date_retrieved is None:
            updates["date_retrieved"] = utc_now()
        if updates:
            return metadata.model_copy(update=updates)
        return metadata

    def _prepare_meeting(self, metadata: MeetingMetadata) -> MeetingMetadata:
        if metadata.working_group is None and metadata.tbid:
            for working_group in WorkingGroup:
                if working_group.tbid == metadata.tbid:
                    return metadata.model_copy(update={"working_group": working_group})
        return metadata

    @staticmethod
    def _tdoc_changed(current: TDocMetadata, candidate: TDocMetadata) -> bool:
        for field in TDocMetadata.model_fields:
            if field == "date_updated":
                continue
            if getattr(current, field) != getattr(candidate, field):
                return True
        return False

    @staticmethod
    def _meeting_changed(current: MeetingMetadata, candidate: MeetingMetadata) -> bool:
        for field in MeetingMetadata.model_fields:
            if field in {"updated_at", "last_synced", "tdoc_count"}:
                continue
            if getattr(current, field) != getattr(candidate, field):
                return True
        return False

    # ------------------------------------------------------------------
    # Data clearing methods
    # ------------------------------------------------------------------
    def _table_exists(self, table_name: str) -> bool:
        """Check if a table exists in the database.

        Args:
            table_name: Name of the table to check

        Returns:
            True if table exists, False otherwise
        """
        cursor = self.connection._db.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name=?",
            (table_name,),
        )
        return cursor.fetchone() is not None

    def clear_tdocs(self) -> int:
        """Clear all TDoc records from database.

+4 −4
Original line number Diff line number Diff line
@@ -6,10 +6,6 @@ from __future__ import annotations
class DatabaseError(RuntimeError):
    """Raised when a database constraint or execution fails."""

    def __init__(self, code: str, *, detail: str | None = None) -> None:
        message = f"{code}:{detail}" if detail else code
        super().__init__(message)

    @classmethod
    def connection_not_open(cls) -> DatabaseError:
        return cls("connection-not-open")
@@ -25,3 +21,7 @@ class DatabaseError(RuntimeError):
    @classmethod
    def missing_datetime(cls) -> DatabaseError:
        return cls("missing-required-datetime")

    def __init__(self, code: str, *, detail: str | None = None) -> None:
        message = f"{code}:{detail}" if detail else code
        super().__init__(message)
+12 −12
Original line number Diff line number Diff line
@@ -122,6 +122,18 @@ class QueryConfig(BaseConfigModel):
    limit: int | None = Field(None, ge=1, description="Maximum results")
    order: SortOrder = Field(SortOrder.DESC, description="Sort order applied to date_retrieved")

    @field_validator("working_groups", mode="before")
    @classmethod
    def _normalize_working_groups(cls, value: Iterable[str | WorkingGroup] | None) -> list[WorkingGroup] | None:
        """Ensure the working group list is comprised of enum members."""

        if value is None:
            return None
        normalized: list[WorkingGroup] = []
        for item in value:
            normalized.append(WorkingGroup(item) if not isinstance(item, WorkingGroup) else item)
        return normalized

    def __init__(self, **data: object) -> None:
        """Normalize identifiers and accept enum values from strings."""

@@ -137,18 +149,6 @@ class QueryConfig(BaseConfigModel):
            data["output_format"] = OutputFormat(output_format.lower())
        super().__init__(**data)

    @field_validator("working_groups", mode="before")
    @classmethod
    def _normalize_working_groups(cls, value: Iterable[str | WorkingGroup] | None) -> list[WorkingGroup] | None:
        """Ensure the working group list is comprised of enum members."""

        if value is None:
            return None
        normalized: list[WorkingGroup] = []
        for item in value:
            normalized.append(WorkingGroup(item) if not isinstance(item, WorkingGroup) else item)
        return normalized


# Legacy alias
CrawlConfig = TDocCrawlConfig