Commit cbb8b0d0 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(database): remove unused database methods

Remove unused methods: DatabaseError.*, MeetingDatabase.get_tdoc_count_for_meeting,
MeetingDatabase.update_meeting_tdoc_count, MeetingDatabase.get_subgroup_by_code,
SpecDatabase.log_spec_download, TDocDatabase.get_processed_meetings,
TDocDatabase.cache_invalid_tdoc, TDocDatabase.get_cached_invalid_tdocs.
parent 7fbb8964
Loading
Loading
Loading
Loading
+0 −17
Original line number Diff line number Diff line
@@ -69,14 +69,6 @@ class DocDatabase:
            raise DatabaseError.connection_not_open()
        return self._database

    def clear_all_data(self) -> dict[str, int]:
        """Clear all TDocs and meetings from database.

        Returns:
            Mapping of table name to deleted count
        """
        return self._clear_tables(self.model_map.keys())

    def clear_tdocs(self) -> int:
        """Clear all TDoc records from database.

@@ -86,15 +78,6 @@ class DocDatabase:
        counts = self._clear_tables(["tdocs"])
        return counts.get("tdocs", 0)

    def clear_meetings(self) -> int:
        """Clear all meeting records from database.

        Returns:
            Number of meetings deleted
        """
        counts = self._clear_tables(["meetings"])
        return counts.get("meetings", 0)

    def clear_specs(self) -> dict[str, int]:
        """Clear all spec-related records from database.

+0 −12
Original line number Diff line number Diff line
@@ -13,15 +13,3 @@ class DatabaseError(RuntimeError):
    @classmethod
    def connection_not_open(cls) -> DatabaseError:
        return cls("connection-not-open")

    @classmethod
    def crawl_log_persist_failed(cls) -> DatabaseError:
        return cls("crawl-log-persist-failed")

    @classmethod
    def parse_failure(cls, entity: str, detail: str) -> DatabaseError:
        return cls(f"{entity}-parse-failed", detail=detail)

    @classmethod
    def missing_datetime(cls) -> DatabaseError:
        return cls("missing-required-datetime")
+1 −52
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.normalization import normalize_portal_meeting_name
@@ -147,37 +147,6 @@ class MeetingDatabase(DocDatabase):
        allowed = {wg.tbid for wg in working_groups}
        return {meeting.meeting_id for meeting in meetings if meeting.tbid in allowed}

    def get_tdoc_count_for_meeting(self, meeting_id: int) -> int:
        """Get the number of TDocs associated with a meeting.

        Args:
            meeting_id: The meeting identifier

        Returns:
            Number of TDocs for this meeting
        """
        tdocs = self._table_rows("tdocs")
        return sum(1 for tdoc in tdocs if tdoc.meeting_id == meeting_id)

    def update_meeting_tdoc_count(self, meeting_id: int, tdoc_count: int) -> None:
        """Update the tdoc_count field for a meeting.

        Args:
            meeting_id: The meeting identifier
            tdoc_count: The new TDoc count
        """
        meeting = self._get_meeting(meeting_id)
        if meeting is None:
            return

        updated = meeting.model_copy(
            update={
                "tdoc_count": tdoc_count,
                "updated_at": utc_now(),
            }
        )
        self.connection.add("meetings", updated, pk="meeting_id")

    def resolve_meeting_id(self, meeting_name: str) -> int | None:
        """Resolve meeting name to meeting_id from database.

@@ -279,26 +248,6 @@ class MeetingDatabase(DocDatabase):
            "recent_crawls": recent_crawls,
        }

    @staticmethod
    def get_subgroup_by_code(code: str) -> dict[str, int | str] | None:
        """Get subgroup metadata by code.

        Args:
            code: Subgroup code (e.g., "S4", "R1")

        Returns:
            Dictionary with subgroup metadata or None if not found
        """
        record = CODE_INDEX.get(code.strip().upper())
        if record is None:
            return None
        return {
            "subtb": record.subtb,
            "tbid": record.tbid,
            "code": record.code,
            "name": record.name,
        }

    def _meeting_map(self) -> dict[int, MeetingMetadata]:
        """Get mapping of meeting ID to meeting metadata."""
        return {meeting.meeting_id: meeting for meeting in self._table_rows("meetings")}
+0 −15
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@ from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.specs.models import (
    Specification,
    SpecificationDownload,
    SpecificationSourceRecord,
    SpecificationVersion,
    SpecQueryFilters,
@@ -188,20 +187,6 @@ class SpecDatabase(DocDatabase):
        except Exception:
            return []

    def log_spec_download(self, download: SpecificationDownload) -> None:
        """Persist download/extraction outcomes for a spec version."""
        record_id = download.record_id or f"{download.spec_number}:{download.version}"
        # Convert Path objects to strings for SQLite compatibility
        updated_download = download.model_copy(
            update={
                "record_id": record_id,
                "checkout_path": str(download.checkout_path),
                "document_path": str(download.document_path),
                "attachment_paths": [str(p) for p in download.attachment_paths],
            }
        )
        self.connection.add("spec_downloads", updated_download, pk="record_id")

    def query_specs(self, filters: SpecQueryFilters) -> list[SpecQueryResult]:
        """Query stored spec metadata."""
        specs = self._spec_table_rows()
+0 −84
Original line number Diff line number Diff line
@@ -4,7 +4,6 @@ import fnmatch
import sqlite3
from collections.abc import Callable, Iterable
from datetime import UTC, datetime
from decimal import Decimal
from typing import Any

from tdoc_crawler.database.meetings import MeetingDatabase
@@ -160,89 +159,6 @@ class TDocDatabase(MeetingDatabase):
                result.add(record.tdoc_id)
        return result

    def get_processed_meetings(
        self,
        working_groups: Iterable[WorkingGroup] | None = None,
        subgroups: Iterable[str] | None = None,
    ) -> set[int]:
        """Get set of meeting IDs that have TDocs stored.

        Args:
            working_groups: Optional list of working groups to filter by
            subgroups: Optional list of subgroup codes to filter by

        Returns:
            Set of meeting IDs
        """
        records = self._table_rows("tdocs")

        meeting_db = MeetingDatabase(self.db_file)
        meeting_db._database = self._database  # Share connection
        meeting_map = meeting_db._meeting_map()

        allowed_tbids = {wg.tbid for wg in working_groups} if working_groups else None
        allowed_subgroups = {value.strip().upper() for value in subgroups} if subgroups else None

        processed: set[int] = set()
        for record in records:
            meeting_id = record.meeting_id
            if meeting_id is None:
                continue
            meeting = meeting_map.get(meeting_id)
            if meeting is None:
                continue
            if not self._meeting_matches_filters(meeting, allowed_tbids, allowed_subgroups):
                continue
            processed.add(meeting_id)
        return processed

    def cache_invalid_tdoc(
        self,
        tdoc_id: str,
        url: str,
        working_group: WorkingGroup,
        subgroup: str,
    ) -> None:
        """Cache an invalid TDoc (for deduplication during crawling).

        Args:
            tdoc_id: TDoc identifier
            url: TDoc URL
            working_group: Working group
            subgroup: Subgroup code
        """
        _ = (working_group, subgroup)
        if self._get_tdoc(tdoc_id) is not None:
            return

        metadata = TDocMetadata(
            tdoc_id=tdoc_id,
            url=url,
            meeting_id=0,
            title="Unknown",
            contact="Unknown",
            source="Unknown",
            tdoc_type="unknown",
            for_purpose="unknown",
            agenda_item_nbr=Decimal("0.0"),
            agenda_item_text="Unknown",
            status=None,
            is_revision_of=None,
            file_size=None,
            date_created=None,
            validated=False,
            validation_failed=True,
        )
        self.connection.add("tdocs", metadata.model_copy(update={"date_updated": utc_now()}), pk="tdoc_id")

    def get_cached_invalid_tdocs(self) -> set[str]:
        """Get set of TDoc IDs marked as invalid.

        Returns:
            Set of invalid TDoc IDs
        """
        return {record.tdoc_id for record in self._table_rows("tdocs") if record.validation_failed}

    def _apply_pattern_filters(self, records: list[TDocMetadata], config: TDocQueryConfig) -> list[TDocMetadata]:
        """Apply all supported include/exclude glob filters."""
        filtered = records