refactor(database): remove unused database methods (cbb8b0d0) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/database/base.py

+0 −17

Original line number	Diff line number	Diff line
		@@ -69,14 +69,6 @@ class DocDatabase:
		raise DatabaseError.connection_not_open()
		return self._database

		def clear_all_data(self) -> dict[str, int]:
		"""Clear all TDocs and meetings from database.

		Returns:
		Mapping of table name to deleted count
		"""
		return self._clear_tables(self.model_map.keys())

		def clear_tdocs(self) -> int:
		"""Clear all TDoc records from database.

		@@ -86,15 +78,6 @@ class DocDatabase:
		counts = self._clear_tables(["tdocs"])
		return counts.get("tdocs", 0)

		def clear_meetings(self) -> int:
		"""Clear all meeting records from database.

		Returns:
		Number of meetings deleted
		"""
		counts = self._clear_tables(["meetings"])
		return counts.get("meetings", 0)

		def clear_specs(self) -> dict[str, int]:
		"""Clear all spec-related records from database.

src/tdoc_crawler/database/errors.py

+0 −12

Original line number	Diff line number	Diff line
		@@ -13,15 +13,3 @@ class DatabaseError(RuntimeError):
		@classmethod
		def connection_not_open(cls) -> DatabaseError:
		return cls("connection-not-open")

		@classmethod
		def crawl_log_persist_failed(cls) -> DatabaseError:
		return cls("crawl-log-persist-failed")

		@classmethod
		def parse_failure(cls, entity: str, detail: str) -> DatabaseError:
		return cls(f"{entity}-parse-failed", detail=detail)

		@classmethod
		def missing_datetime(cls) -> DatabaseError:
		return cls("missing-required-datetime")

src/tdoc_crawler/database/meetings.py

+1 −52

Original line number	Diff line number	Diff line
		@@ -8,7 +8,7 @@ from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
		from tdoc_crawler.models.base import SortOrder
		from tdoc_crawler.models.subworking_groups import CODE_INDEX, SUBTB_INDEX
		from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
		from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup
		from tdoc_crawler.utils.misc import utc_now
		from tdoc_crawler.utils.normalization import normalize_portal_meeting_name
		@@ -147,37 +147,6 @@ class MeetingDatabase(DocDatabase):
		allowed = {wg.tbid for wg in working_groups}
		return {meeting.meeting_id for meeting in meetings if meeting.tbid in allowed}

		def get_tdoc_count_for_meeting(self, meeting_id: int) -> int:
		"""Get the number of TDocs associated with a meeting.

		Args:
		meeting_id: The meeting identifier

		Returns:
		Number of TDocs for this meeting
		"""
		tdocs = self._table_rows("tdocs")
		return sum(1 for tdoc in tdocs if tdoc.meeting_id == meeting_id)

		def update_meeting_tdoc_count(self, meeting_id: int, tdoc_count: int) -> None:
		"""Update the tdoc_count field for a meeting.

		Args:
		meeting_id: The meeting identifier
		tdoc_count: The new TDoc count
		"""
		meeting = self._get_meeting(meeting_id)
		if meeting is None:
		return

		updated = meeting.model_copy(
		update={
		"tdoc_count": tdoc_count,
		"updated_at": utc_now(),
		}
		)
		self.connection.add("meetings", updated, pk="meeting_id")

		def resolve_meeting_id(self, meeting_name: str) -> int \| None:
		"""Resolve meeting name to meeting_id from database.

		@@ -279,26 +248,6 @@ class MeetingDatabase(DocDatabase):
		"recent_crawls": recent_crawls,
		}

		@staticmethod
		def get_subgroup_by_code(code: str) -> dict[str, int \| str] \| None:
		"""Get subgroup metadata by code.

		Args:
		code: Subgroup code (e.g., "S4", "R1")

		Returns:
		Dictionary with subgroup metadata or None if not found
		"""
		record = CODE_INDEX.get(code.strip().upper())
		if record is None:
		return None
		return {
		"subtb": record.subtb,
		"tbid": record.tbid,
		"code": record.code,
		"name": record.name,
		}

		def _meeting_map(self) -> dict[int, MeetingMetadata]:
		"""Get mapping of meeting ID to meeting metadata."""
		return {meeting.meeting_id: meeting for meeting in self._table_rows("meetings")}

src/tdoc_crawler/database/specs.py

+0 −15

Original line number	Diff line number	Diff line
		@@ -12,7 +12,6 @@ from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.specs.models import (
		Specification,
		SpecificationDownload,
		SpecificationSourceRecord,
		SpecificationVersion,
		SpecQueryFilters,
		@@ -188,20 +187,6 @@ class SpecDatabase(DocDatabase):
		except Exception:
		return []

		def log_spec_download(self, download: SpecificationDownload) -> None:
		"""Persist download/extraction outcomes for a spec version."""
		record_id = download.record_id or f"{download.spec_number}:{download.version}"
		# Convert Path objects to strings for SQLite compatibility
		updated_download = download.model_copy(
		update={
		"record_id": record_id,
		"checkout_path": str(download.checkout_path),
		"document_path": str(download.document_path),
		"attachment_paths": [str(p) for p in download.attachment_paths],
		}
		)
		self.connection.add("spec_downloads", updated_download, pk="record_id")

		def query_specs(self, filters: SpecQueryFilters) -> list[SpecQueryResult]:
		"""Query stored spec metadata."""
		specs = self._spec_table_rows()

src/tdoc_crawler/database/tdocs.py

+0 −84

Original line number	Diff line number	Diff line
		@@ -4,7 +4,6 @@ import fnmatch
		import sqlite3
		from collections.abc import Callable, Iterable
		from datetime import UTC, datetime
		from decimal import Decimal
		from typing import Any

		from tdoc_crawler.database.meetings import MeetingDatabase
		@@ -160,89 +159,6 @@ class TDocDatabase(MeetingDatabase):
		result.add(record.tdoc_id)
		return result

		def get_processed_meetings(
		self,
		working_groups: Iterable[WorkingGroup] \| None = None,
		subgroups: Iterable[str] \| None = None,
		) -> set[int]:
		"""Get set of meeting IDs that have TDocs stored.

		Args:
		working_groups: Optional list of working groups to filter by
		subgroups: Optional list of subgroup codes to filter by

		Returns:
		Set of meeting IDs
		"""
		records = self._table_rows("tdocs")

		meeting_db = MeetingDatabase(self.db_file)
		meeting_db._database = self._database # Share connection
		meeting_map = meeting_db._meeting_map()

		allowed_tbids = {wg.tbid for wg in working_groups} if working_groups else None
		allowed_subgroups = {value.strip().upper() for value in subgroups} if subgroups else None

		processed: set[int] = set()
		for record in records:
		meeting_id = record.meeting_id
		if meeting_id is None:
		continue
		meeting = meeting_map.get(meeting_id)
		if meeting is None:
		continue
		if not self._meeting_matches_filters(meeting, allowed_tbids, allowed_subgroups):
		continue
		processed.add(meeting_id)
		return processed

		def cache_invalid_tdoc(
		self,
		tdoc_id: str,
		url: str,
		working_group: WorkingGroup,
		subgroup: str,
		) -> None:
		"""Cache an invalid TDoc (for deduplication during crawling).

		Args:
		tdoc_id: TDoc identifier
		url: TDoc URL
		working_group: Working group
		subgroup: Subgroup code
		"""
		_ = (working_group, subgroup)
		if self._get_tdoc(tdoc_id) is not None:
		return

		metadata = TDocMetadata(
		tdoc_id=tdoc_id,
		url=url,
		meeting_id=0,
		title="Unknown",
		contact="Unknown",
		source="Unknown",
		tdoc_type="unknown",
		for_purpose="unknown",
		agenda_item_nbr=Decimal("0.0"),
		agenda_item_text="Unknown",
		status=None,
		is_revision_of=None,
		file_size=None,
		date_created=None,
		validated=False,
		validation_failed=True,
		)
		self.connection.add("tdocs", metadata.model_copy(update={"date_updated": utc_now()}), pk="tdoc_id")

		def get_cached_invalid_tdocs(self) -> set[str]:
		"""Get set of TDoc IDs marked as invalid.

		Returns:
		Set of invalid TDoc IDs
		"""
		return {record.tdoc_id for record in self._table_rows("tdocs") if record.validation_failed}

		def _apply_pattern_filters(self, records: list[TDocMetadata], config: TDocQueryConfig) -> list[TDocMetadata]:
		"""Apply all supported include/exclude glob filters."""
		filtered = records