♻️ refactor(core): reorganize DB helpers and crawler entrypoints (25433cc1) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/crawlers/meetings.py

+53 −53

Original line number	Diff line number	Diff line
		@@ -105,59 +105,6 @@ class MeetingCrawlResult:
		class MeetingCrawler:
		"""Crawler fetching meeting metadata from the 3GPP portal."""

		def __init__(self, database: TDocDatabase) -> None:
		self.database = database

		def crawl(self, config: MeetingCrawlConfig, progress_callback: Callable[[float, float], None] \| None = None) -> MeetingCrawlResult:
		errors: list[str] = []
		meetings: list[MeetingMetadata] = []

		working_groups = self._limit_working_groups(config.working_groups, config.limits)
		existing_ids: set[int] = set()
		if config.incremental:
		existing_ids = self.database.get_existing_meeting_ids(working_groups)
		session = requests.Session()
		session.headers["User-Agent"] = "tdoc-crawler/0.0.1"
		if config.credentials is not None:
		session.auth = (config.credentials.username, config.credentials.password)

		try:
		for working_group in working_groups:
		for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
		# Skip subgroup if subgroups filter is set and this subgroup is not in the list
		if config.subgroups and subgroup not in config.subgroups:
		continue
		url = MEETINGS_BASE_URL.format(code=code)
		try:
		response = session.get(url, timeout=config.timeout)
		response.raise_for_status()
		except requests.RequestException as exc:
		message = f"Meeting crawl failed for {code}: {exc}"
		logger.warning(message)
		errors.append(message)
		continue
		parsed_meetings = self._parse_meeting_page(response.text, working_group, subgroup)
		for meeting in parsed_meetings:
		if config.incremental and meeting.meeting_id in existing_ids:
		continue
		meetings.append(meeting)
		finally:
		session.close()

		filtered = self._apply_limits(meetings, config.limits)
		inserted = 0
		updated = 0
		if filtered:
		# Pass progress callback to bulk_upsert_meetings to update after each DB operation
		inserted, updated = self.database.bulk_upsert_meetings(filtered, progress_callback=progress_callback)

		return MeetingCrawlResult(
		processed=len(filtered),
		inserted=inserted,
		updated=updated,
		errors=errors,
		)

		def _limit_working_groups(self, working_groups: list[WorkingGroup], limits: CrawlLimits) -> list[WorkingGroup]:
		if limits.limit_wgs is None or limits.limit_wgs == 0:
		return working_groups
		@@ -322,6 +269,59 @@ class MeetingCrawler:
		allowed = set(sequence[:limit]) if limit > 0 else set(sequence[limit:])
		return [meeting for meeting in meetings if meeting.meeting_id in allowed]

		def __init__(self, database: TDocDatabase) -> None:
		self.database = database

		def crawl(self, config: MeetingCrawlConfig, progress_callback: Callable[[float, float], None] \| None = None) -> MeetingCrawlResult:
		errors: list[str] = []
		meetings: list[MeetingMetadata] = []

		working_groups = self._limit_working_groups(config.working_groups, config.limits)
		existing_ids: set[int] = set()
		if config.incremental:
		existing_ids = self.database.get_existing_meeting_ids(working_groups)
		session = requests.Session()
		session.headers["User-Agent"] = "tdoc-crawler/0.0.1"
		if config.credentials is not None:
		session.auth = (config.credentials.username, config.credentials.password)

		try:
		for working_group in working_groups:
		for code, subgroup in MEETING_CODE_REGISTRY.get(working_group.value, []):
		# Skip subgroup if subgroups filter is set and this subgroup is not in the list
		if config.subgroups and subgroup not in config.subgroups:
		continue
		url = MEETINGS_BASE_URL.format(code=code)
		try:
		response = session.get(url, timeout=config.timeout)
		response.raise_for_status()
		except requests.RequestException as exc:
		message = f"Meeting crawl failed for {code}: {exc}"
		logger.warning(message)
		errors.append(message)
		continue
		parsed_meetings = self._parse_meeting_page(response.text, working_group, subgroup)
		for meeting in parsed_meetings:
		if config.incremental and meeting.meeting_id in existing_ids:
		continue
		meetings.append(meeting)
		finally:
		session.close()

		filtered = self._apply_limits(meetings, config.limits)
		inserted = 0
		updated = 0
		if filtered:
		# Pass progress callback to bulk_upsert_meetings to update after each DB operation
		inserted, updated = self.database.bulk_upsert_meetings(filtered, progress_callback=progress_callback)

		return MeetingCrawlResult(
		processed=len(filtered),
		inserted=inserted,
		updated=updated,
		errors=errors,
		)


		__all__ = [
		"MEETING_CODE_REGISTRY",

src/tdoc_crawler/crawlers/tdocs.py

+37 −37

Original line number	Diff line number	Diff line
		@@ -29,43 +29,6 @@ class TDocCrawlResult:
		class TDocCrawler:
		"""Configuration-driven crawler orchestrating subinterpreter workers."""

		def __init__(self, database: TDocDatabase) -> None:
		self.database = database

		def crawl(self, config: TDocCrawlConfig, progress_callback: Callable[[float, float], None] \| None = None) -> TDocCrawlResult:
		"""Execute a crawl using the provided configuration."""

		meetings = self._get_meetings_to_crawl(config)
		if not meetings:
		logger.warning("No meetings found matching criteria")
		return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found"])

		meetings = self._apply_meeting_limits(meetings, config.limits)
		if not meetings:
		logger.warning("Meetings filtered out by limits configuration")
		return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found after applying limits"])

		existing_ids: set[str] = set()
		if config.incremental:
		existing_ids = {tdoc_id.upper() for tdoc_id in self.database.get_existing_tdoc_ids(config.working_groups)}

		targets = set(config.target_ids) if config.target_ids else None

		collected: list[TDocMetadata] = []
		errors = asyncio.run(self._crawl_meetings_parallel(meetings, config, collected, existing_ids, targets))

		inserted = 0
		updated = 0
		if collected:
		inserted, updated = self.database.bulk_upsert_tdocs(collected, progress_callback=progress_callback)

		return TDocCrawlResult(
		processed=len(collected),
		inserted=inserted,
		updated=updated,
		errors=errors,
		)

		def _apply_meeting_limits(self, meetings: list, limits: CrawlLimits) -> list:
		"""Apply crawl limits to the meeting list."""

		@@ -280,6 +243,43 @@ class TDocCrawler:

		return False

		def __init__(self, database: TDocDatabase) -> None:
		self.database = database

		def crawl(self, config: TDocCrawlConfig, progress_callback: Callable[[float, float], None] \| None = None) -> TDocCrawlResult:
		"""Execute a crawl using the provided configuration."""

		meetings = self._get_meetings_to_crawl(config)
		if not meetings:
		logger.warning("No meetings found matching criteria")
		return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found"])

		meetings = self._apply_meeting_limits(meetings, config.limits)
		if not meetings:
		logger.warning("Meetings filtered out by limits configuration")
		return TDocCrawlResult(processed=0, inserted=0, updated=0, errors=["No meetings found after applying limits"])

		existing_ids: set[str] = set()
		if config.incremental:
		existing_ids = {tdoc_id.upper() for tdoc_id in self.database.get_existing_tdoc_ids(config.working_groups)}

		targets = set(config.target_ids) if config.target_ids else None

		collected: list[TDocMetadata] = []
		errors = asyncio.run(self._crawl_meetings_parallel(meetings, config, collected, existing_ids, targets))

		inserted = 0
		updated = 0
		if collected:
		inserted, updated = self.database.bulk_upsert_tdocs(collected, progress_callback=progress_callback)

		return TDocCrawlResult(
		processed=len(collected),
		inserted=inserted,
		updated=updated,
		errors=errors,
		)


		__all__ = [
		"TDocCrawlResult",

src/tdoc_crawler/database/connection.py

+82 −82

Original line number	Diff line number	Diff line
		@@ -27,35 +27,23 @@ from tdoc_crawler.models.subworking_groups import SUBWORKING_GROUP_RECORDS
		class TDocDatabase:
		"""High-level facade for all database operations."""

		def __init__(self, db_path: Path) -> None:
		self.db_path = db_path
		self._database: DataBase \| None = None

		# ------------------------------------------------------------------
		# Context manager lifecycle
		# ------------------------------------------------------------------
		def __enter__(self) -> TDocDatabase:
		try:
		self.db_path.parent.mkdir(parents=True, exist_ok=True)
		self._database = DataBase(self.db_path)
		self._ensure_reference_data()
		except Exception as exc:
		raise DatabaseError("database-initialization-failed", detail=str(exc)) from exc
		return self

		def __exit__(self, exc_type, exc, exc_tb) -> None:
		self._database = None

		# ------------------------------------------------------------------
		# Core accessors and utilities
		# ------------------------------------------------------------------
		@property
		def connection(self) -> DataBase:
		"""Expose the underlying DataBase instance (read-only)."""
		@staticmethod
		def _tdoc_changed(current: TDocMetadata, candidate: TDocMetadata) -> bool:
		for field in TDocMetadata.model_fields:
		if field == "date_updated":
		continue
		if getattr(current, field) != getattr(candidate, field):
		return True
		return False

		if self._database is None:
		raise DatabaseError.connection_not_open()
		return self._database
		@staticmethod
		def _meeting_changed(current: MeetingMetadata, candidate: MeetingMetadata) -> bool:
		for field in MeetingMetadata.model_fields:
		if field in {"updated_at", "last_synced", "tdoc_count"}:
		continue
		if getattr(current, field) != getattr(candidate, field):
		return True
		return False

		def _ensure_reference_data(self) -> None:
		"""Populate reference tables for working and subworking groups."""
		@@ -121,6 +109,72 @@ class TDocDatabase:
		except KeyError:
		return None

		# ------------------------------------------------------------------
		# Normalisation helpers
		# ------------------------------------------------------------------
		def _prepare_tdoc(self, metadata: TDocMetadata) -> TDocMetadata:
		updates: dict[str, object] = {}
		if metadata.date_retrieved is None:
		updates["date_retrieved"] = utc_now()
		if updates:
		return metadata.model_copy(update=updates)
		return metadata

		def _prepare_meeting(self, metadata: MeetingMetadata) -> MeetingMetadata:
		if metadata.working_group is None and metadata.tbid:
		for working_group in WorkingGroup:
		if working_group.tbid == metadata.tbid:
		return metadata.model_copy(update={"working_group": working_group})
		return metadata

		# ------------------------------------------------------------------
		# Data clearing methods
		# ------------------------------------------------------------------
		def _table_exists(self, table_name: str) -> bool:
		"""Check if a table exists in the database.

		Args:
		table_name: Name of the table to check

		Returns:
		True if table exists, False otherwise
		"""
		cursor = self.connection._db.execute(
		"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
		(table_name,),
		)
		return cursor.fetchone() is not None

		def __init__(self, db_path: Path) -> None:
		self.db_path = db_path
		self._database: DataBase \| None = None

		# ------------------------------------------------------------------
		# Context manager lifecycle
		# ------------------------------------------------------------------
		def __enter__(self) -> TDocDatabase:
		try:
		self.db_path.parent.mkdir(parents=True, exist_ok=True)
		self._database = DataBase(self.db_path)
		self._ensure_reference_data()
		except Exception as exc:
		raise DatabaseError("database-initialization-failed", detail=str(exc)) from exc
		return self

		def __exit__(self, exc_type, exc, exc_tb) -> None:
		self._database = None

		# ------------------------------------------------------------------
		# Core accessors and utilities
		# ------------------------------------------------------------------
		@property
		def connection(self) -> DataBase:
		"""Expose the underlying DataBase instance (read-only)."""

		if self._database is None:
		raise DatabaseError.connection_not_open()
		return self._database

		# ------------------------------------------------------------------
		# TDoc operations
		# ------------------------------------------------------------------
		@@ -513,60 +567,6 @@ class TDocDatabase:
		"recent_crawls": recent_crawls,
		}

		# ------------------------------------------------------------------
		# Normalisation helpers
		# ------------------------------------------------------------------
		def _prepare_tdoc(self, metadata: TDocMetadata) -> TDocMetadata:
		updates: dict[str, object] = {}
		if metadata.date_retrieved is None:
		updates["date_retrieved"] = utc_now()
		if updates:
		return metadata.model_copy(update=updates)
		return metadata

		def _prepare_meeting(self, metadata: MeetingMetadata) -> MeetingMetadata:
		if metadata.working_group is None and metadata.tbid:
		for working_group in WorkingGroup:
		if working_group.tbid == metadata.tbid:
		return metadata.model_copy(update={"working_group": working_group})
		return metadata

		@staticmethod
		def _tdoc_changed(current: TDocMetadata, candidate: TDocMetadata) -> bool:
		for field in TDocMetadata.model_fields:
		if field == "date_updated":
		continue
		if getattr(current, field) != getattr(candidate, field):
		return True
		return False

		@staticmethod
		def _meeting_changed(current: MeetingMetadata, candidate: MeetingMetadata) -> bool:
		for field in MeetingMetadata.model_fields:
		if field in {"updated_at", "last_synced", "tdoc_count"}:
		continue
		if getattr(current, field) != getattr(candidate, field):
		return True
		return False

		# ------------------------------------------------------------------
		# Data clearing methods
		# ------------------------------------------------------------------
		def _table_exists(self, table_name: str) -> bool:
		"""Check if a table exists in the database.

		Args:
		table_name: Name of the table to check

		Returns:
		True if table exists, False otherwise
		"""
		cursor = self.connection._db.execute(
		"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
		(table_name,),
		)
		return cursor.fetchone() is not None

		def clear_tdocs(self) -> int:
		"""Clear all TDoc records from database.

src/tdoc_crawler/database/errors.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -6,10 +6,6 @@ from __future__ import annotations
		class DatabaseError(RuntimeError):
		"""Raised when a database constraint or execution fails."""

		def __init__(self, code: str, *, detail: str \| None = None) -> None:
		message = f"{code}:{detail}" if detail else code
		super().__init__(message)

		@classmethod
		def connection_not_open(cls) -> DatabaseError:
		return cls("connection-not-open")
		@@ -25,3 +21,7 @@ class DatabaseError(RuntimeError):
		@classmethod
		def missing_datetime(cls) -> DatabaseError:
		return cls("missing-required-datetime")

		def __init__(self, code: str, *, detail: str \| None = None) -> None:
		message = f"{code}:{detail}" if detail else code
		super().__init__(message)

src/tdoc_crawler/models/tdocs.py

+12 −12

Original line number	Diff line number	Diff line
		@@ -122,6 +122,18 @@ class QueryConfig(BaseConfigModel):
		limit: int \| None = Field(None, ge=1, description="Maximum results")
		order: SortOrder = Field(SortOrder.DESC, description="Sort order applied to date_retrieved")

		@field_validator("working_groups", mode="before")
		@classmethod
		def _normalize_working_groups(cls, value: Iterable[str \| WorkingGroup] \| None) -> list[WorkingGroup] \| None:
		"""Ensure the working group list is comprised of enum members."""

		if value is None:
		return None
		normalized: list[WorkingGroup] = []
		for item in value:
		normalized.append(WorkingGroup(item) if not isinstance(item, WorkingGroup) else item)
		return normalized

		def __init__(self, **data: object) -> None:
		"""Normalize identifiers and accept enum values from strings."""

		@@ -137,18 +149,6 @@ class QueryConfig(BaseConfigModel):
		data["output_format"] = OutputFormat(output_format.lower())
		super().__init__(**data)

		@field_validator("working_groups", mode="before")
		@classmethod
		def _normalize_working_groups(cls, value: Iterable[str \| WorkingGroup] \| None) -> list[WorkingGroup] \| None:
		"""Ensure the working group list is comprised of enum members."""

		if value is None:
		return None
		normalized: list[WorkingGroup] = []
		for item in value:
		normalized.append(WorkingGroup(item) if not isinstance(item, WorkingGroup) else item)
		return normalized


		# Legacy alias
		CrawlConfig = TDocCrawlConfig