feat(cli): add title and status filtering options to command line args (77db5be7) · Commits · Jan Reimes / 3gpp-crawler

AGENTS.md

+8 −0

Original line number	Diff line number	Diff line
		@@ -459,3 +459,11 @@ After several implementation steps, AGENTS.md files might need updates. When exp
		2. Propose changes based on accumulated knowledge
		3. Update only after explicit user confirmation
		4. For any AGENTS.md files in the project structure, you MUST NEVER add a tree view of files or directories. This changes too often, and coding agents can easily retrieve the project's file structure using `ls` or `tree` commands when needed. AGENTS.md should focus on guidelines and patterns, not on file listings.

		## Eager imports in __init__.py

		Most eager imports in all __init__.py files are unnecessary, as these are just there for convenient/short imports in other modules. Inside a package, it should be no problem at all to always import the absolute/full module.

		Eager imports may only make sense for:
		- very relevant types that are also used by consumers of this API.
		- for constants and very simple types (like enums or types without additional dependencies)

src/tdoc_crawler/cli/app.py

+6 −3

Original line number	Diff line number	Diff line
		@@ -76,7 +76,7 @@ from tdoc_crawler.database.specs import SpecDatabase
		from tdoc_crawler.http_client import create_cached_session
		from tdoc_crawler.logging import set_verbosity
		from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingQueryConfig
		from tdoc_crawler.meetings.operations import MeetingCrawler
		from tdoc_crawler.meetings.operations.crawl import MeetingCrawler
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		from tdoc_crawler.models.crawl_limits import CrawlLimits
		from tdoc_crawler.specs.downloads import SpecDownloads
		@@ -333,10 +333,13 @@ def crawl_meetings(
		if removed_specs:
		console.print("[yellow]Cleared checkout entries for specs[/yellow]")

		with TDocDatabase(db_file) as database:
		# TDocDatabase inherits from DocDatabase which handles crawl logging
		with MeetingDatabase(db_file) as database:
		# MeetingDatabase inherits from DocDatabase which handles crawl logging
		crawl_id = database.log_crawl_start("meeting", [wg.value for wg in config.working_groups], config.incremental)

		# Create crawler instance
		crawler = MeetingCrawler(database)

		# Create progress bar for meeting crawling
		with Progress(
		SpinnerColumn(),

src/tdoc_crawler/cli/args.py

+3 −0

Original line number	Diff line number	Diff line
		@@ -57,7 +57,10 @@ FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch

		# Options - Specs
		ClearSpecsOption = Annotated[bool, typer.Option("--clear-specs", help="Clear all specs before crawling")]
		TitleOption = Annotated[str \| None, typer.Option("--title", help="Filter by title contains")]
		StatusOption = Annotated[str \| None, typer.Option("--status", help="Filter by status")]
		SpecOption = Annotated[list[str] \| None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]

		SpecFileOption = Annotated[Path \| None, typer.Option("--spec-file", help="File with spec numbers")]
		ReleaseOption = Annotated[
		str,

src/tdoc_crawler/database/base.py

+12 −11

Original line number	Diff line number	Diff line
		@@ -103,7 +103,7 @@ class DocDatabase:
		"""
		return self._clear_tables(["spec_downloads", "spec_versions", "spec_source_records", "specs"])

		def log_crawl_start(self, crawl_type: str, filters: list[str] \| None, incremental: bool) -> int:
		def log_crawl_start(self, crawl_type: str, filters: list[str] \| None, incremental: bool) -> str:
		"""Log the start of a crawl operation.

		Args:
		@@ -114,24 +114,23 @@ class DocDatabase:
		Returns:
		The ID of the created crawl log entry
		"""
		from tdoc_crawler.models import CrawlLogEntry

		entry = CrawlLogEntry(
		crawl_type=crawl_type,
		filters=json.dumps(filters or []),
		working_groups=filters or [],
		incremental=incremental,
		start_time=datetime.now(),
		end_time=None,
		items_added=0,
		items_updated=0,
		errors_count=0,
		status="RUNNING",
		)
		self.connection.add("crawl_log", entry, pk=True)
		return entry.id
		self.connection.add("crawl_log", entry, pk="log_id")
		return entry.log_id

		def log_crawl_end(
		self,
		crawl_id: int,
		crawl_id: str,
		items_added: int,
		items_updated: int,
		errors_count: int,
		@@ -144,11 +143,13 @@ class DocDatabase:
		items_updated: Number of existing items updated
		errors_count: Number of errors encountered
		"""
		self.connection._db.execute(
		"UPDATE crawl_log SET end_time = ?, items_added = ?, items_updated = ?, errors_count = ? WHERE id = ?",
		(datetime.now(), items_added, items_updated, errors_count, crawl_id),
		# DataBase._db is a sqlite3.Connection
		db = self.connection._db
		db.execute(
		"UPDATE crawl_log SET end_time = ?, items_added = ?, items_updated = ?, errors_count = ?, status = ? WHERE log_id = ?",
		(datetime.now().isoformat(), items_added, items_updated, errors_count, "COMPLETED", crawl_id),
		)
		self.connection._db.commit()
		db.commit()

		def _ensure_reference_data(self) -> None:
		"""Populate reference tables for working and subworking groups."""

src/tdoc_crawler/database/meetings.py

+0 −60

Original line number	Diff line number	Diff line
		@@ -8,7 +8,6 @@ from datetime import datetime
		from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
		from tdoc_crawler.models.base import SortOrder
		from tdoc_crawler.models.crawl_log import CrawlLogEntry
		from tdoc_crawler.models.subworking_groups import CODE_INDEX
		from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup
		from tdoc_crawler.utils.misc import utc_now
		@@ -248,65 +247,6 @@ class MeetingDatabase(DocDatabase):
		# ------------------------------------------------------------------
		# Crawl logging and statistics
		# ------------------------------------------------------------------
		def log_crawl_start(
		self,
		crawl_type: str,
		working_groups: Iterable[WorkingGroup] \| None,
		incremental: bool,
		) -> str:
		"""Log the start of a crawl operation.

		Args:
		crawl_type: Type of crawl (e.g., "meeting", "tdoc")
		working_groups: List of working groups being crawled
		incremental: Whether this is an incremental crawl

		Returns:
		Crawl log ID
		"""
		entry = CrawlLogEntry(
		crawl_type=crawl_type,
		end_time=None,
		working_groups=[wg.value for wg in working_groups or []],
		incremental=incremental,
		items_added=0,
		items_updated=0,
		errors_count=0,
		status="RUNNING",
		)
		self.connection.add("crawl_log", entry, pk="log_id")
		return entry.log_id

		def log_crawl_end(
		self,
		crawl_id: str,
		*,
		items_added: int,
		items_updated: int,
		errors_count: int,
		status: str = "COMPLETED",
		) -> None:
		"""Log the completion of a crawl operation.

		Args:
		crawl_id: Crawl log ID returned by log_crawl_start
		items_added: Number of new items added
		items_updated: Number of existing items updated
		errors_count: Number of errors encountered
		status: Final status (default: "COMPLETED")
		"""
		existing = self.connection.model_from_table("crawl_log", crawl_id)
		updated = existing.model_copy(
		update={
		"end_time": utc_now(),
		"items_added": items_added,
		"items_updated": items_updated,
		"errors_count": errors_count,
		"status": status,
		}
		)
		self.connection.add("crawl_log", updated, pk="log_id")

		def get_statistics(self) -> dict[str, object]:
		"""Get database statistics.