Commit 77db5be7 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(cli): add title and status filtering options to command line args

- Introduced `--title` option to filter results by title.
- Added `--status` option to filter results by status.
- Updated relevant documentation to reflect these changes.
parent cce4d575
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -459,3 +459,11 @@ After several implementation steps, AGENTS.md files might need updates. When exp
2. Propose changes based on accumulated knowledge
3. Update only after explicit user confirmation
4. For any AGENTS.md files in the project structure, you MUST NEVER add a tree view of files or directories. This changes too often, and coding agents can easily retrieve the project's file structure using `ls` or `tree` commands when needed. AGENTS.md should focus on guidelines and patterns, not on file listings.

## Eager imports in __init__.py

Most eager imports in all __init__.py files are unnecessary, as these are just there for convenient/short imports in other modules. Inside a package, it should be no problem at all to always import the absolute/full module.

Eager imports may only make sense for:
- *very* relevant types that are also used by consumers of this API.
- for constants and very simple types (like enums or types without additional dependencies)
+6 −3
Original line number Diff line number Diff line
@@ -76,7 +76,7 @@ from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingQueryConfig
from tdoc_crawler.meetings.operations import MeetingCrawler
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.specs.downloads import SpecDownloads
@@ -333,10 +333,13 @@ def crawl_meetings(
            if removed_specs:
                console.print("[yellow]Cleared checkout entries for specs[/yellow]")

    with TDocDatabase(db_file) as database:
        # TDocDatabase inherits from DocDatabase which handles crawl logging
    with MeetingDatabase(db_file) as database:
        # MeetingDatabase inherits from DocDatabase which handles crawl logging
        crawl_id = database.log_crawl_start("meeting", [wg.value for wg in config.working_groups], config.incremental)

        # Create crawler instance
        crawler = MeetingCrawler(database)

        # Create progress bar for meeting crawling
        with Progress(
            SpinnerColumn(),
+3 −0
Original line number Diff line number Diff line
@@ -57,7 +57,10 @@ FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch

# Options - Specs
ClearSpecsOption = Annotated[bool, typer.Option("--clear-specs", help="Clear all specs before crawling")]
TitleOption = Annotated[str | None, typer.Option("--title", help="Filter by title contains")]
StatusOption = Annotated[str | None, typer.Option("--status", help="Filter by status")]
SpecOption = Annotated[list[str] | None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]

SpecFileOption = Annotated[Path | None, typer.Option("--spec-file", help="File with spec numbers")]
ReleaseOption = Annotated[
    str,
+12 −11
Original line number Diff line number Diff line
@@ -103,7 +103,7 @@ class DocDatabase:
        """
        return self._clear_tables(["spec_downloads", "spec_versions", "spec_source_records", "specs"])

    def log_crawl_start(self, crawl_type: str, filters: list[str] | None, incremental: bool) -> int:
    def log_crawl_start(self, crawl_type: str, filters: list[str] | None, incremental: bool) -> str:
        """Log the start of a crawl operation.

        Args:
@@ -114,24 +114,23 @@ class DocDatabase:
        Returns:
            The ID of the created crawl log entry
        """
        from tdoc_crawler.models import CrawlLogEntry

        entry = CrawlLogEntry(
            crawl_type=crawl_type,
            filters=json.dumps(filters or []),
            working_groups=filters or [],
            incremental=incremental,
            start_time=datetime.now(),
            end_time=None,
            items_added=0,
            items_updated=0,
            errors_count=0,
            status="RUNNING",
        )
        self.connection.add("crawl_log", entry, pk=True)
        return entry.id
        self.connection.add("crawl_log", entry, pk="log_id")
        return entry.log_id

    def log_crawl_end(
        self,
        crawl_id: int,
        crawl_id: str,
        items_added: int,
        items_updated: int,
        errors_count: int,
@@ -144,11 +143,13 @@ class DocDatabase:
            items_updated: Number of existing items updated
            errors_count: Number of errors encountered
        """
        self.connection._db.execute(
            "UPDATE crawl_log SET end_time = ?, items_added = ?, items_updated = ?, errors_count = ? WHERE id = ?",
            (datetime.now(), items_added, items_updated, errors_count, crawl_id),
        # DataBase._db is a sqlite3.Connection
        db = self.connection._db
        db.execute(
            "UPDATE crawl_log SET end_time = ?, items_added = ?, items_updated = ?, errors_count = ?, status = ? WHERE log_id = ?",
            (datetime.now().isoformat(), items_added, items_updated, errors_count, "COMPLETED", crawl_id),
        )
        self.connection._db.commit()
        db.commit()

    def _ensure_reference_data(self) -> None:
        """Populate reference tables for working and subworking groups."""
+0 −60
Original line number Diff line number Diff line
@@ -8,7 +8,6 @@ from datetime import datetime
from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.models.crawl_log import CrawlLogEntry
from tdoc_crawler.models.subworking_groups import CODE_INDEX
from tdoc_crawler.models.working_groups import WORKING_GROUP_RECORDS, WorkingGroup
from tdoc_crawler.utils.misc import utc_now
@@ -248,65 +247,6 @@ class MeetingDatabase(DocDatabase):
    # ------------------------------------------------------------------
    # Crawl logging and statistics
    # ------------------------------------------------------------------
    def log_crawl_start(
        self,
        crawl_type: str,
        working_groups: Iterable[WorkingGroup] | None,
        incremental: bool,
    ) -> str:
        """Log the start of a crawl operation.

        Args:
            crawl_type: Type of crawl (e.g., "meeting", "tdoc")
            working_groups: List of working groups being crawled
            incremental: Whether this is an incremental crawl

        Returns:
            Crawl log ID
        """
        entry = CrawlLogEntry(
            crawl_type=crawl_type,
            end_time=None,
            working_groups=[wg.value for wg in working_groups or []],
            incremental=incremental,
            items_added=0,
            items_updated=0,
            errors_count=0,
            status="RUNNING",
        )
        self.connection.add("crawl_log", entry, pk="log_id")
        return entry.log_id

    def log_crawl_end(
        self,
        crawl_id: str,
        *,
        items_added: int,
        items_updated: int,
        errors_count: int,
        status: str = "COMPLETED",
    ) -> None:
        """Log the completion of a crawl operation.

        Args:
            crawl_id: Crawl log ID returned by log_crawl_start
            items_added: Number of new items added
            items_updated: Number of existing items updated
            errors_count: Number of errors encountered
            status: Final status (default: "COMPLETED")
        """
        existing = self.connection.model_from_table("crawl_log", crawl_id)
        updated = existing.model_copy(
            update={
                "end_time": utc_now(),
                "items_added": items_added,
                "items_updated": items_updated,
                "errors_count": errors_count,
                "status": status,
            }
        )
        self.connection.add("crawl_log", updated, pk="log_id")

    def get_statistics(self) -> dict[str, object]:
        """Get database statistics.

Loading