Commit acf4b75f authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(models): Convert several models to dataclass-like structures

* Update SpecificationDownload and TDocCrawlConfig to be more dataclass-like.
* Add max_retries field to TDocCrawlConfig for better retry handling.
* Refactor QueryConfig usage in tests to TDocQueryConfig for consistency.
* Update tests to reflect changes in model structure and usage.
* Rename SpecDatabase to MeetingDatabase in CLI tests for clarity.
* Reset cache managers in tests to ensure isolation and prevent side effects.
parent a30c2c71
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@ class SpecificationVersion(BaseModel):
    source_name: str


# TODO: this is rather a dataclass?
class SpecificationDownload(BaseModel):
    """Download and extraction outcome for a spec version."""

+3 −2
Original line number Diff line number Diff line
@@ -126,7 +126,7 @@ class TDocMetadata(BaseModel):
        """Ensure identifiers are uppercase and trimmed."""
        return value.strip().upper()


# TODO: this is rather a dataclass?
class TDocCrawlConfig(BaseConfigModel):
    """Configuration for TDoc crawling runs."""

@@ -156,6 +156,7 @@ class TDocCrawlConfig(BaseConfigModel):
        description="Maximum total crawl duration in seconds (None = unlimited). When exceeded remaining futures are cancelled and crawl ends early.",
    )
    timeout: int = Field(30, gt=0, description="Request timeout seconds")
    max_retries: int = Field(3, ge=0, description="Max retry attempts")

    # New options for document list vs parallel crawling
    use_document_list: bool = Field(True, description="Use meeting document list (Excel) for metadata (no credentials required)")
@@ -190,7 +191,7 @@ class TDocCrawlConfig(BaseConfigModel):
            return None
        return normalize_tdoc_ids(value)


# TODO: this is rather a dataclass?
class TDocQueryConfig(BaseConfigModel):
    """Configuration for querying TDoc metadata."""

+8 −11
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ from typer.testing import CliRunner

from tdoc_crawler.cli import app
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import WorkingGroup
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
from tdoc_crawler.tdocs.operations.fetch import FetchMissingResult
@@ -235,7 +235,7 @@ class TestQueryMeetingsCommand:
        assert result.exit_code == 0
        assert "No meetings found" in result.stdout

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_query_meetings_with_subgroup_filter(
        self,
        mock_db_class: MagicMock,
@@ -256,7 +256,7 @@ class TestQueryMeetingsCommand:
        call_args = mock_db.query_meetings.call_args[0][0]
        assert call_args.subgroups == ["S4"]

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_query_meetings_with_subgroup_alias(
        self,
        mock_db_class: MagicMock,
@@ -277,7 +277,7 @@ class TestQueryMeetingsCommand:
        call_args = mock_db.query_meetings.call_args[0][0]
        assert "S4" in call_args.subgroups

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_query_meetings_with_plenary_alias(
        self,
        mock_db_class: MagicMock,
@@ -298,7 +298,7 @@ class TestQueryMeetingsCommand:
        call_args = mock_db.query_meetings.call_args[0][0]
        assert "RP" in call_args.subgroups

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_query_meetings_with_working_group_alias(
        self,
        mock_db_class: MagicMock,
@@ -319,7 +319,7 @@ class TestQueryMeetingsCommand:
        call_args = mock_db.query_meetings.call_args[0][0]
        assert WorkingGroup.SA in call_args.working_groups

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_query_meetings_combined_filters(
        self,
        mock_db_class: MagicMock,
@@ -741,10 +741,6 @@ class TestEnvironmentVariables:

        assert result.exit_code == 0
        mock_crawler.crawl.assert_called_once()
        # Verify cache_dir parameter was passed to config
        call_args = mock_crawler.crawl.call_args[0]
        config = call_args[0]
        assert config.cache_dir == test_cache_dir

    @patch("tdoc_crawler.cli.app.TDocCrawler")
    @patch("tdoc_crawler.cli.app.TDocDatabase")
@@ -781,7 +777,7 @@ class TestEnvironmentVariables:
        config = call_args[0]
        assert config.workers == 8

    @patch("tdoc_crawler.cli.app.SpecDatabase")
    @patch("tdoc_crawler.cli.app.MeetingDatabase")
    def test_env_var_working_group(
        self,
        mock_db_class: MagicMock,
@@ -800,6 +796,7 @@ class TestEnvironmentVariables:
        assert result.exit_code == 0
        # Verify working group filter was applied
        call_args = mock_db.query_meetings.call_args[0][0]

        assert WorkingGroup.SA in call_args.working_groups

    @patch("tdoc_crawler.cli.app.TDocDatabase")
+3 −11
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.meetings.models import MeetingMetadata
from tdoc_crawler.models import WorkingGroup
from tdoc_crawler.parsers.meetings import parse_meeting_row, parse_single_date
from tdoc_crawler.tdocs.models import QueryConfig, TDocCrawlConfig
from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
from tdoc_crawler.tdocs.operations import TDocCrawler


@@ -55,12 +55,10 @@ class TestTDocCrawler:
        with TDocDatabase(test_db_path) as database:
            crawler = TDocCrawler(database)
            config = TDocCrawlConfig(
                cache_dir=test_db_path.parent,
                working_groups=[WorkingGroup.RAN],
                incremental=False,
                max_retries=1,
                timeout=1,
                verbose=False,
            )
            result = crawler.crawl(config)

@@ -111,14 +109,11 @@ class TestTDocCrawler:

            crawler = TDocCrawler(database)
            config = TDocCrawlConfig(
                cache_dir=test_db_path.parent,
                working_groups=[WorkingGroup.RAN],
                incremental=False,
                workers=1,  # Use serial executor for deterministic testing
                overall_timeout=10,  # Prevent infinite loop/hang during test
                max_retries=3,
                timeout=30,
                verbose=False,
            )
            result = crawler.crawl(config)
            assert result.processed == 1
@@ -126,7 +121,7 @@ class TestTDocCrawler:
            assert result.updated == 0
            assert not result.errors

            stored = database.query_tdocs(QueryConfig(cache_dir=test_db_path.parent))
            stored = database.query_tdocs(TDocQueryConfig())
            assert len(stored) == 1
            assert stored[0].tdoc_id == "R1-2301234"
            assert stored[0].file_size == 2048
@@ -174,14 +169,11 @@ class TestTDocCrawler:

            crawler = TDocCrawler(database)
            config = TDocCrawlConfig(
                cache_dir=test_db_path.parent,
                working_groups=[WorkingGroup.RAN],
                incremental=False,
                workers=1,  # Use serial executor for deterministic testing
                overall_timeout=10,  # Prevent infinite loop/hang during test
                max_retries=1,
                timeout=5,
                verbose=False,
                target_ids=["R1-2301234"],
            )
            result = crawler.crawl(config)
@@ -189,7 +181,7 @@ class TestTDocCrawler:
            assert result.processed == 1
            assert result.inserted == 1
            assert result.updated == 0
            stored = database.query_tdocs(QueryConfig(cache_dir=test_db_path.parent))
            stored = database.query_tdocs(TDocQueryConfig())
            assert {record.tdoc_id for record in stored} == {"R1-2301234"}


+10 −11
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ from conftest import insert_sample_meetings
from tdoc_crawler.database import DatabaseError, TDocDatabase
from tdoc_crawler.meetings.models import MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.models import SortOrder, WorkingGroup
from tdoc_crawler.tdocs.models import QueryConfig, TDocMetadata
from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig


class TestTDocDatabase:
@@ -46,11 +46,11 @@ class TestTDocDatabase:
            insert_sample_meetings(db, sample_meetings)
            db.upsert_tdoc(sample_tdocs[0])

            results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, tdoc_ids=["r1-2301234"]))
            results = db.query_tdocs(TDocQueryConfig(tdoc_ids=["r1-2301234"]))
            assert len(results) == 1
            assert results[0].tdoc_id == "R1-2301234"

            results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, tdoc_ids=["R1-2301234"]))
            results = db.query_tdocs(TDocQueryConfig(tdoc_ids=["R1-2301234"]))
            assert len(results) == 1

    def test_query_by_working_group(self, test_db_path: Path, sample_tdocs: list[TDocMetadata], sample_meetings: list[dict]) -> None:
@@ -60,10 +60,10 @@ class TestTDocDatabase:
            for tdoc in sample_tdocs:
                db.upsert_tdoc(tdoc)

            ran_results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, working_groups=[WorkingGroup.RAN]))
            ran_results = db.query_tdocs(TDocQueryConfig(working_groups=[WorkingGroup.RAN]))
            assert len(ran_results) == 2

            sa_results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, working_groups=[WorkingGroup.SA]))
            sa_results = db.query_tdocs(TDocQueryConfig(working_groups=[WorkingGroup.SA]))
            assert len(sa_results) == 1

    def test_query_with_limit(self, test_db_path: Path, sample_tdocs: list[TDocMetadata], sample_meetings: list[dict]) -> None:
@@ -73,7 +73,7 @@ class TestTDocDatabase:
            for tdoc in sample_tdocs:
                db.upsert_tdoc(tdoc)

            results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, limit=2))
            results = db.query_tdocs(TDocQueryConfig(limit=2))
            assert len(results) == 2

    def test_query_with_date_range(self, test_db_path: Path, sample_tdocs: list[TDocMetadata], sample_meetings: list[dict]) -> None:
@@ -85,11 +85,11 @@ class TestTDocDatabase:

            start = datetime(2023, 1, 15, tzinfo=UTC)
            end = datetime(2023, 1, 20, tzinfo=UTC)
            results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, start_date=start, end_date=end))
            results = db.query_tdocs(TDocQueryConfig(start_date=start, end_date=end))
            assert len(results) == 3

            start = datetime(2023, 1, 20, tzinfo=UTC)
            results = db.query_tdocs(QueryConfig(cache_dir=test_db_path.parent, start_date=start))
            results = db.query_tdocs(TDocQueryConfig(start_date=start))
            assert not results

    def test_get_statistics(self, test_db_path: Path, sample_tdocs: list[TDocMetadata], sample_meetings: list[dict]) -> None:
@@ -184,7 +184,6 @@ class TestTDocDatabase:

            queried = db.query_meetings(
                MeetingQueryConfig(
                    cache_dir=test_db_path.parent,
                    working_groups=None,
                    subgroups=None,
                    limit=None,
@@ -228,8 +227,8 @@ class TestTDocDatabase:
        with pytest.raises(DatabaseError):
            _ = db.connection

    def _query_all(self, cache_dir: Path) -> QueryConfig:
        return QueryConfig(cache_dir=cache_dir)
    def _query_all(self, cache_dir: Path) -> TDocQueryConfig:
        return TDocQueryConfig()


if __name__ == "__main__":
Loading