Commit 1ad70f87 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(ai): rename tdoc_id to document_id in tests and related functions

- Updated test files to replace 'tdoc_id' with 'document_id' for consistency.
- Modified function calls and assertions in test cases across multiple files.
- Ensured that all references to document identification are aligned with new naming conventions.
parent 7ddba15d
Loading
Loading
Loading
Loading
+12 −12
Original line number Diff line number Diff line
@@ -6,17 +6,17 @@ from pathlib import Path

from tdoc_crawler.ai.models import DocumentClassification
from tdoc_crawler.ai.operations import classify
from tdoc_crawler.ai.operations.classify import _score_filename, classify_tdoc_files
from tdoc_crawler.ai.operations.classify import _score_filename, classify_document_files


class TestClassifyTdocFiles:
    """Tests for classify_tdoc_files function."""
    """Tests for classify_document_files function."""

    def test_single_file_gets_confidence_one(self, test_data_dir: Path) -> None:
        """Test that a single file gets confidence 1.0."""
        tdoc_folder = test_data_dir / "S4-251003"
        if tdoc_folder.exists():
            result = classify_tdoc_files("S4-251003", tdoc_folder)
            result = classify_document_files("S4-251003", tdoc_folder)
            assert len(result) == 1
            assert result[0].confidence == 1.0
            assert result[0].is_main_document is True
@@ -26,7 +26,7 @@ class TestClassifyTdocFiles:
        # Use a folder with 2 files
        tdoc_folder = test_data_dir / "26260-j10"
        if tdoc_folder.exists() and len(list(tdoc_folder.glob("*"))) >= 2:
            result = classify_tdoc_files("26260-J10", tdoc_folder)
            result = classify_document_files("26260-J10", tdoc_folder)
            # Find the main document
            main = next((r for r in result if r.is_main_document), None)
            assert main is not None
@@ -51,7 +51,7 @@ class TestClassifyTdocFiles:
        # Test with various folders
        for tdoc_folder in test_data_dir.iterdir():
            if tdoc_folder.is_dir():
                result = classify_tdoc_files(tdoc_folder.name, tdoc_folder)
                result = classify_document_files(tdoc_folder.name, tdoc_folder)
                for classification in result:
                    assert 0.0 <= classification.confidence <= 1.0

@@ -60,7 +60,7 @@ class TestClassifyTdocFiles:
        tdoc_folder = test_data_dir / "S4-251003"

        if tdoc_folder.exists():
            result = classify_tdoc_files("S4-251003", tdoc_folder)
            result = classify_document_files("S4-251003", tdoc_folder)
            for classification in result:
                assert classification.decisive_heuristic is not None
                assert len(classification.decisive_heuristic) > 0
@@ -70,17 +70,17 @@ class TestClassifyTdocFiles:
        (tmp_path / "agenda.xlsx").write_text("xlsx placeholder", encoding="utf-8")
        (tmp_path / "slides.pptx").write_text("pptx placeholder", encoding="utf-8")

        result = classify_tdoc_files("S4-260999", tmp_path)
        result = classify_document_files("S4-260999", tmp_path)
        assert result == []


class TestClassifyModuleExports:
    """Tests that classify module exports required functions."""

    def test_classify_tdoc_files_exported(self) -> None:
        """Verify classify_tdoc_files is exported."""
        assert hasattr(classify, "classify_tdoc_files")
        assert callable(classify.classify_tdoc_files)
    def test_classify_document_files_exported(self) -> None:
        """Verify classify_document_files is exported."""
        assert hasattr(classify, "classify_document_files")
        assert callable(classify.classify_document_files)

    def test_document_classification_exported(self) -> None:
        """Verify DocumentClassification is exported."""
@@ -93,7 +93,7 @@ Command: uv run pytest tests/ai/test_ai_classification.py -q
Observed failure:
- TestClassifyTdocFiles.test_non_docx_files_are_ignored
    AssertionError: assert [DocumentClassification(...)] == []
    Cause: classify_tdoc_files currently includes non-DOCX files (e.g., .xlsx/.pptx)
    Cause: classify_document_files currently includes non-DOCX files (e.g., .xlsx/.pptx)
    instead of filtering classification candidates to DOCX only.

Checkpoint status: RED (1 failed, 8 passed)
+9 −9
Original line number Diff line number Diff line
@@ -74,8 +74,8 @@ class TestAiCli:
        assert result.exit_code == 0

    def test_process_delegates_to_library(self, runner: CliRunner, tmp_path: Path) -> None:
        """Test process delegates to process_tdoc/process_all."""
        with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
        """Test process delegates to process_document/process_all."""
        with patch("tdoc_crawler.cli.ai.process_document") as mock:
            mock.return_value = None
            result = runner.invoke(
                app,
@@ -100,15 +100,15 @@ class TestAiCli:
    def test_status_delegates_to_get_status(self, runner: CliRunner) -> None:
        """Test status delegates to get_status."""
        with patch("tdoc_crawler.cli.ai.get_status") as mock:
            mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456"])
            assert result.exit_code == 0

    def test_status_without_tdoc_id_lists_all(self, runner: CliRunner) -> None:
        """Status command supports listing all statuses without --tdoc-id."""
        statuses = [
            ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED),
            ProcessingStatus(tdoc_id="SP-123457", current_stage=PipelineStage.EXTRACTING),
            ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED),
            ProcessingStatus(document_id="SP-123457", current_stage=PipelineStage.EXTRACTING),
        ]
        with patch("tdoc_crawler.cli.ai.get_status") as mock_get_status:
            mock_get_status.return_value = statuses
@@ -121,7 +121,7 @@ class TestAiCli:
    def test_json_flag_produces_valid_json(self, runner: CliRunner) -> None:
        """Test --json flag produces valid JSON output."""
        with patch("tdoc_crawler.cli.ai.get_status") as mock:
            mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456", "--json"])
            assert result.exit_code == 0
            # Should be valid JSON
@@ -157,7 +157,7 @@ class TestAiCli:
        assert result.exit_code in (0, 1)

    def test_process_without_workspace_uses_default(self, runner: CliRunner, tmp_path: Path) -> None:
        with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
        with patch("tdoc_crawler.cli.ai.process_document") as mock:
            mock.return_value = None
            result = runner.invoke(app, ["ai", "process", "--tdoc-id", "SP-123456", "--checkout-path", str(tmp_path / "checkout")])
            assert result.exit_code == 0
@@ -166,7 +166,7 @@ class TestAiCli:

    def test_status_without_workspace_uses_default(self, runner: CliRunner) -> None:
        with patch("tdoc_crawler.cli.ai.get_status") as mock:
            mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
            result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456"])
            assert result.exit_code == 0
            kwargs = mock.call_args.kwargs
@@ -181,7 +181,7 @@ class TestAiCli:
            assert kwargs.get("workspace") is None or kwargs.get("workspace") == "default"

    def test_process_with_explicit_workspace_uses_it(self, runner: CliRunner, tmp_path: Path) -> None:
        with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
        with patch("tdoc_crawler.cli.ai.process_document") as mock:
            mock.return_value = None
            result = runner.invoke(app, ["ai", "process", "--tdoc-id", "SP-123456", "--workspace", "myws", "--checkout-path", str(tmp_path / "checkout")])
            assert result.exit_code == 0
+62 −2
Original line number Diff line number Diff line
@@ -65,7 +65,7 @@ class TestGraph:

        # Simulate calling build_graph which should use storage
        result_nodes, result_edges = graph.build_graph(
            tdoc_id="S4-250002", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
            document_id="S4-250002", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
        )

        # Verify query_graph was called to fetch existing data (incremental check)
@@ -137,7 +137,7 @@ class TestGraph:

        # Build graph - should extract references and create edges
        result_nodes, result_edges = graph.build_graph(
            tdoc_id="S4-252000", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
            document_id="S4-252000", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
        )

        # Verify REFERENCES edges were created
@@ -162,3 +162,63 @@ class TestGraphModuleExports:
        """Verify query_graph is exported."""
        assert hasattr(graph, "query_graph")
        assert callable(graph.query_graph)


class TestEntityExtractors:
    """Test entity extraction functions for graph-RAG."""

    def test_extract_company_entities(self) -> None:
        """Test company name extraction."""
        from tdoc_crawler.ai.operations.graph import extract_company_entities

        text = "This document was submitted by Huawei and Nokia for discussion at 3GPP."
        companies = extract_company_entities(text)

        assert "Huawei" in companies
        assert "Nokia" in companies
        assert len(companies) >= 2

    def test_extract_work_items(self) -> None:
        """Test work item extraction."""
        from tdoc_crawler.ai.operations.graph import extract_work_items

        text = "This relates to WI-12345 and Work Item 67890 for 5G enhancement."
        wis = extract_work_items(text)

        assert "WI-12345" in wis
        assert "WI-67890" in wis

    def test_extract_change_requests(self) -> None:
        """Test change request extraction."""
        from tdoc_crawler.ai.operations.graph import extract_change_requests

        text = "This CR-001234 and Change Request 5678 propose modifications to the spec."
        crs = extract_change_requests(text)

        assert "CR-001234" in crs
        assert "CR-5678" in crs

    def test_extract_all_entity_types(self) -> None:
        """Test extraction of all entity types together."""
        from tdoc_crawler.ai.operations.graph import (
            extract_change_requests,
            extract_company_entities,
            extract_work_items,
        )

        text = """
        Samsung proposes WI-99999 to address CR-11111.
        This work item relates to change request CP-230001.
        Ericsson and Qualcomm support this proposal.
        """

        companies = extract_company_entities(text)
        wis = extract_work_items(text)
        crs = extract_change_requests(text)

        assert "Samsung" in companies
        assert "Ericsson" in companies
        assert "Qualcomm" in companies
        assert "WI-99999" in wis
        assert "CR-11111" in crs
        assert "CR-CP-230001" in crs or "CR-230001" in crs
+10 −10
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ class TestRunPipeline:
    def test_batch_processing(self, mock_run_pipeline: MagicMock, mock_ai_storage: MagicMock, mock_storage: MagicMock, test_data_dir: Path) -> None:
        """Test processing multiple TDocs."""
        mock_ai_storage.return_value = mock_storage
        mock_run_pipeline.return_value = ProcessingStatus(tdoc_id="foo")
        mock_run_pipeline.return_value = ProcessingStatus(document_id="foo")

        tdoc_folders = [d for d in test_data_dir.iterdir() if d.is_dir()][:3]

@@ -43,7 +43,7 @@ class TestRunPipeline:
    def test_resume_from_interrupted_stage(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
        """Test resume from interrupted stage works."""
        # Create a status with partial completion
        mock_status = ProcessingStatus(tdoc_id="S4-251003")
        mock_status = ProcessingStatus(document_id="S4-251003")
        mock_status.classified_at = utc_now()
        mock_storage.get_status.return_value = mock_status

@@ -58,7 +58,7 @@ class TestRunPipeline:
        """Test incremental processing only processes new items."""
        mock_ai_storage.return_value = mock_storage
        # Status shows already completed
        mock_status = ProcessingStatus(tdoc_id="S4-251003")
        mock_status = ProcessingStatus(document_id="S4-251003")
        mock_status.current_stage = PipelineStage.COMPLETED
        mock_storage.get_status.return_value = mock_status

@@ -98,7 +98,7 @@ class TestRunPipeline:
            # Note: "26260-j10" is NOT in the workspace
        ]

        mock_run_pipeline.return_value = ProcessingStatus(tdoc_id="S4-251003", current_stage=PipelineStage.COMPLETED)
        mock_run_pipeline.return_value = ProcessingStatus(document_id="S4-251003", current_stage=PipelineStage.COMPLETED)

        tdoc_ids = ["S4-251003", "26260-j10"]
        # Only S4-251003 should be processed because of the workspace scope
@@ -138,8 +138,8 @@ class TestProcessTdocApi:

    def test_process_all_new_only_filters_completed_statuses(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
        """new_only mode should return only non-completed items."""
        completed = ProcessingStatus(tdoc_id="S4-251003", current_stage=PipelineStage.COMPLETED)
        pending = ProcessingStatus(tdoc_id="S4-260001", current_stage=PipelineStage.PENDING)
        completed = ProcessingStatus(document_id="S4-251003", current_stage=PipelineStage.COMPLETED)
        pending = ProcessingStatus(document_id="S4-260001", current_stage=PipelineStage.PENDING)

        def fake_process_all(
            tdoc_ids: list[str],
@@ -150,19 +150,19 @@ class TestProcessTdocApi:
            workspace: str | None = None,
        ) -> dict[str, ProcessingStatus]:
            return {
                completed.tdoc_id: completed,
                pending.tdoc_id: pending,
                completed.document_id: completed,
                pending.document_id: pending,
            }

        monkeypatch.setattr("tdoc_crawler.ai._pipeline_process_all_impl", fake_process_all)

        result = process_all_api(
            new_only=True,
            tdoc_ids=[completed.tdoc_id, pending.tdoc_id],
            tdoc_ids=[completed.document_id, pending.document_id],
            checkout_base=tmp_path,
        )

        assert [status.tdoc_id for status in result] == [pending.tdoc_id]
        assert [status.document_id for status in result] == [pending.document_id]


class TestPipelineModuleExports:
+2 −2
Original line number Diff line number Diff line
@@ -58,12 +58,12 @@ def test_storage_artifact_isolation_by_workspace(tmp_path: Path) -> None:
    storage = AiStorage(tmp_path / "lancedb", embedding_dimension=3)

    # Save status for workspace A
    status_a = ProcessingStatus(tdoc_id="DOC-1", current_stage=PipelineStage.COMPLETED)
    status_a = ProcessingStatus(document_id="DOC-1", current_stage=PipelineStage.COMPLETED)
    storage.save_status(status_a, workspace="ws_a")

    # Save chunks for workspace A
    chunk_a = DocumentChunk(
        tdoc_id="DOC-1",
        document_id="DOC-1",
        chunk_id="C1",
        chunk_index=0,
        text="text a",
Loading