Commit b26e1dad authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(tests): fix test failures after 3gpp-ai removal

- Add CacheManager._instance reset autouse fixture in conftest.py
- Rewrite whatthespec tests to mock create_cached_session instead of
  requests.Session.get (code uses hishel CachedSession)
- Add monkeypatch.delenv for TDC_EOL_USERNAME/PASSWORD in
  test_credentials.py to prevent env var leakage
- Use requests.RequestException instead of plain Exception in
  test_meeting_document_list.py network error test
- Remove 3gpp-ai dependent tests from test_checkout.py and
  test_graph_errors.py
- Remove AiConfig test from test_config_defaults.py
parent b9171c26
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from pathlib import Path
import pytest
from packaging.version import Version

from tdoc_crawler.config import CacheManager
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.meetings.models import MeetingMetadata
@@ -21,6 +22,17 @@ def pytest_ignore_collect(collection_path: object, config: pytest.Config) -> boo
    return "/tests/ai/" in path_text or path_text.endswith("/tests/ai")


@pytest.fixture(autouse=True)
def _reset_cache_manager() -> None:
    """Reset CacheManager singleton before each test.

    The CLI callback registers a CacheManager singleton on every invocation.
    Without resetting between tests, the second test that invokes the CLI
    will fail with "CacheManager already registered".
    """
    CacheManager._instance = None


@pytest.fixture
def test_cache_dir(tmp_path: Path) -> Path:
    """Create a temporary cache directory for tests.
+1 −106
Original line number Diff line number Diff line
@@ -7,11 +7,10 @@ from unittest.mock import Mock, patch

import pytest
from packaging.version import Version
from threegpp_ai.operations.classify import classify_document_files
from threegpp_ai.operations.workspaces import _checkout_tdoc_if_needed, resolve_tdoc_checkout_path

from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc, get_checked_out_tdocs, get_checkout_path
from tdoc_crawler.workspaces import resolve_tdoc_checkout_path


@pytest.fixture
@@ -261,107 +260,3 @@ class TestResolveTdocCheckoutPath:

        result = resolve_tdoc_checkout_path("S4-999999", checkout_base)
        assert result is None


class TestCheckoutTdocIfNeeded:
    """Tests for _checkout_tdoc_if_needed function behavior."""

    def test_empty_folder_only_ai_subfolder_triggers_redownload(
        self,
        tmp_path: Path,
        sample_tdoc_metadata: TDocMetadata,
    ) -> None:
        """Test that folder with only .ai subfolder triggers re-download."""
        checkout_base = tmp_path / "checkout"

        # Create folder structure with only .ai subfolder (simulating processed but deleted docs)
        tdoc_path = checkout_base / "TSG_SA" / "WG4" / "TSGS4_131" / "Docs" / "S4-251234"
        ai_folder = tdoc_path / ".ai"
        ai_folder.mkdir(parents=True)
        (ai_folder / "processed.md").write_text("processed content")

        # Mock checkout_tdoc to simulate re-download
        with patch("threegpp_ai.operations.workspaces.checkout_tdoc") as mock_checkout:
            mock_checkout.return_value = tdoc_path

            result = _checkout_tdoc_if_needed("S4-251234", sample_tdoc_metadata, checkout_base)

            # Should trigger re-download because folder only has .ai/ subfolder
            mock_checkout.assert_called_once()
            assert result == tdoc_path

    def test_populated_folder_does_not_trigger_redownload(
        self,
        tmp_path: Path,
        sample_tdoc_metadata: TDocMetadata,
    ) -> None:
        """Test that folder with actual document files does NOT trigger re-download."""
        checkout_base = tmp_path / "checkout"

        # Create folder with actual document files
        tdoc_path = checkout_base / "TSG_SA" / "WG4" / "TSGS4_131" / "Docs" / "S4-251234"
        tdoc_path.mkdir(parents=True)
        (tdoc_path / "S4-251234.docx").write_text("document content")
        ai_folder = tdoc_path / ".ai"
        ai_folder.mkdir(parents=True)
        (ai_folder / "processed.md").write_text("processed content")

        # Mock checkout_tdoc to verify it's NOT called
        with patch("threegpp_ai.operations.workspaces.checkout_tdoc") as mock_checkout:
            result = _checkout_tdoc_if_needed("S4-251234", sample_tdoc_metadata, checkout_base)

            # Should NOT trigger re-download because folder has actual files
            mock_checkout.assert_not_called()
            assert result == tdoc_path

    def test_nonexistent_folder_triggers_checkout(
        self,
        tmp_path: Path,
        sample_tdoc_metadata: TDocMetadata,
    ) -> None:
        """Test that non-existent folder triggers checkout."""
        checkout_base = tmp_path / "checkout"
        checkout_base.mkdir(parents=True)

        # Mock both resolve_tdoc_checkout_path (returns None) and checkout_tdoc
        with (
            patch("threegpp_ai.operations.workspaces.resolve_tdoc_checkout_path") as mock_resolve,
            patch("threegpp_ai.operations.workspaces.checkout_tdoc") as mock_checkout,
        ):
            mock_resolve.return_value = None  # Simulate non-existent folder
            new_path = checkout_base / "TSG_SA" / "WG4" / "TSGS4_131" / "Docs" / "S4-251234"
            new_path.mkdir(parents=True)
            (new_path / "test.docx").write_text("content")
            mock_checkout.return_value = new_path

            result = _checkout_tdoc_if_needed("S4-251234", sample_tdoc_metadata, checkout_base)

            # Should trigger checkout because folder doesn't exist
            mock_checkout.assert_called_once()
            assert result == new_path


class TestClassifyDocumentFiles:
    """Tests for classify_document_files error messages."""

    def test_empty_folder_warning_includes_redownload_suggestion(
        self,
        tmp_path: Path,
        caplog: pytest.LogCaptureFixture,
    ) -> None:
        """Test that empty folder warning includes re-download suggestion."""
        folder_path = tmp_path / "empty_folder"
        folder_path.mkdir(parents=True)
        # Add only .ai subfolder
        ai_folder = folder_path / ".ai"
        ai_folder.mkdir(parents=True)
        (ai_folder / "processed.md").write_text("content")

        with caplog.at_level("WARNING"):
            result = classify_document_files("S4-251234", folder_path)

        assert result == []
        assert "No document files found" in caplog.text
        assert ".pdf/.docx/.xlsx/.pptx" in caplog.text
        assert "tdoc-crawler checkout S4-251234" in caplog.text
        assert "Folder contents:" in caplog.text
+0 −23
Original line number Diff line number Diff line
@@ -9,8 +9,6 @@ from __future__ import annotations
import re
from pathlib import Path

from threegpp_ai.config import AiConfig


def test_embedding_model_env_comment_matches_value() -> None:
    """Verify .env.example TDC_AI_EMBEDDING_MODEL comment and value are consistent.
@@ -42,24 +40,3 @@ def test_embedding_model_env_comment_matches_value() -> None:
            assert value == recommended_model.group(1), (
                f".env.example embedding model comment ({recommended_model.group(1)}) differs from value ({value}). Update comment to match value."
            )


def test_embedding_model_not_hardcoded_in_code() -> None:
    """Verify embedding model is read from config, not hardcoded.

    The embedding model should be configurable via TDC_AI_EMBEDDING_MODEL
    environment variable and stored in AiConfig.embedding_model.
    """
    # This test verifies the configuration structure exists
    # The actual default value is defined in AiConfig (or LightRAGConfig)
    # and TDC_AI_EMBEDDING_MODEL should be the documented way to override it

    # AiConfig should accept embedding_model via env var or config
    # If TDC_AI_EMBEDDING_MODEL is set, it should be used
    config = AiConfig()

    # Verify the config has the expected structure
    assert hasattr(config, "llm_model"), "AiConfig should have llm_model field"

    # Note: embedding_model field may not exist yet in AiConfig -
    # this test documents the expected configuration structure
+5 −1
Original line number Diff line number Diff line
@@ -66,8 +66,12 @@ class TestCredentialsWithConfig:

        assert result is None

    def test_resolve_config_prompt_flag(self) -> None:
    def test_resolve_config_prompt_flag(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """credentials_config.prompt controls prompting."""
        # Clear env vars that might provide credentials before prompting
        monkeypatch.delenv("TDC_EOL_USERNAME", raising=False)
        monkeypatch.delenv("TDC_EOL_PASSWORD", raising=False)

        config = CredentialsConfig(prompt="true")

        # Should attempt to prompt (but won't in test due to non-TTY)
+4 −95
Original line number Diff line number Diff line
"""Tests for graph error handling and deprecated import verification.
"""Tests for deprecated import verification and datetime scoping.

This test module verifies:
1. No deprecated imports remain in the codebase (AiStorage, EmbeddingsManager, lancedb, etc.)
2. Graph error handling readiness for future LightRAG integration
3. Datetime scoping is correct in CLI operations
2. Datetime scoping is correct in CLI operations
"""

from __future__ import annotations

import sys
from datetime import UTC, datetime

import pytest
import threegpp_ai
import threegpp_ai.operations.workspaces as workspaces_ops
from threegpp_ai import cli
from threegpp_ai.operations.workspaces import (
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
    ensure_ai_subfolder,
)

from tdoc_crawler.utils.date_parser import parse_partial_date


class TestDeprecatedImports:
    """Verify deprecated modules are not imported in the codebase."""

    def test_no_aistorage_import(self) -> None:
        """AiStorage deprecated class should not be imported."""
        # Verify AiStorage not in module namespaces
        assert not hasattr(threegpp_ai, "AiStorage"), "AiStorage should not be exported"
        assert not hasattr(threegpp_ai.operations, "AiStorage"), "AiStorage should not be in operations"

    def test_no_embeddings_manager_import(self) -> None:
        """EmbeddingsManager deprecated class should not be imported."""
        # Verify EmbeddingsManager not in module namespaces
        assert not hasattr(threegpp_ai, "EmbeddingsManager"), "EmbeddingsManager should not be exported"

    def test_no_lancedb_import(self) -> None:
        """Lancedb deprecated module should not be imported."""
        # Verify lancedb not in loaded modules
        assert "lancedb" not in sys.modules, "lancedb module should not be loaded"
        assert "tdoc_ai.storage.lancedb" not in sys.modules, "tdoc_ai.storage.lancedb should not be loaded"

    def test_no_sentence_transformers_import(self) -> None:
        """sentence_transformers deprecated module should not be imported."""
        # Verify sentence_transformers not in loaded modules
        assert "sentence_transformers" not in sys.modules, "sentence_transformers should not be loaded"

    def test_no_legacy_pipeline_import(self) -> None:
        """Legacy pipeline module should not be imported."""
        # Verify legacy pipeline not in loaded modules
        assert "tdoc_ai.operations.pipeline" not in sys.modules, "Legacy pipeline should not be loaded"


class TestDatetimeScoping:
    """Verify datetime variables are properly scoped."""

    def test_datetime_module_level_import(self) -> None:
        """Datetime should be imported at module level in cli.py."""
        # Verify datetime is available in cli module
        assert hasattr(cli, "datetime"), "datetime should be imported in cli module"
        assert hasattr(cli, "UTC"), "UTC should be imported in cli module"

    def test_datetime_combine_usage(self) -> None:
        """datetime.combine should be used with proper scoping."""
        # Test that datetime.combine works correctly with parse_partial_date
        start_date_str = "2025-01"
        parsed_date = parse_partial_date(start_date_str)
        combined = datetime.combine(parsed_date, datetime.min.time(), tzinfo=UTC)
@@ -76,7 +41,6 @@ class TestDatetimeScoping:

    def test_datetime_combine_end_date(self) -> None:
        """datetime.combine should work for end dates with is_end flag."""
        # Test end date parsing
        end_date_str = "2025-03"
        parsed_date = parse_partial_date(end_date_str, is_end=True)
        combined = datetime.combine(parsed_date, datetime.max.time(), tzinfo=UTC)
@@ -85,58 +49,3 @@ class TestDatetimeScoping:
        assert combined.tzinfo == UTC
        assert combined.year == 2025
        assert combined.month == 3


class TestGraphErrorHandling:
    """Test graph error handling readiness.

    Note: Currently there is NO LightRAG integration or graph building code
    in the 3gpp-ai package. These tests document the expected behavior when
    graph operations are added in a future phase.
    """

    def test_graph_operations_not_yet_implemented(self) -> None:
        """Document that graph operations are not yet implemented.

        This test verifies the current state: no LightRAG or graph building
        code exists in the codebase. When LightRAG integration is added,
        this test should be updated to verify actual graph error handling.
        """
        # Verify no LightRAG classes are exported
        assert not hasattr(threegpp_ai, "LightRAGConfig"), "LightRAGConfig not yet implemented"
        assert not hasattr(threegpp_ai, "TDocRAG"), "TDocRAG not yet implemented"
        assert not hasattr(threegpp_ai, "TDocProcessor"), "TDocProcessor not yet implemented"

        # Verify no graph-related functions in workspaces
        assert not hasattr(workspaces_ops, "build_graph"), "build_graph not yet implemented"
        assert not hasattr(workspaces_ops, "insert_to_graph"), "insert_to_graph not yet implemented"

    def test_workspace_process_continues_without_graph(self) -> None:
        """Workspace processing should work without graph operations.

        Currently workspace processing only does extraction, not graph building.
        This test documents that extraction works independently.
        """
        # Verify core workspace operations exist and work without graph
        assert checkout_tdoc_to_workspace is not None
        assert checkout_spec_to_workspace is not None
        assert ensure_ai_subfolder is not None

    @pytest.mark.skip(reason="LightRAG integration not yet implemented")
    def test_graph_error_caught_and_logged(self) -> None:
        """When LightRAG is integrated, graph errors should be caught.

        Future test: Verify that when rag.insert() raises ValueError or
        RuntimeError, the error is caught, logged with document ID, and
        processing continues for other documents.
        """
        pass

    @pytest.mark.skip(reason="LightRAG integration not yet implemented")
    def test_graph_error_doesnt_crash_workspace_process(self) -> None:
        """When LightRAG is integrated, graph errors should not crash processing.

        Future test: Verify that graph building failures for individual
        documents don't stop the entire workspace process.
        """
        pass
Loading