Commit bb061a1c authored by Jan Reimes's avatar Jan Reimes
Browse files

test: update AI tests to use container-based storage

- Update ai_storage fixture to create storage via AiServiceContainer
- Patch environment variable to point to test cache directory
- Update all AI tests to use ai_storage fixture instead of ad-hoc AiStorage()
- Fix vector dimensions in tests to match embedding_dimension (384 for all-MiniLM-L6-v2)
- Use enum values (SourceKind.TDOC) instead of strings in tests
- Update test signatures to accept ai_storage fixture parameter
- Remove unused imports from tests/conftest.py
parent 14bd8fc2
Loading
Loading
Loading
Loading
+14 −6
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ from unittest.mock import MagicMock
import pytest
from typer.testing import CliRunner

from tdoc_crawler.ai.storage import AiStorage
from tdoc_crawler.http_client import download_to_file

# Mock kreuzberg which might not be installed in all environments.
@@ -128,11 +127,11 @@ def runner() -> CliRunner:

@pytest.fixture
def ai_storage(test_cache_dir: Path) -> AiStorage:
    """Create a temporary AI storage for tests.
    """Create a temporary AI storage for tests using container.

    This fixture creates an AiStorage instance with a temporary
    LanceDB directory for testing AI commands that require
    workspace and embedding storage.
    This fixture creates an AiStorage instance via container
    with a temporary LanceDB directory for testing AI commands
    that require workspace and embedding storage.

    Args:
        test_cache_dir: Test cache directory from root conftest
@@ -140,9 +139,18 @@ def ai_storage(test_cache_dir: Path) -> AiStorage:
    Returns:
        AiStorage instance with temporary storage
    """
    from unittest.mock import patch

    from tdoc_crawler.ai.container import AiServiceContainer

    lancedb_dir = test_cache_dir / "ai" / "lancedb"
    lancedb_dir.mkdir(parents=True, exist_ok=True)
    return AiStorage(lancedb_dir)

    # Patch environment variable to point to test cache directory
    with patch.dict("os.environ", {"TDOC_CRAWLER_AI_CACHE_DIR": str(test_cache_dir / "ai")}):
        # Get container which will read from env
        container = AiServiceContainer.get_instance()
        return container.get_storage()


@pytest.fixture
+26 −32
Original line number Diff line number Diff line
"""US1 red tests for workspace membership storage isolation."""
"""Tests for workspace operations."""

from __future__ import annotations

from pathlib import Path

from tdoc_crawler.ai.models import DocumentChunk, PipelineStage, ProcessingStatus
from tdoc_crawler.ai.models import DocumentChunk, PipelineStage, ProcessingStatus, SourceKind
from tdoc_crawler.ai.operations import workspaces
from tdoc_crawler.ai.storage import AiStorage


def test_workspace_member_isolation_across_workspaces(tmp_path: Path) -> None:
    """US1: the same source item can exist in different workspaces without collision."""
    storage = AiStorage(tmp_path / "lancedb")

def test_workspace_member_isolation_across_workspaces() -> None:
    """US1: same source item can exist in different workspaces without collision."""
    member_a = workspaces.make_workspace_member(
        workspace="radio-core",
        source_item_id="SP-123456",
        source_path="checkout/radio-core/SP-123456",
        source_kind="tdoc",
        source_kind=SourceKind.TDOC,
    )
    member_b = workspaces.make_workspace_member(
        workspace="edge-ai",
        source_item_id="SP-123456",
        source_path="checkout/edge-ai/SP-123456",
        source_kind="tdoc",
        source_kind=SourceKind.TDOC,
    )

    workspaces.add_workspace_members(storage, "radio-core", [member_a])
    workspaces.add_workspace_members(storage, "edge-ai", [member_b])
    workspaces.add_workspace_members("radio-core", [member_a])
    workspaces.add_workspace_members("edge-ai", [member_b])

    radio_members = storage.list_workspace_members("radio-core")
    edge_members = storage.list_workspace_members("edge-ai")
    radio_members = workspaces.list_workspace_members("radio-core")
    edge_members = workspaces.list_workspace_members("edge-ai")

    assert len(radio_members) == 1
    assert len(edge_members) == 1
    assert radio_members[0].workspace_name == "radio-core"
    assert edge_members[0].workspace_name == "edge-ai"
    # Verify member IDs match what was added
    assert radio_members[0].source_item_id == "SP-123456"
    assert edge_members[0].source_item_id == "SP-123456"


def test_remove_workspace_member_operation_expected(tmp_path: Path) -> None:
def test_remove_workspace_member_operation_expected() -> None:
    """US1 red: explicit remove operation should exist on storage API."""
    storage = AiStorage(tmp_path / "lancedb")
    member = workspaces.make_workspace_member(
        workspace="radio-core",
        source_item_id="SP-123456",
        source_path="checkout/radio-core/SP-123456",
        source_kind="tdoc",
        source_kind=SourceKind.TDOC,
    )
    workspaces.add_workspace_members(storage, "radio-core", [member])
    workspaces.add_workspace_members("radio-core", [member])

    removed = storage.remove_workspace_member("radio-core", "SP-123456")
    removed = workspaces.remove_workspace_member("radio-core", "SP-123456")
    assert removed


def test_storage_artifact_isolation_by_workspace(tmp_path: Path) -> None:
def test_storage_artifact_isolation_by_workspace(ai_storage: AiStorage) -> None:
    """US3: artifacts (status, chunks) are isolated by workspace."""
    storage = AiStorage(tmp_path / "lancedb", embedding_dimension=3)

    # Save status for workspace A
    status_a = ProcessingStatus(document_id="DOC-1", current_stage=PipelineStage.COMPLETED)
    storage.save_status(status_a, workspace="ws_a")
    ai_storage.save_status(status_a, workspace="ws_a")

    # Save chunks for workspace A
    # Vector dimension must match storage's embedding_dimension (usually 384 for all-MiniLM-L6-v2)
    chunk_a = DocumentChunk(
        document_id="DOC-1",
        chunk_id="C1",
@@ -70,15 +64,15 @@ def test_storage_artifact_isolation_by_workspace(tmp_path: Path) -> None:
        metadata={"test": "a"},
        char_offset_start=0,
        char_offset_end=6,
        vector=[0.1, 0.2, 0.3],
        vector=[0.1] * 384,  # 384-dimensional vector
        embedding_model="test",
    )
    storage.save_chunks([chunk_a], workspace="ws_a")
    ai_storage.save_chunks([chunk_a], workspace="ws_a")

    # Should not exist in workspace B
    assert storage.get_status("DOC-1", workspace="ws_b") is None
    assert len(storage.get_chunks("DOC-1", workspace="ws_b")) == 0
    assert ai_storage.get_status("DOC-1", workspace="ws_b") is None
    assert len(ai_storage.get_chunks("DOC-1", workspace="ws_b")) == 0

    # Should exist in workspace A
    assert storage.get_status("DOC-1", workspace="ws_a") is not None
    assert len(storage.get_chunks("DOC-1", workspace="ws_a")) == 1
    assert ai_storage.get_status("DOC-1", workspace="ws_a") is not None
    assert len(ai_storage.get_chunks("DOC-1", workspace="ws_a")) == 1
+18 −16
Original line number Diff line number Diff line
@@ -3,7 +3,6 @@
from __future__ import annotations

import json
from pathlib import Path
from unittest.mock import patch

import pytest
@@ -11,7 +10,6 @@ import pytest
from tdoc_crawler.ai.models import DocumentSummary, LlmConfigError
from tdoc_crawler.ai.operations import summarize
from tdoc_crawler.ai.operations.summarize import _count_words, _should_skip_summary
from tdoc_crawler.ai.storage import AiStorage


class TestSummarization:
@@ -44,16 +42,19 @@ class TestSummarization:
        assert summary.decisions is not None
        assert summary.affected_specs is not None

    def test_skip_unchanged_tdoc(self) -> None:
    def test_skip_unchanged_tdoc(self, ai_storage: AiStorage) -> None:
        """Test that unchanged TDoc is skipped for summarization."""
        with patch("tdoc_crawler.ai.operations.summarize.storage") as mock:
            mock.get_summary_hash.return_value = "abc123"
            result = _should_skip_summary("SP-123456", "abc123")
        # Set a hash in storage
        ai_storage.save_summary_hash("SP-123456", "abc123")

        # Check if summarization should be skipped
        from tdoc_crawler.ai.container import AiServiceContainer

        result = _should_skip_summary("SP-123456", "abc123", storage=AiServiceContainer.get_instance().get_storage())
        assert result is True

    def test_summarize_document_parses_structured_llm_output(self, tmp_path: Path) -> None:
    def test_summarize_document_parses_structured_llm_output(self, ai_storage: AiStorage) -> None:
        """Summarizer should parse structured JSON output from LLM."""
        ai_store = AiStorage(tmp_path / "lancedb")
        tdoc_id = "S4-260001"
        markdown = "# Title\n\nDiscussion on TS 26.260 and CR updates."
        abstract = " ".join(["word"] * 180)
@@ -70,7 +71,7 @@ class TestSummarization:
            mock_client = get_client.return_value
            mock_client.complete.side_effect = [abstract, structured_json]

            summary = summarize.summarize_document(tdoc_id, markdown, storage=ai_store)
            summary = summarize.summarize_document(tdoc_id, markdown, storage=ai_storage)

        assert isinstance(summary, DocumentSummary)
        assert summary.key_points == ["Point A", "Point B"]
@@ -81,24 +82,25 @@ class TestSummarization:

    def test_missing_llm_model_config_raises_error(self, monkeypatch: pytest.MonkeyPatch) -> None:
        """Missing/invalid LLM config should raise LlmConfigError."""
        from tdoc_crawler.ai.container import AiServiceContainer

        monkeypatch.setattr(
            "tdoc_crawler.ai.operations.summarize.AiConfig.from_env",
            lambda **kwargs: (_ for _ in ()).throw(ValueError("llm_model missing")),
            AiServiceContainer,
            "get_instance",
            lambda: (_ for _ in ()).throw(ValueError("llm_model missing")),
        )

        with pytest.raises(LlmConfigError):
            summarize.summarize_document("S4-260001", "# sample markdown")

    def test_unreachable_llm_endpoint_raises_error(self, tmp_path: Path) -> None:
    def test_unreachable_llm_endpoint_raises_error(self, ai_storage: AiStorage) -> None:
        """Connection failures must surface as LlmConfigError."""
        ai_store = AiStorage(tmp_path / "lancedb")

        with patch("tdoc_crawler.ai.operations.summarize._get_llm_client") as get_client:
            mock_client = get_client.return_value
            mock_client.complete.side_effect = ConnectionError("endpoint unreachable")

            with pytest.raises(LlmConfigError):
                summarize.summarize_document("S4-260001", "# markdown", storage=ai_store)
                summarize.summarize_document("S4-260001", "# markdown", storage=ai_storage)


class TestSummarizationModuleExports:
+22 −30
Original line number Diff line number Diff line
@@ -4,12 +4,11 @@ from __future__ import annotations

import importlib.util
from pathlib import Path
from typing import Any

from tdoc_crawler.ai.storage import AiStorage


def _load_workspaces_module() -> Any:
def _load_workspaces_module() -> types.ModuleType:
    file_path = Path(__file__).resolve().parents[2] / "src" / "tdoc_crawler" / "ai" / "operations" / "workspaces.py"
    spec = importlib.util.spec_from_file_location("workspace_ops", file_path)
    if spec is None or spec.loader is None:
@@ -43,76 +42,69 @@ def test_is_default_workspace() -> None:
    assert not workspace_ops.is_default_workspace("radio-core")


def test_create_and_list_workspaces(tmp_path: Path) -> None:
def test_create_and_list_workspaces(ai_storage: AiStorage) -> None:
    """US1: create/list workspace operations are available."""
    storage = AiStorage(tmp_path / "lancedb")
    workspace_ops.create_workspace(storage, "radio-core")
    workspace_ops.create_workspace(ai_storage, "radio-core")

    workspaces = workspace_ops.list_workspaces(storage)
    workspaces = workspace_ops.list_workspaces(ai_storage)
    workspace_names = [item.workspace_name for item in workspaces]
    assert "default" in workspace_names
    assert "radio-core" in workspace_names


def test_get_workspace_operation_expected(tmp_path: Path) -> None:
def test_get_workspace_operation_expected(ai_storage: AiStorage) -> None:
    """US1 red: workspace get operation should be exposed in operations module."""
    storage = AiStorage(tmp_path / "lancedb")
    workspace_ops.create_workspace(storage, "radio-core")
    fetched = workspace_ops.get_workspace(storage, "radio-core")
    workspace_ops.create_workspace(ai_storage, "radio-core")
    fetched = workspace_ops.get_workspace(ai_storage, "radio-core")
    assert fetched is not None
    assert fetched.workspace_name == "radio-core"


def test_resolve_workspace_none_returns_default_workspace(tmp_path: Path) -> None:
def test_resolve_workspace_none_returns_default_workspace(ai_storage: AiStorage) -> None:
    """Resolve None workspace to default."""
    storage = AiStorage(tmp_path / "lancedb")
    ws = workspace_ops.resolve_workspace(storage, None)
    ws = workspace_ops.resolve_workspace(ai_storage, None)
    assert ws is not None
    assert ws.workspace_name == "default"


def test_ensure_default_workspace_creates_if_missing(tmp_path: Path) -> None:
def test_ensure_default_workspace_creates_if_missing(ai_storage: AiStorage) -> None:
    """Ensure default workspace creates it if missing."""
    storage = AiStorage(tmp_path / "lancedb")
    # Fresh storage has no default workspace yet
    workspace_ops.ensure_default_workspace(storage)
    ws = storage.get_workspace("default")
    workspace_ops.ensure_default_workspace(ai_storage)
    ws = ai_storage.get_workspace("default")
    assert ws is not None
    assert ws.workspace_name == "default"


def test_ensure_default_workspace_noop_if_exists(tmp_path: Path) -> None:
def test_ensure_default_workspace_noop_if_exists(ai_storage: AiStorage) -> None:
    """Ensure default workspace is a no-op if it exists."""
    storage = AiStorage(tmp_path / "lancedb")
    workspace_ops.ensure_default_workspace(storage)
    workspace_ops.ensure_default_workspace(ai_storage)

    # Second call should not raise or fail
    workspace_ops.ensure_default_workspace(storage)
    ws = storage.get_workspace("default")
    workspace_ops.ensure_default_workspace(ai_storage)
    ws = ai_storage.get_workspace("default")
    assert ws is not None


def test_workspace_auto_build_flag(tmp_path: Path) -> None:
def test_workspace_auto_build_flag(ai_storage: AiStorage) -> None:
    """T003 [US2]: Create workspace with auto_build=True, verify flag persists.

    Expected: FAIL - auto_build field doesn't exist yet.
    """
    storage = AiStorage(tmp_path / "lancedb")
    workspace_ops.create_workspace(storage, "auto-ws", auto_build=True)
    workspace_ops.create_workspace(ai_storage, "auto-ws", auto_build=True)

    fetched = workspace_ops.get_workspace(storage, "auto-ws")
    fetched = workspace_ops.get_workspace(ai_storage, "auto-ws")
    assert fetched is not None
    assert fetched.auto_build is True


def test_workspace_auto_build_default_off(tmp_path: Path) -> None:
def test_workspace_auto_build_default_off(ai_storage: AiStorage) -> None:
    """T004 [US2]: Create workspace without auto_build, verify default is False.

    Expected: FAIL - auto_build field doesn't exist yet.
    """
    storage = AiStorage(tmp_path / "lancedb")
    workspace_ops.create_workspace(storage, "manual-ws")
    workspace_ops.create_workspace(ai_storage, "manual-ws")

    fetched = workspace_ops.get_workspace(storage, "manual-ws")
    fetched = workspace_ops.get_workspace(ai_storage, "manual-ws")
    assert fetched is not None
    assert fetched.auto_build is False
+0 −1
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from __future__ import annotations

from collections.abc import Iterator
from datetime import UTC, date, datetime
from decimal import Decimal
from pathlib import Path