Commit 3fd808f5 authored by Jan Reimes's avatar Jan Reimes
Browse files

🧪 test(3gpp-ai): remove obsolete LightRAG tests, update remaining tests

parent 361714a6
Loading
Loading
Loading
Loading
+3 −53
Original line number Diff line number Diff line
@@ -3,9 +3,10 @@
from __future__ import annotations

import hashlib
import importlib.util
import zipfile
from pathlib import Path
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock

import pytest
from typer.testing import CliRunner
@@ -13,17 +14,8 @@ from typer.testing import CliRunner
from tdoc_crawler.http_client import download_to_file

_TDOC_AI_AVAILABLE = True
try:
    from threegpp_ai.config import AiConfig
    from threegpp_ai.operations import workspaces as workspace_ops
except ModuleNotFoundError:
if importlib.util.find_spec("threegpp_ai") is None:
    _TDOC_AI_AVAILABLE = False
    AiConfig = None  # type: ignore[assignment]
    workspace_ops = None

# Removed classes - keep as None so fixture type annotations don't NameError
AiStorage = None  # type: ignore[assignment]
EmbeddingsManager = None  # type: ignore[assignment]


@pytest.fixture(autouse=True)
@@ -152,45 +144,3 @@ def reset_ai_service_container() -> None:
    With the refactored design, no singleton exists, so this is a no-op.
    """
    return  # No cleanup needed - no singleton


@pytest.fixture
def ai_storage(test_cache_dir: Path) -> AiStorage:
    """Create a temporary AI storage for tests.

    This fixture creates an AiStorage instance using the new
    EmbeddingsManager factory with a temporary LanceDB directory
    for testing AI commands that require workspace and embedding storage.

    Args:
        test_cache_dir: Test cache directory from root conftest

    Returns:
        AiStorage instance with temporary storage
    """
    lancedb_dir = test_cache_dir / "ai" / "lancedb"
    lancedb_dir.mkdir(parents=True, exist_ok=True)

    # Patch environment variable to point to test cache directory
    with patch.dict("os.environ", {"TDOC_CRAWLER_AI_CACHE_DIR": str(test_cache_dir / "ai")}):
        # Use new factory method
        embeddings_manager = EmbeddingsManager(AiConfig.from_env())
        return embeddings_manager.storage


@pytest.fixture
def test_workspace(ai_storage: AiStorage) -> str:
    """Create a test workspace for AI tests.

    This fixture creates a default workspace in the test storage
    and returns its name for use in tests that require a workspace.

    Args:
        ai_storage: AI storage fixture

    Returns:
        Workspace name ("default")
    """
    workspace_name = "default"
    workspace_ops.create_workspace(workspace_name, auto_build=False)
    return workspace_name
+1 −1
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ def test_cli_help_lists_top_level_groups() -> None:
    assert "summarize" in result.stdout
    assert "convert" in result.stdout
    assert "workspace" in result.stdout
    assert "rag" in result.stdout
    assert "config" in result.stdout


def test_workspace_help_lists_expected_subcommands() -> None:
+0 −117
Original line number Diff line number Diff line
"""Tests for shared structured extraction payload behavior."""

from __future__ import annotations

from pathlib import Path

import pytest
from threegpp_ai.lightrag import processor as processor_module
from threegpp_ai.lightrag.processor import DocumentProcessor, ProcessingResultStatus
from threegpp_ai.operations.extraction_result import (
    ExtractedTableElement,
    build_structured_extraction_result,
    from_docling_result,
)


def test_build_structured_extraction_result_defaults() -> None:
    """Builder should create a payload with empty optional collections."""
    result = build_structured_extraction_result("hello")
    assert result.content == "hello"
    assert result.tables == []
    assert result.figures == []
    assert result.equations == []
    assert result.metadata == {}
    assert result.table_count == 0
    assert result.figure_count == 0
    assert result.equation_count == 0


def test_from_docling_result_maps_tables_and_figures() -> None:
    """Converter should map available tables/pictures into structured elements."""

    class FakeTableData:
        def __init__(self, cells: list[list[str]]) -> None:
            self.grid = cells
            self.page_number = 3

    class FakeTable:
        def __init__(self, cells: list[list[str]]) -> None:
            self.data = FakeTableData(cells)
            self.markdown = "|a|b|\n|-|-|\n|c|d|"

        def export_to_markdown(self, *, doc: object = None) -> str:
            return self.markdown

    class FakeDoc:
        def __init__(self) -> None:
            self.tables = [FakeTable([["a", "b"], ["c", "d"]])]
            self.pictures = []

        def export_to_markdown(self) -> str:
            return "body text\n\n$$ x = y + z $$"

    class FakeResult:
        def __init__(self) -> None:
            self.document = FakeDoc()
            self.metadata = {"source": "test"}

    mapped = from_docling_result(FakeResult())
    assert mapped.content.startswith("body text")
    assert "$$ x = y + z $$" in mapped.content
    assert mapped.table_count == 1
    assert mapped.figure_count == 0
    assert mapped.equation_count == 1
    assert "<!-- table:id=table_1" in mapped.content
    assert "<!-- equation:id=equation_1" in mapped.content

    table = mapped.tables[0]
    assert isinstance(table, ExtractedTableElement)
    assert table.element_id == "table_1"
    assert table.page_number == 3
    assert table.row_count == 2
    assert table.column_count == 2


@pytest.mark.asyncio
async def test_processor_process_file_reports_structured_counts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
    """Processor result should include structured extraction counters."""

    async def fake_insert(_: str, **__: object) -> None:
        return None

    # Mock extract_document_structured in the processor module where it's imported
    # This prevents actual docling extraction
    monkeypatch.setattr(
        processor_module,
        "extract_document_structured",
        lambda *args, **kwargs: build_structured_extraction_result(
            content="x" * 120,
            tables=[
                ExtractedTableElement(
                    element_id="table_1",
                    page_number=1,
                    row_count=1,
                    column_count=1,
                    cells=[["v"]],
                )
            ],
            figures=[],
            equations=[],
        ),
    )

    processor = DocumentProcessor()
    monkeypatch.setattr(processor.rag, "insert", fake_insert)

    # Use a .md file to skip PDF conversion, testing only the processor logic
    file_path = tmp_path / "doc.md"
    file_path.write_text("placeholder content " * 10, encoding="utf-8")

    result = await processor.process_file(file_path, metadata={"document_id": "test-doc"})
    assert result.status == ProcessingResultStatus.SUCCESS
    # chars_extracted includes metadata enrichment header, so we check it's > 120
    assert result.chars_extracted > 120
    assert result.table_count == 1
    assert result.figure_count == 0
    assert result.equation_count == 0
+83 −0
Original line number Diff line number Diff line
"""Tests for extraction profile policy, config validation, and CLI exposure."""

from __future__ import annotations

from pathlib import Path

import pytest
from pydantic import ValidationError
from threegpp_ai.cli import app
from threegpp_ai.config import AiConfig
from threegpp_ai.operations import extraction as extraction_ops
from threegpp_ai.operations.extraction_result import build_structured_extraction_result
from typer.testing import CliRunner


def test_config_accepts_profile_literals() -> None:
    for profile in ("default", "balanced", "optimum", "custom"):
        config = AiConfig(extraction_profile=profile)
        assert config.extraction_profile == profile


def test_config_rejects_invalid_profile() -> None:
    with pytest.raises(ValidationError):
        AiConfig(extraction_profile="invalid")


def test_workspace_process_help_exposes_profile_options() -> None:
    runner = CliRunner()
    result = runner.invoke(app, ["workspace", "process", "--help"])

    assert result.exit_code == 0
    assert "--profile" in result.stdout
    assert "--custom-ocr" in result.stdout
    assert "--custom-layout" in result.stdout
    assert "--custom-tables" in result.stdout
    assert "--custom-figures" in result.stdout
    assert "--custom-equations" in result.stdout
    assert "--custom-enrichment" in result.stdout


def test_workspace_add_members_help_exposes_profile_options() -> None:
    runner = CliRunner()
    result = runner.invoke(app, ["workspace", "add-members", "--help"])

    assert result.exit_code == 0
    assert "--profile" in result.stdout
    assert "--custom-ocr" in result.stdout


def test_auto_profile_selection_is_deterministic(tmp_path: Path) -> None:
    file_path = tmp_path / "doc.md"
    file_path.write_text("x" * 1024, encoding="utf-8")

    selected_a, settings_a = extraction_ops.resolve_extraction_policy(file_path)
    selected_b, settings_b = extraction_ops.resolve_extraction_policy(file_path)

    assert selected_a == selected_b
    assert settings_a == settings_b


def test_extraction_metadata_includes_profile_and_effective_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
    file_path = tmp_path / "doc.md"
    file_path.write_text("content", encoding="utf-8")

    monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda *_args, **_kwargs: (True, "cached"))
    monkeypatch.setattr(extraction_ops, "read_cached_artifacts", lambda *_args, **_kwargs: build_structured_extraction_result("cached"))

    result = extraction_ops.extract_document_structured(
        file_path=file_path,
        profile="custom",
        custom_extract_tables=False,
        custom_extract_figures=False,
        custom_extract_equations=True,
        custom_extract_enrichment=False,
    )

    assert result.metadata["extraction_profile"] == "custom"
    settings = result.metadata["effective_extraction_settings"]
    assert settings["tables"] is False
    assert settings["figures"] is False
    assert settings["equations"] is True
    assert settings["enrichment"] is False
    assert result.metadata["cache_hit"] is True
+8 −9
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from __future__ import annotations

import asyncio
from pathlib import Path
from types import SimpleNamespace

@@ -69,10 +68,10 @@ def test_convert_tdoc_to_markdown_records_conversion_metric(monkeypatch: pytest.
        mock_extract,
    )

    output = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True))
    output = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)

    assert output == "# markdown"
    conversion_metrics = tracker.by_type(MetricType.CONVERSION)
    conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION]
    assert len(conversion_metrics) == 1
    assert conversion_metrics[0].success is True

@@ -108,11 +107,11 @@ def test_convert_tdoc_to_markdown_writes_table_sidecar(monkeypatch: pytest.Monke
        mock_extract,
    )

    result = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True))
    result = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)

    # Verify conversion succeeded and returned content
    assert "# markdown" in result
    conversion_metrics = tracker.by_type(MetricType.CONVERSION)
    conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION]
    assert len(conversion_metrics) == 1
    assert conversion_metrics[0].success is True

@@ -129,14 +128,14 @@ def test_summarize_tdoc_records_summarization_metric(monkeypatch: pytest.MonkeyP
    )
    monkeypatch.setattr(summarize_ops, "_get_llm_client", _DummyClient)
    monkeypatch.setattr(
        summarize_ops.AiConfig,
        "from_env",
        staticmethod(lambda: SimpleNamespace(llm_model="test-model")),
        summarize_ops,
        "AiConfig",
        lambda: SimpleNamespace(llm_model="test-model"),
    )

    result = summarize_ops.summarize_tdoc("S4-260001")

    assert result.word_count > 0
    summary_metrics = tracker.by_type(MetricType.SUMMARIZATION)
    summary_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.SUMMARIZATION]
    assert len(summary_metrics) == 1
    assert summary_metrics[0].success is True
Loading