🧪 test(3gpp-ai): remove obsolete LightRAG tests, update remaining tests (3fd808f5) · Commits · Jan Reimes / 3gpp-crawler

tests/ai/conftest.py

+3 −53

Original line number	Diff line number	Diff line
		@@ -3,9 +3,10 @@
		from __future__ import annotations

		import hashlib
		import importlib.util
		import zipfile
		from pathlib import Path
		from unittest.mock import MagicMock, patch
		from unittest.mock import MagicMock

		import pytest
		from typer.testing import CliRunner
		@@ -13,17 +14,8 @@ from typer.testing import CliRunner
		from tdoc_crawler.http_client import download_to_file

		_TDOC_AI_AVAILABLE = True
		try:
		from threegpp_ai.config import AiConfig
		from threegpp_ai.operations import workspaces as workspace_ops
		except ModuleNotFoundError:
		if importlib.util.find_spec("threegpp_ai") is None:
		_TDOC_AI_AVAILABLE = False
		AiConfig = None # type: ignore[assignment]
		workspace_ops = None

		# Removed classes - keep as None so fixture type annotations don't NameError
		AiStorage = None # type: ignore[assignment]
		EmbeddingsManager = None # type: ignore[assignment]


		@pytest.fixture(autouse=True)
		@@ -152,45 +144,3 @@ def reset_ai_service_container() -> None:
		With the refactored design, no singleton exists, so this is a no-op.
		"""
		return # No cleanup needed - no singleton


		@pytest.fixture
		def ai_storage(test_cache_dir: Path) -> AiStorage:
		"""Create a temporary AI storage for tests.

		This fixture creates an AiStorage instance using the new
		EmbeddingsManager factory with a temporary LanceDB directory
		for testing AI commands that require workspace and embedding storage.

		Args:
		test_cache_dir: Test cache directory from root conftest

		Returns:
		AiStorage instance with temporary storage
		"""
		lancedb_dir = test_cache_dir / "ai" / "lancedb"
		lancedb_dir.mkdir(parents=True, exist_ok=True)

		# Patch environment variable to point to test cache directory
		with patch.dict("os.environ", {"TDOC_CRAWLER_AI_CACHE_DIR": str(test_cache_dir / "ai")}):
		# Use new factory method
		embeddings_manager = EmbeddingsManager(AiConfig.from_env())
		return embeddings_manager.storage


		@pytest.fixture
		def test_workspace(ai_storage: AiStorage) -> str:
		"""Create a test workspace for AI tests.

		This fixture creates a default workspace in the test storage
		and returns its name for use in tests that require a workspace.

		Args:
		ai_storage: AI storage fixture

		Returns:
		Workspace name ("default")
		"""
		workspace_name = "default"
		workspace_ops.create_workspace(workspace_name, auto_build=False)
		return workspace_name

tests/ai/test_cli_aggregator.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ def test_cli_help_lists_top_level_groups() -> None:
		assert "summarize" in result.stdout
		assert "convert" in result.stdout
		assert "workspace" in result.stdout
		assert "rag" in result.stdout
		assert "config" in result.stdout


		def test_workspace_help_lists_expected_subcommands() -> None:

tests/ai/test_extraction_elements.py

deleted100644 → 0

+0 −117

Original line number	Diff line number	Diff line
		"""Tests for shared structured extraction payload behavior."""

		from __future__ import annotations

		from pathlib import Path

		import pytest
		from threegpp_ai.lightrag import processor as processor_module
		from threegpp_ai.lightrag.processor import DocumentProcessor, ProcessingResultStatus
		from threegpp_ai.operations.extraction_result import (
		ExtractedTableElement,
		build_structured_extraction_result,
		from_docling_result,
		)


		def test_build_structured_extraction_result_defaults() -> None:
		"""Builder should create a payload with empty optional collections."""
		result = build_structured_extraction_result("hello")
		assert result.content == "hello"
		assert result.tables == []
		assert result.figures == []
		assert result.equations == []
		assert result.metadata == {}
		assert result.table_count == 0
		assert result.figure_count == 0
		assert result.equation_count == 0


		def test_from_docling_result_maps_tables_and_figures() -> None:
		"""Converter should map available tables/pictures into structured elements."""

		class FakeTableData:
		def __init__(self, cells: list[list[str]]) -> None:
		self.grid = cells
		self.page_number = 3

		class FakeTable:
		def __init__(self, cells: list[list[str]]) -> None:
		self.data = FakeTableData(cells)
		self.markdown = "\|a\|b\|\n\|-\|-\|\n\|c\|d\|"

		def export_to_markdown(self, *, doc: object = None) -> str:
		return self.markdown

		class FakeDoc:
		def __init__(self) -> None:
		self.tables = [FakeTable([["a", "b"], ["c", "d"]])]
		self.pictures = []

		def export_to_markdown(self) -> str:
		return "body text\n\n$$ x = y + z $$"

		class FakeResult:
		def __init__(self) -> None:
		self.document = FakeDoc()
		self.metadata = {"source": "test"}

		mapped = from_docling_result(FakeResult())
		assert mapped.content.startswith("body text")
		assert "$$ x = y + z $$" in mapped.content
		assert mapped.table_count == 1
		assert mapped.figure_count == 0
		assert mapped.equation_count == 1
		assert "<!-- table:id=table_1" in mapped.content
		assert "<!-- equation:id=equation_1" in mapped.content

		table = mapped.tables[0]
		assert isinstance(table, ExtractedTableElement)
		assert table.element_id == "table_1"
		assert table.page_number == 3
		assert table.row_count == 2
		assert table.column_count == 2


		@pytest.mark.asyncio
		async def test_processor_process_file_reports_structured_counts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
		"""Processor result should include structured extraction counters."""

		async def fake_insert(_: str, **__: object) -> None:
		return None

		# Mock extract_document_structured in the processor module where it's imported
		# This prevents actual docling extraction
		monkeypatch.setattr(
		processor_module,
		"extract_document_structured",
		lambda args, *kwargs: build_structured_extraction_result(
		content="x" * 120,
		tables=[
		ExtractedTableElement(
		element_id="table_1",
		page_number=1,
		row_count=1,
		column_count=1,
		cells=[["v"]],
		)
		],
		figures=[],
		equations=[],
		),
		)

		processor = DocumentProcessor()
		monkeypatch.setattr(processor.rag, "insert", fake_insert)

		# Use a .md file to skip PDF conversion, testing only the processor logic
		file_path = tmp_path / "doc.md"
		file_path.write_text("placeholder content " * 10, encoding="utf-8")

		result = await processor.process_file(file_path, metadata={"document_id": "test-doc"})
		assert result.status == ProcessingResultStatus.SUCCESS
		# chars_extracted includes metadata enrichment header, so we check it's > 120
		assert result.chars_extracted > 120
		assert result.table_count == 1
		assert result.figure_count == 0
		assert result.equation_count == 0

tests/ai/test_extraction_profiles.py

0 → 100644

+83 −0

Original line number	Diff line number	Diff line
		"""Tests for extraction profile policy, config validation, and CLI exposure."""

		from __future__ import annotations

		from pathlib import Path

		import pytest
		from pydantic import ValidationError
		from threegpp_ai.cli import app
		from threegpp_ai.config import AiConfig
		from threegpp_ai.operations import extraction as extraction_ops
		from threegpp_ai.operations.extraction_result import build_structured_extraction_result
		from typer.testing import CliRunner


		def test_config_accepts_profile_literals() -> None:
		for profile in ("default", "balanced", "optimum", "custom"):
		config = AiConfig(extraction_profile=profile)
		assert config.extraction_profile == profile


		def test_config_rejects_invalid_profile() -> None:
		with pytest.raises(ValidationError):
		AiConfig(extraction_profile="invalid")


		def test_workspace_process_help_exposes_profile_options() -> None:
		runner = CliRunner()
		result = runner.invoke(app, ["workspace", "process", "--help"])

		assert result.exit_code == 0
		assert "--profile" in result.stdout
		assert "--custom-ocr" in result.stdout
		assert "--custom-layout" in result.stdout
		assert "--custom-tables" in result.stdout
		assert "--custom-figures" in result.stdout
		assert "--custom-equations" in result.stdout
		assert "--custom-enrichment" in result.stdout


		def test_workspace_add_members_help_exposes_profile_options() -> None:
		runner = CliRunner()
		result = runner.invoke(app, ["workspace", "add-members", "--help"])

		assert result.exit_code == 0
		assert "--profile" in result.stdout
		assert "--custom-ocr" in result.stdout


		def test_auto_profile_selection_is_deterministic(tmp_path: Path) -> None:
		file_path = tmp_path / "doc.md"
		file_path.write_text("x" * 1024, encoding="utf-8")

		selected_a, settings_a = extraction_ops.resolve_extraction_policy(file_path)
		selected_b, settings_b = extraction_ops.resolve_extraction_policy(file_path)

		assert selected_a == selected_b
		assert settings_a == settings_b


		def test_extraction_metadata_includes_profile_and_effective_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
		file_path = tmp_path / "doc.md"
		file_path.write_text("content", encoding="utf-8")

		monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda _args, *_kwargs: (True, "cached"))
		monkeypatch.setattr(extraction_ops, "read_cached_artifacts", lambda _args, *_kwargs: build_structured_extraction_result("cached"))

		result = extraction_ops.extract_document_structured(
		file_path=file_path,
		profile="custom",
		custom_extract_tables=False,
		custom_extract_figures=False,
		custom_extract_equations=True,
		custom_extract_enrichment=False,
		)

		assert result.metadata["extraction_profile"] == "custom"
		settings = result.metadata["effective_extraction_settings"]
		assert settings["tables"] is False
		assert settings["figures"] is False
		assert settings["equations"] is True
		assert settings["enrichment"] is False
		assert result.metadata["cache_hit"] is True

tests/ai/test_operations_metrics.py

+8 −9

Original line number	Diff line number	Diff line
		@@ -2,7 +2,6 @@

		from __future__ import annotations

		import asyncio
		from pathlib import Path
		from types import SimpleNamespace

		@@ -69,10 +68,10 @@ def test_convert_tdoc_to_markdown_records_conversion_metric(monkeypatch: pytest.
		mock_extract,
		)

		output = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True))
		output = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)

		assert output == "# markdown"
		conversion_metrics = tracker.by_type(MetricType.CONVERSION)
		conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION]
		assert len(conversion_metrics) == 1
		assert conversion_metrics[0].success is True

		@@ -108,11 +107,11 @@ def test_convert_tdoc_to_markdown_writes_table_sidecar(monkeypatch: pytest.Monke
		mock_extract,
		)

		result = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True))
		result = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)

		# Verify conversion succeeded and returned content
		assert "# markdown" in result
		conversion_metrics = tracker.by_type(MetricType.CONVERSION)
		conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION]
		assert len(conversion_metrics) == 1
		assert conversion_metrics[0].success is True

		@@ -129,14 +128,14 @@ def test_summarize_tdoc_records_summarization_metric(monkeypatch: pytest.MonkeyP
		)
		monkeypatch.setattr(summarize_ops, "_get_llm_client", _DummyClient)
		monkeypatch.setattr(
		summarize_ops.AiConfig,
		"from_env",
		staticmethod(lambda: SimpleNamespace(llm_model="test-model")),
		summarize_ops,
		"AiConfig",
		lambda: SimpleNamespace(llm_model="test-model"),
		)

		result = summarize_ops.summarize_tdoc("S4-260001")

		assert result.word_count > 0
		summary_metrics = tracker.by_type(MetricType.SUMMARIZATION)
		summary_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.SUMMARIZATION]
		assert len(summary_metrics) == 1
		assert summary_metrics[0].success is True