Loading tests/ai/conftest.py +3 −53 Original line number Diff line number Diff line Loading @@ -3,9 +3,10 @@ from __future__ import annotations import hashlib import importlib.util import zipfile from pathlib import Path from unittest.mock import MagicMock, patch from unittest.mock import MagicMock import pytest from typer.testing import CliRunner Loading @@ -13,17 +14,8 @@ from typer.testing import CliRunner from tdoc_crawler.http_client import download_to_file _TDOC_AI_AVAILABLE = True try: from threegpp_ai.config import AiConfig from threegpp_ai.operations import workspaces as workspace_ops except ModuleNotFoundError: if importlib.util.find_spec("threegpp_ai") is None: _TDOC_AI_AVAILABLE = False AiConfig = None # type: ignore[assignment] workspace_ops = None # Removed classes - keep as None so fixture type annotations don't NameError AiStorage = None # type: ignore[assignment] EmbeddingsManager = None # type: ignore[assignment] @pytest.fixture(autouse=True) Loading Loading @@ -152,45 +144,3 @@ def reset_ai_service_container() -> None: With the refactored design, no singleton exists, so this is a no-op. """ return # No cleanup needed - no singleton @pytest.fixture def ai_storage(test_cache_dir: Path) -> AiStorage: """Create a temporary AI storage for tests. This fixture creates an AiStorage instance using the new EmbeddingsManager factory with a temporary LanceDB directory for testing AI commands that require workspace and embedding storage. Args: test_cache_dir: Test cache directory from root conftest Returns: AiStorage instance with temporary storage """ lancedb_dir = test_cache_dir / "ai" / "lancedb" lancedb_dir.mkdir(parents=True, exist_ok=True) # Patch environment variable to point to test cache directory with patch.dict("os.environ", {"TDOC_CRAWLER_AI_CACHE_DIR": str(test_cache_dir / "ai")}): # Use new factory method embeddings_manager = EmbeddingsManager(AiConfig.from_env()) return embeddings_manager.storage @pytest.fixture def test_workspace(ai_storage: AiStorage) -> str: """Create a test workspace for AI tests. This fixture creates a default workspace in the test storage and returns its name for use in tests that require a workspace. Args: ai_storage: AI storage fixture Returns: Workspace name ("default") """ workspace_name = "default" workspace_ops.create_workspace(workspace_name, auto_build=False) return workspace_name tests/ai/test_cli_aggregator.py +1 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ def test_cli_help_lists_top_level_groups() -> None: assert "summarize" in result.stdout assert "convert" in result.stdout assert "workspace" in result.stdout assert "rag" in result.stdout assert "config" in result.stdout def test_workspace_help_lists_expected_subcommands() -> None: Loading tests/ai/test_extraction_elements.pydeleted 100644 → 0 +0 −117 Original line number Diff line number Diff line """Tests for shared structured extraction payload behavior.""" from __future__ import annotations from pathlib import Path import pytest from threegpp_ai.lightrag import processor as processor_module from threegpp_ai.lightrag.processor import DocumentProcessor, ProcessingResultStatus from threegpp_ai.operations.extraction_result import ( ExtractedTableElement, build_structured_extraction_result, from_docling_result, ) def test_build_structured_extraction_result_defaults() -> None: """Builder should create a payload with empty optional collections.""" result = build_structured_extraction_result("hello") assert result.content == "hello" assert result.tables == [] assert result.figures == [] assert result.equations == [] assert result.metadata == {} assert result.table_count == 0 assert result.figure_count == 0 assert result.equation_count == 0 def test_from_docling_result_maps_tables_and_figures() -> None: """Converter should map available tables/pictures into structured elements.""" class FakeTableData: def __init__(self, cells: list[list[str]]) -> None: self.grid = cells self.page_number = 3 class FakeTable: def __init__(self, cells: list[list[str]]) -> None: self.data = FakeTableData(cells) self.markdown = "|a|b|\n|-|-|\n|c|d|" def export_to_markdown(self, *, doc: object = None) -> str: return self.markdown class FakeDoc: def __init__(self) -> None: self.tables = [FakeTable([["a", "b"], ["c", "d"]])] self.pictures = [] def export_to_markdown(self) -> str: return "body text\n\n$$ x = y + z $$" class FakeResult: def __init__(self) -> None: self.document = FakeDoc() self.metadata = {"source": "test"} mapped = from_docling_result(FakeResult()) assert mapped.content.startswith("body text") assert "$$ x = y + z $$" in mapped.content assert mapped.table_count == 1 assert mapped.figure_count == 0 assert mapped.equation_count == 1 assert "<!-- table:id=table_1" in mapped.content assert "<!-- equation:id=equation_1" in mapped.content table = mapped.tables[0] assert isinstance(table, ExtractedTableElement) assert table.element_id == "table_1" assert table.page_number == 3 assert table.row_count == 2 assert table.column_count == 2 @pytest.mark.asyncio async def test_processor_process_file_reports_structured_counts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: """Processor result should include structured extraction counters.""" async def fake_insert(_: str, **__: object) -> None: return None # Mock extract_document_structured in the processor module where it's imported # This prevents actual docling extraction monkeypatch.setattr( processor_module, "extract_document_structured", lambda *args, **kwargs: build_structured_extraction_result( content="x" * 120, tables=[ ExtractedTableElement( element_id="table_1", page_number=1, row_count=1, column_count=1, cells=[["v"]], ) ], figures=[], equations=[], ), ) processor = DocumentProcessor() monkeypatch.setattr(processor.rag, "insert", fake_insert) # Use a .md file to skip PDF conversion, testing only the processor logic file_path = tmp_path / "doc.md" file_path.write_text("placeholder content " * 10, encoding="utf-8") result = await processor.process_file(file_path, metadata={"document_id": "test-doc"}) assert result.status == ProcessingResultStatus.SUCCESS # chars_extracted includes metadata enrichment header, so we check it's > 120 assert result.chars_extracted > 120 assert result.table_count == 1 assert result.figure_count == 0 assert result.equation_count == 0 tests/ai/test_extraction_profiles.py 0 → 100644 +83 −0 Original line number Diff line number Diff line """Tests for extraction profile policy, config validation, and CLI exposure.""" from __future__ import annotations from pathlib import Path import pytest from pydantic import ValidationError from threegpp_ai.cli import app from threegpp_ai.config import AiConfig from threegpp_ai.operations import extraction as extraction_ops from threegpp_ai.operations.extraction_result import build_structured_extraction_result from typer.testing import CliRunner def test_config_accepts_profile_literals() -> None: for profile in ("default", "balanced", "optimum", "custom"): config = AiConfig(extraction_profile=profile) assert config.extraction_profile == profile def test_config_rejects_invalid_profile() -> None: with pytest.raises(ValidationError): AiConfig(extraction_profile="invalid") def test_workspace_process_help_exposes_profile_options() -> None: runner = CliRunner() result = runner.invoke(app, ["workspace", "process", "--help"]) assert result.exit_code == 0 assert "--profile" in result.stdout assert "--custom-ocr" in result.stdout assert "--custom-layout" in result.stdout assert "--custom-tables" in result.stdout assert "--custom-figures" in result.stdout assert "--custom-equations" in result.stdout assert "--custom-enrichment" in result.stdout def test_workspace_add_members_help_exposes_profile_options() -> None: runner = CliRunner() result = runner.invoke(app, ["workspace", "add-members", "--help"]) assert result.exit_code == 0 assert "--profile" in result.stdout assert "--custom-ocr" in result.stdout def test_auto_profile_selection_is_deterministic(tmp_path: Path) -> None: file_path = tmp_path / "doc.md" file_path.write_text("x" * 1024, encoding="utf-8") selected_a, settings_a = extraction_ops.resolve_extraction_policy(file_path) selected_b, settings_b = extraction_ops.resolve_extraction_policy(file_path) assert selected_a == selected_b assert settings_a == settings_b def test_extraction_metadata_includes_profile_and_effective_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: file_path = tmp_path / "doc.md" file_path.write_text("content", encoding="utf-8") monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda *_args, **_kwargs: (True, "cached")) monkeypatch.setattr(extraction_ops, "read_cached_artifacts", lambda *_args, **_kwargs: build_structured_extraction_result("cached")) result = extraction_ops.extract_document_structured( file_path=file_path, profile="custom", custom_extract_tables=False, custom_extract_figures=False, custom_extract_equations=True, custom_extract_enrichment=False, ) assert result.metadata["extraction_profile"] == "custom" settings = result.metadata["effective_extraction_settings"] assert settings["tables"] is False assert settings["figures"] is False assert settings["equations"] is True assert settings["enrichment"] is False assert result.metadata["cache_hit"] is True tests/ai/test_operations_metrics.py +8 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio from pathlib import Path from types import SimpleNamespace Loading Loading @@ -69,10 +68,10 @@ def test_convert_tdoc_to_markdown_records_conversion_metric(monkeypatch: pytest. mock_extract, ) output = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)) output = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True) assert output == "# markdown" conversion_metrics = tracker.by_type(MetricType.CONVERSION) conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION] assert len(conversion_metrics) == 1 assert conversion_metrics[0].success is True Loading Loading @@ -108,11 +107,11 @@ def test_convert_tdoc_to_markdown_writes_table_sidecar(monkeypatch: pytest.Monke mock_extract, ) result = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)) result = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True) # Verify conversion succeeded and returned content assert "# markdown" in result conversion_metrics = tracker.by_type(MetricType.CONVERSION) conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION] assert len(conversion_metrics) == 1 assert conversion_metrics[0].success is True Loading @@ -129,14 +128,14 @@ def test_summarize_tdoc_records_summarization_metric(monkeypatch: pytest.MonkeyP ) monkeypatch.setattr(summarize_ops, "_get_llm_client", _DummyClient) monkeypatch.setattr( summarize_ops.AiConfig, "from_env", staticmethod(lambda: SimpleNamespace(llm_model="test-model")), summarize_ops, "AiConfig", lambda: SimpleNamespace(llm_model="test-model"), ) result = summarize_ops.summarize_tdoc("S4-260001") assert result.word_count > 0 summary_metrics = tracker.by_type(MetricType.SUMMARIZATION) summary_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.SUMMARIZATION] assert len(summary_metrics) == 1 assert summary_metrics[0].success is True Loading
tests/ai/conftest.py +3 −53 Original line number Diff line number Diff line Loading @@ -3,9 +3,10 @@ from __future__ import annotations import hashlib import importlib.util import zipfile from pathlib import Path from unittest.mock import MagicMock, patch from unittest.mock import MagicMock import pytest from typer.testing import CliRunner Loading @@ -13,17 +14,8 @@ from typer.testing import CliRunner from tdoc_crawler.http_client import download_to_file _TDOC_AI_AVAILABLE = True try: from threegpp_ai.config import AiConfig from threegpp_ai.operations import workspaces as workspace_ops except ModuleNotFoundError: if importlib.util.find_spec("threegpp_ai") is None: _TDOC_AI_AVAILABLE = False AiConfig = None # type: ignore[assignment] workspace_ops = None # Removed classes - keep as None so fixture type annotations don't NameError AiStorage = None # type: ignore[assignment] EmbeddingsManager = None # type: ignore[assignment] @pytest.fixture(autouse=True) Loading Loading @@ -152,45 +144,3 @@ def reset_ai_service_container() -> None: With the refactored design, no singleton exists, so this is a no-op. """ return # No cleanup needed - no singleton @pytest.fixture def ai_storage(test_cache_dir: Path) -> AiStorage: """Create a temporary AI storage for tests. This fixture creates an AiStorage instance using the new EmbeddingsManager factory with a temporary LanceDB directory for testing AI commands that require workspace and embedding storage. Args: test_cache_dir: Test cache directory from root conftest Returns: AiStorage instance with temporary storage """ lancedb_dir = test_cache_dir / "ai" / "lancedb" lancedb_dir.mkdir(parents=True, exist_ok=True) # Patch environment variable to point to test cache directory with patch.dict("os.environ", {"TDOC_CRAWLER_AI_CACHE_DIR": str(test_cache_dir / "ai")}): # Use new factory method embeddings_manager = EmbeddingsManager(AiConfig.from_env()) return embeddings_manager.storage @pytest.fixture def test_workspace(ai_storage: AiStorage) -> str: """Create a test workspace for AI tests. This fixture creates a default workspace in the test storage and returns its name for use in tests that require a workspace. Args: ai_storage: AI storage fixture Returns: Workspace name ("default") """ workspace_name = "default" workspace_ops.create_workspace(workspace_name, auto_build=False) return workspace_name
tests/ai/test_cli_aggregator.py +1 −1 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ def test_cli_help_lists_top_level_groups() -> None: assert "summarize" in result.stdout assert "convert" in result.stdout assert "workspace" in result.stdout assert "rag" in result.stdout assert "config" in result.stdout def test_workspace_help_lists_expected_subcommands() -> None: Loading
tests/ai/test_extraction_elements.pydeleted 100644 → 0 +0 −117 Original line number Diff line number Diff line """Tests for shared structured extraction payload behavior.""" from __future__ import annotations from pathlib import Path import pytest from threegpp_ai.lightrag import processor as processor_module from threegpp_ai.lightrag.processor import DocumentProcessor, ProcessingResultStatus from threegpp_ai.operations.extraction_result import ( ExtractedTableElement, build_structured_extraction_result, from_docling_result, ) def test_build_structured_extraction_result_defaults() -> None: """Builder should create a payload with empty optional collections.""" result = build_structured_extraction_result("hello") assert result.content == "hello" assert result.tables == [] assert result.figures == [] assert result.equations == [] assert result.metadata == {} assert result.table_count == 0 assert result.figure_count == 0 assert result.equation_count == 0 def test_from_docling_result_maps_tables_and_figures() -> None: """Converter should map available tables/pictures into structured elements.""" class FakeTableData: def __init__(self, cells: list[list[str]]) -> None: self.grid = cells self.page_number = 3 class FakeTable: def __init__(self, cells: list[list[str]]) -> None: self.data = FakeTableData(cells) self.markdown = "|a|b|\n|-|-|\n|c|d|" def export_to_markdown(self, *, doc: object = None) -> str: return self.markdown class FakeDoc: def __init__(self) -> None: self.tables = [FakeTable([["a", "b"], ["c", "d"]])] self.pictures = [] def export_to_markdown(self) -> str: return "body text\n\n$$ x = y + z $$" class FakeResult: def __init__(self) -> None: self.document = FakeDoc() self.metadata = {"source": "test"} mapped = from_docling_result(FakeResult()) assert mapped.content.startswith("body text") assert "$$ x = y + z $$" in mapped.content assert mapped.table_count == 1 assert mapped.figure_count == 0 assert mapped.equation_count == 1 assert "<!-- table:id=table_1" in mapped.content assert "<!-- equation:id=equation_1" in mapped.content table = mapped.tables[0] assert isinstance(table, ExtractedTableElement) assert table.element_id == "table_1" assert table.page_number == 3 assert table.row_count == 2 assert table.column_count == 2 @pytest.mark.asyncio async def test_processor_process_file_reports_structured_counts(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: """Processor result should include structured extraction counters.""" async def fake_insert(_: str, **__: object) -> None: return None # Mock extract_document_structured in the processor module where it's imported # This prevents actual docling extraction monkeypatch.setattr( processor_module, "extract_document_structured", lambda *args, **kwargs: build_structured_extraction_result( content="x" * 120, tables=[ ExtractedTableElement( element_id="table_1", page_number=1, row_count=1, column_count=1, cells=[["v"]], ) ], figures=[], equations=[], ), ) processor = DocumentProcessor() monkeypatch.setattr(processor.rag, "insert", fake_insert) # Use a .md file to skip PDF conversion, testing only the processor logic file_path = tmp_path / "doc.md" file_path.write_text("placeholder content " * 10, encoding="utf-8") result = await processor.process_file(file_path, metadata={"document_id": "test-doc"}) assert result.status == ProcessingResultStatus.SUCCESS # chars_extracted includes metadata enrichment header, so we check it's > 120 assert result.chars_extracted > 120 assert result.table_count == 1 assert result.figure_count == 0 assert result.equation_count == 0
tests/ai/test_extraction_profiles.py 0 → 100644 +83 −0 Original line number Diff line number Diff line """Tests for extraction profile policy, config validation, and CLI exposure.""" from __future__ import annotations from pathlib import Path import pytest from pydantic import ValidationError from threegpp_ai.cli import app from threegpp_ai.config import AiConfig from threegpp_ai.operations import extraction as extraction_ops from threegpp_ai.operations.extraction_result import build_structured_extraction_result from typer.testing import CliRunner def test_config_accepts_profile_literals() -> None: for profile in ("default", "balanced", "optimum", "custom"): config = AiConfig(extraction_profile=profile) assert config.extraction_profile == profile def test_config_rejects_invalid_profile() -> None: with pytest.raises(ValidationError): AiConfig(extraction_profile="invalid") def test_workspace_process_help_exposes_profile_options() -> None: runner = CliRunner() result = runner.invoke(app, ["workspace", "process", "--help"]) assert result.exit_code == 0 assert "--profile" in result.stdout assert "--custom-ocr" in result.stdout assert "--custom-layout" in result.stdout assert "--custom-tables" in result.stdout assert "--custom-figures" in result.stdout assert "--custom-equations" in result.stdout assert "--custom-enrichment" in result.stdout def test_workspace_add_members_help_exposes_profile_options() -> None: runner = CliRunner() result = runner.invoke(app, ["workspace", "add-members", "--help"]) assert result.exit_code == 0 assert "--profile" in result.stdout assert "--custom-ocr" in result.stdout def test_auto_profile_selection_is_deterministic(tmp_path: Path) -> None: file_path = tmp_path / "doc.md" file_path.write_text("x" * 1024, encoding="utf-8") selected_a, settings_a = extraction_ops.resolve_extraction_policy(file_path) selected_b, settings_b = extraction_ops.resolve_extraction_policy(file_path) assert selected_a == selected_b assert settings_a == settings_b def test_extraction_metadata_includes_profile_and_effective_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: file_path = tmp_path / "doc.md" file_path.write_text("content", encoding="utf-8") monkeypatch.setattr(extraction_ops, "_check_cached_extraction", lambda *_args, **_kwargs: (True, "cached")) monkeypatch.setattr(extraction_ops, "read_cached_artifacts", lambda *_args, **_kwargs: build_structured_extraction_result("cached")) result = extraction_ops.extract_document_structured( file_path=file_path, profile="custom", custom_extract_tables=False, custom_extract_figures=False, custom_extract_equations=True, custom_extract_enrichment=False, ) assert result.metadata["extraction_profile"] == "custom" settings = result.metadata["effective_extraction_settings"] assert settings["tables"] is False assert settings["figures"] is False assert settings["equations"] is True assert settings["enrichment"] is False assert result.metadata["cache_hit"] is True
tests/ai/test_operations_metrics.py +8 −9 Original line number Diff line number Diff line Loading @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio from pathlib import Path from types import SimpleNamespace Loading Loading @@ -69,10 +68,10 @@ def test_convert_tdoc_to_markdown_records_conversion_metric(monkeypatch: pytest. mock_extract, ) output = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)) output = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True) assert output == "# markdown" conversion_metrics = tracker.by_type(MetricType.CONVERSION) conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION] assert len(conversion_metrics) == 1 assert conversion_metrics[0].success is True Loading Loading @@ -108,11 +107,11 @@ def test_convert_tdoc_to_markdown_writes_table_sidecar(monkeypatch: pytest.Monke mock_extract, ) result = asyncio.run(convert_ops.convert_tdoc_to_markdown("S4-260001", force=True)) result = convert_ops.convert_tdoc_to_markdown("S4-260001", force=True) # Verify conversion succeeded and returned content assert "# markdown" in result conversion_metrics = tracker.by_type(MetricType.CONVERSION) conversion_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.CONVERSION] assert len(conversion_metrics) == 1 assert conversion_metrics[0].success is True Loading @@ -129,14 +128,14 @@ def test_summarize_tdoc_records_summarization_metric(monkeypatch: pytest.MonkeyP ) monkeypatch.setattr(summarize_ops, "_get_llm_client", _DummyClient) monkeypatch.setattr( summarize_ops.AiConfig, "from_env", staticmethod(lambda: SimpleNamespace(llm_model="test-model")), summarize_ops, "AiConfig", lambda: SimpleNamespace(llm_model="test-model"), ) result = summarize_ops.summarize_tdoc("S4-260001") assert result.word_count > 0 summary_metrics = tracker.by_type(MetricType.SUMMARIZATION) summary_metrics = [m for m in tracker.metrics if m.metric_type is MetricType.SUMMARIZATION] assert len(summary_metrics) == 1 assert summary_metrics[0].success is True