chore(tests): remove outdated and redundant test files (e339010b) · Commits · Jan Reimes / 3gpp-crawler

tests/.gitignore

0 → 100644

+24 −0

Original line number	Diff line number	Diff line
		.ai

		# AI test data (binary files that are downloaded on-demand)
		*.docx
		*.xlsx
		*.pptx
		*.pdf
		*.zip

		# AI test data directories
		data/
		!data/README.md

		# Specific AI test data files
		26253-j10/
		26260-j10/
		S4-251003/
		S4-251971/
		S4-260001/
		S4-260002/
		S4-260003/

		# But keep the README
		!data/README.md
		No newline at end of file

tests/ai/test_ai_cli.py

deleted100644 → 0

+0 −194

Original line number	Diff line number	Diff line
		"""Tests for AI CLI commands."""

		from __future__ import annotations

		import json
		from pathlib import Path
		from unittest.mock import patch

		import pytest
		import tdoc_crawler.cli.ai as ai_cli
		from typer.testing import CliRunner

		from tdoc_crawler.cli.app import app


		class TestAiCli:
		"""Tests for AI CLI commands."""

		def test_help_lists_all_subcommands(self, runner: CliRunner) -> None:
		"""Test --help lists all AI subcommands."""
		result = runner.invoke(app, ["ai", "--help"])
		assert result.exit_code == 0
		assert "summarize" in result.output
		assert "convert" in result.output
		assert "workspace" in result.output

		def test_summarize_command_exists(self, runner: CliRunner) -> None:
		"""Test summarize command is available."""
		result = runner.invoke(app, ["ai", "summarize", "--help"])
		assert result.exit_code == 0

		def test_workspace_command_group_exists(self, runner: CliRunner) -> None:
		"""US1 red: workspace command group should be available."""
		result = runner.invoke(app, ["ai", "workspace", "--help"])
		assert result.exit_code == 0

		def test_workspace_create_command_exists(self, runner: CliRunner) -> None:
		"""US1 red: workspace create command should be available."""
		result = runner.invoke(app, ["ai", "workspace", "create", "--help"])
		assert result.exit_code == 0

		def test_workspace_create_auto_build_flag(self, runner: CliRunner, tmp_path: Path) -> None:
		"""T005 [US2]: CLI integration test for --auto-build flag."""
		result = runner.invoke(
		app,
		["ai", "workspace", "create", "test-auto-ws", "--auto-build", "--json"],
		)
		assert result.exit_code == 0, f"CLI command failed: {result.output}"
		payload = json.loads(result.output)
		assert "auto_build" in payload
		assert payload["auto_build"] is True

		def test_workspace_add_members_command_exists(self, runner: CliRunner) -> None:
		"""US1 red: workspace member registration command should be available."""
		result = runner.invoke(app, ["ai", "workspace", "add-members", "--help"])
		assert result.exit_code == 0

		def test_ai_help_excludes_removed_commands(self, runner: CliRunner) -> None:
		"""Test removed commands are not listed in help."""
		result = runner.invoke(app, ["ai", "--help"])
		assert result.exit_code == 0
		assert "process" not in result.output
		assert "status" not in result.output
		assert "query" not in result.output
		assert "graph" not in result.output


		class TestAiModuleExports:
		def test_workspace_create_auto_build_flag(self, runner: CliRunner, tmp_path: Path) -> None:
		"""T005 [US2]: CLI integration test for --auto-build flag.

		Expected: FAIL - flag doesn't exist yet.
		"""
		tmp_path / "lancedb"
		result = runner.invoke(
		app,
		["ai", "workspace", "create", "test-auto-ws", "--auto-build", "--json"],
		)

		# Should succeed and include auto_build in JSON
		assert result.exit_code == 0, f"CLI command failed: {result.output}"
		payload = json.loads(result.output)
		assert "auto_build" in payload
		assert payload["auto_build"] is True

		"""Tests that AI CLI module exports required functions."""

		def test_ai_cli_exported(self) -> None:
		"""Verify ai CLI module exists."""
		assert ai_cli is not None


		class TestCliRedesign:
		"""Tests for CLI redesign (US4)."""

		@pytest.fixture
		def runner(self) -> CliRunner:
		"""Create a CLI runner for testing."""
		return CliRunner()

		def test_summarize_command_exists(self, runner: CliRunner) -> None:
		"""T006 [US4]: ai summarize command exists with --words, --format options.

		Expected: PASS - command exists.
		"""
		with patch("tdoc_crawler.cli.ai.summarize_document") as mock_summarize:
		mock_summarize.return_value = type(
		"SummarizeResult",
		(),
		{
		"summary": "Test summary",
		"keywords": ["test"],
		"metadata": {},
		"to_markdown": lambda self: "# Test",
		"to_json": lambda self: '{"summary": "test"}',
		"to_yaml": lambda self: "summary: test",
		},
		)()
		result = runner.invoke(
		app,
		["ai", "summarize", "SP-123456", "--words", "200", "--format", "markdown"],
		)
		# Should succeed and produce summary with keywords, metadata
		assert result.exit_code == 0, f"summarize command failed: {result.output}"
		assert "summary" in result.output.lower() or "keyword" in result.output.lower()

		def test_summarize_fetches_remote(self, runner: CliRunner) -> None:
		"""T007 [US4]: ai summarize fetches TDoc remotely if not local.

		Expected: FAIL - command doesn't exist yet.
		"""
		result = runner.invoke(
		app,
		["ai", "summarize", "NONEXISTENT-TDOC", "--words", "150"],
		)
		# Should attempt to fetch remotely (may fail at fetch, but command should exist)
		assert result.exit_code in (0, 1), f"summarize command structure incorrect: {result.output}"

		@pytest.mark.skip(reason="Requires mocking LLM layer - tested in test_summarize_module.py")
		def test_summarize_output_formats(self, runner: CliRunner) -> None:
		"""T008 [US4]: ai summarize supports markdown, json, yaml formats.

		Expected: PASS - command supports all three formats.

		Note: This integration test requires full LLM mocking chain.
		Unit tests in test_summarize_module.py cover the functionality.
		"""
		pass # Skipped - covered by unit tests
		"""T008 [US4]: ai summarize supports markdown, json, yaml formats.

		Expected: FAIL - command doesn't exist yet.
		"""
		result_json = runner.invoke(
		app,
		["ai", "summarize", "SP-123456", "--format", "json"],
		)
		assert result_json.exit_code == 0
		try:
		payload = json.loads(result_json.output)
		assert "summary" in payload
		assert "keywords" in payload
		assert "metadata" in payload
		except json.JSONDecodeError:
		pytest.fail(f"JSON format output is not valid JSON: {result_json.output}")

		result_yaml = runner.invoke(
		app,
		["ai", "summarize", "SP-123456", "--format", "yaml"],
		)
		assert result_yaml.exit_code == 0
		assert "summary:" in result_yaml.output or "keywords:" in result_yaml.output

		def test_convert_command_exists(self, runner: CliRunner) -> None:
		"""T009 [US4]: ai convert command exists and shows help.

		Expected: PASS - command exists.
		"""
		result = runner.invoke(
		app,
		["ai", "convert", "--help"],
		)
		assert result.exit_code == 0, f"convert --help failed: {result.output}"
		assert "Convert a single TDoc to markdown format" in result.output

		def test_removed_commands_unavailable(self, runner: CliRunner) -> None:
		"""T012 [US4]: removed ai process/status/graph commands are unavailable."""
		result_process = runner.invoke(app, ["ai", "process", "--help"])
		assert result_process.exit_code != 0, "ai process should be removed"

		result_status = runner.invoke(app, ["ai", "status", "--help"])
		assert result_status.exit_code != 0, "ai status should be removed"

		result_graph = runner.invoke(app, ["ai", "graph", "--help"])
		assert result_graph.exit_code != 0, "ai graph should be removed"

tests/ai/test_ai_config.py

deleted100644 → 0

+0 −123

Original line number	Diff line number	Diff line
		"""Tests for AI configuration model and validation."""

		from __future__ import annotations

		import re
		from pathlib import Path

		import pytest
		from threegpp_ai.config import AiConfig
		from threegpp_ai.models import DocumentSummary

		from tdoc_crawler.config import CacheManager


		def test_from_env_reads_supported_variables(monkeypatch: pytest.MonkeyPatch) -> None:
		"""AiConfig.from_env reads TDC_AI_* variables."""
		monkeypatch.setenv("TDC_AI_LLM_MODEL", "ollama/llama3.2")
		monkeypatch.setenv("TDC_AI_EMBEDDING_MODEL", "huggingface/BAAI/bge-small-en-v1.5")
		monkeypatch.setenv("TDC_AI_EMBEDDING_BACKEND", "openvino")
		monkeypatch.setenv("TDC_AI_MAX_CHUNK_SIZE", "1200")
		monkeypatch.setenv("TDC_AI_CHUNK_OVERLAP", "120")
		monkeypatch.setenv("TDC_AI_PARALLELISM", "6")

		CacheManager(name="default").register(force=True)
		config = AiConfig.from_env(cache_manager_name="default")

		assert config.llm_model == "ollama/llama3.2"
		assert config.embedding_model == "huggingface/BAAI/bge-small-en-v1.5"
		assert config.embedding_backend == "openvino"
		assert config.max_chunk_size == 1200
		assert config.chunk_overlap == 120
		assert config.parallelism == 6


		def test_model_identifier_requires_provider_prefix() -> None:
		"""Model identifiers must follow <provider>/<model_name>."""
		with pytest.raises(ValueError, match="<provider>/<model_name>"):
		AiConfig(llm_model="llama3.2")

		with pytest.raises(ValueError, match="<provider>/<model_name>"):
		AiConfig(embedding_model="bge-small-en-v1.5")


		def test_model_identifier_allows_nested_model_paths() -> None:
		"""Model name segment may contain additional slashes."""
		config = AiConfig(
		llm_model="openai/gpt-4o-mini",
		embedding_model="huggingface/BAAI/bge-small-en-v1.5",
		)

		assert config.llm_model == "openai/gpt-4o-mini"
		assert config.embedding_model == "huggingface/BAAI/bge-small-en-v1.5"


		def test_invalid_provider_is_rejected() -> None:
		"""Unknown provider segment is rejected."""
		with pytest.raises(ValueError, match="provider"):
		AiConfig(llm_model="not-a-provider/gpt-x")


		def test_embedding_backend_defaults_to_torch() -> None:
		"""Embedding backend defaults to torch when unset."""
		config = AiConfig(llm_model="openai/gpt-4o-mini")
		assert config.embedding_backend == "torch"


		def test_embedding_backend_rejects_invalid_values() -> None:
		"""Invalid embedding backend values are rejected."""
		with pytest.raises(ValueError, match="embedding_backend"):
		AiConfig(embedding_backend="cuda", llm_model="openai/gpt-4o-mini")


		def test_default_store_path_resolves_under_cache_dir(tmp_path: Path) -> None:
		"""Default AI store path resolves to <cache_dir>/.ai/<embedding_model>."""
		CacheManager(root_path=tmp_path, name="test-ai-config").register(force=True)
		config = AiConfig(cache_manager_name="test-ai-config")

		# Path should be under .ai directory with embedding model subdirectory
		assert ".ai" in str(config.ai_cache_dir)
		assert "sentence-transformers" in str(config.ai_cache_dir) or "all-MiniLM-L6-v2" in str(config.ai_cache_dir)


		def test_no_hardcoded_models_in_ai_package() -> None:
		"""T001 [US3]: Scan ai/ package for hardcoded model identifiers.

		Assert zero matches for model patterns (gpt-4o, llama3, ollama/llama, gpt-4o-mini)
		outside of config.py where defaults are defined.
		"""
		ai_package = Path(__file__).resolve().parents[2] / "src" / "tdoc_crawler" / "ai"
		hardcoded_patterns = re.compile(r'["\']((?:gpt-4o[^"\']\|llama3[^"\']\|ollama/llama[^"\']\|gpt-4o-mini))["\']', re.IGNORECASE)
		config_file = ai_package / "config.py"

		violations = []
		for py_file in ai_package.rglob("*.py"):
		if py_file == config_file:
		continue # Skip config.py where defaults are defined

		content = py_file.read_text(encoding="utf-8")
		matches = hardcoded_patterns.findall(content)
		if matches:
		violations.append(f"{py_file.relative_to(ai_package)}: {matches}")

		assert not violations, f"Found hardcoded model identifiers in: {'\n '.join(violations)}"


		def test_model_config_propagation_to_litellm_client() -> None:
		"""T002 [US3]: Verify LiteLLMClient uses AiConfig().llm_model when no explicit model passed.

		This test verifies the bug fix where DocumentSummary.llm_model should use
		default_factory=lambda: AiConfig().llm_model instead of hardcoded default.
		"""
		# Initialize cache manager (required by AiConfig)
		CacheManager(name="test-config-prop").register(force=True)

		# Get the config default
		config = AiConfig(cache_manager_name="test-config-prop")
		config_default = config.llm_model

		# Get the DocumentSummary default
		summary_default = DocumentSummary.model_fields["llm_model"].default_factory()

		# They should match
		assert summary_default == config_default, f"DocumentSummary.llm_model default ({summary_default}) does not match AiConfig().llm_model ({config_default})"

tests/ai/test_ai_extraction.py

deleted100644 → 0

+0 −127

Original line number	Diff line number	Diff line
		"""Tests for DOCX-to-Markdown extraction."""

		from __future__ import annotations

		import tempfile
		from pathlib import Path
		from unittest.mock import MagicMock

		import pytest
		from threegpp_ai.models import ExtractionError
		from threegpp_ai.operations import extract
		from threegpp_ai.operations.extract import (
		compute_source_hash,
		extract_docx_to_markdown,
		extract_from_folder,
		)


		class TestExtractDocxToMarkdown:
		"""Tests for extract_docx_to_markdown function."""

		def test_single_docx_to_markdown(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test basic DOCX to Markdown conversion."""
		# Test with a real DOCX file if available
		docx_path = test_data_dir / "S4-251003" / "S4-251003 - On nominal transmission levels in ATIAS.docx"
		if docx_path.exists():
		result = extract_docx_to_markdown("S4-251003", docx_path, mock_storage)
		assert isinstance(result, str)
		assert len(result) > 0

		def test_table_preservation(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test that tables are preserved in Markdown output."""
		docx_path = test_data_dir / "26260-j10" / "26260-j10.docx"
		if docx_path.exists():
		result = extract_docx_to_markdown("26260-J10", docx_path, mock_storage)
		# Tables should be represented in Markdown
		assert "\|" in result or "table" in result.lower()

		def test_heading_hierarchy(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test heading hierarchy is preserved."""
		docx_path = test_data_dir / "S4-251003"
		# Find any docx file
		docx_files = list(docx_path.glob("*.docx")) if docx_path.exists() else []
		if docx_files:
		result = extract_docx_to_markdown("S4-251003", docx_files[0], mock_storage)
		# Should contain markdown headings
		assert "#" in result

		def test_3gpp_section_numbering(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test 3GPP section numbering is handled."""
		pytest.skip("Requires test data with 3GPP section numbering")

		def test_extraction_writes_markdown_artifact(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Extraction should persist a Markdown artifact for the processed TDoc."""
		docx_path = test_data_dir / "S4-251003" / "S4-251003 - On nominal transmission levels in ATIAS.docx"
		if not docx_path.exists():
		pytest.skip("S4-251003 fixture not available")

		markdown = extract_docx_to_markdown("S4-251003", docx_path, mock_storage)
		assert markdown

		artifact_path = docx_path.parent / ".ai" / "S4-251003.md"
		assert artifact_path.exists()

		def test_idempotent_rerun_skips_unchanged(self, mock_storage: MagicMock) -> None:
		"""Test that re-run with unchanged file skips processing."""
		with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
		# Create empty temp file
		f.write(b"PK\x03\x04") # Minimal DOCX header
		temp_path = Path(f.name)

		try:
		# Compute hash
		hash1 = compute_source_hash(temp_path)
		hash2 = compute_source_hash(temp_path)
		assert hash1 == hash2 # Same content = same hash
		finally:
		temp_path.unlink()

		def test_corrupt_file_raises_extraction_error(self, mock_storage: MagicMock) -> None:
		"""Test that corrupt file raises ExtractionError."""
		# Create corrupt DOCX
		with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f:
		f.write(b"not a valid docx")
		corrupt_path = Path(f.name)

		try:
		with pytest.raises(ExtractionError):
		extract_docx_to_markdown("TEST", corrupt_path, mock_storage)
		finally:
		corrupt_path.unlink()

		def test_no_docx_folder_raises_extraction_error(self, mock_storage: MagicMock, tmp_path: Path) -> None:
		"""Folders without DOCX should fail extraction for status/error tracking."""
		empty_folder = tmp_path / "empty"
		empty_folder.mkdir()

		with pytest.raises(ExtractionError):
		extract_from_folder("TEST", empty_folder, mock_storage)


		class TestExtractModuleExports:
		"""Tests that extract module exports required functions."""

		def test_extract_docx_to_markdown_exported(self) -> None:
		"""Verify extract_docx_to_markdown is exported."""
		assert hasattr(extract, "extract_docx_to_markdown")
		assert callable(extract.extract_docx_to_markdown)

		def test_extract_from_folder_exported(self) -> None:
		"""Verify extract_from_folder is exported."""
		assert hasattr(extract, "extract_from_folder")
		assert callable(extract.extract_from_folder)

		def test_compute_source_hash_exported(self) -> None:
		"""Verify compute_source_hash is exported."""
		assert hasattr(extract, "compute_source_hash")
		assert callable(extract.compute_source_hash)


		US1_T021_RED_CHECKPOINT = """
		Command: uv run pytest tests/ai/test_ai_extraction.py -q

		Observed failure:
		- TestExtractDocxToMarkdown.test_no_docx_folder_raises_extraction_error
		Failed: DID NOT RAISE <class 'tdoc_ai.models.ExtractionError'>
		"""

tests/ai/test_ai_graph.py

deleted100644 → 0

+0 −213

File deleted.

Preview size limit exceeded, changes collapsed.