refactor(ai): rename tdoc_id to document_id in tests and related functions (1ad70f87) · Commits · Jan Reimes / 3gpp-crawler

tests/ai/test_ai_classification.py

+12 −12

Original line number	Diff line number	Diff line
		@@ -6,17 +6,17 @@ from pathlib import Path

		from tdoc_crawler.ai.models import DocumentClassification
		from tdoc_crawler.ai.operations import classify
		from tdoc_crawler.ai.operations.classify import _score_filename, classify_tdoc_files
		from tdoc_crawler.ai.operations.classify import _score_filename, classify_document_files


		class TestClassifyTdocFiles:
		"""Tests for classify_tdoc_files function."""
		"""Tests for classify_document_files function."""

		def test_single_file_gets_confidence_one(self, test_data_dir: Path) -> None:
		"""Test that a single file gets confidence 1.0."""
		tdoc_folder = test_data_dir / "S4-251003"
		if tdoc_folder.exists():
		result = classify_tdoc_files("S4-251003", tdoc_folder)
		result = classify_document_files("S4-251003", tdoc_folder)
		assert len(result) == 1
		assert result[0].confidence == 1.0
		assert result[0].is_main_document is True
		@@ -26,7 +26,7 @@ class TestClassifyTdocFiles:
		# Use a folder with 2 files
		tdoc_folder = test_data_dir / "26260-j10"
		if tdoc_folder.exists() and len(list(tdoc_folder.glob("*"))) >= 2:
		result = classify_tdoc_files("26260-J10", tdoc_folder)
		result = classify_document_files("26260-J10", tdoc_folder)
		# Find the main document
		main = next((r for r in result if r.is_main_document), None)
		assert main is not None
		@@ -51,7 +51,7 @@ class TestClassifyTdocFiles:
		# Test with various folders
		for tdoc_folder in test_data_dir.iterdir():
		if tdoc_folder.is_dir():
		result = classify_tdoc_files(tdoc_folder.name, tdoc_folder)
		result = classify_document_files(tdoc_folder.name, tdoc_folder)
		for classification in result:
		assert 0.0 <= classification.confidence <= 1.0

		@@ -60,7 +60,7 @@ class TestClassifyTdocFiles:
		tdoc_folder = test_data_dir / "S4-251003"

		if tdoc_folder.exists():
		result = classify_tdoc_files("S4-251003", tdoc_folder)
		result = classify_document_files("S4-251003", tdoc_folder)
		for classification in result:
		assert classification.decisive_heuristic is not None
		assert len(classification.decisive_heuristic) > 0
		@@ -70,17 +70,17 @@ class TestClassifyTdocFiles:
		(tmp_path / "agenda.xlsx").write_text("xlsx placeholder", encoding="utf-8")
		(tmp_path / "slides.pptx").write_text("pptx placeholder", encoding="utf-8")

		result = classify_tdoc_files("S4-260999", tmp_path)
		result = classify_document_files("S4-260999", tmp_path)
		assert result == []


		class TestClassifyModuleExports:
		"""Tests that classify module exports required functions."""

		def test_classify_tdoc_files_exported(self) -> None:
		"""Verify classify_tdoc_files is exported."""
		assert hasattr(classify, "classify_tdoc_files")
		assert callable(classify.classify_tdoc_files)
		def test_classify_document_files_exported(self) -> None:
		"""Verify classify_document_files is exported."""
		assert hasattr(classify, "classify_document_files")
		assert callable(classify.classify_document_files)

		def test_document_classification_exported(self) -> None:
		"""Verify DocumentClassification is exported."""
		@@ -93,7 +93,7 @@ Command: uv run pytest tests/ai/test_ai_classification.py -q
		Observed failure:
		- TestClassifyTdocFiles.test_non_docx_files_are_ignored
		AssertionError: assert [DocumentClassification(...)] == []
		Cause: classify_tdoc_files currently includes non-DOCX files (e.g., .xlsx/.pptx)
		Cause: classify_document_files currently includes non-DOCX files (e.g., .xlsx/.pptx)
		instead of filtering classification candidates to DOCX only.

		Checkpoint status: RED (1 failed, 8 passed)

tests/ai/test_ai_cli.py

+9 −9

Original line number	Diff line number	Diff line
		@@ -74,8 +74,8 @@ class TestAiCli:
		assert result.exit_code == 0

		def test_process_delegates_to_library(self, runner: CliRunner, tmp_path: Path) -> None:
		"""Test process delegates to process_tdoc/process_all."""
		with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
		"""Test process delegates to process_document/process_all."""
		with patch("tdoc_crawler.cli.ai.process_document") as mock:
		mock.return_value = None
		result = runner.invoke(
		app,
		@@ -100,15 +100,15 @@ class TestAiCli:
		def test_status_delegates_to_get_status(self, runner: CliRunner) -> None:
		"""Test status delegates to get_status."""
		with patch("tdoc_crawler.cli.ai.get_status") as mock:
		mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456"])
		assert result.exit_code == 0

		def test_status_without_tdoc_id_lists_all(self, runner: CliRunner) -> None:
		"""Status command supports listing all statuses without --tdoc-id."""
		statuses = [
		ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED),
		ProcessingStatus(tdoc_id="SP-123457", current_stage=PipelineStage.EXTRACTING),
		ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED),
		ProcessingStatus(document_id="SP-123457", current_stage=PipelineStage.EXTRACTING),
		]
		with patch("tdoc_crawler.cli.ai.get_status") as mock_get_status:
		mock_get_status.return_value = statuses
		@@ -121,7 +121,7 @@ class TestAiCli:
		def test_json_flag_produces_valid_json(self, runner: CliRunner) -> None:
		"""Test --json flag produces valid JSON output."""
		with patch("tdoc_crawler.cli.ai.get_status") as mock:
		mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456", "--json"])
		assert result.exit_code == 0
		# Should be valid JSON
		@@ -157,7 +157,7 @@ class TestAiCli:
		assert result.exit_code in (0, 1)

		def test_process_without_workspace_uses_default(self, runner: CliRunner, tmp_path: Path) -> None:
		with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
		with patch("tdoc_crawler.cli.ai.process_document") as mock:
		mock.return_value = None
		result = runner.invoke(app, ["ai", "process", "--tdoc-id", "SP-123456", "--checkout-path", str(tmp_path / "checkout")])
		assert result.exit_code == 0
		@@ -166,7 +166,7 @@ class TestAiCli:

		def test_status_without_workspace_uses_default(self, runner: CliRunner) -> None:
		with patch("tdoc_crawler.cli.ai.get_status") as mock:
		mock.return_value = ProcessingStatus(tdoc_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		mock.return_value = ProcessingStatus(document_id="SP-123456", current_stage=PipelineStage.COMPLETED)
		result = runner.invoke(app, ["ai", "status", "--tdoc-id", "SP-123456"])
		assert result.exit_code == 0
		kwargs = mock.call_args.kwargs
		@@ -181,7 +181,7 @@ class TestAiCli:
		assert kwargs.get("workspace") is None or kwargs.get("workspace") == "default"

		def test_process_with_explicit_workspace_uses_it(self, runner: CliRunner, tmp_path: Path) -> None:
		with patch("tdoc_crawler.cli.ai.process_tdoc") as mock:
		with patch("tdoc_crawler.cli.ai.process_document") as mock:
		mock.return_value = None
		result = runner.invoke(app, ["ai", "process", "--tdoc-id", "SP-123456", "--workspace", "myws", "--checkout-path", str(tmp_path / "checkout")])
		assert result.exit_code == 0

tests/ai/test_ai_graph.py

+62 −2

Original line number	Diff line number	Diff line
		@@ -65,7 +65,7 @@ class TestGraph:

		# Simulate calling build_graph which should use storage
		result_nodes, result_edges = graph.build_graph(
		tdoc_id="S4-250002", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
		document_id="S4-250002", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
		)

		# Verify query_graph was called to fetch existing data (incremental check)
		@@ -137,7 +137,7 @@ class TestGraph:

		# Build graph - should extract references and create edges
		result_nodes, result_edges = graph.build_graph(
		tdoc_id="S4-252000", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
		document_id="S4-252000", markdown=markdown_content, meeting_id="SA4#123", storage=mock_storage, workspace="test_ws"
		)

		# Verify REFERENCES edges were created
		@@ -162,3 +162,63 @@ class TestGraphModuleExports:
		"""Verify query_graph is exported."""
		assert hasattr(graph, "query_graph")
		assert callable(graph.query_graph)


		class TestEntityExtractors:
		"""Test entity extraction functions for graph-RAG."""

		def test_extract_company_entities(self) -> None:
		"""Test company name extraction."""
		from tdoc_crawler.ai.operations.graph import extract_company_entities

		text = "This document was submitted by Huawei and Nokia for discussion at 3GPP."
		companies = extract_company_entities(text)

		assert "Huawei" in companies
		assert "Nokia" in companies
		assert len(companies) >= 2

		def test_extract_work_items(self) -> None:
		"""Test work item extraction."""
		from tdoc_crawler.ai.operations.graph import extract_work_items

		text = "This relates to WI-12345 and Work Item 67890 for 5G enhancement."
		wis = extract_work_items(text)

		assert "WI-12345" in wis
		assert "WI-67890" in wis

		def test_extract_change_requests(self) -> None:
		"""Test change request extraction."""
		from tdoc_crawler.ai.operations.graph import extract_change_requests

		text = "This CR-001234 and Change Request 5678 propose modifications to the spec."
		crs = extract_change_requests(text)

		assert "CR-001234" in crs
		assert "CR-5678" in crs

		def test_extract_all_entity_types(self) -> None:
		"""Test extraction of all entity types together."""
		from tdoc_crawler.ai.operations.graph import (
		extract_change_requests,
		extract_company_entities,
		extract_work_items,
		)

		text = """
		Samsung proposes WI-99999 to address CR-11111.
		This work item relates to change request CP-230001.
		Ericsson and Qualcomm support this proposal.
		"""

		companies = extract_company_entities(text)
		wis = extract_work_items(text)
		crs = extract_change_requests(text)

		assert "Samsung" in companies
		assert "Ericsson" in companies
		assert "Qualcomm" in companies
		assert "WI-99999" in wis
		assert "CR-11111" in crs
		assert "CR-CP-230001" in crs or "CR-230001" in crs

tests/ai/test_ai_pipeline.py

+10 −10

Original line number	Diff line number	Diff line
		@@ -32,7 +32,7 @@ class TestRunPipeline:
		def test_batch_processing(self, mock_run_pipeline: MagicMock, mock_ai_storage: MagicMock, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test processing multiple TDocs."""
		mock_ai_storage.return_value = mock_storage
		mock_run_pipeline.return_value = ProcessingStatus(tdoc_id="foo")
		mock_run_pipeline.return_value = ProcessingStatus(document_id="foo")

		tdoc_folders = [d for d in test_data_dir.iterdir() if d.is_dir()][:3]

		@@ -43,7 +43,7 @@ class TestRunPipeline:
		def test_resume_from_interrupted_stage(self, mock_storage: MagicMock, test_data_dir: Path) -> None:
		"""Test resume from interrupted stage works."""
		# Create a status with partial completion
		mock_status = ProcessingStatus(tdoc_id="S4-251003")
		mock_status = ProcessingStatus(document_id="S4-251003")
		mock_status.classified_at = utc_now()
		mock_storage.get_status.return_value = mock_status

		@@ -58,7 +58,7 @@ class TestRunPipeline:
		"""Test incremental processing only processes new items."""
		mock_ai_storage.return_value = mock_storage
		# Status shows already completed
		mock_status = ProcessingStatus(tdoc_id="S4-251003")
		mock_status = ProcessingStatus(document_id="S4-251003")
		mock_status.current_stage = PipelineStage.COMPLETED
		mock_storage.get_status.return_value = mock_status

		@@ -98,7 +98,7 @@ class TestRunPipeline:
		# Note: "26260-j10" is NOT in the workspace
		]

		mock_run_pipeline.return_value = ProcessingStatus(tdoc_id="S4-251003", current_stage=PipelineStage.COMPLETED)
		mock_run_pipeline.return_value = ProcessingStatus(document_id="S4-251003", current_stage=PipelineStage.COMPLETED)

		tdoc_ids = ["S4-251003", "26260-j10"]
		# Only S4-251003 should be processed because of the workspace scope
		@@ -138,8 +138,8 @@ class TestProcessTdocApi:

		def test_process_all_new_only_filters_completed_statuses(self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
		"""new_only mode should return only non-completed items."""
		completed = ProcessingStatus(tdoc_id="S4-251003", current_stage=PipelineStage.COMPLETED)
		pending = ProcessingStatus(tdoc_id="S4-260001", current_stage=PipelineStage.PENDING)
		completed = ProcessingStatus(document_id="S4-251003", current_stage=PipelineStage.COMPLETED)
		pending = ProcessingStatus(document_id="S4-260001", current_stage=PipelineStage.PENDING)

		def fake_process_all(
		tdoc_ids: list[str],
		@@ -150,19 +150,19 @@ class TestProcessTdocApi:
		workspace: str \| None = None,
		) -> dict[str, ProcessingStatus]:
		return {
		completed.tdoc_id: completed,
		pending.tdoc_id: pending,
		completed.document_id: completed,
		pending.document_id: pending,
		}

		monkeypatch.setattr("tdoc_crawler.ai._pipeline_process_all_impl", fake_process_all)

		result = process_all_api(
		new_only=True,
		tdoc_ids=[completed.tdoc_id, pending.tdoc_id],
		tdoc_ids=[completed.document_id, pending.document_id],
		checkout_base=tmp_path,
		)

		assert [status.tdoc_id for status in result] == [pending.tdoc_id]
		assert [status.document_id for status in result] == [pending.document_id]


		class TestPipelineModuleExports:

tests/ai/test_ai_storage.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -58,12 +58,12 @@ def test_storage_artifact_isolation_by_workspace(tmp_path: Path) -> None:
		storage = AiStorage(tmp_path / "lancedb", embedding_dimension=3)

		# Save status for workspace A
		status_a = ProcessingStatus(tdoc_id="DOC-1", current_stage=PipelineStage.COMPLETED)
		status_a = ProcessingStatus(document_id="DOC-1", current_stage=PipelineStage.COMPLETED)
		storage.save_status(status_a, workspace="ws_a")

		# Save chunks for workspace A
		chunk_a = DocumentChunk(
		tdoc_id="DOC-1",
		document_id="DOC-1",
		chunk_id="C1",
		chunk_index=0,
		text="text a",