Fix: Classify all file types matching STRUCTURED_PATTERNS and UNSTRUCTURED_PATTERNS (100b8fbb) · Commits · Jan Reimes / 3gpp-crawler

.vscode/launch.json

+13 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,19 @@
		"program": "${workspaceFolder}/scripts/demo.py",
		"console": "integratedTerminal",
		},
		{
		"name": "Debug: ai workspace process",
		"type": "debugpy",
		"request": "launch",
		"module": "tdoc_crawler",
		"console": "integratedTerminal",
		"justMyCode": false,
		"args": [
		"ai",
		"workspace",
		"process",
		]
		},
		{
		"name": "Debug: crawl-meetings (max. 5)",
		"type": "debugpy",

src/tdoc_crawler/ai/operations/classify.py

+15 −5

Original line number	Diff line number	Diff line
		@@ -35,7 +35,7 @@ REVISION_PATTERNS = [
		]

		STRUCTURED_PATTERNS = [
		r"\.(docx?\|pdf)$", # Word and PDF are structured
		r"\.(docx?\|doc?\|pdf)$", # Word and PDF are structured
		r"(?i)\breport\b",
		r"(?i)\bspecification\b",
		r"(?i)\bstandard\b",
		@@ -46,7 +46,7 @@ STRUCTURED_PATTERNS = [
		UNSTRUCTURED_PATTERNS = [
		r"\.(pptx?\|ppt)$", # PowerPoint
		r"\.(xlsx?\|xls)$", # Excel
		r"\.(txt\|csv)$",
		r"\.(txt\|csv)$", # Misc. text data
		]


		@@ -158,10 +158,20 @@ def classify_document_files(
		logger.warning(f"Folder not found: {folder_path}")
		return []

		files = [file_path for file_path in folder_path.glob("*.docx") if file_path.is_file() and not file_path.name.startswith(".")]
		# Find all files matching structured and unstructured patterns
		# STRUCTURED: .doc, .docx, .pdf
		# UNSTRUCTURED: .ppt, .pptx, .xls, .xlsx, .txt, .csv
		extensions = [".doc", ".docx", ".pdf", ".ppt", ".pptx", ".xls", ".xlsx", ".txt", "*.csv"]

		files = []
		for ext in extensions:
		files.extend([file_path for file_path in folder_path.glob(ext) if file_path.is_file() and not file_path.name.startswith(".")])

		# Remove duplicates (in case a file matches multiple patterns)
		files = list(dict.fromkeys(files)) # Preserves order, removes duplicates

		if not files:
		logger.warning(f"No DOCX files found in {folder_path} for document {document_id}")
		logger.warning(f"No document files found in {folder_path} for document {document_id}")
		return []

		if len(files) == 1:
		@@ -179,7 +189,7 @@ def classify_document_files(
		)
		]

		# Multiple DOCX files - need classification
		# Multiple files - need classification
		scores: dict[Path, float] = {}
		file_sizes: dict[Path, int] = {}
		for file in files:

tests/ai/test_ai_classification.py

+19 −3

Original line number	Diff line number	Diff line
		@@ -65,13 +65,29 @@ class TestClassifyTdocFiles:
		assert classification.decisive_heuristic is not None
		assert len(classification.decisive_heuristic) > 0

		def test_non_docx_files_are_ignored(self, tmp_path: Path) -> None:
		"""US2 classification should only consider DOCX candidates."""
		def test_non_docx_files_are_classified(self, tmp_path: Path) -> None:
		"""Non-DOCX files (.pptx, .xlsx) should be classified."""
		(tmp_path / "agenda.xlsx").write_text("xlsx placeholder", encoding="utf-8")
		(tmp_path / "slides.pptx").write_text("pptx placeholder", encoding="utf-8")

		result = classify_document_files("S4-260999", tmp_path)
		assert result == []

		# Should find and classify both files
		assert len(result) == 2

		# Extract file paths for easier assertion
		result_paths = [classification.file_path for classification in result]
		assert "agenda.xlsx" in result_paths
		assert "slides.pptx" in result_paths

		# Both should be marked as secondary files (no single file)
		# PPTX and XLSX have lower scores due to UNSTRUCTURED_PATTERNS
		main_docs = [c for c in result if c.is_main_document]
		assert len(main_docs) == 1

		# PPTX typically gets lower score due to being unstructured
		# Check that at least one classification exists
		assert any(c for c in result if c.confidence > 0.0)


		class TestClassifyModuleExports: