Commit 100b8fbb authored by Jan Reimes's avatar Jan Reimes
Browse files

Fix: Classify all file types matching STRUCTURED_PATTERNS and UNSTRUCTURED_PATTERNS

The _score_filename() function already handles all file types defined in
STRUCTURED_PATTERNS (.doc, .docx, .pdf) and
UNSTRUCTURED_PATTERNS (.ppt, .pptx, .xls, .xlsx, .txt, .csv),
but classify_document_files() was only searching for *.docx files.

Fixed by:
- Expanding glob patterns to match all supported file extensions
- Using separate globs for: .doc, .docx, .pdf, .ppt, .pptx, .xls, .xlsx, .txt, .csv
- Removing duplicates if a file matches multiple patterns
- Updated warning message to be more generic
- Updated comment to remove 'DOCX' specific reference

Updated test test_non_docx_files_are_ignored to
test_non_docx_files_are_classified() to reflect new behavior
where .pptx and .xlsx files are properly classified, not ignored.
parent 349b5036
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -11,6 +11,19 @@
			"program": "${workspaceFolder}/scripts/demo.py",
			"console": "integratedTerminal",
		},
		{
			"name": "Debug: ai workspace process",
			"type": "debugpy",
			"request": "launch",
			"module": "tdoc_crawler",
			"console": "integratedTerminal",
			"justMyCode": false,
			"args": [
				"ai",
				"workspace",
				"process",
			]
		},
		{
			"name": "Debug: crawl-meetings (max. 5)",
			"type": "debugpy",
+15 −5
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ REVISION_PATTERNS = [
]

STRUCTURED_PATTERNS = [
    r"\.(docx?|pdf)$",  # Word and PDF are structured
    r"\.(docx?|doc?|pdf)$",  # Word and PDF are structured
    r"(?i)\breport\b",
    r"(?i)\bspecification\b",
    r"(?i)\bstandard\b",
@@ -46,7 +46,7 @@ STRUCTURED_PATTERNS = [
UNSTRUCTURED_PATTERNS = [
    r"\.(pptx?|ppt)$",  # PowerPoint
    r"\.(xlsx?|xls)$",  # Excel
    r"\.(txt|csv)$",
    r"\.(txt|csv)$",  # Misc. text data
]


@@ -158,10 +158,20 @@ def classify_document_files(
        logger.warning(f"Folder not found: {folder_path}")
        return []

    files = [file_path for file_path in folder_path.glob("*.docx") if file_path.is_file() and not file_path.name.startswith(".")]
    # Find all files matching structured and unstructured patterns
    # STRUCTURED: .doc, .docx, .pdf
    # UNSTRUCTURED: .ppt, .pptx, .xls, .xlsx, .txt, .csv
    extensions = ["*.doc", "*.docx", "*.pdf", "*.ppt", "*.pptx", "*.xls", "*.xlsx", "*.txt", "*.csv"]

    files = []
    for ext in extensions:
        files.extend([file_path for file_path in folder_path.glob(ext) if file_path.is_file() and not file_path.name.startswith(".")])

    # Remove duplicates (in case a file matches multiple patterns)
    files = list(dict.fromkeys(files))  # Preserves order, removes duplicates

    if not files:
        logger.warning(f"No DOCX files found in {folder_path} for document {document_id}")
        logger.warning(f"No document files found in {folder_path} for document {document_id}")
        return []

    if len(files) == 1:
@@ -179,7 +189,7 @@ def classify_document_files(
            )
        ]

    # Multiple DOCX files - need classification
    # Multiple files - need classification
    scores: dict[Path, float] = {}
    file_sizes: dict[Path, int] = {}
    for file in files:
+19 −3
Original line number Diff line number Diff line
@@ -65,13 +65,29 @@ class TestClassifyTdocFiles:
                assert classification.decisive_heuristic is not None
                assert len(classification.decisive_heuristic) > 0

    def test_non_docx_files_are_ignored(self, tmp_path: Path) -> None:
        """US2 classification should only consider DOCX candidates."""
    def test_non_docx_files_are_classified(self, tmp_path: Path) -> None:
        """Non-DOCX files (.pptx, .xlsx) should be classified."""
        (tmp_path / "agenda.xlsx").write_text("xlsx placeholder", encoding="utf-8")
        (tmp_path / "slides.pptx").write_text("pptx placeholder", encoding="utf-8")

        result = classify_document_files("S4-260999", tmp_path)
        assert result == []

        # Should find and classify both files
        assert len(result) == 2

        # Extract file paths for easier assertion
        result_paths = [classification.file_path for classification in result]
        assert "agenda.xlsx" in result_paths
        assert "slides.pptx" in result_paths

        # Both should be marked as secondary files (no single file)
        # PPTX and XLSX have lower scores due to UNSTRUCTURED_PATTERNS
        main_docs = [c for c in result if c.is_main_document]
        assert len(main_docs) == 1

        # PPTX typically gets lower score due to being unstructured
        # Check that at least one classification exists
        assert any(c for c in result if c.confidence > 0.0)


class TestClassifyModuleExports: