Commit 349b5036 authored by Jan Reimes's avatar Jan Reimes
Browse files

Add support for Office 2003 .doc files using doc2txt

Added extraction support for legacy Office 2003 .doc files using the
doc2txt package (which wraps the antiword CLI tool).

Changes:
- Added extract_doc_to_markdown() function for .doc file extraction
- Updated extract_from_folder() to detect and route both .doc and .docx files
- .doc files use doc2txt/antiword, .docx files use kreuzberg
- Both extraction paths support change detection and artifact caching
- Added doc2txt import (package already in dependencies)
- Updated module docstring and exports

The implementation follows the same pattern as DOCX extraction:
- Hash-based idempotency (skip if content unchanged)
- Artifact caching in .ai folder
- Status tracking in LanceDB
parent 6275b599
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -39,6 +39,7 @@ dependencies = [
    "litellm>=1.81.15",
    "sentence-transformers>=2.7.0",
    "tokenizers>=0.22.2",
    "doc2txt>=1.0.8",
]

[project.urls]
+111 −12
Original line number Diff line number Diff line
"""DOCX-to-Markdown extraction using Kreuzberg."""
"""Document-to-Markdown extraction using Kreuzberg (DOCX) and doc2txt (DOC)."""

from __future__ import annotations

@@ -8,6 +8,7 @@ import re
import zipfile
from pathlib import Path

from doc2txt import extract_text as doc2txt_extract
from kreuzberg import ExtractionConfig, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, extract_file_sync

from tdoc_crawler.ai.models import ExtractionError, ProcessingStatus
@@ -48,6 +49,82 @@ def compute_source_hash(docx_path: Path) -> str:
    return hasher.hexdigest()


def extract_doc_to_markdown(
    document_id: str,
    doc_path: Path,
    storage: AiStorage,
    workspace: str | None = None,
) -> str:
    """Extract content from legacy Office 2003 .doc file to Markdown using doc2txt.

    Args:
        document_id: Document identifier.
        doc_path: Path to .doc file.
        storage: AiStorage instance for status tracking.
        workspace: Optional workspace scope (defaults to "default").

    Returns:
        Extracted content as Markdown string.

    Raises:
        ExtractionError: If extraction fails.
    """
    if not doc_path.exists():
        msg = f"DOC file not found: {doc_path}"
        raise ExtractionError(msg)

    if not doc_path.suffix.lower() == ".doc":
        msg = f"File must be .doc format: {doc_path}"
        raise ExtractionError(msg)

    normalized_workspace = normalize_workspace_name(workspace)

    # Check for change detection (idempotency)
    current_hash = compute_source_hash(doc_path)
    artifact_path = _artifact_path(doc_path, document_id)

    try:
        status = storage.get_status(document_id, workspace=normalized_workspace)
        if status and hasattr(status, "source_hash") and status.source_hash == current_hash and hasattr(status, "extracted_at") and status.extracted_at:
            logger.info(f"Skipping extraction for {document_id} - content unchanged")
            if artifact_path.exists():
                return artifact_path.read_text(encoding="utf-8")
            return ""
    except Exception as error:
        logger.debug(f"Unable to load prior extraction status for {document_id}: {error}")

    try:
        # Extract .doc to text using doc2txt (antiword wrapper)
        logger.info(f"Extracting {doc_path.name} using doc2txt (antiword)")
        markdown = doc2txt_extract(str(doc_path), optimize_format=True)

        if not markdown:
            msg = f"No content extracted from {doc_path.name}"
            raise ExtractionError(msg)

        artifact_output = _write_markdown_artifact(doc_path, document_id, markdown)

        # Save hash for future idempotency
        try:
            status = storage.get_status(document_id, workspace=normalized_workspace)
            if status is None:
                status = ProcessingStatus(document_id=document_id)
            status.source_hash = current_hash
            status.extracted_at = utc_now()
            status.error_message = None
            # Note: doc2txt doesn't provide keywords or language detection
            storage.save_status(status, workspace=normalized_workspace)
        except Exception as e:
            logger.warning(f"Failed to save extraction status: {e}")

        logger.info(f"Extracted {len(markdown)} chars from {doc_path.name} to {artifact_output}")
        return markdown

    except Exception as e:
        msg = f"Failed to extract DOC: {e}"
        raise ExtractionError(msg) from e


def _fallback_extract_docx_to_markdown(docx_path: Path) -> str:
    """Fallback DOCX text extraction using document.xml parsing."""
    try:
@@ -172,6 +249,10 @@ def extract_from_folder(
) -> str:
    """Extract content from all relevant files in a TDoc folder.

    Supports both:
    - .doc files (Office 2003, uses doc2txt/antiword)
    - .docx files (Office 2007+, uses kreuzberg)

    Args:
        document_id: Document identifier.
        folder_path: Path to TDoc checkout folder.
@@ -187,32 +268,50 @@ def extract_from_folder(
        logger.warning(msg)
        raise ExtractionError(msg)

    # Find DOCX files
    # Find both .doc and .docx files
    doc_files = sorted(folder_path.glob("*.doc"))
    docx_files = sorted(folder_path.glob("*.docx"))

    if not docx_files:
        msg = f"No DOCX files found in {folder_path}"
    if not doc_files and not docx_files:
        msg = f"No .doc or .docx files found in {folder_path}"
        logger.warning(msg)
        raise ExtractionError(msg)

    # Use specified main doc or find it
    # Use specified main doc or auto-detect
    if main_doc_path:
        if not main_doc_path.exists():
            msg = f"Configured main document does not exist: {main_doc_path}"
            raise ExtractionError(msg)
        docx_to_extract = main_doc_path
    elif len(docx_files) == 1:
        docx_to_extract = docx_files[0]

        # Route based on file extension
        if main_doc_path.suffix.lower() == ".doc":
            return extract_doc_to_markdown(document_id, main_doc_path, storage, workspace=workspace)
        else:
        # Multiple files - need classification (defer to pipeline)
        logger.info(f"Multiple DOCX files found, classification needed: {docx_files}")
        docx_to_extract = docx_files[0]  # Just use first for now
            return extract_docx_to_markdown(document_id, main_doc_path, storage, workspace=workspace)

    # Auto-detect which file type to use
    # Prefer .docx over .doc if both exist
    if docx_files:
        docx_to_extract = docx_files[0]
        logger.info(f"Using DOCX file: {docx_to_extract.name}")
        if len(docx_files) > 1:
            logger.info(f"Multiple DOCX files found, using first: {docx_files}")
        return extract_docx_to_markdown(document_id, docx_to_extract, storage, workspace=workspace)
    elif doc_files:
        doc_to_extract = doc_files[0]
        logger.info(f"Using DOC file (Office 2003): {doc_to_extract.name}")
        if len(doc_files) > 1:
            logger.info(f"Multiple DOC files found, using first: {doc_files}")
        return extract_doc_to_markdown(document_id, doc_to_extract, storage, workspace=workspace)
    else:
        # Shouldn't reach here due to earlier check
        msg = f"No supported documents found in {folder_path}"
        raise ExtractionError(msg)


__all__ = [
    "compute_source_hash",
    "extract_doc_to_markdown",
    "extract_docx_to_markdown",
    "extract_from_folder",
]