Add support for Office 2003 .doc files using doc2txt (349b5036) · Commits · Jan Reimes / 3gpp-crawler

pyproject.toml

+1 −0

Original line number	Diff line number	Diff line
		@@ -39,6 +39,7 @@ dependencies = [
		"litellm>=1.81.15",
		"sentence-transformers>=2.7.0",
		"tokenizers>=0.22.2",
		"doc2txt>=1.0.8",
		]

		[project.urls]

src/tdoc_crawler/ai/operations/extract.py

+111 −12

Original line number	Diff line number	Diff line
		"""DOCX-to-Markdown extraction using Kreuzberg."""
		"""Document-to-Markdown extraction using Kreuzberg (DOCX) and doc2txt (DOC)."""

		from __future__ import annotations

		@@ -8,6 +8,7 @@ import re
		import zipfile
		from pathlib import Path

		from doc2txt import extract_text as doc2txt_extract
		from kreuzberg import ExtractionConfig, KeywordAlgorithm, KeywordConfig, LanguageDetectionConfig, extract_file_sync

		from tdoc_crawler.ai.models import ExtractionError, ProcessingStatus
		@@ -48,6 +49,82 @@ def compute_source_hash(docx_path: Path) -> str:
		return hasher.hexdigest()


		def extract_doc_to_markdown(
		document_id: str,
		doc_path: Path,
		storage: AiStorage,
		workspace: str \| None = None,
		) -> str:
		"""Extract content from legacy Office 2003 .doc file to Markdown using doc2txt.

		Args:
		document_id: Document identifier.
		doc_path: Path to .doc file.
		storage: AiStorage instance for status tracking.
		workspace: Optional workspace scope (defaults to "default").

		Returns:
		Extracted content as Markdown string.

		Raises:
		ExtractionError: If extraction fails.
		"""
		if not doc_path.exists():
		msg = f"DOC file not found: {doc_path}"
		raise ExtractionError(msg)

		if not doc_path.suffix.lower() == ".doc":
		msg = f"File must be .doc format: {doc_path}"
		raise ExtractionError(msg)

		normalized_workspace = normalize_workspace_name(workspace)

		# Check for change detection (idempotency)
		current_hash = compute_source_hash(doc_path)
		artifact_path = _artifact_path(doc_path, document_id)

		try:
		status = storage.get_status(document_id, workspace=normalized_workspace)
		if status and hasattr(status, "source_hash") and status.source_hash == current_hash and hasattr(status, "extracted_at") and status.extracted_at:
		logger.info(f"Skipping extraction for {document_id} - content unchanged")
		if artifact_path.exists():
		return artifact_path.read_text(encoding="utf-8")
		return ""
		except Exception as error:
		logger.debug(f"Unable to load prior extraction status for {document_id}: {error}")

		try:
		# Extract .doc to text using doc2txt (antiword wrapper)
		logger.info(f"Extracting {doc_path.name} using doc2txt (antiword)")
		markdown = doc2txt_extract(str(doc_path), optimize_format=True)

		if not markdown:
		msg = f"No content extracted from {doc_path.name}"
		raise ExtractionError(msg)

		artifact_output = _write_markdown_artifact(doc_path, document_id, markdown)

		# Save hash for future idempotency
		try:
		status = storage.get_status(document_id, workspace=normalized_workspace)
		if status is None:
		status = ProcessingStatus(document_id=document_id)
		status.source_hash = current_hash
		status.extracted_at = utc_now()
		status.error_message = None
		# Note: doc2txt doesn't provide keywords or language detection
		storage.save_status(status, workspace=normalized_workspace)
		except Exception as e:
		logger.warning(f"Failed to save extraction status: {e}")

		logger.info(f"Extracted {len(markdown)} chars from {doc_path.name} to {artifact_output}")
		return markdown

		except Exception as e:
		msg = f"Failed to extract DOC: {e}"
		raise ExtractionError(msg) from e


		def _fallback_extract_docx_to_markdown(docx_path: Path) -> str:
		"""Fallback DOCX text extraction using document.xml parsing."""
		try:
		@@ -172,6 +249,10 @@ def extract_from_folder(
		) -> str:
		"""Extract content from all relevant files in a TDoc folder.

		Supports both:
		- .doc files (Office 2003, uses doc2txt/antiword)
		- .docx files (Office 2007+, uses kreuzberg)

		Args:
		document_id: Document identifier.
		folder_path: Path to TDoc checkout folder.
		@@ -187,32 +268,50 @@ def extract_from_folder(
		logger.warning(msg)
		raise ExtractionError(msg)

		# Find DOCX files
		# Find both .doc and .docx files
		doc_files = sorted(folder_path.glob("*.doc"))
		docx_files = sorted(folder_path.glob("*.docx"))

		if not docx_files:
		msg = f"No DOCX files found in {folder_path}"
		if not doc_files and not docx_files:
		msg = f"No .doc or .docx files found in {folder_path}"
		logger.warning(msg)
		raise ExtractionError(msg)

		# Use specified main doc or find it
		# Use specified main doc or auto-detect
		if main_doc_path:
		if not main_doc_path.exists():
		msg = f"Configured main document does not exist: {main_doc_path}"
		raise ExtractionError(msg)
		docx_to_extract = main_doc_path
		elif len(docx_files) == 1:
		docx_to_extract = docx_files[0]

		# Route based on file extension
		if main_doc_path.suffix.lower() == ".doc":
		return extract_doc_to_markdown(document_id, main_doc_path, storage, workspace=workspace)
		else:
		# Multiple files - need classification (defer to pipeline)
		logger.info(f"Multiple DOCX files found, classification needed: {docx_files}")
		docx_to_extract = docx_files[0] # Just use first for now
		return extract_docx_to_markdown(document_id, main_doc_path, storage, workspace=workspace)

		# Auto-detect which file type to use
		# Prefer .docx over .doc if both exist
		if docx_files:
		docx_to_extract = docx_files[0]
		logger.info(f"Using DOCX file: {docx_to_extract.name}")
		if len(docx_files) > 1:
		logger.info(f"Multiple DOCX files found, using first: {docx_files}")
		return extract_docx_to_markdown(document_id, docx_to_extract, storage, workspace=workspace)
		elif doc_files:
		doc_to_extract = doc_files[0]
		logger.info(f"Using DOC file (Office 2003): {doc_to_extract.name}")
		if len(doc_files) > 1:
		logger.info(f"Multiple DOC files found, using first: {doc_files}")
		return extract_doc_to_markdown(document_id, doc_to_extract, storage, workspace=workspace)
		else:
		# Shouldn't reach here due to earlier check
		msg = f"No supported documents found in {folder_path}"
		raise ExtractionError(msg)


		__all__ = [
		"compute_source_hash",
		"extract_doc_to_markdown",
		"extract_docx_to_markdown",
		"extract_from_folder",
		]