🔧 chore(tdoc): add support for additional office formats in file scanning (5f566870) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/extraction/fetch_spec.py

+13 −3

Original line number	Diff line number	Diff line
		@@ -13,6 +13,7 @@ from pathlib import Path

		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.database.specs import SpecDatabase
		from tdoc_crawler.extraction.conversion import OFFICE_FORMATS
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.specs.downloads import SpecDownloads
		from tdoc_crawler.utils.normalization import normalize_spec_number
		@@ -27,12 +28,13 @@ class SpecFiles:
		checkout_dir: Path
		docx_path: Path \| None = None
		doc_path: Path \| None = None
		office_path: Path \| None = None
		pdf_path: Path \| None = None

		@property
		def primary_path(self) -> Path \| None:
		"""Return the primary document path, preferring PDF > DOCX > DOC."""
		return self.pdf_path or self.docx_path or self.doc_path
		"""Return the primary document path, preferring PDF > DOCX > DOC > other office formats."""
		return self.pdf_path or self.docx_path or self.doc_path or self.office_path


		def fetch_spec_files(spec_number: str, release: str \| None = None, force_download: bool = False) -> SpecFiles:
		@@ -83,7 +85,13 @@ def fetch_spec_files(spec_number: str, release: str \| None = None, force_downloa
		# Scan ONLY the version-specific extracted directory
		files = _scan_spec_dir(extracted_dir)
		if files.primary_path is not None:
		return SpecFiles(checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, pdf_path=files.pdf_path)
		return SpecFiles(
		checkout_dir=spec_dir,
		docx_path=files.docx_path,
		doc_path=files.doc_path,
		office_path=files.office_path,
		pdf_path=files.pdf_path,
		)

		# Fallback: scan the entire spec directory (covers legacy layouts)
		if spec_dir.exists():
		@@ -138,6 +146,8 @@ def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
		files.docx_path = file_path
		elif suffix == ".doc":
		files.doc_path = file_path
		elif suffix in OFFICE_FORMATS:
		files.office_path = file_path

		return files

src/tdoc_crawler/extraction/fetch_tdoc.py

+56 −7

Original line number	Diff line number	Diff line
		@@ -2,13 +2,17 @@

		from __future__ import annotations

		import asyncio
		from dataclasses import dataclass
		from pathlib import Path

		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.database.tdocs import TDocDatabase
		from tdoc_crawler.extraction.conversion import OFFICE_FORMATS
		from tdoc_crawler.http_client import create_cached_session
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models.workspaces import TDocNotFoundError
		from tdoc_crawler.tdocs.models import TDocMetadata
		from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc, get_checkout_path
		from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
		from tdoc_crawler.workspaces.utils import resolve_tdoc_checkout_path
		@@ -23,12 +27,13 @@ class TDocFiles:
		checkout_dir: Path
		docx_path: Path \| None = None
		doc_path: Path \| None = None
		office_path: Path \| None = None
		pdf_path: Path \| None = None

		@property
		def primary_path(self) -> Path \| None:
		"""Return the primary document path, preferring PDF > DOCX > DOC."""
		return self.pdf_path or self.docx_path or self.doc_path
		"""Return the primary document path, preferring PDF > DOCX > DOC > other office formats."""
		return self.pdf_path or self.docx_path or self.doc_path or self.office_path


		def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFiles:
		@@ -36,8 +41,8 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile

		Pipeline:
		1. Check if TDoc already exists in local checkout (filesystem scan)
		2. If found, return immediately — no network call needed
		3. Otherwise resolve via WhatTheSpec and download if needed
		2. Look up TDoc in local database (populated by crawl command)
		3. Resolve via WhatTheSpec API and download if needed

		Args:
		document_id: TDoc identifier (e.g., "S4-260001")
		@@ -60,10 +65,15 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
		if files.primary_path is not None:
		return files

		# Step 2: Resolve via WhatTheSpec and download if needed
		# Step 2: Look up in local database (populated by crawl command)
		metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id))

		# Step 3: Fall back to WhatTheSpec API if database has no record
		if metadata is None:
		metadata = resolve_via_whatthespec(document_id)

		if metadata is None:
		raise TDocNotFoundError(f"TDoc {document_id} not found via WhatTheSpec or local database")
		raise TDocNotFoundError(f"TDoc {document_id} not found in local database or WhatTheSpec")

		checkout_path = get_checkout_path(metadata, checkout_dir)

		@@ -74,6 +84,43 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
		return _scan_checkout_dir(checkout_path)


		async def _lookup_tdoc_in_db(tdoc_id: str) -> TDocMetadata \| None:
		"""Look up TDoc metadata in the local database.

		The database is populated by the crawl command and contains TDoc metadata
		including the FTP download URL, which is sufficient for checkout.

		Args:
		tdoc_id: Normalized TDoc identifier (uppercase)

		Returns:
		TDocMetadata if found with a valid URL, None otherwise
		"""
		db_file = PathConfig().db_file
		async with TDocDatabase(db_file) as db:
		record = await db._get_tdoc(tdoc_id)
		if record is None or not record.url:
		return None
		return TDocMetadata(
		tdoc_id=record.tdoc_id,
		meeting_id=record.meeting_id,
		title=record.title,
		url=record.url,
		source=record.source,
		contact=record.contact,
		tdoc_type=record.tdoc_type,
		for_purpose=record.for_purpose,
		agenda_item_nbr=record.agenda_item_nbr,
		agenda_item_text=record.agenda_item_text,
		status=record.status,
		is_revision_of=record.is_revision_of,
		file_size=record.file_size,
		date_created=record.date_created,
		validated=record.validated,
		validation_failed=record.validation_failed,
		)


		def _scan_checkout_dir(checkout_path: Path) -> TDocFiles:
		"""Scan a checkout directory for available document files."""
		files = TDocFiles(checkout_dir=checkout_path)
		@@ -88,6 +135,8 @@ def _scan_checkout_dir(checkout_path: Path) -> TDocFiles:
		files.docx_path = file_path
		elif suffix == ".doc":
		files.doc_path = file_path
		elif suffix in OFFICE_FORMATS:
		files.office_path = file_path

		return files