Commit 5f566870 authored by Jan Reimes's avatar Jan Reimes
Browse files

🔧 chore(tdoc): add support for additional office formats in file scanning

parent 217eca31
Loading
Loading
Loading
Loading
+13 −3
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ from pathlib import Path

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.extraction.conversion import OFFICE_FORMATS
from tdoc_crawler.logging import get_logger
from tdoc_crawler.specs.downloads import SpecDownloads
from tdoc_crawler.utils.normalization import normalize_spec_number
@@ -27,12 +28,13 @@ class SpecFiles:
    checkout_dir: Path
    docx_path: Path | None = None
    doc_path: Path | None = None
    office_path: Path | None = None
    pdf_path: Path | None = None

    @property
    def primary_path(self) -> Path | None:
        """Return the primary document path, preferring PDF > DOCX > DOC."""
        return self.pdf_path or self.docx_path or self.doc_path
        """Return the primary document path, preferring PDF > DOCX > DOC > other office formats."""
        return self.pdf_path or self.docx_path or self.doc_path or self.office_path


def fetch_spec_files(spec_number: str, release: str | None = None, force_download: bool = False) -> SpecFiles:
@@ -83,7 +85,13 @@ def fetch_spec_files(spec_number: str, release: str | None = None, force_downloa
            # Scan ONLY the version-specific extracted directory
            files = _scan_spec_dir(extracted_dir)
            if files.primary_path is not None:
                return SpecFiles(checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, pdf_path=files.pdf_path)
                return SpecFiles(
                    checkout_dir=spec_dir,
                    docx_path=files.docx_path,
                    doc_path=files.doc_path,
                    office_path=files.office_path,
                    pdf_path=files.pdf_path,
                )

        # Fallback: scan the entire spec directory (covers legacy layouts)
        if spec_dir.exists():
@@ -138,6 +146,8 @@ def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
                    files.docx_path = file_path
                elif suffix == ".doc":
                    files.doc_path = file_path
                elif suffix in OFFICE_FORMATS:
                    files.office_path = file_path

    return files

+56 −7
Original line number Diff line number Diff line
@@ -2,13 +2,17 @@

from __future__ import annotations

import asyncio
from dataclasses import dataclass
from pathlib import Path

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.extraction.conversion import OFFICE_FORMATS
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.workspaces import TDocNotFoundError
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc, get_checkout_path
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
from tdoc_crawler.workspaces.utils import resolve_tdoc_checkout_path
@@ -23,12 +27,13 @@ class TDocFiles:
    checkout_dir: Path
    docx_path: Path | None = None
    doc_path: Path | None = None
    office_path: Path | None = None
    pdf_path: Path | None = None

    @property
    def primary_path(self) -> Path | None:
        """Return the primary document path, preferring PDF > DOCX > DOC."""
        return self.pdf_path or self.docx_path or self.doc_path
        """Return the primary document path, preferring PDF > DOCX > DOC > other office formats."""
        return self.pdf_path or self.docx_path or self.doc_path or self.office_path


def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFiles:
@@ -36,8 +41,8 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile

    Pipeline:
    1. Check if TDoc already exists in local checkout (filesystem scan)
    2. If found, return immediately — no network call needed
    3. Otherwise resolve via WhatTheSpec and download if needed
    2. Look up TDoc in local database (populated by crawl command)
    3. Resolve via WhatTheSpec API and download if needed

    Args:
        document_id: TDoc identifier (e.g., "S4-260001")
@@ -60,10 +65,15 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
            if files.primary_path is not None:
                return files

    # Step 2: Resolve via WhatTheSpec and download if needed
    # Step 2: Look up in local database (populated by crawl command)
    metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id))

    # Step 3: Fall back to WhatTheSpec API if database has no record
    if metadata is None:
        metadata = resolve_via_whatthespec(document_id)

    if metadata is None:
        raise TDocNotFoundError(f"TDoc {document_id} not found via WhatTheSpec or local database")
        raise TDocNotFoundError(f"TDoc {document_id} not found in local database or WhatTheSpec")

    checkout_path = get_checkout_path(metadata, checkout_dir)

@@ -74,6 +84,43 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
    return _scan_checkout_dir(checkout_path)


async def _lookup_tdoc_in_db(tdoc_id: str) -> TDocMetadata | None:
    """Look up TDoc metadata in the local database.

    The database is populated by the crawl command and contains TDoc metadata
    including the FTP download URL, which is sufficient for checkout.

    Args:
        tdoc_id: Normalized TDoc identifier (uppercase)

    Returns:
        TDocMetadata if found with a valid URL, None otherwise
    """
    db_file = PathConfig().db_file
    async with TDocDatabase(db_file) as db:
        record = await db._get_tdoc(tdoc_id)
    if record is None or not record.url:
        return None
    return TDocMetadata(
        tdoc_id=record.tdoc_id,
        meeting_id=record.meeting_id,
        title=record.title,
        url=record.url,
        source=record.source,
        contact=record.contact,
        tdoc_type=record.tdoc_type,
        for_purpose=record.for_purpose,
        agenda_item_nbr=record.agenda_item_nbr,
        agenda_item_text=record.agenda_item_text,
        status=record.status,
        is_revision_of=record.is_revision_of,
        file_size=record.file_size,
        date_created=record.date_created,
        validated=record.validated,
        validation_failed=record.validation_failed,
    )


def _scan_checkout_dir(checkout_path: Path) -> TDocFiles:
    """Scan a checkout directory for available document files."""
    files = TDocFiles(checkout_dir=checkout_path)
@@ -88,6 +135,8 @@ def _scan_checkout_dir(checkout_path: Path) -> TDocFiles:
                    files.docx_path = file_path
                elif suffix == ".doc":
                    files.doc_path = file_path
                elif suffix in OFFICE_FORMATS:
                    files.office_path = file_path

    return files