Loading src/tdoc_crawler/extraction/fetch_spec.py +13 −3 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.extraction.conversion import OFFICE_FORMATS from tdoc_crawler.logging import get_logger from tdoc_crawler.specs.downloads import SpecDownloads from tdoc_crawler.utils.normalization import normalize_spec_number Loading @@ -27,12 +28,13 @@ class SpecFiles: checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None office_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path """Return the primary document path, preferring PDF > DOCX > DOC > other office formats.""" return self.pdf_path or self.docx_path or self.doc_path or self.office_path def fetch_spec_files(spec_number: str, release: str | None = None, force_download: bool = False) -> SpecFiles: Loading Loading @@ -83,7 +85,13 @@ def fetch_spec_files(spec_number: str, release: str | None = None, force_downloa # Scan ONLY the version-specific extracted directory files = _scan_spec_dir(extracted_dir) if files.primary_path is not None: return SpecFiles(checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, pdf_path=files.pdf_path) return SpecFiles( checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, office_path=files.office_path, pdf_path=files.pdf_path, ) # Fallback: scan the entire spec directory (covers legacy layouts) if spec_dir.exists(): Loading Loading @@ -138,6 +146,8 @@ def _scan_spec_dir(spec_dir: Path) -> SpecFiles: files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path elif suffix in OFFICE_FORMATS: files.office_path = file_path return files Loading src/tdoc_crawler/extraction/fetch_tdoc.py +56 −7 Original line number Diff line number Diff line Loading @@ -2,13 +2,17 @@ from __future__ import annotations import asyncio from dataclasses import dataclass from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.conversion import OFFICE_FORMATS from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.models.workspaces import TDocNotFoundError from tdoc_crawler.tdocs.models import TDocMetadata from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc, get_checkout_path from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec from tdoc_crawler.workspaces.utils import resolve_tdoc_checkout_path Loading @@ -23,12 +27,13 @@ class TDocFiles: checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None office_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path """Return the primary document path, preferring PDF > DOCX > DOC > other office formats.""" return self.pdf_path or self.docx_path or self.doc_path or self.office_path def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFiles: Loading @@ -36,8 +41,8 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile Pipeline: 1. Check if TDoc already exists in local checkout (filesystem scan) 2. If found, return immediately — no network call needed 3. Otherwise resolve via WhatTheSpec and download if needed 2. Look up TDoc in local database (populated by crawl command) 3. Resolve via WhatTheSpec API and download if needed Args: document_id: TDoc identifier (e.g., "S4-260001") Loading @@ -60,10 +65,15 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile if files.primary_path is not None: return files # Step 2: Resolve via WhatTheSpec and download if needed # Step 2: Look up in local database (populated by crawl command) metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id)) # Step 3: Fall back to WhatTheSpec API if database has no record if metadata is None: metadata = resolve_via_whatthespec(document_id) if metadata is None: raise TDocNotFoundError(f"TDoc {document_id} not found via WhatTheSpec or local database") raise TDocNotFoundError(f"TDoc {document_id} not found in local database or WhatTheSpec") checkout_path = get_checkout_path(metadata, checkout_dir) Loading @@ -74,6 +84,43 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile return _scan_checkout_dir(checkout_path) async def _lookup_tdoc_in_db(tdoc_id: str) -> TDocMetadata | None: """Look up TDoc metadata in the local database. The database is populated by the crawl command and contains TDoc metadata including the FTP download URL, which is sufficient for checkout. Args: tdoc_id: Normalized TDoc identifier (uppercase) Returns: TDocMetadata if found with a valid URL, None otherwise """ db_file = PathConfig().db_file async with TDocDatabase(db_file) as db: record = await db._get_tdoc(tdoc_id) if record is None or not record.url: return None return TDocMetadata( tdoc_id=record.tdoc_id, meeting_id=record.meeting_id, title=record.title, url=record.url, source=record.source, contact=record.contact, tdoc_type=record.tdoc_type, for_purpose=record.for_purpose, agenda_item_nbr=record.agenda_item_nbr, agenda_item_text=record.agenda_item_text, status=record.status, is_revision_of=record.is_revision_of, file_size=record.file_size, date_created=record.date_created, validated=record.validated, validation_failed=record.validation_failed, ) def _scan_checkout_dir(checkout_path: Path) -> TDocFiles: """Scan a checkout directory for available document files.""" files = TDocFiles(checkout_dir=checkout_path) Loading @@ -88,6 +135,8 @@ def _scan_checkout_dir(checkout_path: Path) -> TDocFiles: files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path elif suffix in OFFICE_FORMATS: files.office_path = file_path return files Loading Loading
src/tdoc_crawler/extraction/fetch_spec.py +13 −3 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.database.specs import SpecDatabase from tdoc_crawler.extraction.conversion import OFFICE_FORMATS from tdoc_crawler.logging import get_logger from tdoc_crawler.specs.downloads import SpecDownloads from tdoc_crawler.utils.normalization import normalize_spec_number Loading @@ -27,12 +28,13 @@ class SpecFiles: checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None office_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path """Return the primary document path, preferring PDF > DOCX > DOC > other office formats.""" return self.pdf_path or self.docx_path or self.doc_path or self.office_path def fetch_spec_files(spec_number: str, release: str | None = None, force_download: bool = False) -> SpecFiles: Loading Loading @@ -83,7 +85,13 @@ def fetch_spec_files(spec_number: str, release: str | None = None, force_downloa # Scan ONLY the version-specific extracted directory files = _scan_spec_dir(extracted_dir) if files.primary_path is not None: return SpecFiles(checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, pdf_path=files.pdf_path) return SpecFiles( checkout_dir=spec_dir, docx_path=files.docx_path, doc_path=files.doc_path, office_path=files.office_path, pdf_path=files.pdf_path, ) # Fallback: scan the entire spec directory (covers legacy layouts) if spec_dir.exists(): Loading Loading @@ -138,6 +146,8 @@ def _scan_spec_dir(spec_dir: Path) -> SpecFiles: files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path elif suffix in OFFICE_FORMATS: files.office_path = file_path return files Loading
src/tdoc_crawler/extraction/fetch_tdoc.py +56 −7 Original line number Diff line number Diff line Loading @@ -2,13 +2,17 @@ from __future__ import annotations import asyncio from dataclasses import dataclass from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.database.tdocs import TDocDatabase from tdoc_crawler.extraction.conversion import OFFICE_FORMATS from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.logging import get_logger from tdoc_crawler.models.workspaces import TDocNotFoundError from tdoc_crawler.tdocs.models import TDocMetadata from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc, get_checkout_path from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec from tdoc_crawler.workspaces.utils import resolve_tdoc_checkout_path Loading @@ -23,12 +27,13 @@ class TDocFiles: checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None office_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path """Return the primary document path, preferring PDF > DOCX > DOC > other office formats.""" return self.pdf_path or self.docx_path or self.doc_path or self.office_path def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFiles: Loading @@ -36,8 +41,8 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile Pipeline: 1. Check if TDoc already exists in local checkout (filesystem scan) 2. If found, return immediately — no network call needed 3. Otherwise resolve via WhatTheSpec and download if needed 2. Look up TDoc in local database (populated by crawl command) 3. Resolve via WhatTheSpec API and download if needed Args: document_id: TDoc identifier (e.g., "S4-260001") Loading @@ -60,10 +65,15 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile if files.primary_path is not None: return files # Step 2: Resolve via WhatTheSpec and download if needed # Step 2: Look up in local database (populated by crawl command) metadata = asyncio.run(_lookup_tdoc_in_db(normalized_id)) # Step 3: Fall back to WhatTheSpec API if database has no record if metadata is None: metadata = resolve_via_whatthespec(document_id) if metadata is None: raise TDocNotFoundError(f"TDoc {document_id} not found via WhatTheSpec or local database") raise TDocNotFoundError(f"TDoc {document_id} not found in local database or WhatTheSpec") checkout_path = get_checkout_path(metadata, checkout_dir) Loading @@ -74,6 +84,43 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile return _scan_checkout_dir(checkout_path) async def _lookup_tdoc_in_db(tdoc_id: str) -> TDocMetadata | None: """Look up TDoc metadata in the local database. The database is populated by the crawl command and contains TDoc metadata including the FTP download URL, which is sufficient for checkout. Args: tdoc_id: Normalized TDoc identifier (uppercase) Returns: TDocMetadata if found with a valid URL, None otherwise """ db_file = PathConfig().db_file async with TDocDatabase(db_file) as db: record = await db._get_tdoc(tdoc_id) if record is None or not record.url: return None return TDocMetadata( tdoc_id=record.tdoc_id, meeting_id=record.meeting_id, title=record.title, url=record.url, source=record.source, contact=record.contact, tdoc_type=record.tdoc_type, for_purpose=record.for_purpose, agenda_item_nbr=record.agenda_item_nbr, agenda_item_text=record.agenda_item_text, status=record.status, is_revision_of=record.is_revision_of, file_size=record.file_size, date_created=record.date_created, validated=record.validated, validation_failed=record.validation_failed, ) def _scan_checkout_dir(checkout_path: Path) -> TDocFiles: """Scan a checkout directory for available document files.""" files = TDocFiles(checkout_dir=checkout_path) Loading @@ -88,6 +135,8 @@ def _scan_checkout_dir(checkout_path: Path) -> TDocFiles: files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path elif suffix in OFFICE_FORMATS: files.office_path = file_path return files Loading