Commit 73b8b8bc authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(extraction): add spec document fetching functionality

* Implement fetch_spec_files to retrieve spec documents from checkout.
* Enhance convert_for_wiki to handle source_kind for document resolution.
* Add source_kind parameter to convert_for_wiki for better flexibility.
parent 37552687
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -244,6 +244,7 @@ def workspace_process(
            result_path = convert_for_wiki(
                document_id=source_id,
                wiki_source_dir=wiki_source_dir,
                source_kind=member.source_kind,
                profile=extraction_profile,
                force=force,
            )
+20 −8
Original line number Diff line number Diff line
@@ -15,12 +15,14 @@ import opendataloader_pdf
from rich.console import Console

from tdoc_crawler.extraction.conversion import ensure_pdf
from tdoc_crawler.extraction.fetch_spec import fetch_spec_files
from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server
from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
from tdoc_crawler.utils.normalization import normalize_tdoc_id
from tdoc_crawler.utils.normalization import normalize_spec_number, normalize_tdoc_id

logger = logging.getLogger(__name__)
console = Console()
@@ -183,6 +185,7 @@ def convert_for_wiki(
    document_id: str,
    wiki_source_dir: Path,
    *,
    source_kind: SourceKind = SourceKind.TDOC,
    profile: ExtractionProfile | None = None,
    force: bool = False,
) -> Path | None:
@@ -193,8 +196,9 @@ def convert_for_wiki(
    consistency: the source_pdf in JSON points to the actual PDF that was analyzed.

    Args:
        document_id: Document identifier (TDoc ID).
        document_id: Document identifier (TDoc ID or spec number).
        wiki_source_dir: Target directory under wiki/<workspace>/sources/<doc-id>/.
        source_kind: Type of source document (TDOC or SPEC). Defaults to TDOC.
        profile: Extraction profile to use. Defaults to DEFAULT_EXTRACTION_PROFILE.
        force: Force reconversion.

@@ -207,7 +211,15 @@ def convert_for_wiki(

    wiki_source_dir.mkdir(parents=True, exist_ok=True)

    # TDoc pipeline
    # Resolve primary document based on source kind
    primary: Path | None = None
    if source_kind == SourceKind.SPEC:
        normalized_id = normalize_spec_number(document_id)
        spec_files = fetch_spec_files(normalized_id, force_download=force)
        primary = spec_files.primary_path
        if primary is None:
            raise ConversionError(f"No document files found for spec {normalized_id}")
    else:
        normalized_id = normalize_tdoc_id(document_id)
        tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
        primary = tdoc_files.primary_path
+88 −0
Original line number Diff line number Diff line
"""Fetch spec documents from checkout or 3GPP sources.

Analogous to fetch_tdoc.py but for specification documents.
Resolves spec files by number (e.g. "26.131") to their checked-out files.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.logging import get_logger
from tdoc_crawler.utils.normalization import normalize_spec_number

logger = get_logger(__name__)


@dataclass
class SpecFiles:
    """Paths to available spec file types."""

    checkout_dir: Path
    docx_path: Path | None = None
    doc_path: Path | None = None
    pdf_path: Path | None = None

    @property
    def primary_path(self) -> Path | None:
        """Return the primary document path, preferring PDF > DOCX > DOC."""
        return self.pdf_path or self.docx_path or self.doc_path


def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFiles:
    """Fetch spec files from local checkout directory.

    Specs are checked out during the ``workspace add`` checkout phase.
    This function locates the checked-out files for a given spec number.

    Args:
        spec_number: Spec number (e.g., "26.131").
        force_download: Ignored for specs — files are looked up from checkout.

    Returns:
        SpecFiles with paths to available documents.

    Raises:
        FileNotFoundError: If spec directory or files cannot be found.
    """
    checkout_dir = PathConfig().checkout_dir
    normalized = normalize_spec_number(spec_number)
    series = f"{normalized.split('.', maxsplit=1)[0]}_series"

    # Spec checkout path: checkout_dir/Specs/archive/{series}/{spec_number}/
    spec_dir = checkout_dir / "Specs" / "archive" / series / normalized

    if not spec_dir.exists():
        raise FileNotFoundError(f"Spec {normalized} not found at {spec_dir}")

    return _scan_spec_dir(spec_dir)


def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
    """Scan a spec checkout directory for available document files.

    Searches the directory tree for document files, excluding hidden files
    and the .ai subfolder.
    """
    files = SpecFiles(checkout_dir=spec_dir)

    if spec_dir.is_dir():
        for file_path in sorted(spec_dir.rglob("*")):
            if file_path.is_file() and not file_path.name.startswith("."):
                # Skip files inside .ai directories
                if ".ai" in file_path.parts:
                    continue
                suffix = file_path.suffix.lower()
                if suffix == ".pdf":
                    files.pdf_path = file_path
                elif suffix == ".docx":
                    files.docx_path = file_path
                elif suffix == ".doc":
                    files.doc_path = file_path

    return files


__all__ = ["SpecFiles", "fetch_spec_files"]