feat(extraction): add spec document fetching functionality (73b8b8bc) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/_workspace_commands.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -244,6 +244,7 @@ def workspace_process(
		result_path = convert_for_wiki(
		document_id=source_id,
		wiki_source_dir=wiki_source_dir,
		source_kind=member.source_kind,
		profile=extraction_profile,
		force=force,
		)

src/tdoc_crawler/extraction/convert.py

+20 −8

Original line number	Diff line number	Diff line
		@@ -15,12 +15,14 @@ import opendataloader_pdf
		from rich.console import Console

		from tdoc_crawler.extraction.conversion import ensure_pdf
		from tdoc_crawler.extraction.fetch_spec import fetch_spec_files
		from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
		from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server
		from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.models.workspaces import SourceKind
		from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
		from tdoc_crawler.utils.normalization import normalize_tdoc_id
		from tdoc_crawler.utils.normalization import normalize_spec_number, normalize_tdoc_id

		logger = logging.getLogger(__name__)
		console = Console()
		@@ -183,6 +185,7 @@ def convert_for_wiki(
		document_id: str,
		wiki_source_dir: Path,
		*,
		source_kind: SourceKind = SourceKind.TDOC,
		profile: ExtractionProfile \| None = None,
		force: bool = False,
		) -> Path \| None:
		@@ -193,8 +196,9 @@ def convert_for_wiki(
		consistency: the source_pdf in JSON points to the actual PDF that was analyzed.

		Args:
		document_id: Document identifier (TDoc ID).
		document_id: Document identifier (TDoc ID or spec number).
		wiki_source_dir: Target directory under wiki/<workspace>/sources/<doc-id>/.
		source_kind: Type of source document (TDOC or SPEC). Defaults to TDOC.
		profile: Extraction profile to use. Defaults to DEFAULT_EXTRACTION_PROFILE.
		force: Force reconversion.

		@@ -207,7 +211,15 @@ def convert_for_wiki(

		wiki_source_dir.mkdir(parents=True, exist_ok=True)

		# TDoc pipeline
		# Resolve primary document based on source kind
		primary: Path \| None = None
		if source_kind == SourceKind.SPEC:
		normalized_id = normalize_spec_number(document_id)
		spec_files = fetch_spec_files(normalized_id, force_download=force)
		primary = spec_files.primary_path
		if primary is None:
		raise ConversionError(f"No document files found for spec {normalized_id}")
		else:
		normalized_id = normalize_tdoc_id(document_id)
		tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
		primary = tdoc_files.primary_path

src/tdoc_crawler/extraction/fetch_spec.py

0 → 100644

+88 −0

Original line number	Diff line number	Diff line
		"""Fetch spec documents from checkout or 3GPP sources.

		Analogous to fetch_tdoc.py but for specification documents.
		Resolves spec files by number (e.g. "26.131") to their checked-out files.
		"""

		from __future__ import annotations

		from dataclasses import dataclass
		from pathlib import Path

		from tdoc_crawler.config.settings import PathConfig
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.utils.normalization import normalize_spec_number

		logger = get_logger(__name__)


		@dataclass
		class SpecFiles:
		"""Paths to available spec file types."""

		checkout_dir: Path
		docx_path: Path \| None = None
		doc_path: Path \| None = None
		pdf_path: Path \| None = None

		@property
		def primary_path(self) -> Path \| None:
		"""Return the primary document path, preferring PDF > DOCX > DOC."""
		return self.pdf_path or self.docx_path or self.doc_path


		def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFiles:
		"""Fetch spec files from local checkout directory.

		Specs are checked out during the ``workspace add`` checkout phase.
		This function locates the checked-out files for a given spec number.

		Args:
		spec_number: Spec number (e.g., "26.131").
		force_download: Ignored for specs — files are looked up from checkout.

		Returns:
		SpecFiles with paths to available documents.

		Raises:
		FileNotFoundError: If spec directory or files cannot be found.
		"""
		checkout_dir = PathConfig().checkout_dir
		normalized = normalize_spec_number(spec_number)
		series = f"{normalized.split('.', maxsplit=1)[0]}_series"

		# Spec checkout path: checkout_dir/Specs/archive/{series}/{spec_number}/
		spec_dir = checkout_dir / "Specs" / "archive" / series / normalized

		if not spec_dir.exists():
		raise FileNotFoundError(f"Spec {normalized} not found at {spec_dir}")

		return _scan_spec_dir(spec_dir)


		def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
		"""Scan a spec checkout directory for available document files.

		Searches the directory tree for document files, excluding hidden files
		and the .ai subfolder.
		"""
		files = SpecFiles(checkout_dir=spec_dir)

		if spec_dir.is_dir():
		for file_path in sorted(spec_dir.rglob("*")):
		if file_path.is_file() and not file_path.name.startswith("."):
		# Skip files inside .ai directories
		if ".ai" in file_path.parts:
		continue
		suffix = file_path.suffix.lower()
		if suffix == ".pdf":
		files.pdf_path = file_path
		elif suffix == ".docx":
		files.docx_path = file_path
		elif suffix == ".doc":
		files.doc_path = file_path

		return files


		__all__ = ["SpecFiles", "fetch_spec_files"]