Loading src/tdoc_crawler/cli/_workspace_commands.py +1 −0 Original line number Diff line number Diff line Loading @@ -244,6 +244,7 @@ def workspace_process( result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, profile=extraction_profile, force=force, ) Loading src/tdoc_crawler/extraction/convert.py +20 −8 Original line number Diff line number Diff line Loading @@ -15,12 +15,14 @@ import opendataloader_pdf from rich.console import Console from tdoc_crawler.extraction.conversion import ensure_pdf from tdoc_crawler.extraction.fetch_spec import fetch_spec_files from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.models.workspaces import SourceKind from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec from tdoc_crawler.utils.normalization import normalize_tdoc_id from tdoc_crawler.utils.normalization import normalize_spec_number, normalize_tdoc_id logger = logging.getLogger(__name__) console = Console() Loading Loading @@ -183,6 +185,7 @@ def convert_for_wiki( document_id: str, wiki_source_dir: Path, *, source_kind: SourceKind = SourceKind.TDOC, profile: ExtractionProfile | None = None, force: bool = False, ) -> Path | None: Loading @@ -193,8 +196,9 @@ def convert_for_wiki( consistency: the source_pdf in JSON points to the actual PDF that was analyzed. Args: document_id: Document identifier (TDoc ID). document_id: Document identifier (TDoc ID or spec number). wiki_source_dir: Target directory under wiki/<workspace>/sources/<doc-id>/. source_kind: Type of source document (TDOC or SPEC). Defaults to TDOC. profile: Extraction profile to use. Defaults to DEFAULT_EXTRACTION_PROFILE. force: Force reconversion. Loading @@ -207,7 +211,15 @@ def convert_for_wiki( wiki_source_dir.mkdir(parents=True, exist_ok=True) # TDoc pipeline # Resolve primary document based on source kind primary: Path | None = None if source_kind == SourceKind.SPEC: normalized_id = normalize_spec_number(document_id) spec_files = fetch_spec_files(normalized_id, force_download=force) primary = spec_files.primary_path if primary is None: raise ConversionError(f"No document files found for spec {normalized_id}") else: normalized_id = normalize_tdoc_id(document_id) tdoc_files = fetch_tdoc_files(normalized_id, force_download=force) primary = tdoc_files.primary_path Loading src/tdoc_crawler/extraction/fetch_spec.py 0 → 100644 +88 −0 Original line number Diff line number Diff line """Fetch spec documents from checkout or 3GPP sources. Analogous to fetch_tdoc.py but for specification documents. Resolves spec files by number (e.g. "26.131") to their checked-out files. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.logging import get_logger from tdoc_crawler.utils.normalization import normalize_spec_number logger = get_logger(__name__) @dataclass class SpecFiles: """Paths to available spec file types.""" checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFiles: """Fetch spec files from local checkout directory. Specs are checked out during the ``workspace add`` checkout phase. This function locates the checked-out files for a given spec number. Args: spec_number: Spec number (e.g., "26.131"). force_download: Ignored for specs — files are looked up from checkout. Returns: SpecFiles with paths to available documents. Raises: FileNotFoundError: If spec directory or files cannot be found. """ checkout_dir = PathConfig().checkout_dir normalized = normalize_spec_number(spec_number) series = f"{normalized.split('.', maxsplit=1)[0]}_series" # Spec checkout path: checkout_dir/Specs/archive/{series}/{spec_number}/ spec_dir = checkout_dir / "Specs" / "archive" / series / normalized if not spec_dir.exists(): raise FileNotFoundError(f"Spec {normalized} not found at {spec_dir}") return _scan_spec_dir(spec_dir) def _scan_spec_dir(spec_dir: Path) -> SpecFiles: """Scan a spec checkout directory for available document files. Searches the directory tree for document files, excluding hidden files and the .ai subfolder. """ files = SpecFiles(checkout_dir=spec_dir) if spec_dir.is_dir(): for file_path in sorted(spec_dir.rglob("*")): if file_path.is_file() and not file_path.name.startswith("."): # Skip files inside .ai directories if ".ai" in file_path.parts: continue suffix = file_path.suffix.lower() if suffix == ".pdf": files.pdf_path = file_path elif suffix == ".docx": files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path return files __all__ = ["SpecFiles", "fetch_spec_files"] Loading
src/tdoc_crawler/cli/_workspace_commands.py +1 −0 Original line number Diff line number Diff line Loading @@ -244,6 +244,7 @@ def workspace_process( result_path = convert_for_wiki( document_id=source_id, wiki_source_dir=wiki_source_dir, source_kind=member.source_kind, profile=extraction_profile, force=force, ) Loading
src/tdoc_crawler/extraction/convert.py +20 −8 Original line number Diff line number Diff line Loading @@ -15,12 +15,14 @@ import opendataloader_pdf from rich.console import Console from tdoc_crawler.extraction.conversion import ensure_pdf from tdoc_crawler.extraction.fetch_spec import fetch_spec_files from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files from tdoc_crawler.extraction.hybrid_server import ensure_hybrid_server from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile from tdoc_crawler.models.workspaces import SourceKind from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec from tdoc_crawler.utils.normalization import normalize_tdoc_id from tdoc_crawler.utils.normalization import normalize_spec_number, normalize_tdoc_id logger = logging.getLogger(__name__) console = Console() Loading Loading @@ -183,6 +185,7 @@ def convert_for_wiki( document_id: str, wiki_source_dir: Path, *, source_kind: SourceKind = SourceKind.TDOC, profile: ExtractionProfile | None = None, force: bool = False, ) -> Path | None: Loading @@ -193,8 +196,9 @@ def convert_for_wiki( consistency: the source_pdf in JSON points to the actual PDF that was analyzed. Args: document_id: Document identifier (TDoc ID). document_id: Document identifier (TDoc ID or spec number). wiki_source_dir: Target directory under wiki/<workspace>/sources/<doc-id>/. source_kind: Type of source document (TDOC or SPEC). Defaults to TDOC. profile: Extraction profile to use. Defaults to DEFAULT_EXTRACTION_PROFILE. force: Force reconversion. Loading @@ -207,7 +211,15 @@ def convert_for_wiki( wiki_source_dir.mkdir(parents=True, exist_ok=True) # TDoc pipeline # Resolve primary document based on source kind primary: Path | None = None if source_kind == SourceKind.SPEC: normalized_id = normalize_spec_number(document_id) spec_files = fetch_spec_files(normalized_id, force_download=force) primary = spec_files.primary_path if primary is None: raise ConversionError(f"No document files found for spec {normalized_id}") else: normalized_id = normalize_tdoc_id(document_id) tdoc_files = fetch_tdoc_files(normalized_id, force_download=force) primary = tdoc_files.primary_path Loading
src/tdoc_crawler/extraction/fetch_spec.py 0 → 100644 +88 −0 Original line number Diff line number Diff line """Fetch spec documents from checkout or 3GPP sources. Analogous to fetch_tdoc.py but for specification documents. Resolves spec files by number (e.g. "26.131") to their checked-out files. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path from tdoc_crawler.config.settings import PathConfig from tdoc_crawler.logging import get_logger from tdoc_crawler.utils.normalization import normalize_spec_number logger = get_logger(__name__) @dataclass class SpecFiles: """Paths to available spec file types.""" checkout_dir: Path docx_path: Path | None = None doc_path: Path | None = None pdf_path: Path | None = None @property def primary_path(self) -> Path | None: """Return the primary document path, preferring PDF > DOCX > DOC.""" return self.pdf_path or self.docx_path or self.doc_path def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFiles: """Fetch spec files from local checkout directory. Specs are checked out during the ``workspace add`` checkout phase. This function locates the checked-out files for a given spec number. Args: spec_number: Spec number (e.g., "26.131"). force_download: Ignored for specs — files are looked up from checkout. Returns: SpecFiles with paths to available documents. Raises: FileNotFoundError: If spec directory or files cannot be found. """ checkout_dir = PathConfig().checkout_dir normalized = normalize_spec_number(spec_number) series = f"{normalized.split('.', maxsplit=1)[0]}_series" # Spec checkout path: checkout_dir/Specs/archive/{series}/{spec_number}/ spec_dir = checkout_dir / "Specs" / "archive" / series / normalized if not spec_dir.exists(): raise FileNotFoundError(f"Spec {normalized} not found at {spec_dir}") return _scan_spec_dir(spec_dir) def _scan_spec_dir(spec_dir: Path) -> SpecFiles: """Scan a spec checkout directory for available document files. Searches the directory tree for document files, excluding hidden files and the .ai subfolder. """ files = SpecFiles(checkout_dir=spec_dir) if spec_dir.is_dir(): for file_path in sorted(spec_dir.rglob("*")): if file_path.is_file() and not file_path.name.startswith("."): # Skip files inside .ai directories if ".ai" in file_path.parts: continue suffix = file_path.suffix.lower() if suffix == ".pdf": files.pdf_path = file_path elif suffix == ".docx": files.docx_path = file_path elif suffix == ".doc": files.doc_path = file_path return files __all__ = ["SpecFiles", "fetch_spec_files"]