Commit eca05647 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(workspace): enhance workspace member handling and spec fetching

* Add release handling in workspace_process and workspace_add functions.
* Remove unused checkout.py file to streamline extraction logic.
* Introduce new normalization functions for spec number extraction.
* Update fetch_spec_files to support downloading based on release.
parent 1af60ce2
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -247,6 +247,7 @@ def workspace_process(
                source_kind=member.source_kind,
                profile=extraction_profile,
                force=force,
                release=member.release,
            )
            if result_path:
                console.print(f"[green]  Processed {source_id} -> {result_path.name}[/green]")
@@ -361,7 +362,7 @@ def workspace_add(
                    source_path=item,
                    source_kind=source_kind,
                    added_by="cli",
                    release=release if release != "latest" else None,
                    release=release,
                )
            )
    else:
+0 −96
Original line number Diff line number Diff line
"""Workspace item checkout and registration logic.

Handles the checkout phase of workspace item management:
downloading/fetching documents and creating workspace member records.

This module is intentionally free of document processing logic
(PDF conversion, markdown extraction, VLM). That responsibility
belongs in the `workspace process` command exclusively.
"""

from __future__ import annotations

from pathlib import Path

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.config.workspace_registry import WorkspaceMember
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.utils.normalization import normalize_release_version
from tdoc_crawler.workspaces import (
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
    make_workspace_member,
    resolve_spec_release_from_db,
)

_logger = get_logger(__name__)


async def checkout_single_item(
    *,
    item: str,
    workspace: str,
    source_kind: SourceKind,
    checkout: bool,
    release: str | None,
    path_config: PathConfig,
) -> tuple[WorkspaceMember | None, str | None]:
    """Checkout a single workspace item and create a member record.

    Downloads the document if checkout is enabled and returns a
    WorkspaceMember ready for registration.

    Args:
        item: Item ID to checkout (TDoc ID, spec number, or path).
        workspace: Target workspace name.
        source_kind: Type of source (TDOC, SPEC, OTHER).
        checkout: Whether to download/fetch the document.
        release: Spec release version (only applies to SPEC kind).
        path_config: PathConfig for file system paths.

    Returns:
        Tuple of (member, skip_reason):
        - member: WorkspaceMember if successful, None if skipped.
        - skip_reason: Reason if skipped, None if successful.
    """
    source_path: str = item

    if checkout:
        checkout_path: Path | None = None

        if source_kind == SourceKind.TDOC:
            checkout_path = await checkout_tdoc_to_workspace(
                item,
                path_config.checkout_dir,
                workspace,
                db_file=path_config.db_file,
            )
            if checkout_path is None:
                return None, "TDoc not found in database or meeting not crawled"

        elif source_kind == SourceKind.SPEC:
            checkout_path = await checkout_spec_to_workspace(
                item,
                path_config.checkout_dir,
                workspace,
                release or "latest",
                db_file=path_config.db_file,
            )
            if checkout_path is None:
                return None, "Spec not found in database"

        if checkout_path is not None:
            source_path = str(checkout_path)

    # Resolve release for spec member IDs
    resolved_release: str | None = None
    if source_kind == SourceKind.SPEC and release:
        resolved_release, _ = await resolve_spec_release_from_db(item, release)

    source_item_id = f"{item}-REL{normalize_release_version(resolved_release)}" if resolved_release else item
    member = make_workspace_member(source_item_id, source_path, source_kind)
    return member, None


__all__ = ["checkout_single_item"]
+8 −3
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, tim
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
from tdoc_crawler.utils.normalization import normalize_spec_number, normalize_tdoc_id
from tdoc_crawler.utils.normalization import extract_base_spec_number, extract_release_suffix, normalize_spec_number, normalize_tdoc_id

logger = logging.getLogger(__name__)
console = Console()
@@ -192,6 +192,7 @@ def convert_for_wiki(
    source_kind: SourceKind = SourceKind.TDOC,
    profile: ExtractionProfile | None = None,
    force: bool = False,
    release: str | None = None,
) -> Path | None:
    """Convert a document for wiki ingestion using the specified profile.

@@ -205,6 +206,7 @@ def convert_for_wiki(
        source_kind: Type of source document (TDOC or SPEC). Defaults to TDOC.
        profile: Extraction profile to use. Defaults to DEFAULT_EXTRACTION_PROFILE.
        force: Force reconversion.
        release: Release identifier for spec downloads (e.g., "19.0.0", "latest").

    Returns:
        Path to the primary output file (PDF for pdf-only, MD for default/advanced),
@@ -218,8 +220,11 @@ def convert_for_wiki(
    # Resolve primary document based on source kind
    primary: Path | None = None
    if source_kind == SourceKind.SPEC:
        normalized_id = normalize_spec_number(document_id)
        spec_files = fetch_spec_files(normalized_id, force_download=force)
        base_spec = extract_base_spec_number(document_id)
        normalized_id = normalize_spec_number(base_spec)
        # Resolve release: prefer suffix from document_id, fall back to explicit param
        resolved_release = extract_release_suffix(document_id) or release
        spec_files = fetch_spec_files(normalized_id, release=resolved_release, force_download=force)
        primary = spec_files.primary_path
        if primary is None:
            raise ConversionError(f"No document files found for spec {normalized_id}")
+52 −10
Original line number Diff line number Diff line
@@ -2,15 +2,19 @@

Analogous to fetch_tdoc.py but for specification documents.
Resolves spec files by number (e.g. "26.131") to their checked-out files.
If files are not present locally and a release is provided, downloads from 3GPP FTP.
"""

from __future__ import annotations

import asyncio
from dataclasses import dataclass
from pathlib import Path

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.specs.downloads import SpecDownloads
from tdoc_crawler.utils.normalization import normalize_spec_number

logger = get_logger(__name__)
@@ -31,21 +35,26 @@ class SpecFiles:
        return self.pdf_path or self.docx_path or self.doc_path


def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFiles:
    """Fetch spec files from local checkout directory.
def fetch_spec_files(spec_number: str, release: str | None = None, force_download: bool = False) -> SpecFiles:
    """Fetch spec files from local checkout directory, downloading if needed.

    Specs are checked out during the ``workspace add`` checkout phase.
    This function locates the checked-out files for a given spec number.
    Pipeline:
    1. Check if spec already exists in local checkout (filesystem scan)
    2. If found, return immediately — no network call needed
    3. Otherwise download from 3GPP FTP via SpecDownloads if a release is provided

    Args:
        spec_number: Spec number (e.g., "26.131").
        force_download: Ignored for specs — files are looked up from checkout.
        release: Release identifier (e.g., "19.0.0", "latest"). When ``None``
            and files are not found locally, no download is attempted.
        force_download: Force re-download even if exists locally.

    Returns:
        SpecFiles with paths to available documents.

    Raises:
        FileNotFoundError: If spec directory or files cannot be found.
        FileNotFoundError: If spec directory or files cannot be found and no
            download is possible.
    """
    checkout_dir = PathConfig().checkout_dir
    normalized = normalize_spec_number(spec_number)
@@ -54,10 +63,43 @@ def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFile
    # Spec checkout path: checkout_dir/Specs/archive/{series}/{spec_number}/
    spec_dir = checkout_dir / "Specs" / "archive" / series / normalized

    if not spec_dir.exists():
    # Step 1: Check local checkout first
    if spec_dir.exists() and not force_download:
        files = _scan_spec_dir(spec_dir)
        if files.primary_path is not None:
            return files

    # Step 2: Download from 3GPP FTP if release is available
    if release is not None or force_download:
        _download_spec(normalized, release or "latest", checkout_dir)

        # Re-scan after download
        if spec_dir.exists():
            files = _scan_spec_dir(spec_dir)
            if files.primary_path is not None:
                return files

    raise FileNotFoundError(f"Spec {normalized} not found at {spec_dir}")

    return _scan_spec_dir(spec_dir)

def _download_spec(spec_number: str, release: str, checkout_dir: Path) -> None:
    """Download a spec using SpecDownloads with SpecDatabase lifecycle management."""
    db_file = PathConfig().db_file

    async def _do_download() -> None:
        async with SpecDatabase(db_file) as db:
            downloader = SpecDownloads(db)
            await downloader.checkout_specs_async(
                specs=[spec_number],
                doc_only=False,
                checkout_dir=checkout_dir,
                release=release,
            )

    try:
        asyncio.run(_do_download())
    except Exception as exc:
        logger.warning("Failed to download spec %s (release=%s): %s", spec_number, release, exc)


def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
+60 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ _UNDOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})(?P<increment>\d{1,3})$"
_OFFSET_PATTERN = re.compile(r"^(?P<left>.+?)\s*\+\s*(?P<offset>-?\d+)\s*$")
_RANGE_SPLIT_PATTERN = re.compile(r"\s*([-:])\s*")
_RELEASE_PREFIX_PATTERN = re.compile(r"^(?:v|rel|rel[-])", re.IGNORECASE)
_SPEC_REL_SUFFIX_PATTERN = re.compile(r"^(?P<spec>\d+(?:\.\d+)?)-REL(?P<version>[\d.]+)$", re.IGNORECASE)


def normalize_release_label(release: str | None) -> str | None:
@@ -104,6 +105,65 @@ def normalize_spec_number(value: str) -> str:
    return f"{series}.{increment}"


def normalize_spec_number_compact(value: str) -> str:
    """Normalize spec numbers to compact format without dots.

    Args:
        value: Spec number in dotted or undotted form.

    Returns:
        Compact spec number (e.g., "26132").

    Raises:
        ValueError: When the spec number is not in a supported format.
    """
    series, increment, _, _ = _parse_spec_number(value)
    return f"{series}{increment}"


def extract_base_spec_number(source_item_id: str) -> str:
    """Extract the spec number from a source_item_id that may have a -REL suffix.

    Returns the spec number portion before the ``-REL`` marker, or the
    original string unchanged when no ``-REL`` suffix is present.

    Examples:
        ``"26260-REL19.0.0"`` → ``"26260"``
        ``"26.260-REL19.0.0"`` → ``"26.260"``
        ``"S4-250638"`` → ``"S4-250638"``

    Args:
        source_item_id: Workspace source item identifier.

    Returns:
        Spec number portion without the release suffix.
    """
    match = _SPEC_REL_SUFFIX_PATTERN.match(source_item_id)
    if match:
        return match.group("spec")
    return source_item_id


def extract_release_suffix(source_item_id: str) -> str | None:
    """Extract the release version from a source_item_id with a ``-REL`` suffix.

    Examples:
        ``"26260-REL19.0.0"`` → ``"19.0.0"``
        ``"26.260-REL17.0.0"`` → ``"17.0.0"``
        ``"26260"`` → ``None``

    Args:
        source_item_id: Workspace source item identifier.

    Returns:
        Release version string, or ``None`` when no ``-REL`` suffix is present.
    """
    match = _SPEC_REL_SUFFIX_PATTERN.match(source_item_id)
    if match:
        return match.group("version")
    return None


def expand_spec_ranges(spec_input: str) -> Generator[str]:
    """Expand spec range syntax into individual spec numbers.

Loading