Commit d9548de1 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(workspace): add auto-crawl option for workspace document addition

* Introduce AutoCrawlSpecsOption to enable/disable auto-crawling of spec metadata.
* Implement _resolve_spec_release_for_add function to resolve spec releases during workspace addition.
* Update workspace_add command to utilize resolved releases for added documents.
* Add resolve_spec_release function to handle spec release resolution with optional metadata crawling.
parent eca05647
Loading
Loading
Loading
Loading
+28 −1
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from tdoc_crawler.cli._shared import console, ensure_hybrid_server_for_profile
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    AutoCrawlSpecsOption,
    EndDateOption,
    LimitOption,
    ReleaseOption,
@@ -35,6 +36,7 @@ from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, Extract
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.specs.operations.checkout import resolve_spec_release
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.workspaces import (
@@ -262,6 +264,29 @@ def workspace_process(
    console.print(f"\n[yellow]Processing complete: {processed} succeeded, {failed} failed[/yellow]")


def _resolve_spec_release_for_add(
    item: str,
    release: str,
    source_kind: SourceKind,
    auto_crawl: bool,
) -> str:
    """Resolve a spec release selector to a concrete version for workspace add.

    For non-spec kinds or when resolution fails, returns *release* unchanged.
    """
    if source_kind != SourceKind.SPEC:
        return release
    try:
        resolved = asyncio.run(resolve_spec_release(item, release, auto_crawl=auto_crawl))
        console.print(f"[dim]  Resolved {item} release '{release}'{resolved}[/dim]")
        return resolved
    except ValueError as exc:
        console.print(f"[yellow]  Warning: {exc}. Using unresolved release '{release}'.[/yellow]")
    except Exception as exc:
        console.print(f"[yellow]  Warning: Failed to resolve release for {item}: {exc}[/yellow]")
    return release


@app.command("add", help="Add documents to an existing workspace.")
def workspace_add(
    items: WorkspaceItemsArgument = None,
@@ -278,6 +303,7 @@ def workspace_add(
    source_ex: SourcePatternExcludeOption = None,
    title: TitlePatternOption = None,
    title_ex: TitlePatternExcludeOption = None,
    auto_crawl_specs: AutoCrawlSpecsOption = True,
) -> None:
    """Add documents to a workspace.

@@ -356,13 +382,14 @@ def workspace_add(
    elif items:
        # Direct mode: add items by ID
        for item in items:
            resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs)
            members.append(
                make_workspace_member(
                    source_item_id=item,
                    source_path=item,
                    source_kind=source_kind,
                    added_by="cli",
                    release=release,
                    release=resolved_release,
                )
            )
    else:
+5 −0
Original line number Diff line number Diff line
@@ -147,6 +147,11 @@ HttpCacheOption = Annotated[
    ),
]

AutoCrawlSpecsOption = Annotated[
    bool,
    typer.Option("--auto-crawl-specs/--no-auto-crawl-specs", help="Auto-crawl spec metadata when not in database"),
]

NoProgressOption = Annotated[
    bool,
    typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
+55 −0
Original line number Diff line number Diff line
@@ -10,11 +10,65 @@ import shutil
from pathlib import Path
from typing import cast

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.specs.downloads import SpecDownloads
from tdoc_crawler.specs.sources.base import FunctionSpecSource, SpecSource
from tdoc_crawler.specs.sources.threegpp import fetch_threegpp_metadata
from tdoc_crawler.specs.sources.whatthespec import fetch_whatthespec_metadata
from tdoc_crawler.utils.normalization import normalize_spec_number, resolve_release_to_full_version


async def resolve_spec_release(
    spec_number: str,
    release: str,
    *,
    auto_crawl: bool = True,
    cache_manager_name: str | None = None,
) -> str:
    """Resolve a release selector to a concrete version string for a spec.

    Queries the spec database for available versions and resolves the release
    selector (e.g., ``"latest"``, ``"17"``) to a full 3-part version string.

    If no versions are found and *auto_crawl* is ``True``, triggers a metadata
    crawl from 3GPP sources before retrying the resolution.

    Args:
        spec_number: Spec number in any format (e.g., ``"26.260"``, ``"26260"``).
        release: Release selector (e.g., ``"latest"``, ``"17"``, ``"17.1.0"``).
        auto_crawl: Crawl spec metadata when not in database.
        cache_manager_name: Optional cache manager name for HTTP caching.

    Returns:
        Full 3-part version string (e.g., ``"19.0.0"``).

    Raises:
        ValueError: If no matching version can be resolved.
    """
    normalized = normalize_spec_number(spec_number)
    db_file = PathConfig().db_file

    async def _get_versions() -> list[str]:
        async with SpecDatabase(db_file) as db:
            versions = await db.get_spec_versions(normalized)
        return [v.version for v in versions]

    versions = await _get_versions()

    if not versions and auto_crawl:
        sources = build_default_spec_sources(cache_manager_name=cache_manager_name)
        async with SpecDatabase(db_file) as db:
            results = await db.crawl_specs([normalized], release, sources)
        for result in results:
            if result.status == "stored" and result.latest_version:
                versions = await _get_versions()
                break

    if not versions:
        raise ValueError(f"No versions found for spec {normalized} (release={release})")

    return resolve_release_to_full_version(release, versions)


def clear_checkout_specs(checkout_dir: Path) -> int:
@@ -93,4 +147,5 @@ __all__ = [
    "checkout_specs",
    "checkout_specs_async",
    "clear_checkout_specs",
    "resolve_spec_release",
]