Commit d3f3a69b authored by Jan Reimes's avatar Jan Reimes
Browse files

cli(specs): add crawl-specs command and spec input helpers

parent 211f8c6f
Loading
Loading
Loading
Loading
+61 −2
Original line number Diff line number Diff line
@@ -19,6 +19,9 @@ from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
from tdoc_crawler.specs import SpecCatalog
from tdoc_crawler.specs.sources import fetch_threegpp_metadata, fetch_whatthespec_metadata
from tdoc_crawler.specs.sources.base import FunctionSpecSource

from .args import (
    CacheDirOption,
@@ -43,6 +46,9 @@ from .args import (
    OutputFormatOption,
    OverallTimeoutOption,
    PromptCredentialsOption,
    ReleaseOption,
    SpecFileOption,
    SpecOption,
    StartDateOption,
    SubgroupOption,
    TDocIdArgument,
@@ -55,8 +61,23 @@ from .args import (
)
from .console import get_console
from .fetching import maybe_fetch_missing_tdocs
from .helpers import build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file
from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict
from .helpers import (
    build_limits,
    collect_spec_numbers,
    database_path,
    launch_file,
    parse_subgroups,
    parse_working_groups,
    prepare_tdoc_file,
)
from .printing import (
    meeting_to_dict,
    print_meeting_table,
    print_spec_crawl_table,
    print_tdoc_table,
    spec_crawl_to_dict,
    tdoc_to_dict,
)

load_dotenv()

@@ -503,6 +524,44 @@ def stats(
    console.print(table)


@app.command("crawl-specs", rich_help_panel=HELP_PANEL_CRAWLING)
def crawl_specs(
    spec: SpecOption = None,
    spec_file: SpecFileOption = None,
    release: ReleaseOption = "latest",
    output_format: OutputFormatOption = OutputFormat.TABLE.value,
    cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
) -> None:
    """Crawl spec metadata from configured sources."""
    specs = collect_spec_numbers(spec, spec_file)
    try:
        output = OutputFormat(output_format.lower())
    except ValueError as exc:
        console.print("[red]Invalid output format; use table, json, or yaml")
        raise typer.Exit(code=2) from exc

    sources = [
        FunctionSpecSource("3gpp", fetch_threegpp_metadata),
        FunctionSpecSource("whatthespec", fetch_whatthespec_metadata),
    ]

    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        catalog = SpecCatalog(database)
        results = catalog.crawl_specs(specs, release, sources)

    if not results:
        console.print("[yellow]No specs crawled[/yellow]")
        return

    if output is OutputFormat.JSON:
        console.print(json.dumps([spec_crawl_to_dict(result) for result in results], indent=2))
    elif output is OutputFormat.YAML:
        console.print(yaml.dump([spec_crawl_to_dict(result) for result in results], sort_keys=False))
    else:
        print_spec_crawl_table(results)


# Register command aliases
app.command("ct", rich_help_panel=HELP_PANEL_CRAWLING, hidden=True)(crawl_tdocs)
app.command("cm", rich_help_panel=HELP_PANEL_CRAWLING, hidden=True)(crawl_meetings)
+6 −0
Original line number Diff line number Diff line
@@ -74,3 +74,9 @@ IncludeWithoutFilesOption = Annotated[
TDocIdArgument = Annotated[str, typer.Argument(help="TDoc identifier to download and open")]
CheckoutTDocIdsArgument = Annotated[list[str], typer.Argument(help="TDoc identifier(s) to checkout")]
ForceOption = Annotated[bool, typer.Option("--force", "-f", help="Re-download even if already checked out")]

SpecOption = Annotated[list[str] | None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]
SpecFileOption = Annotated[Path | None, typer.Option("--spec-file", help="File with spec numbers")]
ReleaseOption = Annotated[str, typer.Option("--release", help="Spec release selector")]
DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")]
CheckoutDirOption = Annotated[Path | None, typer.Option("--checkout-dir", help="Spec checkout base directory")]
+37 −0
Original line number Diff line number Diff line
@@ -107,6 +107,43 @@ def parse_subgroups(values: list[str] | None) -> list[str] | None:
    return resolved


def collect_spec_numbers(specs: list[str] | None, spec_file: Path | None) -> list[str]:
    """Collect spec numbers from CLI options and stdin.

    Args:
        specs: Spec numbers provided on the command line. Use "-" to read stdin.
        spec_file: Optional file containing spec numbers (one per line).

    Returns:
        List of spec numbers in input order.
    """
    collected: list[str] = []

    if specs:
        for item in specs:
            if item == "-":
                stdin_text = sys.stdin.read()
                collected.extend(line.strip() for line in stdin_text.splitlines() if line.strip())
            else:
                stripped = item.strip()
                if stripped:
                    collected.append(stripped)

    if spec_file is not None:
        try:
            file_text = spec_file.read_text(encoding="utf-8")
        except OSError as exc:
            console.print(f"[red]Failed to read spec file: {exc}")
            raise typer.Exit(code=2) from exc
        collected.extend(line.strip() for line in file_text.splitlines() if line.strip())

    if not collected:
        console.print("[red]No spec numbers provided[/red]")
        raise typer.Exit(code=2)

    return collected


def build_limits(
    limit_tdocs: int | None,
    limit_meetings: int | None,
+43 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ from typing import Any
from rich.table import Table

from tdoc_crawler.models import MeetingMetadata, TDocMetadata
from tdoc_crawler.specs import SpecCrawlResult

from .console import get_console

@@ -78,3 +79,45 @@ def print_meeting_table(results: list[MeetingMetadata]) -> None:
        )

    console.print(table)


def spec_crawl_to_dict(result: SpecCrawlResult) -> dict[str, Any]:
    """Convert SpecCrawlResult to dictionary for JSON/YAML output."""
    return {
        "spec_number": result.spec_number,
        "release": result.release,
        "status": result.status,
        "latest_version": result.latest_version,
        "sources": [
            {
                "source_name": outcome.source_name,
                "status": outcome.status,
                "versions": outcome.versions,
                "message": outcome.message,
            }
            for outcome in result.sources
        ],
        "message": result.message,
    }


def print_spec_crawl_table(results: list[SpecCrawlResult]) -> None:
    """Print spec crawl results as formatted table."""
    table = Table(title=f"Spec crawl results ({len(results)} rows)")
    table.add_column("Spec", style="cyan")
    table.add_column("Status", style="magenta")
    table.add_column("Release", style="yellow")
    table.add_column("Latest", style="green")
    table.add_column("Sources", style="blue")

    for result in results[:100]:
        sources = ", ".join(f"{outcome.source_name}:{outcome.status}" for outcome in result.sources) or "-"
        table.add_row(
            result.spec_number,
            result.status,
            result.release,
            result.latest_version or "-",
            sources,
        )

    console.print(table)