Commit 2ffa247f authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(cli): move helper functions to utils module

parent 55b4ab48
Loading
Loading
Loading
Loading
+1 −35
Original line number Diff line number Diff line
@@ -37,43 +37,9 @@ Assume `cli/` could be separated as an optional package. If a function would be
| `app.py` | Typer command definitions and CLI entry points |
| `args.py` | Typer Annotated types for arguments and options |
| `console.py` | Rich Console singleton for CLI output |
| `helpers.py` | Helper functions - check classification rules above |
| `fetching.py` | TDoc fetching - check classification rules below |
| `utils.py` | Helper functions - check classification rules above |
| `printing.py` | Table and output formatting for CLI |

## Function Classification

### `helpers.py`

**CLI Functions (stay in cli/):**

- `parse_working_groups()` - CLI argument parsing
- `parse_subgroups()` - CLI argument parsing
- `collect_spec_numbers()` - CLI stdin/file input handling
- `build_limits()` - CLI config builder wrapper
- `launch_file()` - System calls for opening files
- `resolve_http_cache_config()` - CLI/env var configuration parsing
- `infer_working_groups_from_ids()` - CLI string inference

**Library Functions (moved to core):**

- `normalize_portal_meeting_name()``tdoc_crawler.specs.normalization`
- `resolve_meeting_id()``tdoc_crawler.database`
- `download_to_path()``tdoc_crawler.http_client`
- `prepare_tdoc_file()``tdoc_crawler.checkout`
- `database_path()``tdoc_crawler.database`

### `fetching.py`

**CLI Functions (stay in cli/):**

- `fetch_missing_tdocs()` - Uses CLI console output
- `_fetch_via_whatthespec()` - Uses CLI console output
- `maybe_fetch_missing_tdocs()` - CLI console and flag handling

**Library Functions:**

- `fetch_tdoc()` - Import from `tdoc_crawler.fetching` (not duplicated in CLI)

## Lessons Learned

+29 −30
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ from tdoc_crawler.checkout import (
from tdoc_crawler.cli.args import (
    DEFAULT_VERBOSITY,
    CacheDirOption,
    CheckoutDirOption,
    CheckoutOption,
    CheckoutTDocIdsArgument,
    ClearDbOption,
@@ -64,7 +65,6 @@ from tdoc_crawler.cli.args import (
    WorkingGroupOption,
)
from tdoc_crawler.cli.console import get_console
from tdoc_crawler.cli.helpers import build_limits, collect_spec_numbers, launch_file, parse_subgroups, parse_working_groups
from tdoc_crawler.cli.printing import (
    meeting_to_dict,
    print_checkout_results,
@@ -76,17 +76,17 @@ from tdoc_crawler.cli.printing import (
    spec_query_to_dict,
    tdoc_to_dict,
)
from tdoc_crawler.cli.utils import launch_file
from tdoc_crawler.config import CacheManager
from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.credentials import resolve_credentials, set_credentials
from tdoc_crawler.database import SpecDatabase, TDocDatabase
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.fetching import fetch_missing_tdocs
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
from tdoc_crawler.models.specs import SpecQueryFilters
from tdoc_crawler.specs import SpecCatalog
from tdoc_crawler.specs.downloads import SpecDownloads
from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, SpecQueryFilters, TDocCrawlConfig
from tdoc_crawler.specs import SpecDatabase, SpecDownloads
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

load_dotenv()

@@ -126,7 +126,8 @@ def crawl_tdocs(

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)
    limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)

    limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)
    config = TDocCrawlConfig(
        cache_dir=manager.root,
        working_groups=working_groups,
@@ -158,7 +159,7 @@ def crawl_tdocs(
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

    with SpecDatabase(db_file) as database:
    with TDocDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        # Clear TDocs if requested
        if clear_tdocs:
@@ -275,7 +276,7 @@ def crawl_meetings(

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)
    limits = build_limits(None, limit_meetings, limit_meetings_per_wg, limit_wgs)
    limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_wg, limit_wgs)
    set_credentials(eol_username, eol_password, prompt_credentials)
    config = MeetingCrawlConfig(
        cache_dir=manager.root,
@@ -297,7 +298,7 @@ def crawl_meetings(
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        # Clear all data if requested
        if clear_db:
@@ -369,7 +370,7 @@ def crawl_meetings(
            order=SortOrder.DESC,
            include_without_files=False,
        )
        with TDocDatabase(db_file) as database:
        with SpecDatabase(db_file) as database:
            meetings = database.query_meetings(query_config)

        with create_cached_session(manager.http_cache_dir) as session:
@@ -515,7 +516,7 @@ def query_meetings(
    )

    db_file = manager.db_file
    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        if clear_tdocs:
            deleted_count = database.clear_tdocs()
@@ -589,7 +590,7 @@ def query_specs(
        raise typer.Exit(code=2) from exc

    db_file = manager.db_file
    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        if clear_tdocs:
            deleted_count = database.clear_tdocs()
@@ -605,8 +606,7 @@ def query_specs(
            if removed_specs:
                console.print("[yellow]Cleared checkout entries for specs[/yellow]")

        catalog = SpecCatalog(database)
        results = catalog.query_specs(filters, release="latest")
        results = database.query_specs(filters)

    if not results:
        console.print("[yellow]No specs found[/yellow]")
@@ -614,7 +614,7 @@ def query_specs(

    if checkout:
        spec_list = [result.spec_number for result in results]
        with TDocDatabase(db_file) as database:
        with SpecDatabase(db_file) as database:
            checkout_specs(spec_list, manager.checkout_dir, database, release="latest")

    if output is OutputFormat.JSON:
@@ -670,13 +670,13 @@ def open_tdoc(
            metadata = results[0]

        try:
            target_file = prepare_tdoc_file(metadata, manager.root, session=session)
            tdoc_file = prepare_tdoc_file(metadata, manager.root, session=session)
        except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
            console.print(f"[red]Failed to prepare TDoc {normalized_id}: {exc}")
            raise typer.Exit(code=1) from exc

        console.print(f"[green]Opening {target_file}")
        launch_file(target_file)
        console.print(f"[green]Opening {tdoc_file}")
        launch_file(tdoc_file)


@app.command("checkout", rich_help_panel=HELP_PANEL_MAIN)
@@ -763,7 +763,7 @@ def stats(
        console.print(f"[red]Database not found: {db_file}[/red]")
        raise typer.Exit(code=1)

    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        stats_dict = cast(dict[str, Any], database.get_statistics())

    table = Table(title="TDoc database statistics")
@@ -811,7 +811,7 @@ def crawl_specs(
    sources = build_default_spec_sources()

    db_file = manager.db_file
    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        if clear_tdocs:
            deleted_count = database.clear_tdocs()
@@ -827,15 +827,14 @@ def crawl_specs(
            if removed_specs:
                console.print("[yellow]Cleared checkout entries for specs[/yellow]")

        catalog = SpecCatalog(database)
        results = catalog.crawl_specs(specs, release, sources)
        results = database.crawl_specs(specs, release, sources)

    if not results:
        console.print("[yellow]No specs crawled[/yellow]")
        return

    if checkout:
        with TDocDatabase(db_file) as database:
        with SpecDatabase(db_file) as database:
            checkout_specs(
                [result.spec_number for result in results],
                manager.checkout_dir,
@@ -857,7 +856,7 @@ def checkout_spec(
    spec_file: SpecFileOption = None,
    release: ReleaseOption = "latest",
    doc_only: DocOnlyOption = False,
    # checkout_dir: CheckoutDirOption = None,
    checkout_dir: CheckoutDirOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = DEFAULT_VERBOSITY,
) -> None:
@@ -876,7 +875,7 @@ def checkout_spec(
    sources = build_default_spec_sources()

    db_file = manager.db_file
    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        downloader = SpecDownloads(database)
        results = downloader.checkout_specs(specs, doc_only, effective_checkout_dir, release, sources=sources)

@@ -901,12 +900,12 @@ def open_spec(
    sources = build_default_spec_sources()

    db_file = manager.db_file
    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        downloader = SpecDownloads(database)
        try:
            path = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources)
            console.print(f"[green]Opening {path}[/green]")
            launch_file(path)
            spec_file = downloader.open_spec(normalized, doc_only, checkout_dir, release, sources=sources)
            console.print(f"[green]Opening {spec_file}[/green]")
            launch_file(spec_file)
        except Exception as exc:
            console.print(f"[red]Failed to open spec: {exc}[/red]")
            raise typer.Exit(code=1)
+1 −0
Original line number Diff line number Diff line
@@ -78,6 +78,7 @@ ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meeti
CheckoutOption = Annotated[
    bool, typer.Option("--checkout/--no-checkout", help="Download and extract metadata results to checkout folder", envvar="TDC_CHECKOUT")
]
CheckoutDirOption = Annotated[Path | None, typer.Option("--checkout-dir", help="Directory for checkout files", envvar="TDC_CHECKOUT_DIR")]
WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers", envvar="TDC_WORKERS")]
MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")]
TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")]

src/tdoc_crawler/cli/helpers.py

deleted100644 → 0
+0 −223
Original line number Diff line number Diff line
"""Helper functions for CLI operations."""

from __future__ import annotations

import os
import subprocess
import sys
from collections.abc import Iterable
from pathlib import Path

import click
import typer

from tdoc_crawler.cli.console import get_console
from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, SortOrder, WorkingGroup
from tdoc_crawler.specs.normalization import expand_spec_ranges_batch, normalize_portal_meeting_name

console = get_console()
_logger = get_logger(__name__)

DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db"
DOWNLOAD_TIMEOUT = 60
ALLOWED_DOWNLOAD_SCHEMES = ("ftp://", "http://", "https://")


def infer_working_groups_from_subgroups(subgroups: list[str]) -> list[WorkingGroup]:
    """Infer working groups from subgroup codes.

    Args:
        subgroups: List of subgroup codes (e.g., ["S4", "R1"])

    Returns:
        List of inferred working groups without duplicates
    """
    working_groups: list[WorkingGroup] = []
    for subgroup in subgroups:
        # Extract first character to determine working group
        if subgroup and len(subgroup) >= 1:
            first_char = subgroup[0].upper()
            if first_char == "R":
                wg = WorkingGroup.RAN
            elif first_char == "S":
                wg = WorkingGroup.SA
            elif first_char == "C":
                wg = WorkingGroup.CT
            else:
                continue

            if wg not in working_groups:
                working_groups.append(wg)

    return working_groups if working_groups else [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]


def parse_working_groups(values: list[str] | None, subgroups: list[str] | None = None) -> list[WorkingGroup]:
    """Parse and normalize working group names, expanding plenary aliases.

    Args:
        values: Explicit working group values from CLI
        subgroups: Optional subgroup list to infer working groups from

    Returns:
        List of working groups to crawl
    """
    if not values:
        # If subgroups are specified but no explicit working groups, infer from subgroups
        if subgroups:
            return infer_working_groups_from_subgroups(subgroups)
        # Otherwise default to all working groups
        return [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]
    resolved: list[WorkingGroup] = []
    for item in values:
        # Try alias normalization first (RP->RAN, SP->SA, CP->CT)
        normalized = normalize_working_group_alias(item)
        try:
            resolved.append(WorkingGroup(normalized.upper()))
        except ValueError as exc:
            console.print(f"[red]Unknown working group: {item}")
            raise typer.Exit(code=2) from exc
    if not resolved:
        console.print("[red]No valid working groups specified")
        raise typer.Exit(code=2)
    return resolved


def parse_subgroups(values: list[str] | None) -> list[str] | None:
    """Parse and normalize subgroup aliases to canonical names."""
    if not values:
        return None

    resolved: list[str] = []
    for item in values:
        normalized = normalize_subgroup_alias(item)
        if not normalized:
            console.print(f"[red]Unknown subgroup: {item}")
            raise typer.Exit(code=2)
        resolved.extend(normalized)

    return resolved


def collect_spec_numbers(specs: list[str] | None, spec_file: Path | None) -> list[str]:
    """Collect spec numbers from CLI arguments or a file."""
    collected: list[str] = []

    if specs:
        for spec in specs:
            if spec == "-":
                # Read from stdin
                for line in sys.stdin:
                    line_stripped = line.strip()
                    if line_stripped:
                        collected.append(line_stripped)
            else:
                collected.append(spec.strip())

    if spec_file and spec_file.exists():
        try:
            with spec_file.open("r", encoding="utf-8") as f:
                for line in f:
                    line_stripped = line.strip()
                    if line_stripped:
                        collected.append(line_stripped)
        except OSError as exc:
            raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}")

    if not collected:
        return []

    try:
        expanded = expand_spec_ranges_batch(collected)
    except ValueError as e:
        raise click.UsageError(str(e))

    return expanded


def build_limits(
    limit_tdocs: int | None,
    limit_meetings: int | None,
    limit_meetings_per_wg: int | None,
    limit_wgs: int | None,
) -> CrawlLimits:
    """Build CrawlLimits configuration from individual parameters."""
    return CrawlLimits(
        limit_tdocs=limit_tdocs,
        limit_meetings=limit_meetings,
        limit_meetings_per_wg=limit_meetings_per_wg,
        limit_wgs=limit_wgs,
    )


def infer_working_groups_from_ids(ids: Iterable[str]) -> list[WorkingGroup]:
    """Infer working groups from TDoc IDs based on first character."""
    mapping = {
        "R": WorkingGroup.RAN,
        "S": WorkingGroup.SA,
        "T": WorkingGroup.CT,
        "C": WorkingGroup.CT,
    }
    resolved: list[WorkingGroup] = []
    for value in ids:
        if not value:
            continue
        group = mapping.get(value[0].upper())
        if group and group not in resolved:
            resolved.append(group)
    return resolved or [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]


def launch_file(path: Path) -> None:
    """Launch file in system's default application."""
    if not path.exists():
        console.print(f"[red]File not found: {path}")
        raise typer.Exit(code=1)
    try:
        if sys.platform.startswith("win"):
            os.startfile(path)  # noqa: S606
        elif sys.platform == "darwin":
            open_cmd = Path("/usr/bin/open")
            if open_cmd.exists():
                subprocess.run([str(open_cmd), str(path)], check=False)  # noqa: S603
            else:
                console.print("[yellow]/usr/bin/open not available[/yellow]")
        else:
            xdg_cmd = Path("/usr/bin/xdg-open")
            if xdg_cmd.exists():
                subprocess.run([str(xdg_cmd), str(path)], check=False)  # noqa: S603
            else:
                console.print("[yellow]xdg-open command not available[/yellow]")
    except OSError as exc:
        console.print(f"[red]Failed to open file: {exc}")
        raise typer.Exit(code=1) from exc


def resolve_http_cache_config(cache_ttl: int | None = None, cache_refresh_on_access: bool | None = None) -> HttpCacheConfig:
    """Resolve HTTP cache configuration from CLI parameters and environment variables.

    Args:
        cache_ttl: TTL for cache entries (CLI parameter)
        cache_refresh_on_access: Whether to refresh TTL on access (CLI parameter)

    Returns:
        HttpCacheConfig instance with resolved values
    """
    # Check CLI parameters first, then environment variables, then defaults
    if cache_ttl is not None:
        ttl = cache_ttl
    else:
        env_ttl = os.getenv("HTTP_CACHE_TTL")
        ttl = int(env_ttl) if env_ttl else 7200

    # Handle refresh on access - check CLI param, then env var, then default
    if cache_refresh_on_access is not None:
        refresh_on_access = cache_refresh_on_access
    else:
        env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower()
        refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True

    return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access)
+40 −0
Original line number Diff line number Diff line
"""CLI utilities."""

from __future__ import annotations

import os
import subprocess
import sys
from pathlib import Path

import typer

from tdoc_crawler.cli.console import get_console

console = get_console()


def launch_file(path: Path) -> None:
    """Launch file in system's default application."""
    if not path.exists():
        console.print(f"[red]File not found: {path}")
        raise typer.Exit(code=1)
    try:
        if sys.platform.startswith("win"):
            os.startfile(path)  # noqa: S606
        elif sys.platform == "darwin":
            open_cmd = Path("/usr/bin/open")
            if open_cmd.exists():
                subprocess.run([str(open_cmd), str(path)], check=False)  # noqa: S603
            else:
                console.print("[yellow]/usr/bin/open not available[/yellow]")
        else:
            # Linux and other Unix-like systems
            xdg_cmd = Path("/usr/bin/xdg-open")
            if xdg_cmd.exists():
                subprocess.run([str(xdg_cmd), str(path)], check=False)  # noqa: S603
            else:
                console.print("[yellow]xdg-open command not available[/yellow]")
    except OSError as exc:
        console.print(f"[red]Failed to open file: {exc}")
        raise typer.Exit(code=1) from exc