Commit dbd4dff5 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(cli): async conversion, extract helpers, remove deprecated options and TYPE_CHECKING

parent ad24976e
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs
console = get_console()


def handle_clear_options(
async def handle_clear_options(
    db_file: Path,
    checkout_dir: Path,
    database_cls: type[DocDatabase],
@@ -44,16 +44,16 @@ def handle_clear_options(
            console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]")
        return

    with database_cls(db_file) as database:
    async with database_cls(db_file) as database:
        if clear_tdocs:
            deleted_count = database.clear_tdocs()
            deleted_count = await database.clear_tdocs()
            console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]")
            removed = clear_checkout_tdocs(checkout_dir)
            if removed:
                console.print(f"[yellow]Cleared {removed} checkout entries for TDocs[/yellow]")

        if clear_specs:
            spec_counts = database.clear_specs()
            spec_counts = await database.clear_specs()
            total_specs = sum(spec_counts.values())
            console.print(f"[yellow]Cleared {total_specs} spec rows from database[/yellow]")
            removed_specs = clear_checkout_specs(checkout_dir)
+3 −3
Original line number Diff line number Diff line
@@ -89,7 +89,9 @@ AgendaPatternOption = Annotated[
AgendaPatternExcludeOption = Annotated[
    list[str] | None,
    typer.Option(
        "--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name
        "--agenda-ex",
        help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.",
        envvar=ConfigEnvVar.TDC_AGENDA_PATTERN_EXCLUDE.name,
    ),
]

@@ -114,7 +116,6 @@ ReleaseOption = Annotated[
DocOnlyOption = Annotated[bool, typer.Option("--doc-only/--no-doc-only", help="Attempt document-only download")]

# Options - General/Common
CacheDirOption = Annotated[Path | None, typer.Option("--cache-dir", "-c", help="Cache directory", envvar=ConfigEnvVar.TDC_CACHE_DIR.name)]
ClearDbOption = Annotated[bool, typer.Option("--clear-db", help="Clear all meetings and TDocs before crawling")]
CheckoutOption = Annotated[
    bool,
@@ -150,4 +151,3 @@ NoProgressOption = Annotated[
    bool,
    typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
]
+2 −5
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from __future__ import annotations

import logging
import tomllib
from pathlib import Path
from typing import Annotated, Literal
@@ -20,7 +19,6 @@ from tdoc_crawler.config.settings import ThreeGPPConfig
FormatType = Literal["toml", "yaml", "json"]

console = Console()
logger = logging.getLogger(__name__)

ConfigInitOutputOption = Annotated[
    Path,
@@ -76,8 +74,7 @@ def config_show(
    format: ConfigShowFormatOption = "toml",
) -> None:
    """Display current configuration (stdout)."""
    exporter = ConfigExporter()
    print(exporter.export(format))
    ConfigExporter()


def _check_dir_exists(target_dir: Path) -> tuple[bool, str]:
@@ -274,7 +271,7 @@ def config_docs(
                    "default": default,
                    "value": value,
                    "description": description,
                }
                },
            )

    if section:
+58 −51
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    CacheDirOption,
    CheckoutOption,
    ClearDbOption,
    ClearSpecsOption,
@@ -75,6 +74,26 @@ def _parse_date(date_str: str | None, is_end: bool = False) -> date | None:
    return parse_partial_date(date_str, is_end=is_end)


def _build_crawl_scope(
    meetings: list[MeetingMetadata],
    working_groups: list,
    subgroups: list[str],
) -> str:
    """Build human-readable scope description for the crawl."""
    scope_parts: list[str] = []
    if meetings:
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    return ", ".join(scope_parts)


def crawl_tdocs(
    working_group: WorkingGroupOption = None,
    subgroup: SubgroupOption = None,
@@ -99,7 +118,6 @@ def crawl_tdocs(
    title_ex: TitlePatternExcludeOption = None,
    agenda: AgendaPatternOption = None,
    agenda_ex: AgendaPatternExcludeOption = None,
    cache_dir: CacheDirOption = None,
    http_cache_enabled: HttpCacheOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
@@ -110,9 +128,6 @@ def crawl_tdocs(
    set_verbosity(verbosity)

    crawler_config = TDocCrawlerConfig.from_settings()
    # Override cache_dir if provided (deprecated but still supported)
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()

    subgroups = parse_subgroups(subgroup)
@@ -139,9 +154,6 @@ def crawl_tdocs(

    db_file = crawler_config.path.db_file

    scope_parts = []

    # Query actual meetings from database to show realistic scope
    async def fetch_meetings() -> list[MeetingMetadata]:
        async with MeetingDatabase(db_file) as meeting_db:
            query_config = MeetingQueryConfig(
@@ -156,23 +168,11 @@ def crawl_tdocs(
            return await meeting_db.query_meetings(query_config)

    meetings = asyncio.run(fetch_meetings())
    scope = _build_crawl_scope(meetings, working_groups, subgroups)
    console.print(f"[cyan]Crawling TDocs ({scope})[/cyan]")

    if meetings:
        # Extract unique subgroups from queried meetings
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    # Fallback to input parameters if no meetings found in DB
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")

    console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

    handle_clear_options(
    async def run_clear() -> None:
        await handle_clear_options(
            db_file,
            crawler_config.path.checkout_dir,
            TDocDatabase,
@@ -180,6 +180,8 @@ def crawl_tdocs(
            clear_specs=clear_specs,
        )

    asyncio.run(run_clear())

    async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]:
        async with TDocDatabase(db_file) as database:
            crawler = TDocCrawler(database)
@@ -246,6 +248,11 @@ def crawl_tdocs(

    result, throughput = asyncio.run(run_tdoc_crawl())

    _print_crawl_results(result, throughput)


def _print_crawl_results(result: TDocCrawlResult, throughput: float) -> None:
    """Print crawl summary to console."""
    console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
@@ -273,7 +280,6 @@ def crawl_meetings(
    prompt_credentials: PromptCredentialsOption = None,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl meeting metadata from 3GPP portal."""
@@ -281,8 +287,6 @@ def crawl_meetings(
    set_credentials(eol_username, eol_password, prompt=prompt_credentials)

    crawler_config = TDocCrawlerConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()

    subgroups = parse_subgroups(subgroup)
@@ -309,7 +313,8 @@ def crawl_meetings(
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

    handle_clear_options(
    async def run_clear() -> None:
        await handle_clear_options(
            db_file,
            crawler_config.path.checkout_dir,
            MeetingDatabase,
@@ -318,6 +323,8 @@ def crawl_meetings(
            clear_db=clear_db,
        )

    asyncio.run(run_clear())

    async def run_meeting_crawl() -> MeetingCrawlResult:
        async with MeetingDatabase(db_file) as database:
            crawl_id = await database.log_crawl_start(
@@ -384,14 +391,11 @@ def crawl_specs(
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    spec_file: SpecFileOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl spec metadata from configured sources."""
    set_verbosity(verbosity)
    crawler_config = TDocCrawlerConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()
    spec_numbers = spec_numbers or []

@@ -404,7 +408,8 @@ def crawl_specs(

    sources = build_default_spec_sources()

    handle_clear_options(
    async def run_clear() -> None:
        await handle_clear_options(
            crawler_config.path.db_file,
            crawler_config.path.checkout_dir,
            SpecDatabase,
@@ -412,6 +417,8 @@ def crawl_specs(
            clear_specs=clear_specs,
        )

    asyncio.run(run_clear())

    async def crawl_specs_db() -> list[SpecCrawlResult]:
        async with SpecDatabase(crawler_config.path.db_file) as database:
            return await database.crawl_specs(specs, release, sources)
+3 −2
Original line number Diff line number Diff line
@@ -76,7 +76,7 @@ def _dataframe_to_payload(df: pd.DataFrame, input_kind: str) -> Any:
    return records


def _format_cell(value: str | int | float | bool | None) -> str:
def _format_cell(value: str | float | bool | None) -> str:
    """Format one table cell from DataFrame value with empty handling."""
    if value is None:
        return "-"
@@ -147,7 +147,8 @@ def format_output(data: StructuredData, output_format: OutputFormat) -> str:
        case OutputFormat.TABLE:
            return df.to_string(index=False)
        case _:
            raise ValueError(f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}")
            msg = f"Unsupported output format: {output_format}. Use one of: {', '.join(f.value for f in OutputFormat)}"
            raise ValueError(msg)


def print_structured_output(
Loading