Commit 15f7d5bc authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(cli): split crawl and query commands into subpackages

parent 83cbe215
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
"""Crawl command functions — re-exported from submodules."""

from tdoc_crawler.cli.crawl.meetings import crawl_meetings
from tdoc_crawler.cli.crawl.specs import crawl_specs
from tdoc_crawler.cli.crawl.tdocs import crawl_tdocs

HELP_PANEL = "Crawling Commands"

__all__ = [
    "HELP_PANEL",
    "crawl_meetings",
    "crawl_specs",
    "crawl_tdocs",
]
+57 −0
Original line number Diff line number Diff line
"""Crawl shared helpers."""

from __future__ import annotations

from datetime import date

from tdoc_crawler.cli._shared import console
from tdoc_crawler.meetings.models import MeetingMetadata
from tdoc_crawler.meetings.operations.crawl import MeetingCrawlResult
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
from tdoc_crawler.utils.date_parser import parse_partial_date


def _parse_date(date_str: str | None, is_end: bool = False) -> date | None:
    """Parse a partial date string, returning None for empty/whitespace input."""
    if not date_str or not date_str.strip():
        return None
    return parse_partial_date(date_str, is_end=is_end)


def _build_scope_description(
    meetings: list[MeetingMetadata],
    subgroups: list[str] | None,
    working_groups: list[WorkingGroup],
) -> list[str]:
    scope_parts = []
    if meetings:
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    return scope_parts


def _print_crawl_result(result: TDocCrawlResult, throughput: float) -> None:
    console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
        console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
        for error in result.errors[:5]:
            console.print(f"  - {error}")


def _print_meeting_crawl_result(result: MeetingCrawlResult) -> None:
    console.print(f"[green]Processed {result.processed} meetings[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
        console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
        for error in result.errors[:5]:
            console.print(f"  - {error}")
+158 −0
Original line number Diff line number Diff line
"""Crawl meetings command."""

from __future__ import annotations

import asyncio

from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_options
from tdoc_crawler.cli.args import (
    CacheDirOption,
    CheckoutOption,
    ClearDbOption,
    ClearSpecsOption,
    ClearTDocsOption,
    EndDateOption,
    EolPasswordOption,
    EolUsernameOption,
    IncludeWithoutFilesOption,
    IncrementalOption,
    LimitMeetingsOption,
    LimitMeetingsPerSubWgOption,
    LimitSubWgsOption,
    MaxRetriesOption,
    PromptCredentialsOption,
    StartDateOption,
    SubgroupOption,
    TimeoutOption,
    VerbosityOption,
    WorkingGroupOption,
)
from tdoc_crawler.cli.crawl._helpers import _parse_date, _print_meeting_crawl_result
from tdoc_crawler.config import ThreeGPPConfig
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import MeetingDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs
from tdoc_crawler.utils.parse import parse_subgroups, parse_working_groups


def crawl_meetings(
    working_group: WorkingGroupOption = None,
    subgroup: SubgroupOption = None,
    limit_meetings: LimitMeetingsOption = None,
    limit_meetings_per_subwg: LimitMeetingsPerSubWgOption = None,
    limit_subwgs: LimitSubWgsOption = None,
    checkout: CheckoutOption = False,
    incremental: IncrementalOption = True,
    include_without_files: IncludeWithoutFilesOption = False,
    clear_db: ClearDbOption = False,
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    timeout: TimeoutOption = 30,
    max_retries: MaxRetriesOption = 3,
    eol_username: EolUsernameOption = None,
    eol_password: EolPasswordOption = None,
    prompt_credentials: PromptCredentialsOption = None,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl meeting metadata from 3GPP portal."""
    set_verbosity(verbosity)
    set_credentials(eol_username, eol_password, prompt=prompt_credentials)

    crawler_config = ThreeGPPConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)

    config = MeetingCrawlConfig(
        working_groups=working_groups,
        subgroups=subgroups,
        incremental=incremental,
        include_without_files=include_without_files,
        max_retries=max_retries,
        timeout=timeout,
        limit_meetings=limit_meetings,
        limit_meetings_per_subwg=limit_meetings_per_subwg,
        limit_subwgs=limit_subwgs,
    )

    db_file = crawler_config.path.db_file

    scope_parts = []
    if subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

    handle_clear_options(
        db_file,
        crawler_config.path.checkout_dir,
        MeetingDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
        clear_db=clear_db,
    )

    async def run_meeting_crawl() -> object:
        async with MeetingDatabase(db_file) as database:
            crawl_id = await database.log_crawl_start(
                "meeting",
                [wg.value for wg in config.working_groups],
                config.incremental,
            )

            crawler = MeetingCrawler(database)

            progress, task = create_progress_bar("[cyan]Crawling meetings...")

            with progress:

                def update_progress(completed: float, total: float) -> None:
                    progress.update(task, completed=completed, total=total)

                result = await crawler.crawl(config, progress_callback=update_progress)

            await database.log_crawl_end(
                crawl_id,
                items_added=result.inserted,
                items_updated=result.updated,
                errors_count=len(result.errors),
            )

            return result

    result = asyncio.run(run_meeting_crawl())

    _print_meeting_crawl_result(result)

    if checkout:
        query_config = MeetingQueryConfig(
            working_groups=working_groups,
            subgroups=subgroups,
            limit=limit_meetings if limit_meetings and limit_meetings > 0 else None,
            order=SortOrder.DESC,
            include_without_files=False,
            start_date=_parse_date(start_date),
            end_date=_parse_date(end_date, is_end=True),
        )

        async def fetch_checkout_meetings() -> list:
            async with MeetingDatabase(db_file) as database:
                return await database.query_meetings(query_config)

        meetings = asyncio.run(fetch_checkout_meetings())

        with create_cached_session() as session:
            checkout_meeting_tdocs(meetings, crawler_config.path.checkout_dir, crawler_config.path.http_cache_file, session=session)
+95 −0
Original line number Diff line number Diff line
"""Crawl specs command."""

from __future__ import annotations

import asyncio

import typer

from tdoc_crawler.cli._shared import console, handle_clear_options
from tdoc_crawler.cli.args import (
    CacheDirOption,
    CheckoutOption,
    ClearSpecsOption,
    ClearTDocsOption,
    OutputFormatOption,
    ReleaseOption,
    SpecArgument,
    SpecFileOption,
    VerbosityOption,
)
from tdoc_crawler.cli.formatting import format_output
from tdoc_crawler.cli.printing import print_spec_crawl_table, spec_crawl_to_dict
from tdoc_crawler.config import ThreeGPPConfig
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.models.base import OutputFormat
from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, checkout_specs
from tdoc_crawler.utils.parse import collect_spec_numbers


def crawl_specs(
    spec_numbers: SpecArgument = None,
    release: ReleaseOption = "latest",
    checkout: CheckoutOption = False,
    output_format: OutputFormatOption = OutputFormat.TABLE.value,
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    spec_file: SpecFileOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl spec metadata from configured sources."""
    set_verbosity(verbosity)
    crawler_config = ThreeGPPConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()
    spec_numbers = spec_numbers or []

    specs = collect_spec_numbers(spec_numbers, spec_file)
    try:
        output = OutputFormat(output_format.lower())
    except ValueError as exc:
        console.print("[red]Invalid output format; use table, json, ison, toon, or yaml")
        raise typer.Exit(code=2) from exc

    sources = build_default_spec_sources()

    handle_clear_options(
        crawler_config.path.db_file,
        crawler_config.path.checkout_dir,
        SpecDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )

    async def crawl_specs_db() -> list:
        async with SpecDatabase(crawler_config.path.db_file) as database:
            return await database.crawl_specs(specs, release, sources)

    results = asyncio.run(crawl_specs_db())

    if not results:
        console.print("[yellow]No specs crawled[/yellow]")
        return

    if checkout:

        async def checkout_specs_db() -> None:
            async with SpecDatabase(crawler_config.path.db_file) as database:
                checkout_specs(
                    [result.spec_number for result in results],
                    crawler_config.path.checkout_dir,
                    database,
                    release=release,
                )

        asyncio.run(checkout_specs_db())

    if output is OutputFormat.TABLE:
        print_spec_crawl_table(results)
    else:
        data = [spec_crawl_to_dict(result) for result in results]
        console.print(format_output(data, output))
+203 −0
Original line number Diff line number Diff line
"""Crawling commands for TDoc, meeting, and spec metadata."""
"""Crawl TDocs command."""

from __future__ import annotations

import asyncio
from datetime import date, datetime

import typer
from datetime import datetime

from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_options
from tdoc_crawler.cli.args import (
@@ -13,14 +11,10 @@ from tdoc_crawler.cli.args import (
    AgendaPatternOption,
    CacheDirOption,
    CheckoutOption,
    ClearDbOption,
    ClearSpecsOption,
    ClearTDocsOption,
    EndDateOption,
    EolPasswordOption,
    EolUsernameOption,
    HttpCacheOption,
    IncludeWithoutFilesOption,
    IncrementalOption,
    LimitMeetingsOption,
    LimitMeetingsPerSubWgOption,
@@ -28,13 +22,8 @@ from tdoc_crawler.cli.args import (
    LimitTDocsOption,
    MaxRetriesOption,
    NoProgressOption,
    OutputFormatOption,
    PromptCredentialsOption,
    ReleaseOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    SpecArgument,
    SpecFileOption,
    StartDateOption,
    SubgroupOption,
    TimeoutOption,
@@ -44,73 +33,18 @@ from tdoc_crawler.cli.args import (
    WorkersOption,
    WorkingGroupOption,
)
from tdoc_crawler.cli.formatting import format_output
from tdoc_crawler.cli.printing import print_spec_crawl_table, spec_crawl_to_dict
from tdoc_crawler.cli.crawl._helpers import _build_scope_description, _parse_date, _print_crawl_result
from tdoc_crawler.config import ThreeGPPConfig
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.database.specs import SpecCrawlResult, SpecDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler, MeetingCrawlResult
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, checkout_specs
from tdoc_crawler.meetings.models import MeetingQueryConfig
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
from tdoc_crawler.tdocs.operations import TDocCrawler
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

HELP_PANEL = "Crawling Commands"


def _parse_date(date_str: str | None, is_end: bool = False) -> date | None:
    """Parse a partial date string, returning None for empty/whitespace input."""
    if not date_str or not date_str.strip():
        return None
    return parse_partial_date(date_str, is_end=is_end)


def _build_scope_description(
    meetings: list[MeetingMetadata],
    subgroups: list[str] | None,
    working_groups: list[WorkingGroup],
) -> list[str]:
    scope_parts = []
    if meetings:
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    return scope_parts


def _print_crawl_result(result: TDocCrawlResult, throughput: float) -> None:
    console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
        console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
        for error in result.errors[:5]:
            console.print(f"  - {error}")


def _print_meeting_crawl_result(result: MeetingCrawlResult) -> None:
    console.print(f"[green]Processed {result.processed} meetings[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
        console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
        for error in result.errors[:5]:
            console.print(f"  - {error}")
from tdoc_crawler.tdocs.operations.checkout import checkout_tdocs
from tdoc_crawler.utils.parse import parse_subgroups, parse_working_groups


def crawl_tdocs(
@@ -148,7 +82,6 @@ def crawl_tdocs(
    set_verbosity(verbosity)

    crawler_config = ThreeGPPConfig.from_settings()
    # Override cache_dir if provided (deprecated but still supported)
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()
@@ -177,7 +110,7 @@ def crawl_tdocs(

    db_file = crawler_config.path.db_file

    async def fetch_meetings() -> list[MeetingMetadata]:
    async def fetch_meetings() -> list:
        async with MeetingDatabase(db_file) as meeting_db:
            query_config = MeetingQueryConfig(
                working_groups=working_groups,
@@ -203,7 +136,7 @@ def crawl_tdocs(
        clear_specs=clear_specs,
    )

    async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]:
    async def run_tdoc_crawl() -> tuple:
        async with TDocDatabase(db_file) as database:
            crawler = TDocCrawler(database)
            crawl_id = await database.log_crawl_start(
@@ -215,7 +148,6 @@ def crawl_tdocs(
            crawl_start_time = datetime.now()

            if no_progress:
                # No progress bar - just run the crawl
                result = await crawler.crawl(config, progress_callback=None)
            else:
                progress, task = create_progress_bar("[cyan]Crawling TDocs...")
@@ -269,194 +201,3 @@ def crawl_tdocs(

    result, throughput = asyncio.run(run_tdoc_crawl())
    _print_crawl_result(result, throughput)


def crawl_meetings(
    working_group: WorkingGroupOption = None,
    subgroup: SubgroupOption = None,
    limit_meetings: LimitMeetingsOption = None,
    limit_meetings_per_subwg: LimitMeetingsPerSubWgOption = None,
    limit_subwgs: LimitSubWgsOption = None,
    checkout: CheckoutOption = False,
    incremental: IncrementalOption = True,
    include_without_files: IncludeWithoutFilesOption = False,
    clear_db: ClearDbOption = False,
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    timeout: TimeoutOption = 30,
    max_retries: MaxRetriesOption = 3,
    eol_username: EolUsernameOption = None,
    eol_password: EolPasswordOption = None,
    prompt_credentials: PromptCredentialsOption = None,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl meeting metadata from 3GPP portal."""
    set_verbosity(verbosity)
    set_credentials(eol_username, eol_password, prompt=prompt_credentials)

    crawler_config = ThreeGPPConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)

    config = MeetingCrawlConfig(
        working_groups=working_groups,
        subgroups=subgroups,
        incremental=incremental,
        include_without_files=include_without_files,
        max_retries=max_retries,
        timeout=timeout,
        limit_meetings=limit_meetings,
        limit_meetings_per_subwg=limit_meetings_per_subwg,
        limit_subwgs=limit_subwgs,
    )

    db_file = crawler_config.path.db_file

    scope_parts = []
    if subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling meetings ({', '.join(scope_parts)})[/cyan]")

    handle_clear_options(
        db_file,
        crawler_config.path.checkout_dir,
        MeetingDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
        clear_db=clear_db,
    )

    async def run_meeting_crawl() -> MeetingCrawlResult:
        async with MeetingDatabase(db_file) as database:
            crawl_id = await database.log_crawl_start(
                "meeting",
                [wg.value for wg in config.working_groups],
                config.incremental,
            )

            crawler = MeetingCrawler(database)

            progress, task = create_progress_bar("[cyan]Crawling meetings...")

            with progress:

                def update_progress(completed: float, total: float) -> None:
                    progress.update(task, completed=completed, total=total)

                result = await crawler.crawl(config, progress_callback=update_progress)

            await database.log_crawl_end(
                crawl_id,
                items_added=result.inserted,
                items_updated=result.updated,
                errors_count=len(result.errors),
            )

            return result

    result = asyncio.run(run_meeting_crawl())

    _print_meeting_crawl_result(result)

    if checkout:
        query_config = MeetingQueryConfig(
            working_groups=working_groups,
            subgroups=subgroups,
            limit=limit_meetings if limit_meetings and limit_meetings > 0 else None,
            order=SortOrder.DESC,
            include_without_files=False,
            start_date=_parse_date(start_date),
            end_date=_parse_date(end_date, is_end=True),
        )

        async def fetch_checkout_meetings() -> list[MeetingMetadata]:
            async with MeetingDatabase(db_file) as database:
                return await database.query_meetings(query_config)

        meetings = asyncio.run(fetch_checkout_meetings())

        with create_cached_session() as session:
            checkout_meeting_tdocs(meetings, crawler_config.path.checkout_dir, crawler_config.path.http_cache_file, session=session)


def crawl_specs(
    spec_numbers: SpecArgument = None,
    release: ReleaseOption = "latest",
    checkout: CheckoutOption = False,
    output_format: OutputFormatOption = OutputFormat.TABLE.value,
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    spec_file: SpecFileOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl spec metadata from configured sources."""
    set_verbosity(verbosity)
    crawler_config = ThreeGPPConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()
    spec_numbers = spec_numbers or []

    specs = collect_spec_numbers(spec_numbers, spec_file)
    try:
        output = OutputFormat(output_format.lower())
    except ValueError as exc:
        console.print("[red]Invalid output format; use table, json, ison, toon, or yaml")
        raise typer.Exit(code=2) from exc

    sources = build_default_spec_sources()

    handle_clear_options(
        crawler_config.path.db_file,
        crawler_config.path.checkout_dir,
        SpecDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )

    async def crawl_specs_db() -> list[SpecCrawlResult]:
        async with SpecDatabase(crawler_config.path.db_file) as database:
            return await database.crawl_specs(specs, release, sources)

    results = asyncio.run(crawl_specs_db())

    if not results:
        console.print("[yellow]No specs crawled[/yellow]")
        return

    if checkout:

        async def checkout_specs_db() -> None:
            async with SpecDatabase(crawler_config.path.db_file) as database:
                checkout_specs(
                    [result.spec_number for result in results],
                    crawler_config.path.checkout_dir,
                    database,
                    release=release,
                )

        asyncio.run(checkout_specs_db())

    if output is OutputFormat.TABLE:
        print_spec_crawl_table(results)
    else:
        data = [spec_crawl_to_dict(result) for result in results]
        console.print(format_output(data, output))


__all__ = [
    "HELP_PANEL",
    "crawl_meetings",
    "crawl_specs",
    "crawl_tdocs",
]
Loading