Commit 40c77e3f authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(cli): replace CacheManager with PathConfig and ThreeGPPConfig in all CLI commands

- tdoc_app: remove CacheManager registration from callback; use crawler_config.path.* directly; add load_dotenv() before imports
- crawl: remove CacheManager; load TDocCrawlerConfig.from_settings() in crawl_meetings/crawl_specs and override cache_dir via path.cache_dir
- spec_app: remove CacheManager registration; apply --cache-dir via config.path.cache_dir
- specs/query: replace CacheManager(cache_dir).register() with PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig()
- config/config_app: replace TDocCrawlerConfig references with ThreeGPPConfig
- demo.bat: enable TDC_AI_VLM=1
parent 2b093931
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@ cls
call .venv\scripts\activate.bat

SET TDC_AI_CONVERT_MD=1
SET TDC_AI_VLM=0
SET TDC_AI_VLM=1

tdoc-crawler crawl-meetings -s S4
tdoc-crawler crawl --start-date 2016
+5 −11
Original line number Diff line number Diff line
@@ -3,15 +3,11 @@
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

import typer
from rich.console import Console

from tdoc_crawler.config import TDocCrawlerConfig

if TYPE_CHECKING:
    pass
from tdoc_crawler.config import ThreeGPPConfig

console = Console()

@@ -19,8 +15,8 @@ console = Console()
def load_cli_config(
    ctx: typer.Context,
    config_file: Path | None = None,
) -> TDocCrawlerConfig:
    """Load and validate TDocCrawlerConfig for CLI use.
) -> ThreeGPPConfig:
    """Load and validate ThreeGPPConfig for CLI use.

    Loads config from (in precedence order):
    1. Explicit --config file (if provided)
@@ -36,19 +32,17 @@ def load_cli_config(
        config_file: Optional explicit config file path from --config

    Returns:
        Validated TDocCrawlerConfig instance
        Validated ThreeGPPConfig instance

    Raises:
        typer.Exit: If config validation fails
    """
    try:
        # Load config with optional explicit file
        config = TDocCrawlerConfig.from_settings(
        config = ThreeGPPConfig.from_settings(
            config_file=config_file,
            cwd=Path.cwd(),
        )

        # Store in context for subcommand access
        ctx.obj = config

        return config
+7 −7
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ from rich.table import Table

from tdoc_crawler.cli.constants import HELP_PANEL_CONFIG, SECTION_DESCRIPTIONS
from tdoc_crawler.config.export import ConfigExporter
from tdoc_crawler.config.settings import TDocCrawlerConfig
from tdoc_crawler.config.settings import ThreeGPPConfig

FormatType = Literal["toml", "yaml", "json"]

@@ -93,7 +93,7 @@ def _check_path_exists(path: Path) -> tuple[bool, str]:
        return False, f" ({e})"


def _validate_config_values(config: TDocCrawlerConfig) -> list[tuple[str, str]]:
def _validate_config_values(config: ThreeGPPConfig) -> list[tuple[str, str]]:
    """Validate config values and return list of (severity, message) tuples."""
    issues: list[tuple[str, str]] = []

@@ -136,7 +136,7 @@ def _validate_config_values(config: TDocCrawlerConfig) -> list[tuple[str, str]]:
    return issues


def _validate_from_file(file: Path) -> TDocCrawlerConfig:
def _validate_from_file(file: Path) -> ThreeGPPConfig:
    """Load config from a specific file with validation."""
    if not file.exists():
        rprint(f"[red]Error: File not found:[/red] {file}")
@@ -156,7 +156,7 @@ def _validate_from_file(file: Path) -> TDocCrawlerConfig:

    # Load and validate
    try:
        return TDocCrawlerConfig.from_settings(config_file=file)
        return ThreeGPPConfig.from_settings(config_file=file)
    except ValidationError as e:
        rprint(f"[red]Validation error in {file}:[/red]")
        for error in e.errors():
@@ -214,7 +214,7 @@ def config_validate(
        config = _validate_from_file(file)
    else:
        try:
            config = TDocCrawlerConfig.from_settings()
            config = ThreeGPPConfig.from_settings()
        except ValidationError as e:
            rprint("[red]Validation error in discovered config:[/red]")
            for error in e.errors():
@@ -245,8 +245,8 @@ def config_docs(
        "crawl": [],
    }

    # Introspect TDocCrawlerConfig
    config = TDocCrawlerConfig()
    # Introspect ThreeGPPConfig
    config = ThreeGPPConfig()
    config_data = config.model_dump()

    for section_name, section_model in [
+24 −15
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ from tdoc_crawler.cli.args import (
)
from tdoc_crawler.cli.formatting import format_output
from tdoc_crawler.cli.printing import print_spec_crawl_table, spec_crawl_to_dict
from tdoc_crawler.config import CacheManager, TDocCrawlerConfig
from tdoc_crawler.config import TDocCrawlerConfig
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.database.specs import SpecCrawlResult, SpecDatabase
@@ -56,7 +56,6 @@ from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler, MeetingCrawlResult
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, checkout_specs
from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
@@ -131,7 +130,10 @@ def crawl_tdocs(
        overall_timeout=overall_timeout,
        timeout=timeout,
        max_retries=max_retries,
        limits=CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_subwg, limit_subwgs),
        limit_tdocs=limit_tdocs,
        limit_meetings=limit_meetings,
        limit_meetings_per_subwg=limit_meetings_per_subwg,
        limit_subwgs=limit_subwgs,
        target_ids=None,
    )

@@ -278,11 +280,13 @@ def crawl_meetings(
    set_verbosity(verbosity)
    set_credentials(eol_username, eol_password, prompt=prompt_credentials)

    manager = CacheManager(cache_dir).register()
    crawler_config = TDocCrawlerConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)
    limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_subwg, limit_subwgs)

    config = MeetingCrawlConfig(
        working_groups=working_groups,
@@ -291,10 +295,12 @@ def crawl_meetings(
        include_without_files=include_without_files,
        max_retries=max_retries,
        timeout=timeout,
        limits=limits,
        limit_meetings=limit_meetings,
        limit_meetings_per_subwg=limit_meetings_per_subwg,
        limit_subwgs=limit_subwgs,
    )

    db_file = manager.db_file
    db_file = crawler_config.path.db_file

    scope_parts = []
    if subgroups:
@@ -305,7 +311,7 @@ def crawl_meetings(

    handle_clear_options(
        db_file,
        manager.checkout_dir,
        crawler_config.path.checkout_dir,
        MeetingDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
@@ -367,7 +373,7 @@ def crawl_meetings(
        meetings = asyncio.run(fetch_checkout_meetings())

        with create_cached_session() as session:
            checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_file, session=session)
            checkout_meeting_tdocs(meetings, crawler_config.path.checkout_dir, crawler_config.path.http_cache_file, session=session)


def crawl_specs(
@@ -383,7 +389,10 @@ def crawl_specs(
) -> None:
    """Crawl spec metadata from configured sources."""
    set_verbosity(verbosity)
    manager = CacheManager(cache_dir).register()
    crawler_config = TDocCrawlerConfig.from_settings()
    if cache_dir is not None:
        crawler_config.path.cache_dir = cache_dir
    crawler_config.ensure_paths()
    spec_numbers = spec_numbers or []

    specs = collect_spec_numbers(spec_numbers, spec_file)
@@ -396,15 +405,15 @@ def crawl_specs(
    sources = build_default_spec_sources()

    handle_clear_options(
        manager.db_file,
        manager.checkout_dir,
        crawler_config.path.db_file,
        crawler_config.path.checkout_dir,
        SpecDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )

    async def crawl_specs_db() -> list[SpecCrawlResult]:
        async with SpecDatabase(manager.db_file) as database:
        async with SpecDatabase(crawler_config.path.db_file) as database:
            return await database.crawl_specs(specs, release, sources)

    results = asyncio.run(crawl_specs_db())
@@ -416,10 +425,10 @@ def crawl_specs(
    if checkout:

        async def checkout_specs_db() -> None:
            async with SpecDatabase(manager.db_file) as database:
            async with SpecDatabase(crawler_config.path.db_file) as database:
                checkout_specs(
                    [result.spec_number for result in results],
                    manager.checkout_dir,
                    crawler_config.path.checkout_dir,
                    database,
                    release=release,
                )
+13 −14
Original line number Diff line number Diff line
@@ -45,7 +45,7 @@ from tdoc_crawler.cli.printing import (
    spec_query_to_dict,
    tdoc_to_dict,
)
from tdoc_crawler.config import CacheManager
from tdoc_crawler.config import PathConfig
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.http_client import create_cached_session
@@ -89,7 +89,7 @@ def query_tdocs(
) -> None:
    """Query TDoc metadata from database."""
    set_verbosity(verbosity)
    manager = CacheManager(cache_dir).register()
    path = PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig()

    working_groups = parse_working_groups(working_group)
    try:
@@ -131,10 +131,10 @@ def query_tdocs(
        agenda_pattern_exclude=agenda_ex,
    )

    db_file = manager.db_file
    db_file = path.db_file
    handle_clear_options(
        db_file,
        manager.checkout_dir,
        path.checkout_dir,
        TDocDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
@@ -150,7 +150,6 @@ def query_tdocs(
                        config,
                        results,
                        session=session,
                        cache_manager_name=manager.name,
                    )
                    if result.fetch_result and result.fetch_result.errors:
                        console.print(
@@ -169,7 +168,7 @@ def query_tdocs(

    if checkout:
        with create_cached_session() as session:
            checkout_tdocs(results, manager.checkout_dir, force=False, session=session)
            checkout_tdocs(results, path.checkout_dir, force=False, session=session)

    # Build meeting map for enriched output
    async def load_meeting_map() -> dict:
@@ -207,7 +206,7 @@ def query_meetings(
) -> None:
    """Query meeting metadata from database."""
    set_verbosity(verbosity)
    manager = CacheManager(cache_dir).register()
    path = PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig()
    working_groups = parse_working_groups(working_group)
    subgroups = parse_subgroups(subgroup)
    try:
@@ -224,10 +223,10 @@ def query_meetings(
        include_without_files=include_without_files,
    )

    db_file = manager.db_file
    db_file = path.db_file
    handle_clear_options(
        db_file,
        manager.checkout_dir,
        path.checkout_dir,
        MeetingDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
@@ -245,7 +244,7 @@ def query_meetings(

    if checkout:
        with create_cached_session() as session:
            checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_file, session=session)
            checkout_meeting_tdocs(meetings, path.checkout_dir, path.http_cache_file, session=session)

    try:
        output = OutputFormat(output_format.lower())
@@ -276,7 +275,7 @@ def query_specs(
) -> None:
    """Query spec metadata from database."""
    set_verbosity(verbosity)
    manager = CacheManager(cache_dir).register()
    path = PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig()
    specs = collect_spec_numbers(spec_numbers, spec_file)
    working_groups = parse_working_groups(working_group)
    wg_filter = working_groups[0].value if working_groups else None
@@ -294,10 +293,10 @@ def query_specs(
        console.print("[red]Invalid output format; use table, json, jsonl, toon, or yaml")
        raise typer.Exit(code=2) from exc

    db_file = manager.db_file
    db_file = path.db_file
    handle_clear_options(
        db_file,
        manager.checkout_dir,
        path.checkout_dir,
        SpecDatabase,
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
@@ -318,7 +317,7 @@ def query_specs(

        async def load_specs_for_checkout() -> None:
            async with SpecDatabase(db_file) as database:
                checkout_specs(spec_list, manager.checkout_dir, database, release="latest")
                checkout_specs(spec_list, path.checkout_dir, database, release="latest")

        asyncio.run(load_specs_for_checkout())

Loading