Commit 43ec85de authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor: fix TODOs - CLI params, DRY violations, version resolution

- Rename misleading CLI parameters: --limit-meetings-per-wg → --limit-meetings-per-subwg, --limit-wgs → --limit-subwgs (args.py, crawl.py, crawl_limits.py, meetings/crawl.py, tdocs/crawl.py)
- Extract TDoc ID normalization to shared utils (tdocs/utils.py) to eliminate duplication
- Add resolve_release_to_full_version() to normalize release selectors (e.g., 'latest', '17.0') to full 3-digit versions (utils/normalization.py, cli/ai.py)
parent 69f47d1b
Loading
Loading
Loading
Loading
+21 −5
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
from __future__ import annotations

import json
import logging
import platform
from datetime import UTC, datetime
from functools import cache
@@ -22,6 +23,7 @@ from tdoc_ai import (
    ensure_ai_subfolder,
    get_active_workspace,
    get_status,
    list_statuses,
    make_workspace_member,
    normalize_workspace_name,
    query_graph,
@@ -86,9 +88,11 @@ from tdoc_crawler.config import CacheManager
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.normalization import resolve_release_to_full_version

ai_app = typer.Typer(help="AI document processing commands")
console = Console()
_logger = logging.getLogger(__name__)


@cache
@@ -307,10 +311,10 @@ def ai_status(
                console.print(f"  {status} {stage}")
    else:
        # Get status for all documents in workspace
        # Ensure cache manager is registered before calling get_status
        # Ensure cache manager is registered before calling list_statuses
        _get_cache_manager()
        # get_status without document_id returns a list
        statuses = get_status(workspace=workspace)
        # list_statuses returns all statuses in workspace
        statuses = list_statuses(workspace=workspace)

        if json_output:
            # Convert all ProcessingStatus objects to dicts
@@ -633,8 +637,20 @@ def workspace_add_members(
                ensure_ai_subfolder(checkout_path)

        # For specs, include release version in source_item_id to allow multiple versions
        # TODO: the variable release should always be fully resolved to an actual version with three digits, e.g., "17.0.0" instead of "latest", "17.0" or "17"
        source_item_id = f"{item}-REL{release}" if source_kind == SourceKind.SPEC and release else item
        # Resolve release to full 3-digit version (e.g., "17.0.0" instead of "latest" or "17")
        resolved_release: str | None = release
        if source_kind == SourceKind.SPEC and release:
            try:
                # Get available versions for this spec from database
                with TDocDatabase(manager.db_file) as db:
                    available_versions = db.get_spec_versions(item)
                    if available_versions:
                        version_list = [v.version for v in available_versions]
                        resolved_release = resolve_release_to_full_version(release, version_list)
            except Exception as e:
                _logger.debug(f"Could not resolve release version for {item}: {e}")
                resolved_release = release
        source_item_id = f"{item}-REL{resolved_release}" if source_kind == SourceKind.SPEC and release else item
        members.append(make_workspace_member(workspace, source_item_id, source_path, source_kind))

    # Report skipped items
+3 −5
Original line number Diff line number Diff line
@@ -23,12 +23,10 @@ SubgroupOption = Annotated[
    typer.Option("--sub-group", "-s", help="Filter by sub-working group (e.g., 'R2/RAN2, SA4, CT1, CP, ...')", envvar="TDC_SUB_GROUP"),
]
LimitMeetingsOption = Annotated[int | None, typer.Option("--limit-meetings", help="Limit meetings overall", envvar="TDC_LIMIT_MEETINGS")]
# TODO: This parameter does not make sense for working groups, but would for sub-workinggroups. Consider renaming to --limit-meetings-per-swg.
LimitMeetingsPerWgOption = Annotated[
    int | None, typer.Option("--limit-meetings-per-wg", help="Limit meetings per working group", envvar="TDC_LIMIT_MEETINGS_PER_WG")
LimitMeetingsPerSubWgOption = Annotated[
    int | None, typer.Option("--limit-meetings-per-subwg", help="Limit meetings per sub-working group", envvar="TDC_LIMIT_MEETINGS_PER_SUBWG")
]
# TODO: This parameter does not make sense for working groups, but would for sub-workinggroups.
LimitWgsOption = Annotated[int | None, typer.Option("--limit-wgs", help="Limit number of working groups")]
LimitSubWgsOption = Annotated[int | None, typer.Option("--limit-subwgs", help="Limit number of sub-working groups")]
LimitOption = Annotated[int | None, typer.Option("--limit", "-l", help="Maximum number of rows")]
OrderOption = Annotated[str, typer.Option("--order", help="Sort order (asc|desc)")]
StartDateOption = Annotated[
+8 −8
Original line number Diff line number Diff line
@@ -25,9 +25,9 @@ from tdoc_crawler.cli.args import (
    IncludeWithoutFilesOption,
    IncrementalOption,
    LimitMeetingsOption,
    LimitMeetingsPerWgOption,
    LimitMeetingsPerSubWgOption,
    LimitSubWgsOption,
    LimitTDocsOption,
    LimitWgsOption,
    MaxRetriesOption,
    NoProgressOption,
    OutputFormatOption,
@@ -76,8 +76,8 @@ def crawl_tdocs(
    subgroup: SubgroupOption = None,
    limit_tdocs: LimitTDocsOption = None,
    limit_meetings: LimitMeetingsOption = None,
    limit_meetings_per_wg: LimitMeetingsPerWgOption = None,
    limit_wgs: LimitWgsOption = None,
    limit_meetings_per_subwg: LimitMeetingsPerSubWgOption = None,
    limit_subwgs: LimitSubWgsOption = None,
    checkout: CheckoutOption = False,
    incremental: IncrementalOption = True,
    clear_tdocs: ClearTDocsOption = False,
@@ -109,7 +109,7 @@ def crawl_tdocs(
    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)

    limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)
    limits = CrawlLimits.build(limit_tdocs, limit_meetings, limit_meetings_per_subwg, limit_subwgs)

    http_cache = HttpCacheConfig.resolve_http_cache_config(
        cache_ttl=None, cache_refresh_on_access=None, max_retries=max_retries, cache_file=manager.http_cache_file
@@ -256,8 +256,8 @@ def crawl_meetings(
    working_group: WorkingGroupOption = None,
    subgroup: SubgroupOption = None,
    limit_meetings: LimitMeetingsOption = None,
    limit_meetings_per_wg: LimitMeetingsPerWgOption = None,
    limit_wgs: LimitWgsOption = None,
    limit_meetings_per_subwg: LimitMeetingsPerSubWgOption = None,
    limit_subwgs: LimitSubWgsOption = None,
    checkout: CheckoutOption = False,
    incremental: IncrementalOption = True,
    include_without_files: IncludeWithoutFilesOption = False,
@@ -282,7 +282,7 @@ def crawl_meetings(

    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)
    limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_wg, limit_wgs)
    limits = CrawlLimits.build(None, limit_meetings, limit_meetings_per_subwg, limit_subwgs)

    config = MeetingCrawlConfig(
        working_groups=working_groups,
+5 −5
Original line number Diff line number Diff line
@@ -136,7 +136,7 @@ class MeetingCrawler:
        if not meetings:
            return []
        filtered = list(meetings)
        filtered = self._limit_meetings_per_wg(filtered, limits.limit_meetings_per_wg)
        filtered = self._limit_meetings_per_subwg(filtered, limits.limit_meetings_per_subwg)
        filtered = self._limit_meetings(filtered, limits.limit_meetings)
        return filtered

@@ -178,17 +178,17 @@ class MeetingCrawler:
        limits: CrawlLimits,
    ) -> list[WorkingGroup]:
        """Apply working group limits from crawl configuration."""
        if limits.limit_wgs is None or limits.limit_wgs == 0:
        if limits.limit_subwgs is None or limits.limit_subwgs == 0:
            return working_groups
        limit = limits.limit_wgs
        limit = limits.limit_subwgs
        return working_groups[:limit] if limit > 0 else working_groups[limit:]

    @staticmethod
    def _limit_meetings_per_wg(
    def _limit_meetings_per_subwg(
        meetings: list[MeetingMetadata],
        limit: int | None,
    ) -> list[MeetingMetadata]:
        """Limit number of meetings per working group."""
        """Limit number of meetings per sub-working group."""
        if limit is None or limit == 0:
            return meetings
        order: dict[WorkingGroup, list[int]] = defaultdict(list)
+9 −9
Original line number Diff line number Diff line
@@ -20,26 +20,26 @@ class CrawlLimits(BaseConfigModel):
        None,
        description="Maximum meetings to crawl overall (negative for newest N)",
    )
    limit_meetings_per_wg: int | None = Field(
    limit_meetings_per_subwg: int | None = Field(
        None,
        description="Per working group meeting limit",
        description="Per sub-working group meeting limit",
    )
    limit_wgs: int | None = Field(None, description="Maximum number of working groups to process")
    limit_subwgs: int | None = Field(None, description="Maximum number of sub-working groups to process")

    @classmethod
    def build(
        cls,
        limit_tdocs: int | None,
        limit_meetings: int | None,
        limit_meetings_per_wg: int | None,
        limit_wgs: int | None,
        limit_meetings_per_subwg: int | None,
        limit_subwgs: int | None,
    ) -> Self:
        """Build CrawlLimits configuration from individual parameters."""
        return cls(
            limit_tdocs=limit_tdocs,
            limit_meetings=limit_meetings,
            limit_meetings_per_wg=limit_meetings_per_wg,
            limit_wgs=limit_wgs,
            limit_meetings_per_subwg=limit_meetings_per_subwg,
            limit_subwgs=limit_subwgs,
        )


@@ -48,8 +48,8 @@ def _new_crawl_limits() -> CrawlLimits:
    return CrawlLimits(
        limit_tdocs=None,
        limit_meetings=None,
        limit_meetings_per_wg=None,
        limit_wgs=None,
        limit_meetings_per_subwg=None,
        limit_subwgs=None,
    )


Loading