Commit 9d814237 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(normalization): add release normalization function and update usages

- Introduced `normalize_release` function to handle various release formats.
- Updated filtering logic in `SpecDownloads` and `SpecDatabase` to utilize the new normalization.
- Enhanced argument help descriptions in `args.py` for clarity.
parent 00d1227e
Loading
Loading
Loading
Loading
+20 −5
Original line number Diff line number Diff line
@@ -14,8 +14,14 @@ CheckoutTDocIdsArgument = Annotated[list[str], typer.Argument(help="TDoc identif
SpecArgument = Annotated[list[str] | None, typer.Argument(help="Spec number(s) to query (dotted or undotted)")]

# Options - TDocs/Meetings
WorkingGroupOption = Annotated[list[str] | None, typer.Option("--working-group", "-w", help="Filter by working group", envvar="TDC_WORKING_GROUP")]
SubgroupOption = Annotated[list[str] | None, typer.Option("--sub-group", "-s", help="Filter by sub-working group", envvar="TDC_SUB_GROUP")]
WorkingGroupOption = Annotated[
    list[str] | None,
    typer.Option("--working-group", "-w", help="Filter by working group (e.g., 'R2', 'SA2')", envvar="TDC_WORKING_GROUP"),
]
SubgroupOption = Annotated[
    list[str] | None,
    typer.Option("--sub-group", "-s", help="Filter by sub-working group (e.g., 'R2-102')", envvar="TDC_SUB_GROUP"),
]
LimitMeetingsOption = Annotated[int | None, typer.Option("--limit-meetings", help="Limit meetings overall", envvar="TDC_LIMIT_MEETINGS")]
LimitMeetingsPerWgOption = Annotated[
    int | None, typer.Option("--limit-meetings-per-wg", help="Limit meetings per working group", envvar="TDC_LIMIT_MEETINGS_PER_WG")
@@ -23,8 +29,14 @@ LimitMeetingsPerWgOption = Annotated[
LimitWgsOption = Annotated[int | None, typer.Option("--limit-wgs", help="Limit number of working groups")]
LimitOption = Annotated[int | None, typer.Option("--limit", "-l", help="Maximum number of rows")]
OrderOption = Annotated[str, typer.Option("--order", help="Sort order (asc|desc)")]
StartDateOption = Annotated[str | None, typer.Option("--start-date", help="Filter from ISO timestamp", envvar="TDC_START_DATE")]
EndDateOption = Annotated[str | None, typer.Option("--end-date", help="Filter until ISO timestamp", envvar="TDC_END_DATE")]
StartDateOption = Annotated[
    str | None,
    typer.Option("--start-date", help="Filter from ISO timestamp (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)", envvar="TDC_START_DATE"),
]
EndDateOption = Annotated[
    str | None,
    typer.Option("--end-date", help="Filter until ISO timestamp (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)", envvar="TDC_END_DATE"),
]
NoFetchOption = Annotated[
    bool,
    typer.Option("--no-fetch", help="Disable automatic fetching of missing TDocs from portal"),
@@ -80,7 +92,10 @@ CheckoutDirOption = Annotated[Path | None, typer.Option("--checkout-dir", help="
WorkersOption = Annotated[int, typer.Option("--workers", help="Number of parallel subinterpreter workers", envvar="TDC_WORKERS")]
MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")]
TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")]
VerbosityOption = Annotated[str, typer.Option("--verbosity", "-v", help="Logging verbosity level", envvar="TDC_VERBOSITY")]
VerbosityOption = Annotated[
    str,
    typer.Option("--verbosity", "-v", help="Logging verbosity level (DEBUG, INFO, WARNING, ERROR, CRITICAL)", envvar="TDC_VERBOSITY"),
]

UseWhatTheSpecOption = Annotated[
    bool, typer.Option("--use-whatthespec/--no-use-whatthespec", help="Use WhatTheSpec API for fetching", envvar="TDC_USE_WHATTHESPEC")
+33 −2
Original line number Diff line number Diff line
@@ -19,11 +19,29 @@ from tdoc_crawler.specs.models import (
    SpecQueryResult,
)
from tdoc_crawler.specs.sources.base import SpecSource
from tdoc_crawler.utils.normalization import normalize_spec_number
from tdoc_crawler.utils.normalization import normalize_release, normalize_spec_number

_logger = get_logger(__name__)


def _version_matches_release(version: str, release_type: str, release_value: str, specificity: int) -> bool:
    """Check if a version string matches the release selector."""
    try:
        v_parts = [int(p) for p in version.split(".")]
    except ValueError:
        return False

    if release_type == "exact":
        return version == release_value

    # prefix match: check major (specificity=1) or major.minor (specificity=2)
    if len(v_parts) >= specificity:
        prefix_parts = release_value.split(".")
        return all(v_parts[i] == int(prefix_parts[i]) for i in range(specificity))

    return False


@dataclass(frozen=True)
class SpecCrawlSourceOutcome:
    """Outcome for a single spec source crawl."""
@@ -324,7 +342,20 @@ class SpecDatabase(DocDatabase):
                )
                continue

            release_matches = release == "latest" or any(release in outcome.versions for outcome in outcomes if outcome.status == "ok")
            release_type, release_value, specificity = normalize_release(release)
            if release_type in ("all", "latest"):
                release_matches = True
            elif release_type in ("exact", "prefix"):
                # release_value and specificity are guaranteed non-None here
                release_matches = any(
                    _version_matches_release(v, release_type, release_value, specificity)  # type: ignore[arg-type]
                    for outcome in outcomes
                    if outcome.status == "ok"
                    for v in outcome.versions
                )
            else:
                release_matches = False

            if not release_matches:
                results.append(
                    SpecCrawlResult(
+24 −15
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.http_client import download_to_file
from tdoc_crawler.logging import get_logger
from tdoc_crawler.specs.sources.base import SpecSource
from tdoc_crawler.utils.normalization import normalize_spec_number
from tdoc_crawler.utils.normalization import normalize_release, normalize_spec_number

_logger = get_logger(__name__)

@@ -139,20 +139,29 @@ class SpecDownloads:

        versions.sort(key=lambda x: parse_version(x.version), reverse=True)

        # If specific release requested, filter versions by major version
        if release != "latest":
            try:
                release_major = int(release.split(".", maxsplit=1)[0])
                # Filter versions that match this major release
                filtered_versions = [v for v in versions if parse_version(v.version)[0] == release_major]
        # If specific release requested, use normalize_release to handle various formats
        release_type, release_value, specificity = normalize_release(release)
        if release_type in {"latest", "all"}:
            pass  # Use all versions, already sorted by version desc
        elif release_type in ("exact", "prefix"):
            # Filter versions by release prefix
            filtered_versions: list = []
            for v in versions:
                v_parts = parse_version(v.version)
                if release_type == "exact":
                    # Exact match: specificity == 3, match full version
                    if v.version == release_value:
                        filtered_versions.append(v)
                # Prefix match: match major (specificity=1) or major.minor (specificity=2)
                elif len(v_parts) >= specificity:
                    prefix_parts = release_value.split(".")
                    if all(v_parts[i] == int(prefix_parts[i]) for i in range(specificity)):
                        filtered_versions.append(v)
            if filtered_versions:
                versions = filtered_versions
            else:
                msg = f"No versions found for spec {normalized} with release {release}"
                raise ValueError(msg)
            except (ValueError, IndexError) as e:
                msg = f"Invalid release format: {release}. Expected format like '17', '17.1', or '17.1.0'"
                raise ValueError(msg) from e

        target = versions[0]

+2 −0
Original line number Diff line number Diff line
@@ -126,6 +126,7 @@ class TDocMetadata(BaseModel):
        """Ensure identifiers are uppercase and trimmed."""
        return value.strip().upper()


# TODO: this is rather a dataclass?
class TDocCrawlConfig(BaseConfigModel):
    """Configuration for TDoc crawling runs."""
@@ -191,6 +192,7 @@ class TDocCrawlConfig(BaseConfigModel):
            return None
        return normalize_tdoc_ids(value)


# TODO: this is rather a dataclass?
class TDocQueryConfig(BaseConfigModel):
    """Configuration for querying TDoc metadata."""
+61 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ _DOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})\s*\.\s*(?P<increment>\d{1
_UNDOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})(?P<increment>\d{1,3})$")
_OFFSET_PATTERN = re.compile(r"^(?P<left>.+?)\s*\+\s*(?P<offset>-?\d+)\s*$")
_RANGE_SPLIT_PATTERN = re.compile(r"\s*([-:])\s*")
_RELEASE_PREFIX_PATTERN = re.compile(r"^(?:v|rel|rel[-])", re.IGNORECASE)


def normalize_tdoc_ids(ids: Iterable[str]) -> list[str]:
@@ -186,3 +187,63 @@ def expand_spec_ranges_batch(spec_inputs: list[str]) -> list[str]:
            # Skip invalid spec inputs silently
            continue
    return expanded


def normalize_release(release: str) -> tuple[str, str | None, int | None]:
    """Normalize a release selector string.

    Supports formats:
    - 'latest' - Returns ('latest', None, None)
    - 'all' - Returns ('all', None, None)
    - '18.0.0' - Full version (returns ('exact', '18.0.0', 3))
    - '18.1' - Partial major.minor (returns ('prefix', '18.1', 2))
    - '18' - Partial major only (returns ('prefix', '18', 1))
    - 'v18', 'V18', 'Rel-18', 'rel18', 'REL18' - Prefix variants (normalized to '18')

    Args:
        release: Release selector string in various formats.

    Returns:
        Tuple of (type, normalized_value, specificity):
        - type: 'latest', 'all', 'exact', or 'prefix'
        - normalized_value: Cleaned release string without prefixes
        - specificity: Number of version components (1=major, 2=major.minor, 3=major.minor.patch)

    Raises:
        ValueError: If the release format is invalid.
    """
    cleaned = release.strip()

    # Handle special values (case-insensitive)
    upper_cleaned = cleaned.upper()
    if upper_cleaned == "LATEST":
        return ("latest", None, None)
    if upper_cleaned == "ALL":
        return ("all", None, None)

    # Strip prefixes: v, rel, rel- (case-insensitive)
    normalized = _RELEASE_PREFIX_PATTERN.sub("", cleaned)

    # Validate that we have a valid version number after stripping
    if not normalized:
        raise ValueError(f"Invalid release format: '{release}'. Expected 'latest', 'all', or a version number like '18', '18.1', '18.1.0'.")

    # Split into components and validate
    parts = normalized.split(".")
    if len(parts) > 3:
        raise ValueError(f"Invalid release format: '{release}'. Version has too many components (max 3: major.minor.patch).")

    # Validate each part is a number
    try:
        [int(p) for p in parts]
    except ValueError as exc:
        raise ValueError(f"Invalid release format: '{release}'. Version components must be numbers.") from exc

    specificity = len(parts)

    # Check for exact match (all 3 components)
    if specificity == 3:
        return ("exact", normalized, 3)

    # Partial match (1 or 2 components)
    return ("prefix", normalized, specificity)