Commit c510f401 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(normalization): add spec range expansion and batch processing

- Implement normalize_spec_number, expand_spec_ranges, and expand_spec_ranges_batch\n- Update CLI helper collect_spec_numbers to read stdin and files, and expand ranges\n- Add comprehensive tests covering range syntax, offsets, tabs, and batch expansion
parent 51f42da1
Loading
Loading
Loading
Loading
+27 −24
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ from contextlib import suppress
from pathlib import Path
from urllib.parse import urlparse

import click
import requests
import typer

@@ -108,40 +109,42 @@ def parse_subgroups(values: list[str] | None) -> list[str] | None:


def collect_spec_numbers(specs: list[str] | None, spec_file: Path | None) -> list[str]:
    """Collect spec numbers from CLI options and stdin.

    Args:
        specs: Spec numbers provided on the command line. Use "-" to read stdin.
        spec_file: Optional file containing spec numbers (one per line).

    Returns:
        List of spec numbers in input order.
    """
    """Collect spec numbers from CLI arguments or a file."""
    collected: list[str] = []

    if specs:
        for item in specs:
            if item == "-":
                stdin_text = sys.stdin.read()
                collected.extend(line.strip() for line in stdin_text.splitlines() if line.strip())
        for spec in specs:
            if spec == "-":
                # Read from stdin
                for line in sys.stdin:
                    line_stripped = line.strip()
                    if line_stripped:
                        collected.append(line_stripped)
            else:
                stripped = item.strip()
                if stripped:
                    collected.append(stripped)
                collected.append(spec.strip())

    if spec_file is not None:
    if spec_file and spec_file.exists():
        try:
            file_text = spec_file.read_text(encoding="utf-8")
            with spec_file.open("r", encoding="utf-8") as f:
                for line in f:
                    line_stripped = line.strip()
                    if line_stripped:
                        collected.append(line_stripped)
        except OSError as exc:
            console.print(f"[red]Failed to read spec file: {exc}")
            raise typer.Exit(code=2) from exc
        collected.extend(line.strip() for line in file_text.splitlines() if line.strip())
            raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}")

    if not collected:
        console.print("[red]No spec numbers provided[/red]")
        raise typer.Exit(code=2)
        return []

    # Import here to avoid circular imports
    from tdoc_crawler.specs.normalization import expand_spec_ranges_batch

    try:
        expanded = expand_spec_ranges_batch(collected)
    except ValueError as e:
        raise click.UsageError(str(e))

    return collected
    return expanded


def build_limits(
+125 −19
Original line number Diff line number Diff line
"""Normalization helpers for spec identifiers."""

import re
from collections.abc import Generator

_SPEC_PATTERN = re.compile(r"^(?P<prefix>[A-Z]+)?(?P<body>\d{2,}(?:\.\d{3})?)$")
_DOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})\s*\.\s*(?P<increment>\d{1,3})$")
_UNDOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})(?P<increment>\d{1,3})$")
_OFFSET_PATTERN = re.compile(r"^(?P<left>.+?)\s*\+\s*(?P<offset>-?\d+)\s*$")
_RANGE_SPLIT_PATTERN = re.compile(r"\s*([-:])\s*")


def _strip_prefixes(value: str) -> str:
    """Strip optional 3GPP and TS/TR prefixes from a spec string."""
    cleaned = value.strip().upper()
    if cleaned.startswith("3GPP"):
        cleaned = cleaned[4:]
    cleaned = cleaned.strip()
    if cleaned.startswith("TS") or cleaned.startswith("TR"):
        cleaned = cleaned[2:]
    return cleaned.strip()


def _parse_spec_number(value: str) -> tuple[str, str, str, int]:
    """Parse a spec number and return (series, increment, format_kind, increment_digits)."""
    body = _strip_prefixes(value)
    if not body:
        raise ValueError("Spec number is required")

    if "." in body:
        body_no_space = re.sub(r"\s+", "", body)
        match = _DOTTED_BODY_PATTERN.match(body_no_space)
        if not match:
            raise ValueError(f"Unsupported spec number format: {value}")
        series = match.group("series")
        increment_raw = match.group("increment")
        increment = increment_raw.zfill(3)
        return series, increment, "dotted", len(increment_raw)

    digits_only = re.sub(r"\s+", "", body)
    match = _UNDOTTED_BODY_PATTERN.match(digits_only)
    if not match:
        raise ValueError(f"Unsupported spec number format: {value}")
    series = match.group("series")
    increment_raw = match.group("increment")
    increment = increment_raw.zfill(3)
    return series, increment, "undotted", len(increment_raw)


def _validate_range(start_num: int, end_num: int, spec_input: str) -> None:
    if start_num > end_num:
        raise ValueError(f"Invalid range: start {start_num} > end {end_num}")
    if end_num - start_num > 1329:
        raise ValueError(f"Range too large: {end_num - start_num + 1} specs")


def normalize_spec_number(value: str) -> str:
@@ -17,26 +65,84 @@ def normalize_spec_number(value: str) -> str:
    Raises:
        ValueError: When the spec number is not in a supported format.
    """
    raw = value.strip().upper().replace(" ", "")
    if not raw:
    series, increment, _, _ = _parse_spec_number(value)
    return f"{series}.{increment}"


def expand_spec_ranges(spec_input: str) -> Generator[str]:
    """Expand spec range syntax into individual spec numbers.

    Supports formats:
    - 26.260-26.266  (260,261,262,263,264,265,266)
    - 26.260:26.266  (same as above, alias)
    - 26.260+6       (260,261,262,263,264,265,266)
    - Flexible prefixes: "3GPP TS 26.260-26.266", "TR 26260-26.266", etc.

    Args:
        spec_input: Input string that may contain range syntax.

    Yields:
        Individual normalized spec numbers (e.g., "26.260", "26.261", ...)

    Raises:
        ValueError: If the range syntax is invalid or series numbers don't match.
    """
    cleaned = spec_input.strip()
    if not cleaned:
        raise ValueError("Spec number is required")

    match = _SPEC_PATTERN.match(raw)
    if not match:
        raise ValueError(f"Unsupported spec number format: {value}")
    if "+" in cleaned:
        offset_match = _OFFSET_PATTERN.match(cleaned)
        if not offset_match:
            raise ValueError("Invalid offset format")
        left = offset_match.group("left")
        offset = int(offset_match.group("offset"))
        if offset < 0:
            raise ValueError("Offset must be non-negative")
        series, start_str, _, _ = _parse_spec_number(left)
        start_num = int(start_str)
        end_num = start_num + offset
        _validate_range(start_num, end_num, spec_input)
        for num in range(start_num, end_num + 1):
            yield f"{series}.{num:03d}"
        return

    body = match.group("body")
    if "." in body:
        left, right = body.split(".", maxsplit=1)
        if not (left.isdigit() and right.isdigit()):
            raise ValueError(f"Unsupported spec number format: {value}")
        if len(right) != 3:
            raise ValueError(f"Unsupported spec number format: {value}")
        return f"{int(left)}.{right.zfill(3)}"
    split = _RANGE_SPLIT_PATTERN.split(cleaned, maxsplit=1)
    if len(split) == 3:
        left, _sep, right = split
        if not right.strip():
            raise ValueError("Missing end value in range")
        series1, num1, format1, digits1 = _parse_spec_number(left)
        series2, num2, format2, digits2 = _parse_spec_number(right)
        if series1 != series2:
            raise ValueError(f"Series numbers don't match: {series1} vs {series2}")
        if format1 != format2 and (digits1 == 1 or digits2 == 1):
            raise ValueError("Both range endpoints must use the same format")
        start_num = int(num1)
        end_num = int(num2)
        _validate_range(start_num, end_num, spec_input)
        for num in range(start_num, end_num + 1):
            yield f"{series1}.{num:03d}"
        return

    if not body.isdigit() or len(body) < 4:
        raise ValueError(f"Unsupported spec number format: {value}")
    yield normalize_spec_number(cleaned)


def expand_spec_ranges_batch(spec_inputs: list[str]) -> list[str]:
    """Expand all spec range inputs in a batch.

    left = body[:-3]
    right = body[-3:]
    return f"{int(left)}.{right}"
    Args:
        spec_inputs: List of spec inputs that may contain range syntax.

    Returns:
        List of expanded and normalized individual spec numbers.
        Invalid spec inputs are silently skipped.
    """
    expanded: list[str] = []
    for spec_input in spec_inputs:
        try:
            expanded.extend(expand_spec_ranges(spec_input))
        except ValueError:
            # Skip invalid spec inputs silently
            continue
    return expanded
+292 −1
Original line number Diff line number Diff line
"""Tests for spec normalization utilities."""

import os
import tempfile
from pathlib import Path

import pytest

from tdoc_crawler.specs.normalization import normalize_spec_number
from tdoc_crawler.cli.helpers import collect_spec_numbers
from tdoc_crawler.specs.normalization import expand_spec_ranges, expand_spec_ranges_batch, normalize_spec_number


def test_normalize_dotted_spec_number() -> None:
@@ -17,6 +22,292 @@ def test_normalize_prefixed_spec_number() -> None:
    assert normalize_spec_number("TS23.501") == "23.501"


def test_normalize_prefix_spacing_variants() -> None:
    assert normalize_spec_number("3GPPTS26132") == "26.132"
    assert normalize_spec_number("3GPP     TS    26 132") == "26.132"
    assert normalize_spec_number("TR 26.071") == "26.071"
    assert normalize_spec_number("TS 2671") == "26.071"


def test_normalize_tab_separator_variants() -> None:
    """Test tab characters as separators between components."""
    # Tab between 3GPP and TS/TR
    assert normalize_spec_number("3GPP\tTS\t26.123") == "26.123"
    assert normalize_spec_number("3GPP\tTR\t26.123") == "26.123"

    # Tab between TS/TR and number
    assert normalize_spec_number("TS\t26.123") == "26.123"
    assert normalize_spec_number("TR\t26123") == "26.123"

    # Tab within undotted number
    assert normalize_spec_number("26\t123") == "26.123"
    assert normalize_spec_number("26\t12") == "26.012"

    # Mixed tabs and spaces
    assert normalize_spec_number("3GPP  \t  TS  \t  26  \t  123") == "26.123"
    assert normalize_spec_number("3GPP\t TS \t 26.123") == "26.123"


def test_normalize_no_whitespace_between_prefixes() -> None:
    """Test no whitespace between 3GPP and TS/TR prefixes."""
    # No space between 3GPP and TS
    assert normalize_spec_number("3GPPTS26123") == "26.123"
    assert normalize_spec_number("3GPPTR26123") == "26.123"

    # No space between TS/TR and number
    assert normalize_spec_number("TS26123") == "26.123"
    assert normalize_spec_number("TR26123") == "26.123"

    # Combination: 3GPPTS + no space before number
    assert normalize_spec_number("3GPPTS26123") == "26.123"
    assert normalize_spec_number("3GPPTR26123") == "26.123"


def test_normalize_invalid_spec_number() -> None:
    with pytest.raises(ValueError):
        normalize_spec_number("ABC")


def test_expand_range_hyphen() -> None:
    """Test range expansion with hyphen separator."""
    result = list(expand_spec_ranges("26.260-26.266"))
    expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
    assert result == expected


def test_expand_range_colon() -> None:
    """Test range expansion with colon separator."""
    result = list(expand_spec_ranges("26.260:26.266"))
    expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
    assert result == expected


def test_expand_offset_plus() -> None:
    """Test range expansion with offset plus syntax."""
    result = list(expand_spec_ranges("26.260+6"))
    expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
    assert result == expected


def test_expand_range_with_prefix() -> None:
    """Test range expansion with various prefixes."""
    result = list(expand_spec_ranges("TS 26.260-26.266"))
    expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
    assert result == expected

    result = list(expand_spec_ranges("3GPP TR 26.260 - TS 26266"))
    assert result == expected


def test_expand_single_spec() -> None:
    """Test that single specs without range are passed through."""
    result = list(expand_spec_ranges("26.260"))
    assert result == ["26.260"]

    result = list(expand_spec_ranges("TS 26.260"))
    assert result == ["26.260"]


def test_expand_range_mixed_formats() -> None:
    """Test range expansion with mixed formats."""
    result = list(expand_spec_ranges("TR 26260-26.266"))
    expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
    assert result == expected

    result = list(expand_spec_ranges("3GPP TS 26.260+6"))
    assert result == expected

    result = list(expand_spec_ranges("TR 23.501+2"))
    assert result == ["23.501", "23.502", "23.503"]


def test_expand_range_invalid() -> None:
    """Test invalid range syntax."""
    with pytest.raises(ValueError):
        list(expand_spec_ranges("26.260-25.266"))  # Different series

    with pytest.raises(ValueError):
        list(expand_spec_ranges("26.260-"))  # Missing end

    with pytest.raises(ValueError):
        list(expand_spec_ranges("26.260+"))  # Missing offset value


def test_expand_range_reverse() -> None:
    """Test that reverse ranges are rejected."""
    with pytest.raises(ValueError):
        list(expand_spec_ranges("26.266-26.260"))  # Start > end


def test_expand_range_batch() -> None:
    """Test batch expansion of multiple spec inputs."""
    inputs = ["26.260-26.262", "TS 38.331", "TR 23.501+2"]
    result = expand_spec_ranges_batch(inputs)
    expected = ["26.260", "26.261", "26.262", "38.331", "23.501", "23.502", "23.503"]
    assert result == expected


def test_expand_range_batch_invalid() -> None:
    """Test batch expansion with invalid input - invalid specs are silently skipped."""
    result = expand_spec_ranges_batch(["26.260-26.262", "invalid", "38.331"])
    expected = ["26.260", "26.261", "26.262", "38.331"]
    assert result == expected


def test_expand_range_with_tab_separators() -> None:
    """Test range expansion with tab characters as separators."""
    # Tab in range separator
    result = list(expand_spec_ranges("26.260-\t26.266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Tabs around hyphen
    result = list(expand_spec_ranges("26.260\t-\t26.266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Tab within spec numbers (undotted)
    result = list(expand_spec_ranges("26\t260-26\t266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Tab between prefix and number
    result = list(expand_spec_ranges("3GPP\tTS\t26.260-26.266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Tab in offset
    result = list(expand_spec_ranges("26.260+\t6"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]


def test_expand_range_with_no_whitespace() -> None:
    """Test range expansion with no whitespace between prefixes."""
    # No space between 3GPP and TS in range
    result = list(expand_spec_ranges("3GPPTS26260-26.266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # No space between TS and number in range
    result = list(expand_spec_ranges("TS26260-TS26266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Mixed: prefixed start, bare end
    result = list(expand_spec_ranges("3GPPTS26260-26266"))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]


def test_expand_all_range_variants() -> None:
    """Test all range syntax variants mentioned in requirements."""
    # Test hyphen range with full 5-digit format
    result = list(expand_spec_ranges_batch(["26.260-26.266"]))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test colon range (alias for hyphen)
    result2 = list(expand_spec_ranges_batch(["26.260:26.266"]))
    assert result2 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test offset mode
    result3 = list(expand_spec_ranges_batch(["26.260+6"]))
    assert result3 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test mixed prefixes with whitespace
    result4 = list(expand_spec_ranges_batch(["3GPP TS 26.260-26.266"]))
    assert result4 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test mixed prefixes without dot in second part
    result5 = list(expand_spec_ranges_batch(["TR 26260-26.266"]))
    assert result5 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test complex mixed prefixes with whitespace
    result6 = list(expand_spec_ranges_batch(["3GPP TR 26.260 - TS 26266"]))
    assert result6 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test with prefix on both sides but different
    result7 = list(expand_spec_ranges_batch(["TS 26.260:TR 26.266"]))
    assert result7 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Test that 26.260-266 (ambiguous) is silently skipped
    result = list(expand_spec_ranges_batch(["26.260-266"]))
    assert result == []

    # Test dotted with one-digit suffix is allowed and normalized
    result8 = list(expand_spec_ranges_batch(["26.2-26.4"]))
    assert result8 == ["26.002", "26.003", "26.004"]


def test_expand_edge_cases() -> None:
    """Test edge cases for range expansion."""
    # Single spec (no range)
    result = list(expand_spec_ranges_batch(["26.260"]))
    assert result == ["26.260"]

    # Range of 1 (start == end)
    result = list(expand_spec_ranges_batch(["26.260-26.260"]))
    assert result == ["26.260"]

    # Reverse range (silently skipped in batch mode)
    result = list(expand_spec_ranges_batch(["26.266-26.260"]))
    assert result == []

    # Offset 0
    result = list(expand_spec_ranges_batch(["26.260+0"]))
    assert result == ["26.260"]

    # Offset 1
    result = list(expand_spec_ranges_batch(["26.260+1"]))
    assert result == ["26.260", "26.261"]

    # Large range
    result = list(expand_spec_ranges_batch(["38.331-38.335"]))
    assert result == ["38.331", "38.332", "38.333", "38.334", "38.335"]

    # Cross hundred boundary
    result = list(expand_spec_ranges_batch(["23.498-23.502"]))
    assert result == ["23.498", "23.499", "23.500", "23.501", "23.502"]


def test_expand_invalid_ranges() -> None:
    """Test invalid range syntax - invalid specs are silently skipped in batch mode."""
    # Invalid: missing second part after hyphen - skipped
    result = list(expand_spec_ranges_batch(["26.260-"]))
    assert result == []

    # Invalid: missing offset number - skipped
    result = list(expand_spec_ranges_batch(["26.260+"]))
    assert result == []

    # Invalid: negative offset - skipped
    result = list(expand_spec_ranges_batch(["26.260+-1"]))
    assert result == []

    # Invalid: non-numeric offset - skipped
    result = list(expand_spec_ranges_batch(["26.260+abc"]))
    assert result == []

    # Mixed dotted/undotted formats with full increment digits is allowed
    result = list(expand_spec_ranges_batch(["26.260-26266"]))
    assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

    # Invalid: completely wrong format - skipped
    result = list(expand_spec_ranges_batch(["abc"]))
    assert result == []

    # Test that valid specs are still processed alongside invalid ones
    result = list(expand_spec_ranges_batch(["abc", "26.260", "invalid", "26.261"]))
    assert result == ["26.260", "26.261"]


def test_expand_with_collect_spec_numbers() -> None:
    """Test that range expansion works with CLI helper."""
    # Test with positional arguments using ranges
    result = collect_spec_numbers(["26.260-26.262", "38.331+1"], None)
    assert result == ["26.260", "26.261", "26.262", "38.331", "38.332"]

    # Test with file containing ranges
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
        f.write("26.260-26.262\n")
        f.write("TR 38.331+1\n")
        f.write("3GPP TS a23.501\n")  # Should be ignored/skipped
        f.flush()
        temp_path = Path(f.name)
    try:
        result = collect_spec_numbers(specs=None, spec_file=temp_path)
        assert result == ["26.260", "26.261", "26.262", "38.331", "38.332"]
    finally:
        os.unlink(temp_path)