feat(normalization): add spec range expansion and batch processing (c510f401) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/helpers.py

+27 −24

Original line number	Diff line number	Diff line
		@@ -14,6 +14,7 @@ from contextlib import suppress
		from pathlib import Path
		from urllib.parse import urlparse

		import click
		import requests
		import typer

		@@ -108,40 +109,42 @@ def parse_subgroups(values: list[str] \| None) -> list[str] \| None:


		def collect_spec_numbers(specs: list[str] \| None, spec_file: Path \| None) -> list[str]:
		"""Collect spec numbers from CLI options and stdin.

		Args:
		specs: Spec numbers provided on the command line. Use "-" to read stdin.
		spec_file: Optional file containing spec numbers (one per line).

		Returns:
		List of spec numbers in input order.
		"""
		"""Collect spec numbers from CLI arguments or a file."""
		collected: list[str] = []

		if specs:
		for item in specs:
		if item == "-":
		stdin_text = sys.stdin.read()
		collected.extend(line.strip() for line in stdin_text.splitlines() if line.strip())
		for spec in specs:
		if spec == "-":
		# Read from stdin
		for line in sys.stdin:
		line_stripped = line.strip()
		if line_stripped:
		collected.append(line_stripped)
		else:
		stripped = item.strip()
		if stripped:
		collected.append(stripped)
		collected.append(spec.strip())

		if spec_file is not None:
		if spec_file and spec_file.exists():
		try:
		file_text = spec_file.read_text(encoding="utf-8")
		with spec_file.open("r", encoding="utf-8") as f:
		for line in f:
		line_stripped = line.strip()
		if line_stripped:
		collected.append(line_stripped)
		except OSError as exc:
		console.print(f"[red]Failed to read spec file: {exc}")
		raise typer.Exit(code=2) from exc
		collected.extend(line.strip() for line in file_text.splitlines() if line.strip())
		raise click.FileError(str(spec_file), hint=f"Cannot read spec file: {exc}")

		if not collected:
		console.print("[red]No spec numbers provided[/red]")
		raise typer.Exit(code=2)
		return []

		# Import here to avoid circular imports
		from tdoc_crawler.specs.normalization import expand_spec_ranges_batch

		try:
		expanded = expand_spec_ranges_batch(collected)
		except ValueError as e:
		raise click.UsageError(str(e))

		return collected
		return expanded


		def build_limits(

src/tdoc_crawler/specs/normalization.py

+125 −19

Original line number	Diff line number	Diff line
		"""Normalization helpers for spec identifiers."""

		import re
		from collections.abc import Generator

		_SPEC_PATTERN = re.compile(r"^(?P<prefix>[A-Z]+)?(?P<body>\d{2,}(?:\.\d{3})?)$")
		_DOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})\s\.\s(?P<increment>\d{1,3})$")
		_UNDOTTED_BODY_PATTERN = re.compile(r"^(?P<series>\d{2})(?P<increment>\d{1,3})$")
		_OFFSET_PATTERN = re.compile(r"^(?P<left>.+?)\s\+\s(?P<offset>-?\d+)\s*$")
		_RANGE_SPLIT_PATTERN = re.compile(r"\s([-:])\s")


		def _strip_prefixes(value: str) -> str:
		"""Strip optional 3GPP and TS/TR prefixes from a spec string."""
		cleaned = value.strip().upper()
		if cleaned.startswith("3GPP"):
		cleaned = cleaned[4:]
		cleaned = cleaned.strip()
		if cleaned.startswith("TS") or cleaned.startswith("TR"):
		cleaned = cleaned[2:]
		return cleaned.strip()


		def _parse_spec_number(value: str) -> tuple[str, str, str, int]:
		"""Parse a spec number and return (series, increment, format_kind, increment_digits)."""
		body = _strip_prefixes(value)
		if not body:
		raise ValueError("Spec number is required")

		if "." in body:
		body_no_space = re.sub(r"\s+", "", body)
		match = _DOTTED_BODY_PATTERN.match(body_no_space)
		if not match:
		raise ValueError(f"Unsupported spec number format: {value}")
		series = match.group("series")
		increment_raw = match.group("increment")
		increment = increment_raw.zfill(3)
		return series, increment, "dotted", len(increment_raw)

		digits_only = re.sub(r"\s+", "", body)
		match = _UNDOTTED_BODY_PATTERN.match(digits_only)
		if not match:
		raise ValueError(f"Unsupported spec number format: {value}")
		series = match.group("series")
		increment_raw = match.group("increment")
		increment = increment_raw.zfill(3)
		return series, increment, "undotted", len(increment_raw)


		def _validate_range(start_num: int, end_num: int, spec_input: str) -> None:
		if start_num > end_num:
		raise ValueError(f"Invalid range: start {start_num} > end {end_num}")
		if end_num - start_num > 1329:
		raise ValueError(f"Range too large: {end_num - start_num + 1} specs")


		def normalize_spec_number(value: str) -> str:
		@@ -17,26 +65,84 @@ def normalize_spec_number(value: str) -> str:
		Raises:
		ValueError: When the spec number is not in a supported format.
		"""
		raw = value.strip().upper().replace(" ", "")
		if not raw:
		series, increment, _, _ = _parse_spec_number(value)
		return f"{series}.{increment}"


		def expand_spec_ranges(spec_input: str) -> Generator[str]:
		"""Expand spec range syntax into individual spec numbers.

		Supports formats:
		- 26.260-26.266 (260,261,262,263,264,265,266)
		- 26.260:26.266 (same as above, alias)
		- 26.260+6 (260,261,262,263,264,265,266)
		- Flexible prefixes: "3GPP TS 26.260-26.266", "TR 26260-26.266", etc.

		Args:
		spec_input: Input string that may contain range syntax.

		Yields:
		Individual normalized spec numbers (e.g., "26.260", "26.261", ...)

		Raises:
		ValueError: If the range syntax is invalid or series numbers don't match.
		"""
		cleaned = spec_input.strip()
		if not cleaned:
		raise ValueError("Spec number is required")

		match = _SPEC_PATTERN.match(raw)
		if not match:
		raise ValueError(f"Unsupported spec number format: {value}")
		if "+" in cleaned:
		offset_match = _OFFSET_PATTERN.match(cleaned)
		if not offset_match:
		raise ValueError("Invalid offset format")
		left = offset_match.group("left")
		offset = int(offset_match.group("offset"))
		if offset < 0:
		raise ValueError("Offset must be non-negative")
		series, start_str, _, _ = _parse_spec_number(left)
		start_num = int(start_str)
		end_num = start_num + offset
		_validate_range(start_num, end_num, spec_input)
		for num in range(start_num, end_num + 1):
		yield f"{series}.{num:03d}"
		return

		body = match.group("body")
		if "." in body:
		left, right = body.split(".", maxsplit=1)
		if not (left.isdigit() and right.isdigit()):
		raise ValueError(f"Unsupported spec number format: {value}")
		if len(right) != 3:
		raise ValueError(f"Unsupported spec number format: {value}")
		return f"{int(left)}.{right.zfill(3)}"
		split = _RANGE_SPLIT_PATTERN.split(cleaned, maxsplit=1)
		if len(split) == 3:
		left, _sep, right = split
		if not right.strip():
		raise ValueError("Missing end value in range")
		series1, num1, format1, digits1 = _parse_spec_number(left)
		series2, num2, format2, digits2 = _parse_spec_number(right)
		if series1 != series2:
		raise ValueError(f"Series numbers don't match: {series1} vs {series2}")
		if format1 != format2 and (digits1 == 1 or digits2 == 1):
		raise ValueError("Both range endpoints must use the same format")
		start_num = int(num1)
		end_num = int(num2)
		_validate_range(start_num, end_num, spec_input)
		for num in range(start_num, end_num + 1):
		yield f"{series1}.{num:03d}"
		return

		if not body.isdigit() or len(body) < 4:
		raise ValueError(f"Unsupported spec number format: {value}")
		yield normalize_spec_number(cleaned)


		def expand_spec_ranges_batch(spec_inputs: list[str]) -> list[str]:
		"""Expand all spec range inputs in a batch.

		left = body[:-3]
		right = body[-3:]
		return f"{int(left)}.{right}"
		Args:
		spec_inputs: List of spec inputs that may contain range syntax.

		Returns:
		List of expanded and normalized individual spec numbers.
		Invalid spec inputs are silently skipped.
		"""
		expanded: list[str] = []
		for spec_input in spec_inputs:
		try:
		expanded.extend(expand_spec_ranges(spec_input))
		except ValueError:
		# Skip invalid spec inputs silently
		continue
		return expanded

tests/test_specs_normalization.py

+292 −1

Original line number	Diff line number	Diff line
		"""Tests for spec normalization utilities."""

		import os
		import tempfile
		from pathlib import Path

		import pytest

		from tdoc_crawler.specs.normalization import normalize_spec_number
		from tdoc_crawler.cli.helpers import collect_spec_numbers
		from tdoc_crawler.specs.normalization import expand_spec_ranges, expand_spec_ranges_batch, normalize_spec_number


		def test_normalize_dotted_spec_number() -> None:
		@@ -17,6 +22,292 @@ def test_normalize_prefixed_spec_number() -> None:
		assert normalize_spec_number("TS23.501") == "23.501"


		def test_normalize_prefix_spacing_variants() -> None:
		assert normalize_spec_number("3GPPTS26132") == "26.132"
		assert normalize_spec_number("3GPP TS 26 132") == "26.132"
		assert normalize_spec_number("TR 26.071") == "26.071"
		assert normalize_spec_number("TS 2671") == "26.071"


		def test_normalize_tab_separator_variants() -> None:
		"""Test tab characters as separators between components."""
		# Tab between 3GPP and TS/TR
		assert normalize_spec_number("3GPP\tTS\t26.123") == "26.123"
		assert normalize_spec_number("3GPP\tTR\t26.123") == "26.123"

		# Tab between TS/TR and number
		assert normalize_spec_number("TS\t26.123") == "26.123"
		assert normalize_spec_number("TR\t26123") == "26.123"

		# Tab within undotted number
		assert normalize_spec_number("26\t123") == "26.123"
		assert normalize_spec_number("26\t12") == "26.012"

		# Mixed tabs and spaces
		assert normalize_spec_number("3GPP \t TS \t 26 \t 123") == "26.123"
		assert normalize_spec_number("3GPP\t TS \t 26.123") == "26.123"


		def test_normalize_no_whitespace_between_prefixes() -> None:
		"""Test no whitespace between 3GPP and TS/TR prefixes."""
		# No space between 3GPP and TS
		assert normalize_spec_number("3GPPTS26123") == "26.123"
		assert normalize_spec_number("3GPPTR26123") == "26.123"

		# No space between TS/TR and number
		assert normalize_spec_number("TS26123") == "26.123"
		assert normalize_spec_number("TR26123") == "26.123"

		# Combination: 3GPPTS + no space before number
		assert normalize_spec_number("3GPPTS26123") == "26.123"
		assert normalize_spec_number("3GPPTR26123") == "26.123"


		def test_normalize_invalid_spec_number() -> None:
		with pytest.raises(ValueError):
		normalize_spec_number("ABC")


		def test_expand_range_hyphen() -> None:
		"""Test range expansion with hyphen separator."""
		result = list(expand_spec_ranges("26.260-26.266"))
		expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
		assert result == expected


		def test_expand_range_colon() -> None:
		"""Test range expansion with colon separator."""
		result = list(expand_spec_ranges("26.260:26.266"))
		expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
		assert result == expected


		def test_expand_offset_plus() -> None:
		"""Test range expansion with offset plus syntax."""
		result = list(expand_spec_ranges("26.260+6"))
		expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
		assert result == expected


		def test_expand_range_with_prefix() -> None:
		"""Test range expansion with various prefixes."""
		result = list(expand_spec_ranges("TS 26.260-26.266"))
		expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
		assert result == expected

		result = list(expand_spec_ranges("3GPP TR 26.260 - TS 26266"))
		assert result == expected


		def test_expand_single_spec() -> None:
		"""Test that single specs without range are passed through."""
		result = list(expand_spec_ranges("26.260"))
		assert result == ["26.260"]

		result = list(expand_spec_ranges("TS 26.260"))
		assert result == ["26.260"]


		def test_expand_range_mixed_formats() -> None:
		"""Test range expansion with mixed formats."""
		result = list(expand_spec_ranges("TR 26260-26.266"))
		expected = ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]
		assert result == expected

		result = list(expand_spec_ranges("3GPP TS 26.260+6"))
		assert result == expected

		result = list(expand_spec_ranges("TR 23.501+2"))
		assert result == ["23.501", "23.502", "23.503"]


		def test_expand_range_invalid() -> None:
		"""Test invalid range syntax."""
		with pytest.raises(ValueError):
		list(expand_spec_ranges("26.260-25.266")) # Different series

		with pytest.raises(ValueError):
		list(expand_spec_ranges("26.260-")) # Missing end

		with pytest.raises(ValueError):
		list(expand_spec_ranges("26.260+")) # Missing offset value


		def test_expand_range_reverse() -> None:
		"""Test that reverse ranges are rejected."""
		with pytest.raises(ValueError):
		list(expand_spec_ranges("26.266-26.260")) # Start > end


		def test_expand_range_batch() -> None:
		"""Test batch expansion of multiple spec inputs."""
		inputs = ["26.260-26.262", "TS 38.331", "TR 23.501+2"]
		result = expand_spec_ranges_batch(inputs)
		expected = ["26.260", "26.261", "26.262", "38.331", "23.501", "23.502", "23.503"]
		assert result == expected


		def test_expand_range_batch_invalid() -> None:
		"""Test batch expansion with invalid input - invalid specs are silently skipped."""
		result = expand_spec_ranges_batch(["26.260-26.262", "invalid", "38.331"])
		expected = ["26.260", "26.261", "26.262", "38.331"]
		assert result == expected


		def test_expand_range_with_tab_separators() -> None:
		"""Test range expansion with tab characters as separators."""
		# Tab in range separator
		result = list(expand_spec_ranges("26.260-\t26.266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Tabs around hyphen
		result = list(expand_spec_ranges("26.260\t-\t26.266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Tab within spec numbers (undotted)
		result = list(expand_spec_ranges("26\t260-26\t266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Tab between prefix and number
		result = list(expand_spec_ranges("3GPP\tTS\t26.260-26.266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Tab in offset
		result = list(expand_spec_ranges("26.260+\t6"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]


		def test_expand_range_with_no_whitespace() -> None:
		"""Test range expansion with no whitespace between prefixes."""
		# No space between 3GPP and TS in range
		result = list(expand_spec_ranges("3GPPTS26260-26.266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# No space between TS and number in range
		result = list(expand_spec_ranges("TS26260-TS26266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Mixed: prefixed start, bare end
		result = list(expand_spec_ranges("3GPPTS26260-26266"))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]


		def test_expand_all_range_variants() -> None:
		"""Test all range syntax variants mentioned in requirements."""
		# Test hyphen range with full 5-digit format
		result = list(expand_spec_ranges_batch(["26.260-26.266"]))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test colon range (alias for hyphen)
		result2 = list(expand_spec_ranges_batch(["26.260:26.266"]))
		assert result2 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test offset mode
		result3 = list(expand_spec_ranges_batch(["26.260+6"]))
		assert result3 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test mixed prefixes with whitespace
		result4 = list(expand_spec_ranges_batch(["3GPP TS 26.260-26.266"]))
		assert result4 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test mixed prefixes without dot in second part
		result5 = list(expand_spec_ranges_batch(["TR 26260-26.266"]))
		assert result5 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test complex mixed prefixes with whitespace
		result6 = list(expand_spec_ranges_batch(["3GPP TR 26.260 - TS 26266"]))
		assert result6 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test with prefix on both sides but different
		result7 = list(expand_spec_ranges_batch(["TS 26.260:TR 26.266"]))
		assert result7 == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Test that 26.260-266 (ambiguous) is silently skipped
		result = list(expand_spec_ranges_batch(["26.260-266"]))
		assert result == []

		# Test dotted with one-digit suffix is allowed and normalized
		result8 = list(expand_spec_ranges_batch(["26.2-26.4"]))
		assert result8 == ["26.002", "26.003", "26.004"]


		def test_expand_edge_cases() -> None:
		"""Test edge cases for range expansion."""
		# Single spec (no range)
		result = list(expand_spec_ranges_batch(["26.260"]))
		assert result == ["26.260"]

		# Range of 1 (start == end)
		result = list(expand_spec_ranges_batch(["26.260-26.260"]))
		assert result == ["26.260"]

		# Reverse range (silently skipped in batch mode)
		result = list(expand_spec_ranges_batch(["26.266-26.260"]))
		assert result == []

		# Offset 0
		result = list(expand_spec_ranges_batch(["26.260+0"]))
		assert result == ["26.260"]

		# Offset 1
		result = list(expand_spec_ranges_batch(["26.260+1"]))
		assert result == ["26.260", "26.261"]

		# Large range
		result = list(expand_spec_ranges_batch(["38.331-38.335"]))
		assert result == ["38.331", "38.332", "38.333", "38.334", "38.335"]

		# Cross hundred boundary
		result = list(expand_spec_ranges_batch(["23.498-23.502"]))
		assert result == ["23.498", "23.499", "23.500", "23.501", "23.502"]


		def test_expand_invalid_ranges() -> None:
		"""Test invalid range syntax - invalid specs are silently skipped in batch mode."""
		# Invalid: missing second part after hyphen - skipped
		result = list(expand_spec_ranges_batch(["26.260-"]))
		assert result == []

		# Invalid: missing offset number - skipped
		result = list(expand_spec_ranges_batch(["26.260+"]))
		assert result == []

		# Invalid: negative offset - skipped
		result = list(expand_spec_ranges_batch(["26.260+-1"]))
		assert result == []

		# Invalid: non-numeric offset - skipped
		result = list(expand_spec_ranges_batch(["26.260+abc"]))
		assert result == []

		# Mixed dotted/undotted formats with full increment digits is allowed
		result = list(expand_spec_ranges_batch(["26.260-26266"]))
		assert result == ["26.260", "26.261", "26.262", "26.263", "26.264", "26.265", "26.266"]

		# Invalid: completely wrong format - skipped
		result = list(expand_spec_ranges_batch(["abc"]))
		assert result == []

		# Test that valid specs are still processed alongside invalid ones
		result = list(expand_spec_ranges_batch(["abc", "26.260", "invalid", "26.261"]))
		assert result == ["26.260", "26.261"]


		def test_expand_with_collect_spec_numbers() -> None:
		"""Test that range expansion works with CLI helper."""
		# Test with positional arguments using ranges
		result = collect_spec_numbers(["26.260-26.262", "38.331+1"], None)
		assert result == ["26.260", "26.261", "26.262", "38.331", "38.332"]

		# Test with file containing ranges
		with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
		f.write("26.260-26.262\n")
		f.write("TR 38.331+1\n")
		f.write("3GPP TS a23.501\n") # Should be ignored/skipped
		f.flush()
		temp_path = Path(f.name)
		try:
		result = collect_spec_numbers(specs=None, spec_file=temp_path)
		assert result == ["26.260", "26.261", "26.262", "38.331", "38.332"]
		finally:
		os.unlink(temp_path)