Commit 5732a721 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(metadata): add release normalization and related tests

* Implement normalize_release_label function for consistent release formatting.
* Update RAGMetadata to normalize release during initialization.
* Add test_release_normalization_variants to validate release handling.
* Modify workspace operations to utilize new release resolution logic.
parent 78d34a64
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -50,6 +50,16 @@ class TestRAGMetadata:
        assert metadata.release == "Rel-18"
        assert metadata.wg == "SA4"

    def test_release_normalization_variants(self) -> None:
        """Test release normalization across selector formats."""
        metadata_short = RAGMetadata(tdoc_id="S4-250001", release=" rel18 ")
        metadata_full = RAGMetadata(tdoc_id="S4-250001", release="18.1.0")
        metadata_latest = RAGMetadata(tdoc_id="S4-250001", release="LATEST")

        assert metadata_short.release == "Rel-18"
        assert metadata_full.release == "Rel-18.1.0"
        assert metadata_latest.release == "latest"


class TestEnrichText:
    """Test enrich_text function."""
+2 −0
Original line number Diff line number Diff line
@@ -53,6 +53,7 @@ from threegpp_ai.operations.workspaces import (
    make_workspace_member,
    normalize_workspace_name,
    remove_invalid_members,
    resolve_spec_release_from_db,
    resolve_tdoc_checkout_path,
    resolve_workspace,
)
@@ -102,6 +103,7 @@ __all__ = [
    "make_workspace_member",
    "normalize_workspace_name",
    "remove_invalid_members",
    "resolve_spec_release_from_db",
    "resolve_tdoc_checkout_path",
    "resolve_workspace",
    "set_active_workspace",
+4 −1
Original line number Diff line number Diff line
@@ -56,7 +56,10 @@ ConvertMdOption = Annotated[
]
WorkspaceReleaseOption = Annotated[
    str | None,
    typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs."),
    typer.Option(
        "--release",
        help="Spec release selector: latest, 18, 18.1, 18.1.0, Rel-18. Only applies to specs.",
    ),
]
WorkspaceLimitOption = Annotated[int | None, typer.Option("--limit", help="Maximum items to add")]
WorkspaceIncludeInactiveOption = Annotated[bool, typer.Option("--include-inactive", help="Include inactive members")]
+4 −18
Original line number Diff line number Diff line
@@ -20,12 +20,11 @@ from dotenv import load_dotenv
from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from rich.table import Table
from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.database import SpecDatabase, TDocDatabase
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_console, get_logger
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.normalization import resolve_release_to_full_version

from threegpp_ai import (
    SourceKind,
@@ -43,6 +42,7 @@ from threegpp_ai import (
    make_workspace_member,
    normalize_workspace_name,
    remove_invalid_members,
    resolve_spec_release_from_db,
    set_active_workspace,
    summarize_document,
)
@@ -85,6 +85,7 @@ from threegpp_ai.lightrag.processor import DocumentProcessor
from threegpp_ai.lightrag.rag import PROVIDER_ALIASES, PROVIDERS
from threegpp_ai.operations.conversion import OFFICE_FORMATS, convert_to_pdf
from threegpp_ai.operations.convert import convert_document_to_markdown
from threegpp_ai.operations.extraction import extract_document_structured
from threegpp_ai.operations.workspace_registry import WorkspaceRegistry

# Load environment variables from .env file
@@ -252,19 +253,6 @@ def _resolve_workspace_items(
    return [row.tdoc_id for row in rows]


def _resolve_spec_release(item: str, release: str) -> str:
    resolved = release
    manager = resolve_cache_manager()
    try:
        with SpecDatabase(manager.db_file) as db:
            versions = db.get_spec_versions(item)
            if versions:
                resolved = resolve_release_to_full_version(release, [entry.version for entry in versions])
    except Exception as exc:
        _logger.debug("Could not resolve release for %s: %s", item, exc)
    return resolved


def _process_single_item(
    *,
    item: str,
@@ -339,14 +327,12 @@ def _process_single_item(
                # Generic extraction (specs, other) - uses file path directly
                doc_file = _resolve_process_file(Path(source_path))
                if doc_file:
                    from threegpp_ai.operations.extraction import extract_document_structured

                    extract_document_structured(doc_file, metadata=None, force=False)
            was_md_extracted = True
        except Exception as e:
            _logger.debug("Failed to extract markdown for %s: %s", item, e)

    resolved_release = _resolve_spec_release(item, release) if source_kind == SourceKind.SPEC and release else None
    resolved_release = resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
    source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
    member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
    return member, None, was_converted, was_md_extracted
+24 −0
Original line number Diff line number Diff line
@@ -7,6 +7,27 @@ structured 3GPP metadata to document text before LightRAG insertion.
from __future__ import annotations

from pydantic import BaseModel, Field
from tdoc_crawler.utils.normalization import normalize_release


def normalize_release_label(release: str | None) -> str | None:
    """Normalize release strings to a stable label format for metadata display."""
    normalized_label: str | None = None
    if release is not None:
        cleaned = release.strip()
        if cleaned:
            normalized_label = cleaned
            try:
                release_type, normalized_value, _ = normalize_release(cleaned)
                if release_type == "latest":
                    normalized_label = "latest"
                elif release_type == "all":
                    normalized_label = "all"
                elif normalized_value is not None:
                    normalized_label = f"Rel-{normalized_value.lstrip('-')}"
            except ValueError:
                pass
    return normalized_label


class RAGMetadata(BaseModel):
@@ -54,6 +75,9 @@ class RAGMetadata(BaseModel):
        if self.spec_refs:
            self.spec_refs = [ref.strip() for ref in self.spec_refs if ref.strip()]

        # Normalize release to a consistent label for downstream headers.
        self.release = normalize_release_label(self.release)


def enrich_text(metadata: RAGMetadata, text: str) -> str:
    r"""Prepend normalized metadata to document text for deterministic extraction.
Loading