Commit 08089377 authored by Jan Reimes's avatar Jan Reimes
Browse files

fix: Skip spec checkout when already exists and enable extraction for specs

- workspaces.py: Fix spec version matching to use 3GPP version codes (h00, i10) instead of release numbers (19, 18)
  - Resolves release parameter to actual version codes from database before checking existing checkouts
  - Prevents re-downloading specs that already exist with matching version
- cli.py: Enable markdown extraction for both TDocs and specs during add-members
  - Changed convert_md check from SourceKind.TDOC only to all source kinds
  - Uses unified convert_document_to_markdown() for all document types
  - Fixes missing tables/figures/equations extraction for specs

These fixes ensure:
1. Specs are not re-checked out when already present with matching version
2. Existing DOCX files are not overwritten
3. Markdown extraction with artifacts works for specs (not just TDocs)
4. TDC_AI_CONVERT_MD=1 environment variable now works for specs
parent 5cae87f3
Loading
Loading
Loading
Loading
+8 −5
Original line number Diff line number Diff line
@@ -327,15 +327,18 @@ def _process_single_item(
        pdf_path = _convert_member_to_pdf(member_for_convert)
        was_converted = pdf_path is not None

    # Optional markdown extraction (only for TDocs)
    # Optional markdown extraction (for TDocs and specs)
    was_md_extracted = False
    if convert_md and source_kind == SourceKind.TDOC:
    if convert_md:
        try:
            # Extract markdown - this will save to .ai folder
            convert_tdoc_to_markdown(document_id=item, force=False)
            # Extract markdown using unified pipeline - this will save to .ai folder
            # For TDocs: uses TDoc ID; for specs: uses spec number
            from threegpp_ai.operations.convert import convert_document_to_markdown

            convert_document_to_markdown(document_id=item, output_path=None, force=False)
            was_md_extracted = True
        except Exception as e:
            _logger.debug(f"Failed to extract markdown for {item}: {e}")
            _logger.debug("Failed to extract markdown for %s: %s", item, e)

    resolved_release = _resolve_spec_release(item, release) if source_kind == SourceKind.SPEC and release else None
    source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
+25 −5
Original line number Diff line number Diff line
@@ -520,7 +520,25 @@ def checkout_spec_to_workspace(
    # First check if already checked out with the SAME release version
    specs_dir = checkout_base / "Specs"
    if specs_dir.exists():
        # Search for spec with matching release version using both normalized and undotted forms
        # Resolve release to actual version(s) to match against 3GPP version codes (e.g., "h00", "i10")
        version_codes: list[str] = []
        try:
            cache_manager = resolve_cache_manager("default")
            db_path = db_file if db_file is not None else cache_manager.db_file
            with SpecDatabase(db_path) as db:
                versions = db.get_spec_versions(spec_number)
                if versions:
                    # Filter versions that match the requested release
                    for entry in versions:
                        if release == "latest" or release.startswith(entry.release.split(".")[0]):
                            # Extract version code from version string (e.g., "26.260" from "26.260-h00")
                            if "-" in entry.version:
                                version_code = entry.version.split("-")[1]
                                version_codes.append(version_code)
        except Exception as exc:
            _logger.debug("Could not resolve version codes for %s: %s", spec_number, exc)

        # Search for spec with matching version code
        for spec_dir in specs_dir.rglob("*"):
            if not spec_dir.is_dir():
                continue
@@ -528,10 +546,12 @@ def checkout_spec_to_workspace(
            # Check if this directory matches our spec (normalized or undotted form)
            if normalized not in dir_name and undotted not in dir_name:
                continue
            # Check if release version matches (only for non-latest)
            if release != "latest" and release not in dir_name:
            # Check if version code matches (only for non-latest)
            if release != "latest":
                # Check if any resolved version code is in the directory name
                if not any(vc in dir_name for vc in version_codes):
                    continue
            _logger.debug(f"Spec {spec_number} (release {release}) already checked out at {spec_dir}")
            _logger.debug("Spec %s (release %s) already checked out at %s", spec_number, release, spec_dir)
            return spec_dir

    # Need to checkout the spec