Commit a43e7aa2 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(ai): adapt 3gpp-ai package to async database layer

Convert sync database context managers to async (with/await) for
SpecDatabase, TDocDatabase, and MeetingDatabase in workspaces.py,
convert.py, and cli.py. Add asyncio.run() wrappers for CLI entry
points. Update test files to use async database patterns.
parent e311e66a
Loading
Loading
Loading
Loading
+35 −27
Original line number Diff line number Diff line
@@ -249,12 +249,16 @@ def _resolve_workspace_items(
        limit=limit,
        order=SortOrder.DESC,
    )
    with TDocDatabase(manager.db_file) as db:
        rows = db.query_tdocs(config)

    async def _fetch_tdocs() -> list[Any]:
        async with TDocDatabase(manager.db_file) as db:
            return await db.query_tdocs(config)

    rows = asyncio.run(_fetch_tdocs())
    return [row.tdoc_id for row in rows]


def _process_single_item(
async def _process_single_item(
    *,
    item: str,
    workspace: str,
@@ -288,11 +292,11 @@ def _process_single_item(
    if checkout:
        checkout_path = None
        if source_kind == SourceKind.TDOC:
            checkout_path = checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
            checkout_path = await checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
            if checkout_path is None:
                return None, "TDoc not found in database or meeting not crawled", False, False
        elif source_kind == SourceKind.SPEC:
            checkout_path = checkout_spec_to_workspace(
            checkout_path = await checkout_spec_to_workspace(
                item,
                manager.checkout_dir,
                workspace,
@@ -323,7 +327,7 @@ def _process_single_item(
        try:
            if source_kind == SourceKind.TDOC:
                # TDoc extraction - uses TDoc ID to fetch files
                convert_document_to_markdown(document_id=item, output_path=None, force=False)
                asyncio.run(convert_document_to_markdown(document_id=item, output_path=None, force=False))
            else:
                # Generic extraction (specs, other) - uses file path directly
                doc_file = _resolve_process_file(Path(source_path))
@@ -333,7 +337,7 @@ def _process_single_item(
        except Exception as e:
            _logger.debug("Failed to extract markdown for %s: %s", item, e)

    resolved_release = resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
    resolved_release = await resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
    source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
    member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
    return member, None, was_converted, was_md_extracted
@@ -351,8 +355,9 @@ def _build_workspace_members(
    members: list[Any] = []
    skipped: list[tuple[str, str]] = []

    async def _build_members() -> tuple[list[Any], list[tuple[str, str]]]:
        for item in items:
        member, skip_reason, _, _ = _process_single_item(
            member, skip_reason, _, _ = await _process_single_item(
                item=item,
                workspace=workspace,
                source_kind=source_kind,
@@ -365,9 +370,10 @@ def _build_workspace_members(
                skipped.append((item, skip_reason))
            else:
                members.append(member)

        return members, skipped

    return asyncio.run(_build_members())


def _resolve_process_file(path: Path) -> Path | None:
    """Resolve the actual document file from a path or directory.
@@ -439,14 +445,14 @@ def _convert_member_to_pdf(member: Any) -> Path | None:
        return None


def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata | None:
async def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata | None:
    if not source_item_id.startswith(("S", "R", "C", "T")):
        return None

    manager = resolve_cache_manager()
    try:
        with TDocDatabase(manager.db_file) as db:
            rows = db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
        async with TDocDatabase(manager.db_file) as db:
            rows = await db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
            if not rows:
                return None
            row = rows[0]
@@ -497,7 +503,7 @@ async def _process_workspace_members(
                    on_progress(len(results), member.source_item_id)
                continue

            metadata = _try_build_tdoc_metadata(member.source_item_id)
            metadata = await _try_build_tdoc_metadata(member.source_item_id)
            process_result = await processor.process_file(file_path, workspace, metadata=metadata)
            results.append(
                {
@@ -537,7 +543,9 @@ def ai_convert(
    json_output: JsonOutputOption = False,
) -> None:
    """Convert one TDoc and optionally persist markdown output."""
    markdown_or_path = convert_tdoc_to_markdown(document_id=document_id, output_path=output, force=force)
    markdown_or_path = asyncio.run(
        convert_tdoc_to_markdown(document_id=document_id, output_path=output, force=force),
    )

    if output:
        if json_output:
+2 −1
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ structured 3GPP metadata to document text before LightRAG insertion.
from __future__ import annotations

from pydantic import BaseModel, Field
from tdoc_crawler.tdocs.utils import normalize_tdoc_id
from tdoc_crawler.utils.normalization import normalize_release


@@ -69,7 +70,7 @@ class RAGMetadata(BaseModel):
        """Normalize metadata after initialization."""
        # Normalize tdoc_id: uppercase and strip whitespace
        if self.tdoc_id:
            self.tdoc_id = self.tdoc_id.strip().upper()
            self.tdoc_id = normalize_tdoc_id(self.tdoc_id)

        # Normalize spec_refs: strip whitespace from each reference
        if self.spec_refs:
+3 −1
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@ from dataclasses import dataclass
from enum import StrEnum
from typing import Any

from tdoc_crawler.tdocs.utils import normalize_tdoc_id

from .metadata import normalize_release_label
from .rag import TDocRAG

@@ -141,7 +143,7 @@ class EntitySeeder:
            True if seeding was successful.
        """
        # Normalize tdoc_id
        tdoc_id = tdoc_id.strip().upper()
        tdoc_id = normalize_tdoc_id(tdoc_id)

        # Create TDoc entity
        description = f"3GPP Temporary Document {tdoc_id}"
+4 −3
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
from tdoc_crawler.tdocs.utils import normalize_tdoc_id

from threegpp_ai.models import ExtractionError
from threegpp_ai.operations.conversion import (
@@ -131,7 +132,7 @@ def convert_tdoc_metadata(
        ValueError: If TDoc cannot be found via WhatTheSpec.
    """
    # Normalize TDoc ID
    normalized_id = document_id.strip().upper()
    normalized_id = normalize_tdoc_id(document_id)

    # Fetch metadata from WhatTheSpec
    logger.info("Fetching TDoc metadata for %s via WhatTheSpec", normalized_id)
@@ -193,7 +194,7 @@ def convert_document_to_markdown(
    )

    # Get TDoc metadata for header
    normalized_id = document_id.strip().upper()
    normalized_id = normalize_tdoc_id(document_id)
    metadata = resolve_via_whatthespec(normalized_id)
    if metadata:
        header = _format_tdoc_metadata(metadata)
@@ -232,7 +233,7 @@ def extract_document_structured_from_tdoc(
        ExtractionError: If no source file is available for the TDoc.
        ConversionError: If conversion/extraction fails.
    """
    normalized_id = document_id.strip().upper()
    normalized_id = normalize_tdoc_id(document_id)

    with timed_operation(get_metrics_tracker(), normalized_id, MetricType.CONVERSION):
        tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)
+2 −1
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import logging
import re

from litellm import exceptions as litellm_exceptions
from tdoc_crawler.tdocs.utils import normalize_tdoc_id
from tdoc_crawler.utils.misc import utc_now

from threegpp_ai.config import AiConfig
@@ -316,7 +317,7 @@ def summarize_tdoc(
        LlmConfigError: If LLM endpoint is unreachable or misconfigured.
        ValueError: If format is not supported.
    """
    normalized_id = document_id.strip().upper()
    normalized_id = normalize_tdoc_id(document_id)

    with timed_operation(get_metrics_tracker(), normalized_id, MetricType.SUMMARIZATION):
        extraction = extract_tdoc_structured(normalized_id, force=force)
Loading