feat(ai): adapt 3gpp-ai package to async database layer (a43e7aa2) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/threegpp_ai/cli.py

+35 −27

Original line number	Diff line number	Diff line
		@@ -249,12 +249,16 @@ def _resolve_workspace_items(
		limit=limit,
		order=SortOrder.DESC,
		)
		with TDocDatabase(manager.db_file) as db:
		rows = db.query_tdocs(config)

		async def _fetch_tdocs() -> list[Any]:
		async with TDocDatabase(manager.db_file) as db:
		return await db.query_tdocs(config)

		rows = asyncio.run(_fetch_tdocs())
		return [row.tdoc_id for row in rows]


		def _process_single_item(
		async def _process_single_item(
		*,
		item: str,
		workspace: str,
		@@ -288,11 +292,11 @@ def _process_single_item(
		if checkout:
		checkout_path = None
		if source_kind == SourceKind.TDOC:
		checkout_path = checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
		checkout_path = await checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
		if checkout_path is None:
		return None, "TDoc not found in database or meeting not crawled", False, False
		elif source_kind == SourceKind.SPEC:
		checkout_path = checkout_spec_to_workspace(
		checkout_path = await checkout_spec_to_workspace(
		item,
		manager.checkout_dir,
		workspace,
		@@ -323,7 +327,7 @@ def _process_single_item(
		try:
		if source_kind == SourceKind.TDOC:
		# TDoc extraction - uses TDoc ID to fetch files
		convert_document_to_markdown(document_id=item, output_path=None, force=False)
		asyncio.run(convert_document_to_markdown(document_id=item, output_path=None, force=False))
		else:
		# Generic extraction (specs, other) - uses file path directly
		doc_file = _resolve_process_file(Path(source_path))
		@@ -333,7 +337,7 @@ def _process_single_item(
		except Exception as e:
		_logger.debug("Failed to extract markdown for %s: %s", item, e)

		resolved_release = resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
		resolved_release = await resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
		source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
		member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
		return member, None, was_converted, was_md_extracted
		@@ -351,8 +355,9 @@ def _build_workspace_members(
		members: list[Any] = []
		skipped: list[tuple[str, str]] = []

		async def _build_members() -> tuple[list[Any], list[tuple[str, str]]]:
		for item in items:
		member, skip_reason, _, _ = _process_single_item(
		member, skip_reason, _, _ = await _process_single_item(
		item=item,
		workspace=workspace,
		source_kind=source_kind,
		@@ -365,9 +370,10 @@ def _build_workspace_members(
		skipped.append((item, skip_reason))
		else:
		members.append(member)

		return members, skipped

		return asyncio.run(_build_members())


		def _resolve_process_file(path: Path) -> Path \| None:
		"""Resolve the actual document file from a path or directory.
		@@ -439,14 +445,14 @@ def _convert_member_to_pdf(member: Any) -> Path \| None:
		return None


		def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata \| None:
		async def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata \| None:
		if not source_item_id.startswith(("S", "R", "C", "T")):
		return None

		manager = resolve_cache_manager()
		try:
		with TDocDatabase(manager.db_file) as db:
		rows = db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
		async with TDocDatabase(manager.db_file) as db:
		rows = await db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
		if not rows:
		return None
		row = rows[0]
		@@ -497,7 +503,7 @@ async def _process_workspace_members(
		on_progress(len(results), member.source_item_id)
		continue

		metadata = _try_build_tdoc_metadata(member.source_item_id)
		metadata = await _try_build_tdoc_metadata(member.source_item_id)
		process_result = await processor.process_file(file_path, workspace, metadata=metadata)
		results.append(
		{
		@@ -537,7 +543,9 @@ def ai_convert(
		json_output: JsonOutputOption = False,
		) -> None:
		"""Convert one TDoc and optionally persist markdown output."""
		markdown_or_path = convert_tdoc_to_markdown(document_id=document_id, output_path=output, force=force)
		markdown_or_path = asyncio.run(
		convert_tdoc_to_markdown(document_id=document_id, output_path=output, force=force),
		)

		if output:
		if json_output:

packages/3gpp-ai/threegpp_ai/lightrag/metadata.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@ structured 3GPP metadata to document text before LightRAG insertion.
		from __future__ import annotations

		from pydantic import BaseModel, Field
		from tdoc_crawler.tdocs.utils import normalize_tdoc_id
		from tdoc_crawler.utils.normalization import normalize_release


		@@ -69,7 +70,7 @@ class RAGMetadata(BaseModel):
		"""Normalize metadata after initialization."""
		# Normalize tdoc_id: uppercase and strip whitespace
		if self.tdoc_id:
		self.tdoc_id = self.tdoc_id.strip().upper()
		self.tdoc_id = normalize_tdoc_id(self.tdoc_id)

		# Normalize spec_refs: strip whitespace from each reference
		if self.spec_refs:

packages/3gpp-ai/threegpp_ai/lightrag/seeder.py

+3 −1

Original line number	Diff line number	Diff line
		@@ -11,6 +11,8 @@ from dataclasses import dataclass
		from enum import StrEnum
		from typing import Any

		from tdoc_crawler.tdocs.utils import normalize_tdoc_id

		from .metadata import normalize_release_label
		from .rag import TDocRAG

		@@ -141,7 +143,7 @@ class EntitySeeder:
		True if seeding was successful.
		"""
		# Normalize tdoc_id
		tdoc_id = tdoc_id.strip().upper()
		tdoc_id = normalize_tdoc_id(tdoc_id)

		# Create TDoc entity
		description = f"3GPP Temporary Document {tdoc_id}"

packages/3gpp-ai/threegpp_ai/operations/convert.py

+4 −3

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ from tdoc_crawler.database.meetings import MeetingDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.tdocs.models import TDocMetadata
		from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
		from tdoc_crawler.tdocs.utils import normalize_tdoc_id

		from threegpp_ai.models import ExtractionError
		from threegpp_ai.operations.conversion import (
		@@ -131,7 +132,7 @@ def convert_tdoc_metadata(
		ValueError: If TDoc cannot be found via WhatTheSpec.
		"""
		# Normalize TDoc ID
		normalized_id = document_id.strip().upper()
		normalized_id = normalize_tdoc_id(document_id)

		# Fetch metadata from WhatTheSpec
		logger.info("Fetching TDoc metadata for %s via WhatTheSpec", normalized_id)
		@@ -193,7 +194,7 @@ def convert_document_to_markdown(
		)

		# Get TDoc metadata for header
		normalized_id = document_id.strip().upper()
		normalized_id = normalize_tdoc_id(document_id)
		metadata = resolve_via_whatthespec(normalized_id)
		if metadata:
		header = _format_tdoc_metadata(metadata)
		@@ -232,7 +233,7 @@ def extract_document_structured_from_tdoc(
		ExtractionError: If no source file is available for the TDoc.
		ConversionError: If conversion/extraction fails.
		"""
		normalized_id = document_id.strip().upper()
		normalized_id = normalize_tdoc_id(document_id)

		with timed_operation(get_metrics_tracker(), normalized_id, MetricType.CONVERSION):
		tdoc_files = fetch_tdoc_files(normalized_id, force_download=force)

packages/3gpp-ai/threegpp_ai/operations/summarize.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -7,6 +7,7 @@ import logging
		import re

		from litellm import exceptions as litellm_exceptions
		from tdoc_crawler.tdocs.utils import normalize_tdoc_id
		from tdoc_crawler.utils.misc import utc_now

		from threegpp_ai.config import AiConfig
		@@ -316,7 +317,7 @@ def summarize_tdoc(
		LlmConfigError: If LLM endpoint is unreachable or misconfigured.
		ValueError: If format is not supported.
		"""
		normalized_id = document_id.strip().upper()
		normalized_id = normalize_tdoc_id(document_id)

		with timed_operation(get_metrics_tracker(), normalized_id, MetricType.SUMMARIZATION):
		extraction = extract_tdoc_structured(normalized_id, force=force)