fix(circular-import): resolve PLC0415 by moving LiteLLMClient to shared llm_client.py (9b1cf380) · Commits · Jan Reimes / 3gpp-crawler

packages/3gpp-ai/threegpp_ai/operations/figure_descriptor.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -5,6 +5,8 @@ from __future__ import annotations
		import logging
		from pathlib import Path

		from .llm_client import LiteLLMClient

		logger = logging.getLogger(__name__)


		@@ -45,8 +47,6 @@ def describe_figures(
		def _describe_figure(image_path: Path, *, caption: str \| None = None, model: str \| None = None) -> str \| None:
		"""Describe a single figure image using the summarize LiteLLM client."""
		try:
		from threegpp_ai.operations.summarize import LiteLLMClient

		prompt = "Describe this technical figure in 2-3 concise sentences."
		if caption:
		prompt = f"{prompt} Caption hint: {caption}"

packages/3gpp-ai/threegpp_ai/operations/llm_client.py

0 → 100644

+90 −0

Original line number	Diff line number	Diff line
		"""Generic LiteLLM client wrapper."""

		from __future__ import annotations

		import base64
		import logging
		import mimetypes
		from pathlib import Path

		import litellm

		from threegpp_ai.config import AiConfig

		logger = logging.getLogger(__name__)

		# Default system prompt for technical document analysis
		DEFAULT_SYSTEM_PROMPT = """You are a technical document analyzer specializing in 3GPP TDoc documents.
		Generate concise, informative summaries following the specified structure."""


		class LiteLLMClient:
		"""Client for LiteLLM API."""

		_instance: LiteLLMClient \| None = None

		def __new__(cls) -> LiteLLMClient:
		if cls._instance is None:
		cls._instance = super().__new__(cls)
		return cls._instance

		def __init__(self) -> None:
		"""Initialize the singleton instance."""
		pass

		@staticmethod
		def complete(
		prompt: str,
		system_prompt: str = DEFAULT_SYSTEM_PROMPT,
		max_tokens: int = 256000,
		model: str \| None = None,
		images: list[Path] \| None = None,
		) -> str:
		"""Generate completion from prompt.

		Args:
		prompt: User prompt.
		system_prompt: System prompt.
		max_tokens: Maximum tokens in response.
		model: Model identifier (e.g., 'openai/gpt-4o-mini'). Defaults to config.llm_model.
		images: Optional list of image paths for vision-capable models.

		Returns:
		Generated text.
		"""
		cfg = AiConfig.from_env()
		if cfg.llm_api_base is not None and cfg.llm_api_key is None:
		msg = f"LLM API base URL is set ({cfg.llm_api_base}) but API key is missing. Please set TDC_AI_LLM_API_KEY environment variable."
		logger.warning(msg)

		try:
		user_content: str \| list[dict[str, str]]
		if images:
		user_content = [{"type": "text", "text": prompt}]
		for image_path in images:
		mime_type, _ = mimetypes.guess_type(str(image_path))
		mime_type = mime_type or "image/png"
		encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
		user_content.append(
		{
		"type": "image_url",
		"image_url": f"data:{mime_type};base64,{encoded}",
		}
		)
		else:
		user_content = prompt

		response = litellm.completion(
		model=model or cfg.llm_model,
		messages=[
		{"role": "system", "content": system_prompt},
		{"role": "user", "content": user_content},
		],
		max_tokens=max_tokens,
		api_key=cfg.llm_api_key,
		base_url=cfg.llm_api_base,
		)
		return response.choices[0].message.content
		except Exception as e:
		logger.error(f"LLM completion failed: {e}")
		raise

packages/3gpp-ai/threegpp_ai/operations/summarize.py

+4 −80

Original line number	Diff line number	Diff line
		@@ -2,20 +2,17 @@

		from __future__ import annotations

		import base64
		import json
		import logging
		import mimetypes
		import re
		from pathlib import Path

		import litellm

		from tdoc_crawler.utils.misc import utc_now
		from threegpp_ai.config import AiConfig
		from threegpp_ai.models import DocumentSummary, LlmConfigError, SummarizeResult
		from threegpp_ai.operations.convert import extract_tdoc_structured
		from threegpp_ai.operations.metrics import MetricType, get_metrics_tracker, timed_operation

		from .convert import extract_tdoc_structured
		from .llm_client import LiteLLMClient
		from .metrics import MetricType, get_metrics_tracker, timed_operation

		logger = logging.getLogger(__name__)

		@@ -41,9 +38,6 @@ def _truncate_text(text: str, max_chars: int) -> str:


		# Prompt templates
		SUMMARY_SYSTEM_PROMPT = """You are a technical document analyzer specializing in 3GPP TDoc documents.
		Generate concise, informative summaries following the specified structure."""

		ABSTRACT_PROMPT = """Generate a brief abstract (150-250 words) for this document:

		{content}
		@@ -87,76 +81,6 @@ def _get_llm_client() -> LiteLLMClient:
		return LiteLLMClient()


		class LiteLLMClient:
		"""Client for LiteLLM API."""

		_instance: LiteLLMClient \| None = None

		def __new__(cls) -> LiteLLMClient:
		if cls._instance is None:
		cls._instance = super().__new__(cls)
		return cls._instance

		def __init__(self) -> None:
		logger.info("LiteLLM client initialized")

		@staticmethod
		def complete(
		prompt: str,
		system_prompt: str = SUMMARY_SYSTEM_PROMPT,
		max_tokens: int = 256000,
		model: str \| None = None,
		images: list[Path] \| None = None,
		) -> str:
		"""Generate completion from prompt.

		Args:
		prompt: User prompt.
		system_prompt: System prompt.
		max_tokens: Maximum tokens in response.
		model: Model identifier (e.g., 'openai/gpt-4o-mini'). Defaults to config.llm_model.

		Returns:
		Generated text.
		"""
		cfg = AiConfig.from_env()
		if cfg.llm_api_base is not None and cfg.llm_api_key is None:
		msg = f"LLM API base URL is set ({cfg.llm_api_base}) but API key is missing. Please set TDC_AI_LLM_API_KEY environment variable."
		logger.warning(msg)

		try:
		user_content: str \| list[dict[str, str]]
		if images:
		user_content = [{"type": "text", "text": prompt}]
		for image_path in images:
		mime_type, _ = mimetypes.guess_type(str(image_path))
		mime_type = mime_type or "image/png"
		encoded = base64.b64encode(image_path.read_bytes()).decode("ascii")
		user_content.append(
		{
		"type": "image_url",
		"image_url": f"data:{mime_type};base64,{encoded}",
		}
		)
		else:
		user_content = prompt

		response = litellm.completion(
		model=model or cfg.llm_model,
		messages=[
		{"role": "system", "content": system_prompt},
		{"role": "user", "content": user_content},
		],
		max_tokens=max_tokens,
		api_key=cfg.llm_api_key,
		base_url=cfg.llm_api_base,
		)
		return response.choices[0].message.content
		except Exception as e:
		logger.error(f"LLM completion failed: {e}")
		raise


		def _strip_code_fences(payload: str \| None) -> str:
		"""Strip optional markdown code fences from LLM payloads."""
		if payload is None:

tests/ai/test_ai_extraction_artifacts.py

0 → 100644

+257 −0

Original line number	Diff line number	Diff line
		"""Integration tests for extraction artifact storage.

		Tests the folder-based storage pattern for tables, figures, and equations
		extracted from TDoc documents.
		"""

		import shutil
		import tempfile
		from pathlib import Path

		import pytest
		from threegpp_ai.models import (
		ExtractedEquationElement,
		ExtractedFigureElement,
		ExtractedTableElement,
		)
		from threegpp_ai.operations.extraction_result import (
		build_structured_extraction_result,
		has_cached_artifacts,
		persist_equations_from_extraction,
		persist_figures_from_extraction,
		persist_tables_from_extraction,
		read_cached_artifacts,
		)


		class TestArtifactStorage:
		"""Test artifact storage utilities."""

		@pytest.fixture
		def temp_ai_dir(self) -> Path:
		"""Create temporary .ai directory."""
		tmpdir = Path(tempfile.mkdtemp())
		ai_dir = tmpdir / ".ai"
		ai_dir.mkdir()
		yield ai_dir
		shutil.rmtree(tmpdir)

		@pytest.fixture
		def sample_tables(self) -> list[ExtractedTableElement]:
		"""Create sample table elements."""
		return [
		ExtractedTableElement(
		element_id="table_1",
		page_number=1,
		row_count=2,
		column_count=3,
		cells=[["A1", "B1", "C1"], ["A2", "B2", "C2"]],
		markdown="\| A1 \| B1 \| C1 \|\n\| A2 \| B2 \| C2 \|",
		caption="Test table caption",
		),
		ExtractedTableElement(
		element_id="table_2",
		page_number=3,
		row_count=4,
		column_count=2,
		cells=[["X1", "Y1"], ["X2", "Y2"], ["X3", "Y3"], ["X4", "Y4"]],
		markdown="\| X1 \| Y1 \|\n\| X2 \| Y2 \|\n\| X3 \| Y3 \|\n\| X4 \| Y4 \|",
		),
		]

		@pytest.fixture
		def sample_equations(self) -> list[ExtractedEquationElement]:
		"""Create sample equation elements."""
		return [
		ExtractedEquationElement(
		element_id="equation_1",
		page_number=2,
		latex=r"E = mc^2",
		raw_text="E = mc^2",
		),
		ExtractedEquationElement(
		element_id="equation_2",
		page_number=5,
		latex=r"\int_0^\infty e^{-x} dx = 1",
		raw_text="integral from 0 to infinity",
		),
		]

		@pytest.fixture
		def sample_figures(self) -> list[ExtractedFigureElement]:
		"""Create sample figure elements with image bytes in metadata."""
		return [
		ExtractedFigureElement(
		element_id="figure_1",
		page_number=1,
		image_path="/path/to/figure_1.png",
		image_format="png",
		caption="Test figure caption",
		metadata={"image_bytes": b"\x89PNG\r\n\x1a\n" + b"\x00" * 100},
		),
		ExtractedFigureElement(
		element_id="figure_2",
		page_number=4,
		image_path="/path/to/figure_2.jpg",
		image_format="jpeg",
		metadata={"image_bytes": b"\xff\xd8\xff" + b"\x00" * 100},
		),
		]

		def test_persist_tables_creates_folder_structure(self, temp_ai_dir: Path, sample_tables: list[ExtractedTableElement]) -> None:
		"""Tables are persisted in individual JSON files under tables/ subfolder."""
		doc_stem = "S4-250638"
		tables_dir = temp_ai_dir / "tables"
		tables_dir.mkdir(parents=True, exist_ok=True)

		paths = persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)

		assert len(paths) == 2
		assert (temp_ai_dir / "tables" / "S4-250638_table_1_1.json").exists()
		assert (temp_ai_dir / "tables" / "S4-250638_table_3_2.json").exists()

		def test_persist_equations_creates_folder_structure(self, temp_ai_dir: Path, sample_equations: list[ExtractedEquationElement]) -> None:
		"""Equations are persisted in individual JSON files under equations/ subfolder."""
		doc_stem = "S4-250638"
		equations_dir = temp_ai_dir / "equations"
		equations_dir.mkdir(parents=True, exist_ok=True)

		paths = persist_equations_from_extraction(sample_equations, temp_ai_dir, doc_stem)

		assert len(paths) == 2
		assert (temp_ai_dir / "equations" / "S4-250638_equation_2_1.json").exists()
		assert (temp_ai_dir / "equations" / "S4-250638_equation_5_2.json").exists()

		def test_persist_figures_creates_folder_structure(self, temp_ai_dir: Path, sample_figures: list[ExtractedFigureElement]) -> None:
		"""Figures are persisted with metadata under figures/ subfolder."""
		doc_stem = "S4-250638"
		figures_dir = temp_ai_dir / "figures"
		figures_dir.mkdir(parents=True, exist_ok=True)

		paths = persist_figures_from_extraction(sample_figures, figures_dir, doc_stem)

		assert len(paths) == 2
		assert "figure_1" in paths
		assert "figure_2" in paths

		def test_read_cached_artifacts_reconstructs_result(
		self,
		temp_ai_dir: Path,
		sample_tables: list[ExtractedTableElement],
		sample_equations: list[ExtractedEquationElement],
		sample_figures: list[ExtractedFigureElement],
		) -> None:
		"""read_cached_artifacts reconstructs StructuredExtractionResult from folder storage."""
		doc_stem = "S4-250638"

		# Persist all artifacts
		persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)
		persist_equations_from_extraction(sample_equations, temp_ai_dir, doc_stem)
		figures_dir = temp_ai_dir / "figures"
		figures_dir.mkdir(parents=True, exist_ok=True)
		persist_figures_from_extraction(sample_figures, figures_dir, doc_stem)

		# Read back
		cached = read_cached_artifacts(temp_ai_dir, doc_stem)

		assert cached is not None
		assert len(cached.tables) == 2
		assert len(cached.equations) == 2
		assert len(cached.figures) == 2

		# Verify table data integrity
		assert cached.tables[0].element_id == "table_1"
		assert cached.tables[0].page_number == 1
		assert cached.tables[0].cells == [["A1", "B1", "C1"], ["A2", "B2", "C2"]]

		# Verify equation data integrity
		assert cached.equations[0].element_id == "equation_1"
		assert cached.equations[0].latex == r"E = mc^2"

		def test_has_cached_artifacts_checks_existence(self, temp_ai_dir: Path, sample_tables: list[ExtractedTableElement]) -> None:
		"""has_cached_artifacts correctly reports which artifact types exist."""
		doc_stem = "S4-250638"

		# Initially nothing cached
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables"}) is False
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables", "equations"}) is False

		# Persist tables
		persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)

		# Now tables exist
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables"}) is True
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables", "equations"}) is False

		def test_has_cached_artifacts_partial_types(self, temp_ai_dir: Path, sample_tables: list[ExtractedTableElement]) -> None:
		"""has_cached_artifacts returns True only if ALL requested types exist."""
		doc_stem = "S4-250638"

		# Persist tables only
		persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)

		# tables=True, figures=False, equations=False
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables"}) is True
		assert has_cached_artifacts(temp_ai_dir, doc_stem, {"tables", "figures"}) is False

		def test_read_cached_artifacts_returns_none_when_empty(self, temp_ai_dir: Path) -> None:
		"""read_cached_artifacts returns None if no artifacts exist."""
		doc_stem = "S4-250638"
		cached = read_cached_artifacts(temp_ai_dir, doc_stem)
		assert cached is None

		def test_build_structured_extraction_with_artifacts(
		self,
		temp_ai_dir: Path,
		sample_tables: list[ExtractedTableElement],
		sample_equations: list[ExtractedEquationElement],
		sample_figures: list[ExtractedFigureElement],
		) -> None:
		"""build_structured_extraction_result creates proper result with artifacts."""
		doc_stem = "S4-250638"
		content = "# Test Document\n\nSome content here."

		# Persist artifacts
		persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)
		persist_equations_from_extraction(sample_equations, temp_ai_dir, doc_stem)
		figures_dir = temp_ai_dir / "figures"
		figures_dir.mkdir(parents=True, exist_ok=True)
		persist_figures_from_extraction(sample_figures, figures_dir, doc_stem)

		# Read and build
		cached = read_cached_artifacts(temp_ai_dir, doc_stem)
		result = build_structured_extraction_result(
		content,
		tables=cached.tables if cached else [],
		figures=cached.figures if cached else [],
		equations=cached.equations if cached else [],
		)

		assert result.content == content
		assert result.table_count == 2
		assert result.figure_count == 2
		assert result.equation_count == 2

		def test_artifact_filename_includes_page_and_index(self, temp_ai_dir: Path, sample_tables: list[ExtractedTableElement]) -> None:
		"""Artifact filenames encode page number and index for traceability."""
		doc_stem = "S4-250999"

		persist_tables_from_extraction(sample_tables, temp_ai_dir, doc_stem)

		# First table: page=1, index=1 -> S4-250999_table_1_1.json
		assert (temp_ai_dir / "tables" / "S4-250999_table_1_1.json").exists()

		# Second table: page=3, index=2 -> S4-250999_table_3_2.json
		assert (temp_ai_dir / "tables" / "S4-250999_table_3_2.json").exists()

		def test_empty_artifacts_list_handled_gracefully(self, temp_ai_dir: Path) -> None:
		"""Empty artifact lists are handled without creating files."""
		doc_stem = "S4-250638"
		empty_tables: list[ExtractedTableElement] = []

		paths = persist_tables_from_extraction(empty_tables, temp_ai_dir, doc_stem)

		assert len(paths) == 0
		tables_dir = temp_ai_dir / "tables"
		# Directory should not be created for empty list
		assert not tables_dir.exists() or not any(tables_dir.iterdir())