ai: resolve nested imports, remove redundant embedding wrappers, and use... (709c0c21) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/ai/init.py

+1 −10

Original line number	Diff line number	Diff line
		@@ -16,9 +16,8 @@ from tdoc_crawler.ai.models import (
		ProcessingStatus,
		)
		from tdoc_crawler.ai.operations.convert import convert_tdoc as convert_document
		from tdoc_crawler.ai.operations.embeddings import query_embeddings
		from tdoc_crawler.ai.operations.graph import query_graph
		from tdoc_crawler.ai.operations.pipeline import get_status, process_all, process_tdoc
		from tdoc_crawler.ai.operations.pipeline import get_status, process_all
		from tdoc_crawler.ai.operations.pipeline import process_tdoc as process_document
		from tdoc_crawler.ai.operations.summarize import SummarizeResult
		from tdoc_crawler.ai.operations.summarize import summarize_tdoc as summarize_document
		@@ -53,13 +52,6 @@ from tdoc_crawler.config import CacheManager

		litellm.suppress_debug_info = True # Suppress provider/model info logs from litellm

		# Backward-compatible internal aliases used by some tests and monkeypatching.
		_pipeline_get_status_impl = get_status
		_pipeline_process_tdoc_impl = process_tdoc
		_pipeline_process_all_impl = process_all
		_query_embeddings = query_embeddings
		_query_graph = query_graph


		__all__ = [
		"DEFAULT_WORKSPACE",
		@@ -96,7 +88,6 @@ __all__ = [
		"normalize_workspace_name",
		"process_all",
		"process_document",
		"query_embeddings",
		"query_graph",
		"remove_invalid_members",
		"resolve_tdoc_checkout_path",

src/tdoc_crawler/ai/operations/embeddings.py

+150 −201

Original line number	Diff line number	Diff line
		@@ -23,6 +23,7 @@ logger = logging.getLogger(__name__)
		# Chunk size settings
		DEFAULT_MAX_CHARS = 500
		DEFAULT_OVERLAP = 50
		MAX_NBR_CHUNKS = 10000


		class EmbeddingsManager:
		@@ -97,7 +98,7 @@ class EmbeddingsManager:
		return []

		# Create chunks
		chunks = _create_chunks(document_id, markdown_content, self._model_name)
		chunks = self._create_chunks(document_id, markdown_content, self._model_name)

		if not chunks:
		return []
		@@ -137,10 +138,6 @@ class EmbeddingsManager:
		Returns:
		List of (DocumentChunk, score) tuples.
		"""
		if self._storage is None:
		from tdoc_crawler.ai.container import AiServiceContainer

		self._storage = AiServiceContainer.get_instance().get_storage()
		storage = self._storage
		normalized_workspace = normalize_workspace_name(workspace)

		@@ -154,65 +151,7 @@ class EmbeddingsManager:
		# Search in storage
		return storage.search_chunks(query_vector, top_k, workspace=normalized_workspace)


		def _chunk_by_headings(markdown: str) -> list[dict[str, str]]:
		"""Split markdown content by heading sections.

		Args:
		markdown: Markdown content.

		Returns:
		List of dicts with 'section' and 'content' keys.
		"""
		# Split by markdown headings (# ## ### etc)
		heading_pattern = r"(?m)^(#{1,6})\s+(.+)$"

		parts = re.split(heading_pattern, markdown)
		chunks: list[dict[str, str]] = []

		current_section = "Introduction"
		current_content: list[str] = []

		for i, part in enumerate(parts):
		if i % 3 == 0:
		# Content between headings
		if part.strip():
		current_content.append(part.strip())
		elif i % 3 == 2:
		# Heading text
		# Save previous chunk if exists
		if current_content:
		chunks.append(
		{
		"section": current_section,
		"content": "\n\n".join(current_content),
		}
		)
		current_content = []

		current_section = part.strip()

		# Add final chunk
		if current_content:
		chunks.append(
		{
		"section": current_section,
		"content": "\n\n".join(current_content),
		}
		)

		# If no headings found, treat as single chunk
		if not chunks:
		chunks.append(
		{
		"section": "Document",
		"content": markdown,
		}
		)

		return chunks


		@classmethod
		def _chunk_by_paragraphs(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
		"""Split text into chunks by paragraphs.

		@@ -274,8 +213,8 @@ def _chunk_by_paragraphs(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[

		return overlapped_chunks


		def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[DocumentChunk]:
		@classmethod
		def _create_chunks(cls, document_id: str, markdown: str, model_name: str) -> list[DocumentChunk]:
		"""Create document chunks from markdown.

		Args:
		@@ -287,7 +226,7 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc
		List of DocumentChunk objects.
		"""
		# First try heading-based chunking
		sections = _chunk_by_headings(markdown)
		sections = cls._chunk_by_headings(markdown)

		chunks: list[DocumentChunk] = []

		@@ -297,14 +236,14 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc

		# If section is too long, chunk by paragraphs
		if len(content) > DEFAULT_MAX_CHARS * 2:
		sub_chunks = _chunk_by_paragraphs(content)
		sub_chunks = cls._chunk_by_paragraphs(content)
		for j, sub_content in enumerate(sub_chunks):
		chunks.append(
		DocumentChunk(
		chunk_id=f"{document_id}:{i * 100 + j}",
		chunk_id=f"{document_id}:{i * MAX_NBR_CHUNKS + j}",
		document_id=document_id,
		section_heading=f"{section_name} ({j + 1})",
		chunk_index=i * 100 + j,
		chunk_index=i * MAX_NBR_CHUNKS + j,
		text=sub_content,
		char_offset_start=0,
		char_offset_end=len(sub_content),
		@@ -332,55 +271,65 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc
		logger.info(f"Created {len(chunks)} chunks for {document_id}")
		return chunks


		def generate_embeddings(
		document_id: str,
		markdown: str \| Path,
		storage: AiStorage \| None = None,
		workspace: str \| None = None,
		) -> list[DocumentChunk]:
		"""Generate embeddings for markdown content.
		@staticmethod
		def _chunk_by_headings(markdown: str) -> list[dict[str, str]]:
		"""Split markdown content by heading sections.

		Args:
		document_id: Document identifier.
		markdown: Markdown content to embed.
		storage: Optional storage instance (deprecated, uses singleton now).
		workspace: Optional workspace scope (defaults to "default").
		markdown: Markdown content.

		Returns:
		List of DocumentChunk objects with embeddings.
		List of dicts with 'section' and 'content' keys.
		"""
		from tdoc_crawler.ai.container import AiServiceContainer
		# Split by markdown headings (# ## ### etc)
		heading_pattern = r"(?m)^(#{1,6})\s+(.+)$"

		container = AiServiceContainer.get_instance()
		manager = container.get_embeddings_manager()
		return manager.generate_embeddings(document_id, markdown, workspace)
		parts = re.split(heading_pattern, markdown)
		chunks: list[dict[str, str]] = []

		current_section = "Introduction"
		current_content: list[str] = []

		def query_embeddings(
		query: str,
		workspace: str,
		top_k: int = 5,
		) -> list[tuple[DocumentChunk, float]]:
		"""Query embeddings using semantic search.
		for i, part in enumerate(parts):
		if i % 3 == 0:
		# Content between headings
		if part.strip():
		current_content.append(part.strip())
		elif i % 3 == 2:
		# Heading text
		# Save previous chunk if exists
		if current_content:
		chunks.append(
		{
		"section": current_section,
		"content": "\n\n".join(current_content),
		}
		)
		current_content = []

		Args:
		query: Search query.
		workspace: Workspace scope (required).
		top_k: Number of results to return.
		current_section = part.strip()

		Returns:
		List of (DocumentChunk, score) tuples.
		"""
		from tdoc_crawler.ai.container import AiServiceContainer
		# Add final chunk
		if current_content:
		chunks.append(
		{
		"section": current_section,
		"content": "\n\n".join(current_content),
		}
		)

		container = AiServiceContainer.get_instance()
		manager = container.get_embeddings_manager()
		return manager.query_embeddings(query, workspace, top_k)
		# If no headings found, treat as single chunk
		if not chunks:
		chunks.append(
		{
		"section": "Document",
		"content": markdown,
		}
		)

		return chunks


		__all__ = [
		"EmbeddingsManager",
		"generate_embeddings",
		"query_embeddings",
		]

src/tdoc_crawler/ai/operations/pipeline.py

+4 −2

Original line number	Diff line number	Diff line
		@@ -17,7 +17,6 @@ from tdoc_crawler.ai.models import (
		ProcessingStatus,
		)
		from tdoc_crawler.ai.operations.classify import classify_document_files
		from tdoc_crawler.ai.operations.embeddings import generate_embeddings
		from tdoc_crawler.ai.operations.extract import extract_from_folder
		from tdoc_crawler.ai.operations.summarize import summarize_document
		from tdoc_crawler.ai.operations.workspaces import list_workspace_members, normalize_workspace_name
		@@ -256,7 +255,10 @@ def _run_embedding_stage(
		msg = f"Extracted markdown artifact not found for embedding: {artifact_path}"
		raise FileNotFoundError(msg)

		generate_embeddings(document_id, artifact_path, storage=storage, workspace=workspace)
		# Get embeddings manager from container
		container = AiServiceContainer.get_instance()
		embeddings_manager = container.get_embeddings_manager()
		embeddings_manager.generate_embeddings(document_id, artifact_path, workspace=workspace)

		status.embedded_at = utc_now()
		status.error_message = None

src/tdoc_crawler/ai/operations/summarize.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -469,7 +469,7 @@ def summarize_tdoc(
		raise LlmConfigError(msg) from exc

		# Extract keywords
		keywords_prompt = KEYWORDS_PROMPT.format(content=content[:4000])
		keywords_prompt = KEYWORDS_PROMPT.format(content=content) # TODO: limit content size for keyword extraction as well, maybe 5000 chars?
		try:
		keywords_raw = client.complete(keywords_prompt, model=config.llm_model, max_tokens=200)
		keywords = _parse_keywords(keywords_raw)

src/tdoc_crawler/ai/operations/workspace_registry.py

+9 −8

Original line number	Diff line number	Diff line
		@@ -10,12 +10,16 @@ import json
		import logging
		from dataclasses import dataclass, field
		from pathlib import Path
		from typing import Any
		from typing import TYPE_CHECKING, Any

		from tdoc_crawler.ai.models import SourceKind
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.config import CacheManager, resolve_cache_manager
		from tdoc_crawler.utils.misc import utc_now

		if TYPE_CHECKING:
		from tdoc_crawler.ai.operations.workspace_registry import WorkspaceRegistry
		from tdoc_crawler.ai.operations.workspaces import WorkspaceMetadata

		logger = logging.getLogger(__name__)

		DEFAULT_WORKSPACE = "default"
		@@ -381,8 +385,6 @@ def get_active_workspace(cache_manager_name: str \| None = None) -> str:
		Returns:
		Name of the active workspace, or DEFAULT_WORKSPACE if none set.
		"""
		from tdoc_crawler.config import CacheManager # noqa: PLC0415

		manager_name = cache_manager_name or "default"
		try:
		resolve_cache_manager(manager_name)
		@@ -400,17 +402,16 @@ def set_active_workspace(name: str, cache_manager_name: str \| None = None) -> No
		name: Workspace name to set as active.
		cache_manager_name: Optional cache manager name.
		"""
		# Ensure cache manager is registered before loading registry
		from tdoc_crawler.config import CacheManager
		# Local import to avoid circular dependency with workspaces.py
		from tdoc_crawler.ai.operations.workspaces import normalize_workspace_name # noqa: PLC0415

		# Ensure cache manager is registered before loading registry
		manager_name = cache_manager_name or "default"
		try:
		resolve_cache_manager(manager_name)
		except ValueError:
		CacheManager(name=manager_name).register()

		from tdoc_crawler.ai.operations.workspaces import normalize_workspace_name

		registry = WorkspaceRegistry.load(cache_manager_name=cache_manager_name)
		normalized_name = normalize_workspace_name(name)
		if normalized_name not in registry.workspaces: