Commit 709c0c21 authored by Jan Reimes's avatar Jan Reimes
Browse files

ai: resolve nested imports, remove redundant embedding wrappers, and use...

ai: resolve nested imports, remove redundant embedding wrappers, and use container-based embedding calls
parent 821cf120
Loading
Loading
Loading
Loading
+1 −10
Original line number Diff line number Diff line
@@ -16,9 +16,8 @@ from tdoc_crawler.ai.models import (
    ProcessingStatus,
)
from tdoc_crawler.ai.operations.convert import convert_tdoc as convert_document
from tdoc_crawler.ai.operations.embeddings import query_embeddings
from tdoc_crawler.ai.operations.graph import query_graph
from tdoc_crawler.ai.operations.pipeline import get_status, process_all, process_tdoc
from tdoc_crawler.ai.operations.pipeline import get_status, process_all
from tdoc_crawler.ai.operations.pipeline import process_tdoc as process_document
from tdoc_crawler.ai.operations.summarize import SummarizeResult
from tdoc_crawler.ai.operations.summarize import summarize_tdoc as summarize_document
@@ -53,13 +52,6 @@ from tdoc_crawler.config import CacheManager

litellm.suppress_debug_info = True  # Suppress provider/model info logs from litellm

# Backward-compatible internal aliases used by some tests and monkeypatching.
_pipeline_get_status_impl = get_status
_pipeline_process_tdoc_impl = process_tdoc
_pipeline_process_all_impl = process_all
_query_embeddings = query_embeddings
_query_graph = query_graph


__all__ = [
    "DEFAULT_WORKSPACE",
@@ -96,7 +88,6 @@ __all__ = [
    "normalize_workspace_name",
    "process_all",
    "process_document",
    "query_embeddings",
    "query_graph",
    "remove_invalid_members",
    "resolve_tdoc_checkout_path",
+150 −201
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ logger = logging.getLogger(__name__)
# Chunk size settings
DEFAULT_MAX_CHARS = 500
DEFAULT_OVERLAP = 50
MAX_NBR_CHUNKS = 10000


class EmbeddingsManager:
@@ -97,7 +98,7 @@ class EmbeddingsManager:
            return []

        # Create chunks
        chunks = _create_chunks(document_id, markdown_content, self._model_name)
        chunks = self._create_chunks(document_id, markdown_content, self._model_name)

        if not chunks:
            return []
@@ -137,10 +138,6 @@ class EmbeddingsManager:
        Returns:
            List of (DocumentChunk, score) tuples.
        """
        if self._storage is None:
            from tdoc_crawler.ai.container import AiServiceContainer

            self._storage = AiServiceContainer.get_instance().get_storage()
        storage = self._storage
        normalized_workspace = normalize_workspace_name(workspace)

@@ -154,65 +151,7 @@ class EmbeddingsManager:
        # Search in storage
        return storage.search_chunks(query_vector, top_k, workspace=normalized_workspace)


def _chunk_by_headings(markdown: str) -> list[dict[str, str]]:
    """Split markdown content by heading sections.

    Args:
        markdown: Markdown content.

    Returns:
        List of dicts with 'section' and 'content' keys.
    """
    # Split by markdown headings (# ## ### etc)
    heading_pattern = r"(?m)^(#{1,6})\s+(.+)$"

    parts = re.split(heading_pattern, markdown)
    chunks: list[dict[str, str]] = []

    current_section = "Introduction"
    current_content: list[str] = []

    for i, part in enumerate(parts):
        if i % 3 == 0:
            # Content between headings
            if part.strip():
                current_content.append(part.strip())
        elif i % 3 == 2:
            # Heading text
            # Save previous chunk if exists
            if current_content:
                chunks.append(
                    {
                        "section": current_section,
                        "content": "\n\n".join(current_content),
                    }
                )
                current_content = []

            current_section = part.strip()

    # Add final chunk
    if current_content:
        chunks.append(
            {
                "section": current_section,
                "content": "\n\n".join(current_content),
            }
        )

    # If no headings found, treat as single chunk
    if not chunks:
        chunks.append(
            {
                "section": "Document",
                "content": markdown,
            }
        )

    return chunks


    @classmethod
    def _chunk_by_paragraphs(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
        """Split text into chunks by paragraphs.

@@ -274,8 +213,8 @@ def _chunk_by_paragraphs(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[

        return overlapped_chunks


def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[DocumentChunk]:
    @classmethod
    def _create_chunks(cls, document_id: str, markdown: str, model_name: str) -> list[DocumentChunk]:
        """Create document chunks from markdown.

        Args:
@@ -287,7 +226,7 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc
            List of DocumentChunk objects.
        """
        # First try heading-based chunking
    sections = _chunk_by_headings(markdown)
        sections = cls._chunk_by_headings(markdown)

        chunks: list[DocumentChunk] = []

@@ -297,14 +236,14 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc

            # If section is too long, chunk by paragraphs
            if len(content) > DEFAULT_MAX_CHARS * 2:
            sub_chunks = _chunk_by_paragraphs(content)
                sub_chunks = cls._chunk_by_paragraphs(content)
                for j, sub_content in enumerate(sub_chunks):
                    chunks.append(
                        DocumentChunk(
                        chunk_id=f"{document_id}:{i * 100 + j}",
                            chunk_id=f"{document_id}:{i * MAX_NBR_CHUNKS + j}",
                            document_id=document_id,
                            section_heading=f"{section_name} ({j + 1})",
                        chunk_index=i * 100 + j,
                            chunk_index=i * MAX_NBR_CHUNKS + j,
                            text=sub_content,
                            char_offset_start=0,
                            char_offset_end=len(sub_content),
@@ -332,55 +271,65 @@ def _create_chunks(document_id: str, markdown: str, model_name: str) -> list[Doc
        logger.info(f"Created {len(chunks)} chunks for {document_id}")
        return chunks


def generate_embeddings(
    document_id: str,
    markdown: str | Path,
    storage: AiStorage | None = None,
    workspace: str | None = None,
) -> list[DocumentChunk]:
    """Generate embeddings for markdown content.
    @staticmethod
    def _chunk_by_headings(markdown: str) -> list[dict[str, str]]:
        """Split markdown content by heading sections.

        Args:
        document_id: Document identifier.
        markdown: Markdown content to embed.
        storage: Optional storage instance (deprecated, uses singleton now).
        workspace: Optional workspace scope (defaults to "default").
            markdown: Markdown content.

        Returns:
        List of DocumentChunk objects with embeddings.
            List of dicts with 'section' and 'content' keys.
        """
    from tdoc_crawler.ai.container import AiServiceContainer
        # Split by markdown headings (# ## ### etc)
        heading_pattern = r"(?m)^(#{1,6})\s+(.+)$"

    container = AiServiceContainer.get_instance()
    manager = container.get_embeddings_manager()
    return manager.generate_embeddings(document_id, markdown, workspace)
        parts = re.split(heading_pattern, markdown)
        chunks: list[dict[str, str]] = []

        current_section = "Introduction"
        current_content: list[str] = []

def query_embeddings(
    query: str,
    workspace: str,
    top_k: int = 5,
) -> list[tuple[DocumentChunk, float]]:
    """Query embeddings using semantic search.
        for i, part in enumerate(parts):
            if i % 3 == 0:
                # Content between headings
                if part.strip():
                    current_content.append(part.strip())
            elif i % 3 == 2:
                # Heading text
                # Save previous chunk if exists
                if current_content:
                    chunks.append(
                        {
                            "section": current_section,
                            "content": "\n\n".join(current_content),
                        }
                    )
                    current_content = []

    Args:
        query: Search query.
        workspace: Workspace scope (required).
        top_k: Number of results to return.
                current_section = part.strip()

    Returns:
        List of (DocumentChunk, score) tuples.
    """
    from tdoc_crawler.ai.container import AiServiceContainer
        # Add final chunk
        if current_content:
            chunks.append(
                {
                    "section": current_section,
                    "content": "\n\n".join(current_content),
                }
            )

    container = AiServiceContainer.get_instance()
    manager = container.get_embeddings_manager()
    return manager.query_embeddings(query, workspace, top_k)
        # If no headings found, treat as single chunk
        if not chunks:
            chunks.append(
                {
                    "section": "Document",
                    "content": markdown,
                }
            )

        return chunks


__all__ = [
    "EmbeddingsManager",
    "generate_embeddings",
    "query_embeddings",
]
+4 −2
Original line number Diff line number Diff line
@@ -17,7 +17,6 @@ from tdoc_crawler.ai.models import (
    ProcessingStatus,
)
from tdoc_crawler.ai.operations.classify import classify_document_files
from tdoc_crawler.ai.operations.embeddings import generate_embeddings
from tdoc_crawler.ai.operations.extract import extract_from_folder
from tdoc_crawler.ai.operations.summarize import summarize_document
from tdoc_crawler.ai.operations.workspaces import list_workspace_members, normalize_workspace_name
@@ -256,7 +255,10 @@ def _run_embedding_stage(
        msg = f"Extracted markdown artifact not found for embedding: {artifact_path}"
        raise FileNotFoundError(msg)

    generate_embeddings(document_id, artifact_path, storage=storage, workspace=workspace)
    # Get embeddings manager from container
    container = AiServiceContainer.get_instance()
    embeddings_manager = container.get_embeddings_manager()
    embeddings_manager.generate_embeddings(document_id, artifact_path, workspace=workspace)

    status.embedded_at = utc_now()
    status.error_message = None
+1 −1
Original line number Diff line number Diff line
@@ -469,7 +469,7 @@ def summarize_tdoc(
        raise LlmConfigError(msg) from exc

    # Extract keywords
    keywords_prompt = KEYWORDS_PROMPT.format(content=content[:4000])
    keywords_prompt = KEYWORDS_PROMPT.format(content=content)  # TODO: limit content size for keyword extraction as well, maybe 5000 chars?
    try:
        keywords_raw = client.complete(keywords_prompt, model=config.llm_model, max_tokens=200)
        keywords = _parse_keywords(keywords_raw)
+9 −8
Original line number Diff line number Diff line
@@ -10,12 +10,16 @@ import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Any

from tdoc_crawler.ai.models import SourceKind
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.utils.misc import utc_now

if TYPE_CHECKING:
    from tdoc_crawler.ai.operations.workspace_registry import WorkspaceRegistry
    from tdoc_crawler.ai.operations.workspaces import WorkspaceMetadata

logger = logging.getLogger(__name__)

DEFAULT_WORKSPACE = "default"
@@ -381,8 +385,6 @@ def get_active_workspace(cache_manager_name: str | None = None) -> str:
    Returns:
        Name of the active workspace, or DEFAULT_WORKSPACE if none set.
    """
    from tdoc_crawler.config import CacheManager  # noqa: PLC0415

    manager_name = cache_manager_name or "default"
    try:
        resolve_cache_manager(manager_name)
@@ -400,17 +402,16 @@ def set_active_workspace(name: str, cache_manager_name: str | None = None) -> No
        name: Workspace name to set as active.
        cache_manager_name: Optional cache manager name.
    """
    # Ensure cache manager is registered before loading registry
    from tdoc_crawler.config import CacheManager
    # Local import to avoid circular dependency with workspaces.py
    from tdoc_crawler.ai.operations.workspaces import normalize_workspace_name  # noqa: PLC0415

    # Ensure cache manager is registered before loading registry
    manager_name = cache_manager_name or "default"
    try:
        resolve_cache_manager(manager_name)
    except ValueError:
        CacheManager(name=manager_name).register()

    from tdoc_crawler.ai.operations.workspaces import normalize_workspace_name

    registry = WorkspaceRegistry.load(cache_manager_name=cache_manager_name)
    normalized_name = normalize_workspace_name(name)
    if normalized_name not in registry.workspaces:
Loading