Commit 639af911 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(3gpp-ai): migrate to pydantic-settings and remove CacheManager dependency

- config.py: rewrite AiConfig and LightRAGSettings as pydantic BaseSettings with AliasChoices; add ThreeGPPAIConfig extending ThreeGPPConfig; remove from_env() factory methods and litellm provider validation
- Add config_app.py and config_exporter.py for AI-specific config CLI commands and export
- cli: remove CacheManager/resolve_cache_manager; load ThreeGPPAIConfig.from_settings() in _app_init callback; replace all resolve_cache_manager() calls with PathConfig(); rename manager → path_config in _process_single_item
- workspace_registry: replace cache_manager_name field with registry_path: Path | None; remove CacheManager registration boilerplate from get/set_active_workspace
- workspaces, convert, fetch_tdoc: replace resolve_cache_manager() with PathConfig() for db_file, checkout_dir, ai_embed_dir
- rag.py: replace resolve_cache_manager() with PathConfig().ai_embed_dir()
- models.py, llm_client.py, summarize.py: replace AiConfig.from_env() with AiConfig()
parent 40c77e3f
Loading
Loading
Loading
Loading
+42 −23
Original line number Diff line number Diff line
@@ -11,13 +11,13 @@ import shutil
from collections.abc import Callable
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from typing import Annotated, Any

import typer
from dotenv import load_dotenv
from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_console, get_logger, set_verbosity
from tdoc_crawler.models.base import OutputFormat, SortOrder
@@ -84,6 +84,8 @@ from threegpp_ai.args import (
    WorkspaceProcessVlmOption,
    WorkspaceReleaseOption,
)
from threegpp_ai.config import ThreeGPPAIConfig
from threegpp_ai.config_app import config_app
from threegpp_ai.lightrag.config import LightRAGConfig, QueryMode, StorageBackend
from threegpp_ai.lightrag.metadata import RAGMetadata
from threegpp_ai.lightrag.processor import DocumentProcessor
@@ -104,6 +106,7 @@ workspace_app = typer.Typer(help="Manage GraphRAG workspaces")
providers_app = typer.Typer(help="List and manage AI providers")
app.add_typer(workspace_app, name="workspace")
app.add_typer(providers_app, name="providers")
app.add_typer(config_app, name="config")

console = get_console()
_logger = get_logger(__name__)
@@ -194,9 +197,26 @@ def providers_list(


@app.callback()
def _app_init(cache_dir: CacheDirOption = None) -> None:
    """Register a CacheManager so all sub-commands can resolve file paths."""
    CacheManager(cache_dir).register(force=True)
def _app_init(
    ctx: typer.Context,
    config_file: Annotated[
        Path | None,
        typer.Option(
            "--config",
            "-c",
            help="Path to configuration file (overrides discovered config)",
            exists=True,
            readable=True,
        ),
    ] = None,
    cache_dir: CacheDirOption = None,
) -> None:
    """Load configuration so all sub-commands can resolve file paths."""
    config = ThreeGPPAIConfig.from_settings(config_file=config_file)
    if cache_dir:
        config.path.cache_dir = cache_dir

    ctx.obj = config


def _resolve_workspace_name(workspace: str | None) -> str:
@@ -245,7 +265,7 @@ def _resolve_workspace_items(
        console.print("[red]Error: filtering without explicit items is only supported for TDocs[/red]")
        raise typer.Exit(1)

    manager = resolve_cache_manager()
    manager = PathConfig()
    config = TDocQueryConfig(
        output_format=OutputFormat.TABLE,
        tdoc_ids=None,
@@ -281,7 +301,7 @@ async def _process_single_item(
    release: str | None,
    convert_pdf: bool,
    convert_md: bool = False,
    manager: CacheManager,
    path_config: PathConfig,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
) -> tuple[Any | None, str | None, bool, bool]:
@@ -295,7 +315,7 @@ async def _process_single_item(
        release: Spec release version
        convert_pdf: Whether to convert to PDF
        convert_md: Whether to extract markdown (implies convert_pdf)
        manager: CacheManager for paths
        path_config: PathConfig for file system paths
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.

@@ -310,16 +330,16 @@ async def _process_single_item(
    if checkout:
        checkout_path = None
        if source_kind == SourceKind.TDOC:
            checkout_path = await checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
            checkout_path = await checkout_tdoc_to_workspace(item, path_config.checkout_dir, workspace, db_file=path_config.db_file)
            if checkout_path is None:
                return None, "TDoc not found in database or meeting not crawled", False, False
        elif source_kind == SourceKind.SPEC:
            checkout_path = await checkout_spec_to_workspace(
                item,
                manager.checkout_dir,
                path_config.checkout_dir,
                workspace,
                release or "latest",
                db_file=manager.db_file,
                db_file=path_config.db_file,
            )
            if checkout_path is None:
                return None, "Spec not found in database", False, False
@@ -449,7 +469,7 @@ async def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata | None:
    if not source_item_id.startswith(("S", "R", "C", "T")):
        return None

    manager = resolve_cache_manager()
    manager = PathConfig()
    try:
        async with TDocDatabase(manager.db_file) as db:
            rows = await db.query_tdocs(TDocQueryConfig(tdoc_ids=[source_item_id], order=SortOrder.ASC, limit=1))
@@ -494,7 +514,7 @@ async def _process_workspace_members(
    """
    processor = DocumentProcessor(LightRAGConfig.from_env())
    results: list[dict[str, Any]] = []
    manager = resolve_cache_manager()
    path_config = PathConfig()

    await processor.rag.start(workspace)
    try:
@@ -508,17 +528,17 @@ async def _process_workspace_members(
                if member.source_kind == SourceKind.TDOC:
                    checkout_path = await checkout_tdoc_to_workspace(
                        member.source_item_id,
                        manager.checkout_dir,
                        path_config.checkout_dir,
                        workspace,
                        db_file=manager.db_file,
                        db_file=path_config.db_file,
                    )
                elif member.source_kind == SourceKind.SPEC:
                    checkout_path = await checkout_spec_to_workspace(
                        member.source_item_id,
                        manager.checkout_dir,
                        path_config.checkout_dir,
                        workspace,
                        "latest",
                        db_file=manager.db_file,
                        db_file=path_config.db_file,
                    )
                if checkout_path is not None:
                    file_path = _resolve_process_file(checkout_path)
@@ -826,11 +846,11 @@ def workspace_clear(
    workspace: WorkspaceNameOption = None,
) -> None:
    workspace_name = _resolve_workspace_name(workspace)
    manager = resolve_cache_manager()
    path_config = typer.get_current_context().obj.path

    config = LightRAGConfig.from_env()
    embedding_model_safe = config.embedding.model.replace(":", "-").replace("/", "-")
    working_dir = manager.ai_embed_dir(embedding_model_safe) / workspace_name
    working_dir = path_config.ai_embed_dir(embedding_model_safe) / workspace_name

    if not working_dir.exists():
        console.print(f"[yellow]No LightRAG artifacts found for '{workspace_name}'[/yellow]")
@@ -857,7 +877,7 @@ def _checkout_and_convert_items(
    Returns:
        Tuple of (members, skipped_items, converted_count, md_extracted_count)
    """
    manager = resolve_cache_manager()
    manager = PathConfig()
    members: list[Any] = []
    skipped: list[tuple[str, str]] = []
    converted_count = 0
@@ -886,7 +906,7 @@ def _checkout_and_convert_items(
                    release=release,
                    convert_pdf=convert_pdf,
                    convert_md=convert_md,
                    manager=manager,
                    path_config=manager,
                    vlm_options=vlm_options,
                    accelerator_config=accelerator_config,
                )
@@ -1034,7 +1054,6 @@ def _embed_members(
        Number of successfully embedded documents.
    """
    processor = DocumentProcessor(LightRAGConfig.from_env())
    resolve_cache_manager()
    embedded = 0

    async def _run() -> None:
@@ -1091,7 +1110,7 @@ def workspace_list_members(
        console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
        raise typer.Exit(1)

    manager = resolve_cache_manager()
    manager = typer.get_current_context().obj.path
    checkout_base = manager.checkout_dir

    member_rows = [
+183 −141
Original line number Diff line number Diff line
"""Configuration for the AI document processing pipeline."""
"""AI processing pipeline configuration for 3GPP documents.

This module extends the base ThreeGPPConfig with AI-specific settings
organized under the [ai] section in TOML/YAML/JSON configuration files.
"""

from __future__ import annotations

import os
from pathlib import Path
from typing import Literal

import litellm
from pydantic import Field, field_validator, model_validator
from tdoc_crawler.models import BaseConfigModel
from pydantic import AliasChoices, Field, field_validator, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
from tdoc_crawler.config.env_vars import ConfigEnvVar
from tdoc_crawler.config.settings import ThreeGPPConfig

DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
DEFAULT_LLM_MODEL = "openrouter/openrouter/free"

type Backend = Literal["torch", "onnx", "openvino"]

# Graph query level type
type GraphQueryLevel = Literal["simple", "medium", "advanced"]

# Type aliases
Backend = Literal["torch", "onnx", "openvino"]
GraphQueryLevel = Literal["simple", "medium", "advanced"]
QueryMode = Literal["naive", "local", "global", "hybrid", "mix", "bypass"]
StorageBackend = Literal["file", "pg0"]

def _env_int(name: str) -> int | None:
    value = os.getenv(name)
    if value is None or value == "":
        return None
    return int(value)

class LightRAGSettings(BaseSettings):
    """LightRAG-specific configuration (nested under ai.lightrag)."""

def _validate_model_identifier(value: str, field_name: str) -> str:
    if "/" not in value:
        msg = f"{field_name} must be in '<provider>/<model_name>' format"
        raise ValueError(msg)
    model_config = SettingsConfigDict(extra="ignore")

    provider, model_name = value.split("/", 1)
    provider_normalized = provider.strip().lower()
    model_name_normalized = model_name.strip()

    if not provider_normalized:
        msg = f"{field_name} provider segment cannot be empty"
        raise ValueError(msg)
    if not model_name_normalized:
        msg = f"{field_name} model_name segment cannot be empty"
        raise ValueError(msg)

    supported_providers = set(litellm.LITELLM_CHAT_PROVIDERS + litellm.openai_compatible_providers)
    # Storage backend
    db_backend: StorageBackend = Field(
        "file",
        validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_DB_BACKEND.name, "db_backend"),
        description="Storage backend to use (file or pg0)",
    )
    pg0_instance_name: str = Field(
        "3gpp-crawler",
        description="pg0 instance name",
    )
    pg0_port: int = Field(
        15432,
        ge=1,
        le=65535,
        description="pg0 PostgreSQL port",
    )
    pg0_database: str = Field(
        "tdoc",
        description="pg0 database name",
    )

    if provider_normalized not in supported_providers:
        msg = (
            f"{field_name} provider '{provider}' is not supported by litellm. "
            f"See https://docs.litellm.ai/docs/providers for the full list of {len(supported_providers)} supported providers."
    # Workspace / query
    workspace: str = Field(
        "default",
        description="Default workspace name",
    )
    default_query_mode: QueryMode = Field(
        "hybrid",
        description="Default query mode",
    )
        raise ValueError(msg)

    return f"{provider_normalized}/{model_name_normalized}"
    # Feature toggles
    shared_storage: bool = Field(
        False,
        validation_alias=AliasChoices(ConfigEnvVar.LIGHTRAG_SHARED_STORAGE.name, "shared_storage"),
        description="Enable shared embedding storage across workspaces (deduplication)",
    )
    extract_tables: bool = Field(
        True,
        description="Enable extraction and indexing of table elements",
    )
    extract_figures: bool = Field(
        True,
        description="Enable extraction and indexing of figure elements",
    )
    extract_equations: bool = Field(
        True,
        description="Enable extraction and indexing of equation elements",
    )
    figure_description_enabled: bool = Field(
        True,
        description="Enable figure description generation with vision-capable models",
    )


def _validate_embedding_model_format(value: str) -> str:
    """Validate embedding model - accepts any HuggingFace-style model ID.
class AiConfig(BaseSettings):
    """AI processing pipeline configuration.

    Unlike LLM models, embedding models via sentence-transformers don't require
    LiteLLM provider validation. Accepts formats like:
    - sentence-transformers/all-MiniLM-L6-v2
    - perplexity-ai/pplx-embed-v1-0.6b
    Lives in 3gpp-ai package. Only primitive fields + format validators.
    No litellm import — provider validation is a separate concern.
    """
    if "/" not in value:
        msg = "embedding_model must be in '<provider>/<model_name>' format"
        raise ValueError(msg)

    provider, model_name = value.split("/", 1)
    provider_normalized = provider.strip().lower()
    model_name_normalized = model_name.strip()

    if not provider_normalized:
        msg = "embedding_model provider segment cannot be empty"
        raise ValueError(msg)
    if not model_name_normalized:
        msg = "embedding_model model_name segment cannot be empty"
        raise ValueError(msg)

    return f"{provider_normalized}/{model_name_normalized}"


class AiConfig(BaseConfigModel):
    """Configuration for the AI processing pipeline."""
    model_config = SettingsConfigDict(extra="ignore")

    # Embedding
    embedding_model: str = Field(
        DEFAULT_EMBEDDING_MODEL,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_MODEL.name, "embedding_model"),
        description="Embedding model in <provider>/<model_name> format",
    )
    embedding_backend: Backend = Field(
        "torch",
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name, "embedding_backend"),
        description="Sentence-transformers backend (torch, onnx, openvino)",
    )
    max_chunk_size: int = Field(1000, ge=1, description="Max tokens per chunk")
    chunk_overlap: int = Field(100, ge=0, description="Token overlap between chunks")
    embedding_api_base: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_BASE.name, "embedding_api_base"),
        description="Override Embedding API base URL",
    )
    embedding_api_key: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_EMBEDDING_API_KEY.name, "embedding_api_key"),
        description="Override Embedding API key",
    )

    # LLM
    llm_model: str = Field(
        DEFAULT_LLM_MODEL,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_MODEL.name, "llm_model"),
        description="LLM model in <provider>/<model_name> format",
    )
    llm_api_base: str | None = Field(None, description="Override LLM API base URL")
    llm_api_key: str | None = Field(None, description="Override LLM API key (takes precedence over provider-specific env vars)")
    llm_api_base: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_API_BASE.name, "llm_api_base"),
        description="Override LLM API base URL",
    )
    llm_api_key: str | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_LLM_API_KEY.name, "llm_api_key"),
        description="Override LLM API key (takes precedence over provider env vars)",
    )

    abstract_min_words: int = Field(150, ge=1, description="Minimum abstract word count")
    abstract_max_words: int = Field(250, ge=1, description="Maximum abstract word count")
    parallelism: int = Field(4, ge=1, le=32, description="Concurrent TDoc processing")
    # Chunking
    max_chunk_size: int = Field(
        1000,
        ge=1,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_MAX_CHUNK_SIZE.name, "max_chunk_size"),
        description="Max tokens per chunk",
    )
    chunk_overlap: int = Field(
        100,
        ge=0,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CHUNK_OVERLAP.name, "chunk_overlap"),
        description="Token overlap between chunks",
    )

    # Processing
    abstract_min_words: int = Field(
        150,
        ge=1,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_ABSTRACT_MIN_WORDS.name, "abstract_min_words"),
        description="Minimum abstract word count",
    )
    abstract_max_words: int = Field(
        250,
        ge=1,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_ABSTRACT_MAX_WORDS.name, "abstract_max_words"),
        description="Maximum abstract word count",
    )
    parallelism: int = Field(
        4,
        ge=1,
        le=32,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_PARALLELISM.name, "parallelism"),
        description="Concurrent TDoc processing",
    )
    convert_pdf: bool = Field(
        False,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CONVERT_PDF.name, "convert_pdf"),
        description="Convert PDF documents to markdown",
    )
    convert_md: bool = Field(
        False,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_CONVERT_MD.name, "convert_md"),
        description="Enable markdown conversion for documents",
    )
    vlm: bool = Field(
        False,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_VLM.name, "vlm"),
        description="Use Vision-Language Models for figure analysis",
    )
    device: str = Field(
        "auto",
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_DEVICE.name, "device"),
        description="Device to use for local models (auto|cpu|cuda|mps)",
    )
    num_threads: int | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_NUM_THREADS.name, "num_threads"),
        description="Number of threads for parallel processing",
    )
    batch_size: int | None = Field(
        None,
        validation_alias=AliasChoices(ConfigEnvVar.TDC_AI_BATCH_SIZE.name, "batch_size"),
        description="Batch size for processing",
    )

    # Graph
    graph_query_level: GraphQueryLevel = Field(
        "simple",
        validation_alias=AliasChoices(ConfigEnvVar.TDC_GRAPH_QUERY_LEVEL.name, "graph_query_level"),
        description="Level of graph query answer generation (simple|medium|advanced)",
    )

    @classmethod
    def from_env(cls, **overrides: str | int | Path | None) -> AiConfig:
        """Create config from environment variables."""
        data: dict[str, str | int | Path | None] = {}

        # Set cache_manager_name for use in _resolve_paths validator
        if cache_manager_name := overrides.get("cache_manager_name"):
            data["cache_manager_name"] = cache_manager_name

        if embedding_model := os.getenv("TDC_AI_EMBEDDING_MODEL"):
            data["embedding_model"] = embedding_model
        if embedding_backend := os.getenv("TDC_AI_EMBEDDING_BACKEND"):
            data["embedding_backend"] = embedding_backend
        if llm_model := os.getenv("TDC_AI_LLM_MODEL"):
            data["llm_model"] = llm_model
        if llm_api_base := os.getenv("TDC_AI_LLM_API_BASE"):
            data["llm_api_base"] = llm_api_base

        # Check for TDC_AI_LLM_API_KEY - takes precedence over provider-specific keys
        data["llm_api_key"] = os.environ.get("TDC_AI_LLM_API_KEY")

        max_chunk_size = _env_int("TDC_AI_MAX_CHUNK_SIZE")
        if max_chunk_size is not None:
            data["max_chunk_size"] = max_chunk_size

        chunk_overlap = _env_int("TDC_AI_CHUNK_OVERLAP")
        if chunk_overlap is not None:
            data["chunk_overlap"] = chunk_overlap

        abstract_min_words = _env_int("TDC_AI_ABSTRACT_MIN_WORDS")
        if abstract_min_words is not None:
            data["abstract_min_words"] = abstract_min_words

        abstract_max_words = _env_int("TDC_AI_ABSTRACT_MAX_WORDS")
        if abstract_max_words is not None:
            data["abstract_max_words"] = abstract_max_words

        parallelism = _env_int("TDC_AI_PARALLELISM")
        if parallelism is not None:
            data["parallelism"] = parallelism

        if graph_query_level := os.getenv("TDC_GRAPH_QUERY_LEVEL"):
            data["graph_query_level"] = graph_query_level

        data.update(overrides)
        # Filter out None values to let defaults apply
        filtered_data = {k: v for k, v in data.items() if v is not None}
        return cls(**filtered_data)
    # LightRAG nested
    lightrag: LightRAGSettings = Field(default_factory=LightRAGSettings)

    @model_validator(mode="after")
    def _validate_bounds(self) -> AiConfig:
@@ -172,33 +219,28 @@ class AiConfig(BaseConfigModel):
    @field_validator("embedding_model")
    @classmethod
    def _validate_embedding_model(cls, value: str) -> str:
        return _validate_embedding_model_format(value)

    @field_validator("embedding_backend")
    @classmethod
    def _validate_embedding_backend(cls, value: str) -> str:
        normalized = value.strip().lower()
        allowed = {"torch", "onnx", "openvino"}
        if normalized not in allowed:
            msg = "embedding_backend must be one of: torch, onnx, openvino"
        if "/" not in value:
            msg = "embedding_model must be in '<provider>/<model_name>' format"
            raise ValueError(msg)
        return normalized
        return value

    @field_validator("llm_model")
    @classmethod
    def _validate_llm_model(cls, value: str) -> str:
        return _validate_model_identifier(value, "llm_model")

    @field_validator("graph_query_level")
    @classmethod
    def _validate_graph_query_level(cls, value: GraphQueryLevel | str) -> GraphQueryLevel:
        if isinstance(value, str):
            value = value.strip().lower()
            if value not in ["simple", "medium", "advanced"]:
                msg = "graph_query_level must be one of: simple, medium, advanced"
        if "/" not in value:
            msg = "llm_model must be in '<provider>/<model_name>' format"
            raise ValueError(msg)
            return value  # type: ignore[return-value]
        return value


__all__ = ["AiConfig", "Backend", "GraphQueryLevel"]
class ThreeGPPAIConfig(ThreeGPPConfig):
    """Extended config for 3gpp-ai, adding [ai] section.

    Inherits from_settings() from ThreeGPPConfig — loads all base
    sections (path, http, credentials, crawl) plus [ai].
    """

    ai: AiConfig = Field(default_factory=AiConfig)


__all__ = ["AiConfig", "Backend", "GraphQueryLevel", "LightRAGSettings", "ThreeGPPAIConfig"]
+534 −0

File added.

Preview size limit exceeded, changes collapsed.

+428 −0

File added.

Preview size limit exceeded, changes collapsed.

+3 −4
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ from lightrag.llm.ollama import ollama_embed, ollama_model_complete
from lightrag.llm.openai import openai_complete, openai_embed
from lightrag.llm.zhipu import zhipu_complete, zhipu_embedding
from lightrag.utils import EmbeddingFunc
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.logging import get_logger

from .config import LightRAGConfig, QueryMode, StorageBackend
@@ -199,12 +199,11 @@ class TDocRAG:
            self._pg0_manager.start()
            logger.info("Using pg0 at %s", self._pg0_manager.uri)

        # Prepare working directory using CacheManager (single source of truth)
        # Prepare working directory using PathConfig (single source of truth)
        # Structure: ~/.3gpp-crawler/lightrag/{embedding_model}/
        # LightRAG will create workspace subdirectory internally
        manager = resolve_cache_manager()
        embedding_model_safe = self.config.embedding.model.replace(":", "-").replace("/", "-")
        working_dir = manager.ai_embed_dir(embedding_model_safe)
        working_dir = PathConfig().ai_embed_dir(embedding_model_safe)
        working_dir.mkdir(parents=True, exist_ok=True)
        self._working_dir = working_dir
        logger.info("Using working directory: %s", working_dir)
Loading