Commit c9838d05 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(config): update environment variable defaults and add helper function

* Set TDC_AI_CONVERT_PDF and TDC_AI_CONVERT_MD to true by default.
* Modify DEFAULT_EMBEDDING_MODEL to use the updated model path.
* Introduce _env_bool helper function for parsing boolean environment variables.
* Patch zhipu functions to remove unsupported kwargs and improve compatibility.
parent e6129944
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -101,12 +101,12 @@ TDC_AI_CHUNK_OVERLAP=100

# Whether to convert office documents to PDF during workspace add-members (default: false)
# Set to "true", "1", or "yes" to enable; anything else disables it
# TDC_AI_CONVERT_PDF=false
TDC_AI_CONVERT_PDF=1

# Whether to extract markdown from PDFs during workspace add-members (default: false)
# Set to "true", "1", or "yes" to enable; anything else disables it
# When enabled, implies TDC_AI_CONVERT_PDF=true
# TDC_AI_CONVERT_MD=false
TDC_AI_CONVERT_MD=1

# Summary constraints
TDC_AI_ABSTRACT_MIN_WORDS=150
+47 −50
Original line number Diff line number Diff line
@@ -26,7 +26,7 @@ DEFAULT_LLM_MODEL: str = os.getenv("TDC_AI_LLM_MODEL", "openrouter/openrouter/fr
DEFAULT_LLM_API_BASE: str = os.getenv("TDC_AI_LLM_API_BASE", "http://localhost:11434")

# Embedding defaults (read from TDC_AI_EMBEDDING_MODEL or use default)
DEFAULT_EMBEDDING_MODEL: str = os.getenv("TDC_AI_EMBEDDING_MODEL", "qwen3-embedding:0.6b")
DEFAULT_EMBEDDING_MODEL: str = os.getenv("TDC_AI_EMBEDDING_MODEL", "ollama/qwen3-embedding:0.6b")
DEFAULT_EMBEDDING_API_BASE: str = os.getenv("TDC_AI_EMBEDDING_API_BASE", "http://localhost:11434")

# Workspace default
@@ -38,6 +38,24 @@ DEFAULT_PG0_PORT: int = 15432
DEFAULT_PG0_DATABASE: str = "tdoc"


# =============================================================================
# Helpers
# =============================================================================


def _env_bool(key: str, default: bool = True) -> bool:
    """Parse a boolean environment variable.

    Returns True for: "1", "true", "yes", "on"
    Returns False for: "0", "false", "no", "off"
    Returns default if the variable is not set.
    """
    value = os.getenv(key)
    if value is None:
        return default
    return value.lower() in {"1", "true", "yes", "on"}


# =============================================================================
# Enums
# =============================================================================
@@ -228,54 +246,33 @@ class LightRAGConfig(BaseSettings):
        Returns:
            LightRAGConfig instance configured from environment
        """
        # Read TDC_AI_* environment variables
        llm_model = os.getenv("TDC_AI_LLM_MODEL")
        llm_api_base = os.getenv("TDC_AI_LLM_API_BASE")
        llm_api_key = os.getenv("TDC_AI_LLM_API_KEY")
        embedding_model = os.getenv("TDC_AI_EMBEDDING_MODEL")
        embedding_api_base = os.getenv("TDC_AI_EMBEDDING_API_BASE")
        embedding_api_key = os.getenv("TDC_AI_EMBEDDING_API_KEY")
        extract_tables = os.getenv("LIGHTRAG_EXTRACT_TABLES")
        extract_figures = os.getenv("LIGHTRAG_EXTRACT_FIGURES")
        extract_equations = os.getenv("LIGHTRAG_EXTRACT_EQUATIONS")
        figure_description_enabled = os.getenv("LIGHTRAG_FIGURE_DESCRIPTION_ENABLED")

        # Build configuration dictionary
        config_data = {}

        # Configure LLM if TDC_AI_LLM_MODEL is set
        if llm_model:
            config_data["llm"] = {
                "model": llm_model,
                "api_base": llm_api_base or DEFAULT_LLM_API_BASE,
                "api_key": llm_api_key,
            }
        config_data: dict = {}

        # Configure embedding if TDC_AI_EMBEDDING_MODEL is set
        if embedding_model:
            config_data["embedding"] = {
                "model": embedding_model,
                "api_base": embedding_api_base or DEFAULT_EMBEDDING_API_BASE,
                "api_key": embedding_api_key,
        # LLM config - pass through any set values, filter None
        llm_config = {
            "model": os.getenv("TDC_AI_LLM_MODEL"),
            "api_base": os.getenv("TDC_AI_LLM_API_BASE"),
            "api_key": os.getenv("TDC_AI_LLM_API_KEY"),
        }

        # Extraction toggles are part of LightRAGConfig itself.
        # Keep explicit mapping here so from_env behavior is predictable.
        if extract_tables is not None:
            config_data["extract_tables"] = extract_tables.lower() in {"1", "true", "yes", "on"}
        if extract_figures is not None:
            config_data["extract_figures"] = extract_figures.lower() in {"1", "true", "yes", "on"}
        if extract_equations is not None:
            config_data["extract_equations"] = extract_equations.lower() in {"1", "true", "yes", "on"}
        if figure_description_enabled is not None:
            config_data["figure_description_enabled"] = figure_description_enabled.lower() in {
                "1",
                "true",
                "yes",
                "on",
        llm_config = {k: v for k, v in llm_config.items() if v is not None}
        if llm_config:
            config_data["llm"] = llm_config

        # Embedding config - pass through any set values, filter None
        embedding_config = {
            "model": os.getenv("TDC_AI_EMBEDDING_MODEL"),
            "api_base": os.getenv("TDC_AI_EMBEDDING_API_BASE"),
            "api_key": os.getenv("TDC_AI_EMBEDDING_API_KEY"),
        }
        embedding_config = {k: v for k, v in embedding_config.items() if v is not None}
        if embedding_config:
            config_data["embedding"] = embedding_config

        # Apply overrides
        config_data.update(overrides)
        # Extraction toggles default to True
        config_data["extract_tables"] = _env_bool("LIGHTRAG_EXTRACT_TABLES", True)
        config_data["extract_figures"] = _env_bool("LIGHTRAG_EXTRACT_FIGURES", True)
        config_data["extract_equations"] = _env_bool("LIGHTRAG_EXTRACT_EQUATIONS", True)
        config_data["figure_description_enabled"] = _env_bool("LIGHTRAG_FIGURE_DESCRIPTION_ENABLED", True)

        config_data.update(overrides)
        return cls(**config_data)
+77 −7
Original line number Diff line number Diff line
@@ -11,9 +11,12 @@ from __future__ import annotations
import logging
from collections.abc import Callable
from dataclasses import dataclass
from functools import wraps
from pathlib import Path
from typing import Any

# Monkey-patch zhipu functions to filter out base_url
import lightrag.llm.zhipu as zhipu_module
from lightrag import LightRAG, QueryParam
from lightrag.kg import STORAGES
from lightrag.kg.shared_storage import initialize_pipeline_status
@@ -24,6 +27,62 @@ from lightrag.llm.openai import openai_complete, openai_embed
from lightrag.llm.zhipu import zhipu_complete, zhipu_embedding
from lightrag.utils import EmbeddingFunc

# Patch zhipu_complete_if_cache
original_zhipu_complete_if_cache = zhipu_module.zhipu_complete_if_cache


@wraps(original_zhipu_complete_if_cache)
async def patched_zhipu_complete_if_cache(
    prompt,
    model="glm-4-flashx",
    api_key=None,
    system_prompt=None,
    history_messages=None,
    enable_cot=False,
    **kwargs,
):
    # Remove unsupported kwargs including base_url
    if history_messages is None:
        history_messages = []
    kwargs = {k: v for k, v in kwargs.items() if k not in ["hashing_kv", "keyword_extraction", "base_url"]}
    return await original_zhipu_complete_if_cache(
        prompt=prompt,
        model=model,
        api_key=api_key,
        system_prompt=system_prompt,
        history_messages=history_messages,
        enable_cot=enable_cot,
        **kwargs,
    )


# Patch zhipu_embedding
original_zhipu_embedding = zhipu_module.zhipu_embedding


@wraps(original_zhipu_embedding)
async def patched_zhipu_embedding(
    texts: list[str],
    model: str = "embedding-3",
    api_key: str = None,
    **kwargs,
):
    # Remove unsupported kwargs including base_url
    kwargs = {k: v for k, v in kwargs.items() if k not in ["base_url"]}
    return await original_zhipu_embedding(
        texts=texts,
        model=model,
        api_key=api_key,
        **kwargs,
    )


# Apply the patches
zhipu_module.zhipu_complete_if_cache = patched_zhipu_complete_if_cache
zhipu_module.zhipu_embedding = patched_zhipu_embedding
zhipu_complete_if_cache = patched_zhipu_complete_if_cache
zhipu_embedding = patched_zhipu_embedding

from tdoc_crawler.config import resolve_cache_manager

from .config import LightRAGConfig, QueryMode, StorageBackend
@@ -59,8 +118,8 @@ PROVIDERS: dict[str, ProviderConfig] = {
}

PROVIDER_ALIASES: dict[str, ProviderAlias] = {
    "zai": ProviderAlias(canonical="zhipu"),
    "zai-coding-plan": ProviderAlias(canonical="zhipu", base_url="https://api.z.ai/api/paas/v4"),
    "zai": ProviderAlias(canonical="openai", base_url="https://api.z.ai/api/paas/v4"),
    "zai-coding-plan": ProviderAlias(canonical="openai", base_url="https://api.z.ai/api/coding/paas/v4"),
    "openrouter": ProviderAlias(canonical="openai", base_url="https://openrouter.ai/api/v1"),
    "nvidia": ProviderAlias(canonical="openai", base_url="https://integrate.api.nvidia.com/v1/"),
}
@@ -318,15 +377,26 @@ class TDocRAG:
    def _build_provider_kwargs(self, provider: str, model_name: str, is_embedding: bool = False) -> dict[str, Any]:
        """Build provider-specific kwargs for LLM/embedding functions."""
        config = self.config.embedding if is_embedding else self.config.llm
        if provider == "ollama":

        # Resolve alias to canonical provider name
        canonical_provider = _resolve_provider(provider)

        if canonical_provider == "ollama":
            # Ollama uses different kwargs for embedding vs completion
            kwargs = {"host": config.api_base}
            if is_embedding:
                kwargs["embed_model"] = model_name
            return kwargs
        if provider in ("openai", "zhipu"):
            # LightRAG's openai_complete/zhipu_complete expect base_url, not api_base
            return {"api_key": config.api_key, "base_url": config.api_base}
        if provider == "jina":
        if canonical_provider == "openai":
            # LightRAG's openai_complete expects base_url, not api_base
            # Check for alias-specific base URL override
            base_url = config.api_base
            if provider in PROVIDER_ALIASES and PROVIDER_ALIASES[provider].base_url:
                base_url = PROVIDER_ALIASES[provider].base_url
            return {"api_key": config.api_key, "base_url": base_url}
        if canonical_provider == "zhipu":
            # zhipu_complete_if_cache only accepts api_key, not base_url
            return {"api_key": config.api_key}
        if canonical_provider == "jina":
            return {"api_key": config.api_key}
        return {}