Commit 267b7d85 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(config): add export functionality for TDocCrawlerConfig

* Implement ConfigExporter to export configuration in TOML, YAML, and JSON formats.
* Create TDocCrawlerConfig with nested models for paths, HTTP settings, credentials, and crawling behavior.
* Add sources module for discovering, loading, and merging configuration files.
* Include tests for export functionality and config file discovery.
parent a36cc758
Loading
Loading
Loading
Loading
+54 −1
Original line number Diff line number Diff line
@@ -62,6 +62,7 @@ Notes:
| Pydantic model | `src/tdoc_crawler/models/` | Data validation, serialization |
| HTTP caching | `src/tdoc_crawler/http_client.py` | `create_cached_session()` |
| Path management | `src/tdoc_crawler/config/__init__.py` | `CacheManager`, `resolve_cache_manager()` |
| Configuration | `src/tdoc_crawler/config/settings.py` | `TDocCrawlerConfig`, pydantic-settings |
| Test structure | `tests/test_crawler.py` | Fixtures, mocking, isolation |

## Heuristics (quick decisions)
@@ -113,7 +114,49 @@ Notes:
| TSG | Technical Specification Group (SA, RAN, CT) |
| Portal | 3GPP EOL authenticated portal |

## CacheManager Pattern (CRITICAL)
## Configuration System (NEW in v1.0)

**Two complementary systems:**

1. **`TDocCrawlerConfig`** (pydantic-settings) — Type-safe configuration from files/env vars
2. **`CacheManager`** (runtime paths) — File system path resolution

### TDocCrawlerConfig (Settings)

Use for **all configurable behavior** (timeouts, credentials, limits, etc.):

```python
from tdoc_crawler.config import TDocCrawlerConfig

# Load with automatic discovery (3gpp-crawler.toml, env vars)
config = TDocCrawlerConfig.from_settings()

# Or with explicit config file
config = TDocCrawlerConfig.from_settings(config_file=Path("./my-config.toml"))

# Access nested config
config.path.cache_dir      # Path to cache directory
config.http.timeout        # HTTP timeout in seconds
config.credentials.username  # Portal username
config.crawl.workers       # Concurrent crawl workers
```

**Config file discovery order** (later overrides earlier):
1. Global: `~/.config/3gpp-crawler/config.toml`
2. Project: `3gpp-crawler.toml`, `.3gpp-crawler.toml`, `.3gpp-crawler/config.toml`
3. Config dir: `.config/.3gpp-crawler/conf.d/*.toml` (alphabetical)

**Precedence:** CLI args > Config file > Environment variables > Defaults

**Supported formats:** TOML (primary), YAML, JSON

**Environment variable prefixes:**
- `TDC_*` — Path settings
- `TDC_EOL_*` — Portal credentials
- `TDC_CRAWL_*` — Crawl filters
- `HTTP_CACHE_*` — HTTP cache settings

### CacheManager Pattern (Runtime Paths)

**Single Source of Truth:** All file paths MUST use `CacheManager` from `src/tdoc_crawler/config/__init__.py`.

@@ -179,6 +222,16 @@ except CacheManagerNotRegisteredError:
manager = resolve_cache_manager()
```

### Configuration CLI Commands

```bash
# Generate default config file
tdoc-crawler config init --output 3gpp-crawler.toml

# Show current configuration (env + files + defaults)
tdoc-crawler config show
```

## Antipaterns (what NOT to do)

Errors are often masked by trying to be too clever and/or too careful with error handling or by not following the established patterns. Always prefer simplicity and clarity over complex workarounds.
+2 −2
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ from __future__ import annotations

from tdoc_crawler.cli.config import load_cli_config
from tdoc_crawler.cli.config_cmd import config_app
from tdoc_crawler.cli.tdoc_app import tdoc_app
from tdoc_crawler.cli.spec_app import spec_app
from tdoc_crawler.cli.tdoc_app import tdoc_app

__all__ = ["load_cli_config", "config_app", "tdoc_app", "spec_app"]
__all__ = ["config_app", "load_cli_config", "spec_app", "tdoc_app"]
+95 −0
Original line number Diff line number Diff line
"""Backward compatibility layer for environment variable mapping.

This module provides mappings between legacy environment variable names and
their corresponding config field paths, enabling seamless migration from
env-var-based configuration to the new config-file-based approach.

All existing TDC_*, HTTP_CACHE_*, and LIGHTRAG_* environment variables
remain functional via pydantic's AliasChoices mechanism.
"""

from __future__ import annotations

import logging

logger = logging.getLogger(__name__)

# Maps environment variable names to their config field paths.
# Used for documentation and validation purposes.
ENV_VAR_MAPPINGS: dict[str, str] = {
    # Path/Cache (TDC_*)
    "TDC_CACHE_DIR": "path.cache_dir",
    "TDC_AI_STORE_PATH": "path.ai_cache_dir",
    # Credentials (TDC_*)
    "TDC_EOL_USERNAME": "credentials.username",
    "TDC_EOL_PASSWORD": "credentials.password",
    "TDC_EOL_PROMPT": "credentials.prompt",
    # HTTP/SQLite (overrides)
    "TDC_VERIFY_SSL": "http.verify_ssl",
    "TDC_TIMEOUT": "http.timeout",
    "TDC_MAX_RETRIES": "http.max_retries",
    # HTTP Cache (HTTP_CACHE_*)
    "HTTP_CACHE_TTL": "http.cache_ttl",
    "HTTP_CACHE_ENABLED": "http.cache_enabled",
    "HTTP_CACHE_REFRESH_ON_ACCESS": "http.cache_refresh_on_access",
    # Crawl filters (TDC_ prefixed)
    "TDC_WORKING_GROUP": "crawl.working_group",
    "TDC_SUB_GROUP": "crawl.sub_group",
    "TDC_START_DATE": "crawl.date_start",
    "TDC_END_DATE": "crawl.date_end",
    "TDC_SOURCE_LIKE": "crawl.source_like",
    "TDC_AGENDA_LIKE": "crawl.agenda_like",
    "TDC_TITLE_LIKE": "crawl.title_like",
    "TDC_LIMIT_TDOCS": "crawl.limit",
    "TDC_WORKERS": "crawl.workers",
    # AI/LightRAG (TDC_AI_* and LIGHTRAG_*)
    "TDC_AI_LLM_MODEL": "ai.llm_model",
    "TDC_AI_LLM_API_BASE": "ai.llm_api_base",
    "TDC_AI_LLM_API_KEY": "ai.llm_api_key",
    "TDC_AI_EMBEDDING_MODEL": "ai.embedding_model",
    "TDC_AI_EMBEDDING_API_BASE": "ai.embedding_api_base",
    "TDC_AI_EMBEDDING_API_KEY": "ai.embedding_api_key",
    "TDC_AI_MAX_CHUNK_SIZE": "ai.max_chunk_size",
    "TDC_AI_CHUNK_OVERLAP": "ai.chunk_overlap",
    "TDC_AI_CONVERT_PDF": "ai.convert_pdf",
    "TDC_AI_CONVERT_MD": "ai.convert_md",
    "TDC_AI_VLM": "ai.vlm",
    "TDC_AI_ABSTRACT_MIN_WORDS": "ai.abstract_min_words",
    "TDC_AI_ABSTRACT_MAX_WORDS": "ai.abstract_max_words",
    "TDC_AI_PARALLELISM": "ai.parallelism",
    "TDC_GRAPH_QUERY_LEVEL": "ai.graph_query_level",
    "TDC_LIGHTRAG_SHARED_STORAGE": "ai.lightrag.shared_storage",
    "LIGHTRAG_SHARED_STORAGE": "ai.lightrag.shared_storage",
    "LIGHTRAG_DB_BACKEND": "ai.lightrag.db_backend",
}

# Deprecated environment variables that will produce warnings.
# Format: "OLD_VAR": "Use NEW_VAR instead"
DEPRECATED_ENV_VARS: dict[str, str] = {}


def log_deprecation_warning(env_var_name: str) -> None:
    """Log a warning for a deprecated environment variable.

    Args:
        env_var_name: Name of the deprecated environment variable.
    """
    if env_var_name in DEPRECATED_ENV_VARS:
        replacement = DEPRECATED_ENV_VARS[env_var_name]
        logger.warning(
            "Environment variable '%s' is deprecated. %s",
            env_var_name,
            replacement,
        )
    else:
        logger.debug(
            "Environment variable '%s' is set but has no documented mapping",
            env_var_name,
        )


__all__ = [
    "DEPRECATED_ENV_VARS",
    "ENV_VAR_MAPPINGS",
    "log_deprecation_warning",
]
+201 −0
Original line number Diff line number Diff line
"""Export TDocCrawlerConfig to various file formats with documentation."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Literal

import yaml

from tdoc_crawler.config.settings import TDocCrawlerConfig

FormatType = Literal["toml", "yaml", "json"]


def _default_value_for_field(info: Any) -> Any:
    """Extract the default value from a pydantic field info."""
    default = info.default
    if callable(default):
        return None
    return default


class ConfigExporter:
    """Export configuration to file formats with documentation comments."""

    def __init__(self, config: TDocCrawlerConfig | None = None):
        """Initialize with config instance (or create with defaults)."""
        self.config = config or TDocCrawlerConfig()

    def export(self, format: FormatType = "toml") -> str:
        """Export config to string in specified format with comments."""
        if format == "toml":
            return self._export_toml()
        elif format == "yaml":
            return self._export_yaml()
        elif format == "json":
            return self._export_json()
        else:
            raise ValueError(f"Unsupported format: {format}")

    def save(self, path: Path, format: FormatType = "toml", force: bool = False) -> None:
        """Save config to file.

        Args:
            path: Output file path.
            format: Output format (toml, yaml, json).
            force: If True, overwrite existing file.

        Raises:
            FileExistsError: If file exists and force is False.
        """
        if path.exists() and not force:
            raise FileExistsError(f"File exists: {path}. Use --force to overwrite.")

        content = self.export(format)
        path.write_text(content, encoding="utf-8")

    def _get_all_fields(self) -> dict[str, Any]:
        """Get all fields with their values and metadata."""
        fields: dict[str, Any] = {}
        config_data = self.config.model_dump()

        # Walk through all subconfigs
        for section_name in ("path", "http", "credentials", "crawl", "output", "http_cache", "ai"):
            if hasattr(self.config, section_name):
                section = getattr(self.config, section_name)
                section_data = config_data.get(section_name, {})
                section_model = type(section)
                section_fields = section_model.model_fields if hasattr(section_model, "model_fields") else {}

                for field_name, field_info in section_fields.items():
                    value = section_data.get(field_name)
                    default = _default_value_for_field(field_info)
                    description = field_info.description if hasattr(field_info, "description") else None
                    fields[f"{section_name}.{field_name}"] = {
                        "value": value,
                        "default": default,
                        "description": description,
                        "section": section_name,
                    }

        return fields

    def _export_toml(self) -> str:
        """Export to TOML with comments from field descriptions."""
        lines = ["# 3GPP Crawler Configuration", ""]
        lines.append("# This file was generated by `tdoc-crawler config init`")
        lines.append("# Default values are defined in TDocCrawlerConfig using Field(default=...)")
        lines.append("")

        current_section = None
        fields = self._get_all_fields()

        for key, meta in fields.items():
            section = meta["section"]
            field_name = key.split(".", 1)[1] if "." in key else key
            description = meta["description"]
            value = meta["value"]
            default = meta["default"]

            if section != current_section:
                if current_section is not None:
                    lines.append("")
                lines.append(f"[{section}]")
                lines.append("")
                current_section = section

            if description:
                lines.append(f"# {description}")
            if default is not None:
                lines.append(f"# Default: {default}")

            formatted_value = self._format_toml_value(value)
            lines.append(f"{field_name} = {formatted_value}")
            lines.append("")

        return "\n".join(lines)

    def _export_yaml(self) -> str:
        """Export to YAML with comments from field descriptions."""
        # YAML doesn't support inline comments in the same way, so we use a different approach
        # We'll add a header and use YAML's native comment syntax
        lines = [
            "# 3GPP Crawler Configuration",
            "# This file was generated by `tdoc-crawler config init`",
            "# Default values are defined in TDocCrawlerConfig using Field(default=...)",
            "",
        ]

        config_data = self.config.model_dump()
        yaml_str = yaml.dump(config_data, default_flow_style=False, sort_keys=False)

        # Add comment for each top-level section
        sections_with_comments = {
            "path": "File system paths",
            "http": "HTTP client and caching configuration",
            "credentials": "ETSI Online (EOL) portal credentials",
            "crawl": "Crawling behavior and filter configuration",
            "output": "Output configuration",
            "http_cache": "HTTP cache settings",
            "ai": "AI/LLM configuration",
        }

        result_lines = []
        for line in yaml_str.splitlines():
            stripped = line.rstrip()
            if stripped.endswith(":") and not stripped.startswith("#"):
                # This is a section header
                section_name = stripped[:-1]
                if section_name in sections_with_comments:
                    result_lines.append(f"# {sections_with_comments[section_name]}")
            result_lines.append(line)

        return "\n".join(lines + result_lines)

    def _export_json(self) -> str:
        """Export to JSON (comments not supported, so just data)."""
        config_data = self._resolve_paths_for_json(self.config.model_dump())
        return json.dumps(config_data, indent=2)

    def _resolve_paths_for_json(self, data: dict[str, Any]) -> dict[str, Any]:
        """Resolve Path objects to strings for JSON serialization."""
        result = {}
        for key, value in data.items():
            if isinstance(value, Path):
                result[key] = str(value)
            elif isinstance(value, dict):
                result[key] = self._resolve_paths_for_json(value)
            elif isinstance(value, list):
                result[key] = [str(v) if isinstance(v, Path) else v for v in value]
            else:
                result[key] = value
        return result

    def _format_toml_value(self, value: Any) -> str:
        """Format Python value for TOML output."""
        return self._toml_value_to_string(value)

    def _toml_value_to_string(self, value: Any) -> str:
        """Convert value to TOML-compatible string representation."""
        result = None
        if value is None:
            result = "null"
        elif isinstance(value, bool):
            result = "true" if value else "false"
        elif isinstance(value, (int, float)):
            result = str(value)
        elif isinstance(value, Path):
            result = f'"{str(value)}"'
        elif isinstance(value, str):
            escaped = value.replace("\\", "\\\\").replace('"', '\\"')
            result = f'"{escaped}"'
        elif isinstance(value, list):
            result = "[" + ", ".join(self._toml_value_to_string(item) for item in value) + "]"
        else:
            result = f'"{str(value)}"'
        return result


__all__ = ["ConfigExporter", "FormatType"]
+336 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading