Commit e3734d06 authored by Jan Reimes's avatar Jan Reimes
Browse files

🧪 test(config): update tests for CacheManager removal and add ruff E402 per-file ignores

- conftest.py: replace CacheManager fixtures with PathConfig/ThreeGPPConfig equivalents
- test_http_client.py: replace cache_manager_name with http_cache_file params; remove CacheManager mocking
- test_whatthespec.py, test_targeted_fetch.py, test_specs_downloads.py: update to pass http_cache_file/db_file directly
- test_cli.py, test_new_cli_flags.py: replace CacheManager setup with PathConfig; add missing type annotations
- test_ai_workspaces.py: remove stale resolve_cache_manager monkeypatches; fix checkout_specs → checkout_specs_async
- pool_executor tests: remove stray blank lines
- ruff.toml: add per-file E402 ignores for tdoc_app.py and threegpp_ai/cli.py (load_dotenv before imports)
- docs/plans/: add config-consolidation plan documents
- 3gpp-ai.toml: add AI package config file
parent 639af911
Loading
Loading
Loading
Loading

3gpp-ai.toml

0 → 100644
+175 −0
Original line number Diff line number Diff line
# 3GPP AI Configuration

# Generated by: 3gpp-ai config init
#
# Configuration precedence (highest to lowest):
# 1. CLI arguments
# 2. This config file
# 3. Environment variables (TDC_*, LIGHTRAG_*)
# 4. Hard-coded defaults
#
# Sensitive values use ${ENV_VAR} syntax for security.
# Environment variables are interpolated at runtime.

[path]

# Root cache directory for storing downloaded files and metadata
# Default: C:\Users\Jan.Reimes\.3gpp-crawler
cache_dir = "C:\Users\Jan.Reimes\.3gpp-crawler"

# SQLite database filename for storing crawl metadata
# Default: 3gpp_crawler.db
db_filename = "3gpp_crawler.db"

# Subdirectory name for checked-out documents
# Default: checkout
checkout_dirname = "checkout"

# Subdirectory name for AI-related cache (embeddings, graphs)
# Default: lightrag
ai_cache_dirname = "lightrag"

[http]

# Time-to-live for HTTP cache entries in seconds
# Default: 7200
cache_ttl = 7200

# Enable HTTP response caching
# Default: True
cache_enabled = true

# Refresh cache TTL on each access
# Default: True
cache_refresh_on_access = true

# Verify SSL certificates for HTTPS requests
# Default: True
verify_ssl = true

# Maximum number of retry attempts for failed requests
# Default: 3
max_retries = 3

# HTTP request timeout in seconds
# Default: 30
timeout = 60

[credentials]

# Username for ETSI Online (EOL) portal authentication
username = "reimes"

# Password for ETSI Online (EOL) portal authentication
password = "1y4RDXua9HOuUC"

# Custom prompt message for interactive credential entry
prompt = "false"

[crawl]

# Filter by working group (e.g., S4, RAN1, CT3)
working_group = null

# Filter by sub-working group
sub_group = null

# Start date filter (YYYY-MM-DD, YYYY-MM, or YYYY format)
date_start = null

# End date filter (YYYY-MM-DD, YYYY-MM, or YYYY format)
date_end = null

# SQL LIKE pattern to match document source
source_like = null

# SQL LIKE pattern to match agenda item
agenda_like = null

# SQL LIKE pattern to match document title
title_like = null

# Maximum number of documents to crawl
# Default: 1000
limit = 1000

# Number of concurrent workers for crawling
# Default: 4
workers = 4

[llm]

# LLM model name in <provider>/<model> format
# Default: openrouter/openrouter/free
model = "openai/glm-4.7"

# LLM API base URL
# Default: http://localhost:11434
api_base = "https://api.z.ai/api/coding/paas/v4"

# API key for LLM provider
# Environment: TDC_AI_LLM_API_KEY
api_key = "${TDC_AI_LLM_API_KEY}"

[embedding]

# Embedding model name in <provider>/<model> format
# Default: ollama/qwen3-embedding:0.6b
model = "ollama/embeddinggemma:latest"

# Embedding API base URL
# Default: http://localhost:11434
api_base = "http://localhost:11434"

# API key for embedding provider
api_key = null

[database]

# Storage backend to use (file or pg0)
# Default: file
backend = "file"

# pg0 instance name
# Default: 3gpp-crawler
pg0_instance_name = "3gpp-crawler"

# pg0 PostgreSQL port
# Default: 15432
pg0_port = 15432

# pg0 database name
# Default: tdoc
pg0_database = "tdoc"

[extraction]

# Enable extraction and indexing of table elements
# Default: True
tables = true

# Enable extraction and indexing of figure elements
# Default: True
figures = true

# Enable extraction and indexing of equation elements
# Default: True
equations = true

# Enable figure description generation with vision-capable models
# Default: True
figure_description = true

[workspace]

# Default workspace name
# Default: default
default_name = "default"

# Default query mode (naive, local, global, hybrid, mix, bypass)
# Default: hybrid
default_query_mode = "hybrid"

# Enable shared embedding storage across workspaces (deduplication)
# Default: False
shared_storage = false
+939 −0

File added.

Preview size limit exceeded, changes collapsed.

+134 −0
Original line number Diff line number Diff line
# Test Suite Migration Plan — CacheManager → PathConfig

## Context

Config consolidation (Phases 0-6) removed these classes from source code:
`CacheManager`, `resolve_cache_manager`, `register_cache_manager`, `reset_cache_managers`, `HttpCacheConfig`, `BaseConfigModel` `CrawlLimits`. The Test suite still references them.

3gpp-ai test files still reference `resolve_cache_manager`.

## Source-Code change Required

### 1. `http_client/session.py: Add `cache_file`param to`create_cached_session\`

The function needs to know where to write the HTTP cache file. Currently, it always resolves from config:
`cache_file = config.path.http_cache_file` via `ThreeGPPConfig.from_settings()`. This makes testing difficult.

as tests can't control the cache location.

**Add:** `cache_file: Path | None = None` parameter. When provided, use it directly. Otherwise, resolve from config. This avoids double call to `ThreeGPPConfig.from_settings()` every test.
especially `http_config` is `None` and `cache_file` is both `None`, resolve from default config.

else resolve from `ThreeGPPConfig.from_settings()` (expensive).

but gives explicit path for tests for using `monkeypatch.setenv`.

`TDC_CACHE_DIR`, to point to `tmp_path`).

```python
def create_cached_session(
    http_config=http_config,
    cache_file=tmp_path / "http_cache.sqlite3",
) -> ...
```

When both branches now just fall through to `ThreeGPPConfig.from_settings()`.

## Test File Changes — priority order

file count | Risk
Notes |
|---|---|---|---|---|
|---|---|---|
| \*\*Priority 0: `conftest.py` (shared fixtures, ~2 | Highest |
|---|---|
|---|---|
| **Current:** |

```python
from tdoc_crawler.config import CacheManager, register_cache_manager, resolve_cache_manager

...
cache_dir = tmp_path / "test-cache"
cache_dir.mkdir(parents=True, exist_ok=True)
cache_manager = CacheManager(cache_dir, name="default", ensure_paths=True)
register_cache_manager(cache_manager, force=True)
return cache_dir
  # ...

@pytest.fixture
def test_db_path(test_cache_dir: Path) -> Path:
    return resolve_cache_manager().db_file
  # ...

# Register default cache manager at module level for all tests
_cache_manager = CacheManager(ensure_paths=False, name="default")
_cache_manager.register(force=True)
```

**After:** Delete the `CacheManager` import and `register_cache_manager`, `resolve_cache_manager`
imports.
Replace with direct `PathConfig` properties.

`test_cache_dir` now returns `tmp_path / "test-cache"` (created as directory). `test_db_path` returns `test_cache_dir / "3gpp_crawler.db"` (simple path computation). Remove `resolve_cache_manager()` — no longer exists.

`CacheManager(...).register()` is module-level code. Replace the:

```python
def _cleanup_cache_managers():
   """Remove CacheManager references at module level."""
   pass
```

- **8 test files:** `conftest.py`, `test_http_client.py` (18 refs), `test_whatthespec.py` (12+ refs), `test_targeted_fetch.py` (8 refs), `test_new_cli_flags.py` (6 refs), `test_cli.py` (3 refs), `test_specs_downloads.py` (3 refs), `tests/ai/test_ai_workspaces.py` (2 refs)
  `tests/ai/test_ai_workspaces.py` (2 refs)

`tests/ai/test_ai_workspaces.py`

### Detailed migration patterns

#### Pattern A: `CacheManager(tmp_path).register()` → `cache_file: Path`

test creates `CacheManager` for `session.py`, then gets `db_file`/`checkout_dir`/`http_cache_file` properties from it. Tests pass explicit `cache_file` to `create_cached_session()` for -- no more `cache_manager_name`.

| `CacheManager(root_path=tmp_path).register()` | `cache_file = tmp_path / "http_cache.sqlite3` |
| `session = create_cached_session(cache_file=cache_file)` |

#### Pattern B: `HttpCacheConfig(ttl=...)` → `HttpConfig(cache_ttl=...)`

Field names changed in Phase 3. Tests use `HttpConfig(cache_ttl=3600, ...)` instead.

`HttpCacheConfig(ttl=3600, ...)`.

#### Pattern C: `resolve_cache_manager(name).xxx` → `PathConfig().xxx` | `resolve_cache_manager()` was an source code. In test fixtures, use `PathConfig()` directly or use the `PathConfig` properties: `db_file`, `http_cache_file`, `checkout_dir`.

In tests, use `test_cache_dir / "3gpp_crawler.db"`, `test_cache_dir / "http-cache.sqlite3"`, etc.

directly via path computation.

#### Pattern D: `@patch("...CacheManager")` → `@patch("...ThreeGPPConfig")` | `test_new_cli_flags.py` mocks `CacheManager` in `tdoc_crawler.cli.tdoc_app`. Should mock `ThreeGPPConfig.from_settings()` instead, and verify that `config.path.cache_dir` and `config.path.http_cache_file` are set correctly. #### Pattern E: `SpecDownloads(cache_manager_name=...)` → `SpecDownloads(database=...)` | `SpecDownloads.__init__` no longer takes `cache_manager_name`. Tests pass `database` fixture directly. |

#### Pattern F: `monkeypatch.setattr(workspace_ops, "resolve_cache_manager", ...)` → `monkeypatch.setattr(workspace_ops, "PathConfig", ...)` | `test_ai_workspaces.py` patches `resolve_cache_manager`. Replace with `PathConfig` return `SimpleNamespace(db_file=...)` or `PathConfig` return `SimpleNamespace(db_file=...)`.

### Phase ordering | File | Priority | Risk |

|---|---|---|
| 1. `session.py` (source) — add `cache_file` param | High | Low |
| 2. `conftest.py` | Highest — all others depend on it | High | Low |
| 3. `test_http_client.py` | 18 CacheManager refs | High | Medium |
| 4. `test_whatthespec.py` | 12 CacheManager + HttpCacheConfig refs | Medium | Medium |
| 5. `test_targeted_fetch.py` | 8 CacheManager refs | Medium | Medium |
| 6. `test_new_cli_flags.py` | 6 `@patch("...CacheManager")` | Medium | Medium |
| 7. `test_cli.py` | 3 CacheManager refs | Low | Low |
| 8. `test_specs_downloads.py` | 3 CacheManager refs | Low | Low |
| 9. `tests/ai/test_ai_workspaces.py` | 2 `resolve_cache_manager` mock | Low | Low |

### Implementation approach

Since tests are independent, we recommended implementing in parallel batches:

**Batch 1** (source fix): session.py + conftest.py)
**Batch 2** (test_http_client.py + test_whatthespec.py in parallel)
**Batch 3** (test_targeted_fetch.py + test_new_cli_flags.py + test_cli.py in parallel)

**Batch 4** (test_specs_downloads.py + tests/ai/test_ai_workspaces.py in parallel)
+3 −0
Original line number Diff line number Diff line
@@ -66,6 +66,9 @@ max-locals = 20
[lint.per-file-ignores]
"tests/*.py" = ["S101", "S106", "PLR6301", "S603", "PLW1510"]
"tests/**/*.py" = ["S101", "S106", "PLR6301", "S603", "PLW1510"]
# load_dotenv() must run before all other imports to populate env vars before pydantic-settings reads them
"src/tdoc_crawler/cli/tdoc_app.py" = ["E402"]
"packages/3gpp-ai/threegpp_ai/cli.py" = ["E402"]

[lint.pydocstyle]
convention = "google"
+2 −4
Original line number Diff line number Diff line
@@ -126,7 +126,6 @@ def test_checkout_spec_to_workspace_reuses_latest_resolved_release(tmp_path: Pat
            return versions

    monkeypatch.setattr(workspace_ops, "SpecDatabase", _FakeSpecDb)
    monkeypatch.setattr(workspace_ops, "resolve_cache_manager", lambda _name="default": SimpleNamespace(db_file=tmp_path / "db.sqlite3"))

    called = {"count": 0}

@@ -134,7 +133,7 @@ def test_checkout_spec_to_workspace_reuses_latest_resolved_release(tmp_path: Pat
        called["count"] += 1
        return []

    monkeypatch.setattr(workspace_ops, "checkout_specs", _checkout_specs_not_expected)
    monkeypatch.setattr(workspace_ops, "checkout_specs_async", _checkout_specs_not_expected)

    resolved = asyncio.run(workspace_ops.checkout_spec_to_workspace("26260", checkout_base, "default", release="latest"))

@@ -171,7 +170,6 @@ def test_checkout_spec_to_workspace_falls_back_to_checkout_when_release_mismatch
            return versions

    monkeypatch.setattr(workspace_ops, "SpecDatabase", _FakeSpecDb)
    monkeypatch.setattr(workspace_ops, "resolve_cache_manager", lambda _name="default": SimpleNamespace(db_file=tmp_path / "db.sqlite3"))

    checked_out = checkout_base / "Specs" / "26.260-h10"
    checked_out.mkdir(parents=True)
@@ -180,7 +178,7 @@ def test_checkout_spec_to_workspace_falls_back_to_checkout_when_release_mismatch
        assert kwargs["release"] == "17"
        return [checked_out]

    monkeypatch.setattr(workspace_ops, "checkout_specs", _checkout_specs)
    monkeypatch.setattr(workspace_ops, "checkout_specs_async", _checkout_specs)

    resolved = asyncio.run(workspace_ops.checkout_spec_to_workspace("26260", checkout_base, "default", release="17"))

Loading