Commit b9171c26 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(config): add .llm-wiki path methods to CacheManager and update dependencies

- Add llm_wiki_dir, workspace_llm_wiki_dir, workspace_sources_dir,
  workspace_assets_dir, workspace_schema_path to CacheManager
- Register workspace CLI sub-app in tdoc_app.py
- Add convert-lo, doc2txt, liteparse, opendataloader-pdf dependencies
  to pyproject.toml
- Add test_llm_wiki_paths.py and test_workspaces.py
parent f27eb52c
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -18,7 +18,11 @@ dependencies = [
    "aiointerpreters>=0.4.0",
    "beautifulsoup4>=4.14.2",
    "brotli>=1.2.0",
    "convert-lo",
    "doc2txt>=1.0.8",
    "hishel>=1.1.8",
    "liteparse>=1.2.0",
    "opendataloader-pdf[hybrid]>=2.2.0",
    "packaging>=25.0",
    "pandas>=3.0.0",
    "pydantic>=2.12.2",
@@ -109,5 +113,7 @@ bump = true
members = ["packages/*"]

[tool.uv.sources]
convert-lo = { workspace = true }
pool-executors = { workspace = true }
toon-format = { git = "https://github.com/toon-format/toon-python.git" }
doc2txt = { git = "https://github.com/Quantatirsk/doc2txt-pypi.git" }
+5 −0
Original line number Diff line number Diff line
@@ -49,6 +49,11 @@ tdoc_app = typer.Typer(help="3GPP Crawler - TDocs and Meetings")
# Register config sub-app
tdoc_app.add_typer(config_app, name="config", help="Manage configuration")

# Register workspace sub-app (now part of main app)
from tdoc_crawler.cli._workspace_commands import app as workspace_app

tdoc_app.add_typer(workspace_app, name="workspace", help="Manage extraction workspaces")


@tdoc_app.callback()
def _app_callback(
+22 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
DEFAULT_CHECKOUT_DIRNAME = "checkout"
DEFAULT_AI_CACHE_DIRNAME = "lightrag"
DEFAULT_LLM_WIKI_DIRNAME = ".llm-wiki"


class CacheManagerNotRegisteredError(RuntimeError):
@@ -103,6 +104,27 @@ class CacheManager:
        """Path to AI cache directory for embeddings and graphs."""
        return self._cache_dir / DEFAULT_AI_CACHE_DIRNAME

    @property
    def llm_wiki_dir(self) -> Path:
        """Path to LLM wiki workspace directory."""
        return self._cache_dir / DEFAULT_LLM_WIKI_DIRNAME

    def workspace_llm_wiki_dir(self, workspace_name: str) -> Path:
        """Path to a specific workspace's LLM wiki directory."""
        return self.llm_wiki_dir / workspace_name

    def workspace_sources_dir(self, workspace_name: str) -> Path:
        """Path to a workspace's sources subdirectory."""
        return self.workspace_llm_wiki_dir(workspace_name) / "sources"

    def workspace_assets_dir(self, workspace_name: str) -> Path:
        """Path to a workspace's assets subdirectory."""
        return self.workspace_llm_wiki_dir(workspace_name) / "assets"

    def workspace_schema_path(self, workspace_name: str) -> Path:
        """Path to a workspace's schema.json."""
        return self.workspace_llm_wiki_dir(workspace_name) / ".llmwiki" / "schema.json"

    @property
    def ai_workspace_file(self) -> Path:
        """Path to workspace registry file."""
+113 −0
Original line number Diff line number Diff line
"""Tests for .llm-wiki path functionality."""

from __future__ import annotations

from pathlib import Path

import pytest

from tdoc_crawler.config import CacheManager
from tdoc_crawler.workspaces import create_workspace, delete_workspace


@pytest.fixture
def temp_cache_dir(tmp_path: Path) -> Path:
    """Create a temporary cache directory and register it."""
    # Reset CacheManager singleton to allow re-registration in tests
    CacheManager._instance = None
    cache_dir = tmp_path / ".3gpp-crawler"
    cache_dir.mkdir()
    CacheManager(cache_dir).register()
    return cache_dir


class TestCacheManagerLlmWikiPaths:
    """Tests for CacheManager .llm-wiki path methods."""

    def test_llm_wiki_dir(self, temp_cache_dir: Path) -> None:
        """Test llm_wiki_dir property."""
        manager = CacheManager(temp_cache_dir)
        expected = temp_cache_dir / ".llm-wiki"
        assert manager.llm_wiki_dir == expected

    def test_workspace_llm_wiki_dir(self, temp_cache_dir: Path) -> None:
        """Test workspace_llm_wiki_dir method."""
        manager = CacheManager(temp_cache_dir)
        expected = temp_cache_dir / ".llm-wiki" / "test-ws"
        assert manager.workspace_llm_wiki_dir("test-ws") == expected

    def test_workspace_sources_dir(self, temp_cache_dir: Path) -> None:
        """Test workspace_sources_dir method."""
        manager = CacheManager(temp_cache_dir)
        expected = temp_cache_dir / ".llm-wiki" / "test-ws" / "sources"
        assert manager.workspace_sources_dir("test-ws") == expected

    def test_workspace_assets_dir(self, temp_cache_dir: Path) -> None:
        """Test workspace_assets_dir method."""
        manager = CacheManager(temp_cache_dir)
        expected = temp_cache_dir / ".llm-wiki" / "test-ws" / "assets"
        assert manager.workspace_assets_dir("test-ws") == expected

    def test_workspace_schema_path(self, temp_cache_dir: Path) -> None:
        """Test workspace_schema_path method."""
        manager = CacheManager(temp_cache_dir)
        expected = temp_cache_dir / ".llm-wiki" / "test-ws" / ".llmwiki" / "schema.json"
        assert manager.workspace_schema_path("test-ws") == expected


class TestWorkspaceLlmWikiStructure:
    """Tests for workspace .llm-wiki structure creation."""

    def test_create_workspace_creates_llm_wiki_structure(self, temp_cache_dir: Path) -> None:
        """Test that create_workspace creates .llm-wiki structure."""
        create_workspace("test-ws")

        manager = CacheManager(temp_cache_dir)
        assert manager.workspace_llm_wiki_dir("test-ws").exists()
        assert manager.workspace_sources_dir("test-ws").exists()
        assert manager.workspace_assets_dir("test-ws").exists()
        assert manager.workspace_schema_path("test-ws").exists()

    def test_create_workspace_schema_content(self, temp_cache_dir: Path) -> None:
        """Test that schema.json has correct content."""
        create_workspace("test-ws")

        manager = CacheManager(temp_cache_dir)
        schema_path = manager.workspace_schema_path("test-ws")
        content = schema_path.read_text(encoding="utf-8")
        assert '"pageKinds"' in content
        assert '"concept"' in content
        assert '"defaultKind"' in content

    def test_delete_workspace_with_llm_wiki(self, temp_cache_dir: Path) -> None:
        """Test deleting workspace does NOT delete .llm-wiki by default."""
        create_workspace("test-ws")

        manager = CacheManager(temp_cache_dir)
        llm_wiki_dir = manager.workspace_llm_wiki_dir("test-ws")
        assert llm_wiki_dir.exists()

        # delete_workspace does NOT touch .llm-wiki by default
        delete_workspace("test-ws")
        assert llm_wiki_dir.exists()  # .llm-wiki folder still exists

    def test_sources_dir_is_under_llm_wiki(self, temp_cache_dir: Path) -> None:
        """Test that sources dir is under .llm-wiki."""
        manager = CacheManager(temp_cache_dir)
        sources_dir = manager.workspace_sources_dir("test-ws")
        llm_wiki_dir = manager.workspace_llm_wiki_dir("test-ws")
        assert sources_dir.parent == llm_wiki_dir

    def test_assets_dir_is_under_llm_wiki(self, temp_cache_dir: Path) -> None:
        """Test that assets dir is under .llm-wiki."""
        manager = CacheManager(temp_cache_dir)
        assets_dir = manager.workspace_assets_dir("test-ws")
        llm_wiki_dir = manager.workspace_llm_wiki_dir("test-ws")
        assert assets_dir.parent == llm_wiki_dir

    def test_schema_is_under_llmwiki(self, temp_cache_dir: Path) -> None:
        """Test that schema.json is under .llmwiki."""
        manager = CacheManager(temp_cache_dir)
        schema_path = manager.workspace_schema_path("test-ws")
        llmwiki_dir = manager.workspace_llm_wiki_dir("test-ws") / ".llmwiki"
        assert schema_path.parent == llmwiki_dir
+227 −0
Original line number Diff line number Diff line
"""Tests for workspace infrastructure moved to main app."""

from __future__ import annotations

from pathlib import Path

import pytest

from tdoc_crawler.config import CacheManager
from tdoc_crawler.config.workspace_registry import WorkspaceRegistry
from tdoc_crawler.models.workspaces import (
    DEFAULT_WORKSPACE,
    DocumentClassification,
    SourceKind,
    Workspace,
    normalize_workspace_name,
)
from tdoc_crawler.workspaces import (
    create_workspace,
    delete_workspace,
    ensure_default_workspace,
    get_active_workspace,
    get_workspace,
    list_workspaces,
    set_active_workspace,
)


@pytest.fixture
def temp_cache_dir(tmp_path: Path) -> Path:
    """Create a temporary cache directory and register it."""
    # Reset CacheManager singleton to allow re-registration in tests
    CacheManager._instance = None
    cache_dir = tmp_path / ".3gpp-crawler"
    cache_dir.mkdir()
    CacheManager(cache_dir).register()
    return cache_dir


class TestWorkspaceModels:
    """Tests for workspace models."""

    def test_workspace_creation(self) -> None:
        """Test creating a Workspace instance."""
        ws = Workspace(workspace_name="test-workspace")
        assert ws.workspace_name == "test-workspace"
        assert ws.is_default is False
        assert ws.auto_build is False

    def test_workspace_normalization(self) -> None:
        """Test workspace name normalization."""
        ws = Workspace(workspace_name="  Test Workspace  ")
        assert ws.workspace_name == "test workspace"

    def test_workspace_empty_name_raises(self) -> None:
        """Test that empty workspace name raises ValueError."""
        with pytest.raises(ValueError, match="workspace_name must not be empty"):
            Workspace(workspace_name="")

    def test_source_kind_enum(self) -> None:
        """Test SourceKind enum values."""
        assert SourceKind.TDOC.value == "tdoc"
        assert SourceKind.SPEC.value == "spec"
        assert SourceKind.OTHER.value == "other"

    def test_document_classification(self) -> None:
        """Test DocumentClassification model."""
        doc = DocumentClassification(
            document_id="S4-12345",
            file_path="test/path.pdf",
            is_main_document=True,
            confidence=0.95,
            decisive_heuristic="size",
            file_size_bytes=1024,
        )
        assert doc.document_id == "S4-12345"
        assert doc.confidence == 0.95

    def test_document_classification_invalid_confidence(self) -> None:
        """Test that invalid confidence raises ValueError."""
        with pytest.raises(ValueError):
            DocumentClassification(
                document_id="S4-12345",
                file_path="test.pdf",
                is_main_document=True,
                confidence=1.5,
                decisive_heuristic="size",
                file_size_bytes=1024,
            )


class TestNormalizeWorkspaceName:
    """Tests for workspace name normalization."""

    def test_normalize_none(self) -> None:
        """Test normalizing None returns default."""
        assert normalize_workspace_name(None) == DEFAULT_WORKSPACE

    def test_normalize_empty(self) -> None:
        """Test normalizing empty string returns default."""
        assert normalize_workspace_name("") == DEFAULT_WORKSPACE

    def test_normalize_whitespace(self) -> None:
        """Test normalizing whitespace-only returns default."""
        assert normalize_workspace_name("   ") == DEFAULT_WORKSPACE

    def test_normalize_lowercase(self) -> None:
        """Test normalizing converts to lowercase."""
        assert normalize_workspace_name("MyWorkspace") == "myworkspace"

    def test_normalize_strips(self) -> None:
        """Test normalizing strips whitespace."""
        assert normalize_workspace_name("  my-workspace  ") == "my-workspace"


class TestWorkspaceRegistry:
    """Tests for WorkspaceRegistry."""

    def test_registry_create_default(self, temp_cache_dir: Path) -> None:
        """Test creating a registry with default workspace."""
        registry = WorkspaceRegistry._create_default()
        assert DEFAULT_WORKSPACE in registry.workspaces
        assert registry.active == DEFAULT_WORKSPACE

    def test_registry_create_workspace(self, temp_cache_dir: Path) -> None:
        """Test creating a workspace in registry."""
        registry = WorkspaceRegistry._create_default()
        registry.create_workspace("test-ws", description="Test workspace")
        assert "test-ws" in registry.workspaces
        assert registry.workspaces["test-ws"].description == "Test workspace"

    def test_registry_delete_workspace(self, temp_cache_dir: Path) -> None:
        """Test deleting a workspace."""
        registry = WorkspaceRegistry._create_default()
        registry.create_workspace("test-ws")
        result = registry.delete_workspace("test-ws")
        assert result is True
        assert "test-ws" not in registry.workspaces

    def test_registry_delete_default_fails(self, temp_cache_dir: Path) -> None:
        """Test deleting default workspace fails."""
        registry = WorkspaceRegistry._create_default()
        result = registry.delete_workspace(DEFAULT_WORKSPACE)
        assert result is False

    def test_registry_set_active(self, temp_cache_dir: Path) -> None:
        """Test setting active workspace."""
        registry = WorkspaceRegistry._create_default()
        registry.create_workspace("test-ws")
        registry.set_active("test-ws")
        assert registry.active == "test-ws"

    def test_registry_save_load(self, temp_cache_dir: Path) -> None:
        """Test saving and loading registry."""
        registry = WorkspaceRegistry._create_default()
        registry.create_workspace("test-ws")
        registry.save()

        loaded = WorkspaceRegistry.load()
        assert "test-ws" in loaded.workspaces
        assert loaded.active == DEFAULT_WORKSPACE

    def test_registry_get_workspace(self, temp_cache_dir: Path) -> None:
        """Test getting workspace metadata."""
        registry = WorkspaceRegistry._create_default()
        registry.create_workspace("test-ws")
        metadata = registry.get_workspace("test-ws")
        assert metadata is not None
        assert metadata.name == "test-ws"

    def test_registry_get_nonexistent(self, temp_cache_dir: Path) -> None:
        """Test getting nonexistent workspace returns None."""
        registry = WorkspaceRegistry._create_default()
        assert registry.get_workspace("nonexistent") is None


class TestWorkspaceCRUD:
    """Tests for workspace CRUD operations."""

    def test_create_workspace(self, temp_cache_dir: Path) -> None:
        """Test creating a workspace via CRUD."""
        registry = create_workspace("test-ws")
        assert "test-ws" in registry.workspaces

    def test_list_workspaces(self, temp_cache_dir: Path) -> None:
        """Test listing workspaces."""
        create_workspace("ws1")
        create_workspace("ws2")
        workspaces = list_workspaces()
        names = [ws.name for ws in workspaces]
        assert "ws1" in names
        assert "ws2" in names

    def test_get_workspace(self, temp_cache_dir: Path) -> None:
        """Test getting a workspace."""
        create_workspace("test-ws")
        ws = get_workspace("test-ws")
        assert ws is not None
        assert ws.name == "test-ws"

    def test_get_workspace_none(self, temp_cache_dir: Path) -> None:
        """Test getting nonexistent workspace returns None."""
        ws = get_workspace("nonexistent")
        assert ws is None

    def test_set_active_workspace(self, temp_cache_dir: Path) -> None:
        """Test setting active workspace."""
        create_workspace("test-ws")
        set_active_workspace("test-ws")
        assert get_active_workspace() == "test-ws"

    def test_delete_workspace(self, temp_cache_dir: Path) -> None:
        """Test deleting a workspace."""
        create_workspace("test-ws")
        result = delete_workspace("test-ws")
        assert result is True
        assert get_workspace("test-ws") is None

    def test_delete_default_workspace_fails(self, temp_cache_dir: Path) -> None:
        """Test deleting default workspace fails."""
        result = delete_workspace(DEFAULT_WORKSPACE)
        assert result is False

    def test_ensure_default_workspace(self, temp_cache_dir: Path) -> None:
        """Test ensuring default workspace exists."""
        registry = ensure_default_workspace()
        assert DEFAULT_WORKSPACE in registry.workspaces