Commit 27bf372b authored by Jan Reimes's avatar Jan Reimes
Browse files

Default --skip-existing to True, codify idempotent processing rule

- SkipExistingOption default changed from False to True: running
  'workspace process' twice without --force is now safe by default.
- Added idempotent processing rule to cli/AGENTS.md documenting the
  two-layer protection (fast-path skip + convert_for_wiki guards).
- Added TestSpecSourceDirNaming regression tests (5 tests) for spec
  source directory naming with -REL suffix.
parent 7b2370b9
Loading
Loading
Loading
Loading
+41 −0
Original line number Diff line number Diff line
@@ -26,6 +26,47 @@ CLI-only code conventions for the `tdoc_crawler` CLI.

**Rule of thumb:** If useful to a Python developer using `tdoc_crawler` as a library, it belongs in the core.

## CRITICAL: Spec Source Directory Naming

Spec source directories under `wiki/<workspace>/sources/` **MUST** include the `-REL<version>` suffix:

```
# CORRECT
wiki/atias/sources/26260-REL18.0.0/
wiki/atias/sources/26131-REL19.0.0/

# WRONG — NEVER do this
wiki/atias/sources/26260/
wiki/atias/sources/26131/
```

When processing a spec workspace member:
1. If the member ID already has `-REL` (e.g. `26260-REL18.0.0`), use it as-is.
2. If the member ID is bare (e.g. `26131`), resolve the release from the database and construct the full ID.
3. If the release cannot be resolved, fall back to the bare ID.

This applies to **all** code paths that create or reference spec source directories:
- `_effective_source_id()` / `_resolve_spec_source_id()`
- `_process_member()``wiki_source_dir` construction
- `_should_skip_member()` — existing output detection

A regression test exists in `TestSpecSourceDirNaming` (test_workspaces.py).

## Idempotent Processing

`workspace process` MUST be idempotent — running it twice without `--force` produces the same result as running it once. Existing output files are never overwritten.

Two-layer protection:

1. **`_should_skip_member()`** (fast path): Checks if output artifacts exist before entering `convert_for_wiki()`. Controlled by `--skip-existing` (default: True).

2. **`convert_for_wiki()`** (hard guarantee): Each profile branch checks for existing output and returns early when `force=False`:
   - `PDF_ONLY`: `ensure_pdf()` skips if PDF exists
   - `MARKDOWN_ONLY`: skips if `.md` exists
   - `DEFAULT`/`ADVANCED`: skips if both `.md` and `.json` exist

The `--force` flag is the ONLY way to overwrite existing output. It defaults to `False`.

## Module Responsibilities

| Module | Responsibility |
+5 −1
Original line number Diff line number Diff line
@@ -210,7 +210,11 @@ ProcessLimitOption = Annotated[
]
SkipExistingOption = Annotated[
    bool,
    typer.Option("--skip-existing/--process-existing", help="Skip members that already have artifacts", envvar=ConfigEnvVar.TDC_SKIP_EXISTING.name),
    typer.Option(
        "--skip-existing/--no-skip-existing",
        help="Skip members that already have artifacts (default: True)",
        envvar=ConfigEnvVar.TDC_SKIP_EXISTING.name,
    ),
]
ProfileOption = Annotated[
    str,
+1 −1
Original line number Diff line number Diff line
@@ -197,7 +197,7 @@ def workspace_process(
    workspace_name: str = typer.Argument(None, help="Workspace name (default: active workspace)"),
    force: WorkspaceProcessForceOption = False,
    limit: ProcessLimitOption = None,
    skip_existing: SkipExistingOption = False,
    skip_existing: SkipExistingOption = True,
    profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value,
    figures: FiguresModeOption = "embed",
    tables: TablesModeOption = "embed",
+85 −0
Original line number Diff line number Diff line
@@ -225,3 +225,88 @@ class TestWorkspaceCRUD:
        """Test ensuring default workspace exists."""
        registry = ensure_default_workspace()
        assert DEFAULT_WORKSPACE in registry.workspaces


class TestSpecSourceDirNaming:
    """Regression: spec source dirs MUST include -REL suffix.

    This test prevents the recurring bug where spec members without an
    explicit --release produce source directories like ``sources/26131/``
    instead of ``sources/26131-REL19.0.0/``.
    """

    def test_spec_with_rel_suffix_unchanged(self) -> None:
        """Spec IDs that already have -REL must pass through unchanged."""
        from tdoc_crawler.cli.workspace.process import _resolve_spec_source_id

        assert _resolve_spec_source_id("26260-REL18.0.0") == "26260-REL18.0.0"
        assert _resolve_spec_source_id("26131-REL17.0.0") == "26131-REL17.0.0"

    def test_tdoc_id_unchanged(self) -> None:
        """TDoc IDs are never modified."""
        from unittest.mock import patch

        from tdoc_crawler.cli.workspace.process import _effective_source_id
        from tdoc_crawler.config.workspace_registry import WorkspaceMember
        from tdoc_crawler.models.workspaces import SourceKind

        member = WorkspaceMember(source_item_id="S4-201115", source_kind=SourceKind.TDOC, source_path="")
        assert _effective_source_id(member) == "S4-201115"

    def test_spec_without_suffix_resolves(self) -> None:
        """Spec without -REL must resolve to full ID via DB."""
        from unittest.mock import patch

        from tdoc_crawler.cli.workspace.process import _effective_source_id
        from tdoc_crawler.config.workspace_registry import WorkspaceMember
        from tdoc_crawler.models.workspaces import SourceKind

        member = WorkspaceMember(source_item_id="26131", source_kind=SourceKind.SPEC, source_path="")

        # Mock resolve_spec_release_from_db to return a known version
        with patch(
            "tdoc_crawler.cli.workspace.process.resolve_spec_release_from_db",
            return_value=("19.0.0", []),
        ):
            result = _effective_source_id(member)

        assert result == "26131-REL19.0.0"
        assert "-REL" in result

    def test_spec_unknown_release_falls_back(self) -> None:
        """Spec with unresolvable release falls back to bare ID."""
        from unittest.mock import patch

        from tdoc_crawler.cli.workspace.process import _effective_source_id
        from tdoc_crawler.config.workspace_registry import WorkspaceMember
        from tdoc_crawler.models.workspaces import SourceKind

        member = WorkspaceMember(source_item_id="99999", source_kind=SourceKind.SPEC, source_path="")

        # resolve returns "latest" when spec not in DB
        with patch(
            "tdoc_crawler.cli.workspace.process.resolve_spec_release_from_db",
            return_value=("latest", []),
        ):
            result = _effective_source_id(member)

        # Must NOT produce 99999-RELlatest
        assert result == "99999"

    def test_spec_resolve_exception_falls_back(self) -> None:
        """Spec resolve failure falls back to bare ID without -REL."""
        from unittest.mock import patch

        from tdoc_crawler.cli.workspace.process import _effective_source_id
        from tdoc_crawler.config.workspace_registry import WorkspaceMember
        from tdoc_crawler.models.workspaces import SourceKind

        member = WorkspaceMember(source_item_id="26131", source_kind=SourceKind.SPEC, source_path="")

        with patch(
            "tdoc_crawler.cli.workspace.process.resolve_spec_release_from_db",
            side_effect=RuntimeError("DB connection failed"),
        ):
            result = _effective_source_id(member)

        assert result == "26131"