Commit a3a05d16 authored by Jan Reimes's avatar Jan Reimes
Browse files

Replace bare str CLI options with StrEnum types

Added FiguresMode, TablesMode, DeviceType StrEnums in
extraction/profiles.py alongside ExtractionProfile. Updated
cli/args.py option types, cli/workspace/process.py defaults and
removed manual string validation. DoclingConfig type hints now
accept enum types. Codified rule in cli/AGENTS.md.
parent 27bf372b
Loading
Loading
Loading
Loading
+28 −0
Original line number Diff line number Diff line
@@ -82,3 +82,31 @@ The `--force` flag is the ONLY way to overwrite existing output. It defaults to
- CLI (`cli/`) → Core (`tdoc_crawler/`)
- Core submodules → Common layer (`models/`)
- **Never:** Core importing from CLI

## CRITICAL: Multi-Value CLI Options Must Use StrEnum

CLI options accepting two or more fixed string values **MUST** be backed by `StrEnum` classes — never bare `str` with manual validation.

```python
# CORRECT — StrEnum defined alongside ExtractionProfile in profiles.py
class FiguresMode(StrEnum):
    EMBED = "embed"
    REFERENCE = "reference"

# CLI option uses the enum type directly
FiguresModeOption = Annotated[
    FiguresMode,
    typer.Option("--figures", ...),
]

# WRONG — bare str with manual if/else validation
FiguresModeOption = Annotated[str, typer.Option("--figures", ...)]
# ... then later:
if figures not in ("embed", "reference"):  # ← NEVER do this
```

**Why:** Typer validates enum values automatically. No manual string checks, no typos, no forgotten cases. The enum is the single source of truth for valid values.

**Where to define:** Extraction-related enums go in `extraction/profiles.py` alongside `ExtractionProfile`. Domain-agnostic enums go in `models/base.py`.

**Current enums:** `ExtractionProfile`, `FiguresMode`, `TablesMode`, `DeviceType`, `SourceKind`, `TDocStatus`, `OutputFormat`, `SortOrder`.
+6 −5
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ from typing import Annotated
import typer

from tdoc_crawler.config import ConfigEnvVar
from tdoc_crawler.extraction.profiles import DeviceType, ExtractionProfile, FiguresMode, TablesMode

# Arguments
TDocIdsArgument = Annotated[list[str] | None, typer.Argument(help="TDoc identifiers to query")]
@@ -217,11 +218,11 @@ SkipExistingOption = Annotated[
    ),
]
ProfileOption = Annotated[
    str,
    typer.Option("--profile", help="Extraction profile: pdf-only, default, or advanced", envvar=ConfigEnvVar.TDC_PROFILE.name),
    ExtractionProfile,
    typer.Option("--profile", help="Extraction profile: pdf-only, markdown-only, default, or advanced", envvar=ConfigEnvVar.TDC_PROFILE.name),
]
FiguresModeOption = Annotated[
    str,
    FiguresMode,
    typer.Option(
        "--figures",
        help=(
@@ -232,7 +233,7 @@ FiguresModeOption = Annotated[
    ),
]
TablesModeOption = Annotated[
    str,
    TablesMode,
    typer.Option(
        "--tables",
        help="Table handling: embed (in markdown) or csv (separate CSV files)",
@@ -240,7 +241,7 @@ TablesModeOption = Annotated[
    ),
]
DeviceOption = Annotated[
    str,
    DeviceType,
    typer.Option(
        "--device",
        help="Accelerator device: auto (detect), cpu, cuda, or mps",
+6 −22
Original line number Diff line number Diff line
@@ -24,7 +24,7 @@ from tdoc_crawler.cli.args import (
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.workspace_registry import WorkspaceMember
from tdoc_crawler.extraction.convert import ConversionError, DoclingConfig, convert_for_wiki
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, DeviceType, ExtractionProfile, FiguresMode, TablesMode
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import get_logger, set_verbosity
from tdoc_crawler.models.workspaces import SourceKind
@@ -198,10 +198,10 @@ def workspace_process(
    force: WorkspaceProcessForceOption = False,
    limit: ProcessLimitOption = None,
    skip_existing: SkipExistingOption = True,
    profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value,
    figures: FiguresModeOption = "embed",
    tables: TablesModeOption = "embed",
    device: DeviceOption = "auto",
    profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE,
    figures: FiguresModeOption = FiguresMode.EMBED,
    tables: TablesModeOption = TablesMode.EMBED,
    device: DeviceOption = DeviceType.AUTO,
    docx_direct: DocxDirectOption = False,
    md_yaml_frontmatter: MdYamlFrontmatterOption = True,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
@@ -213,23 +213,7 @@ def workspace_process(
        workspace_name = get_active_workspace()

    normalized = normalize_workspace_name(workspace_name)

    try:
        extraction_profile = ExtractionProfile(profile)
    except ValueError:
        console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, markdown-only, default, advanced[/red]")
        raise typer.Exit(1)

    # Validate figure/table mode options
    if figures not in ("embed", "reference"):
        console.print(f"[red]Invalid figures mode '{figures}'. Use: embed, reference[/red]")
        raise typer.Exit(1)
    if tables not in ("embed", "csv"):
        console.print(f"[red]Invalid tables mode '{tables}'. Use: embed, csv[/red]")
        raise typer.Exit(1)
    if device not in ("auto", "cpu", "cuda", "mps"):
        console.print(f"[red]Invalid device '{device}'. Use: auto, cpu, cuda, mps[/red]")
        raise typer.Exit(1)
    extraction_profile = profile

    docling_config = DoclingConfig(figures_mode=figures, tables_mode=tables, device=device)

+3 −3
Original line number Diff line number Diff line
@@ -36,7 +36,7 @@ from tdoc_crawler.extraction.errors import ConversionError
from tdoc_crawler.extraction.fetch_tdoc import fetch_spec_files as fetch_spec_files_from_tdoc
from tdoc_crawler.extraction.fetch_tdoc import fetch_tdoc_files
from tdoc_crawler.extraction.metrics import MetricType, get_metrics_tracker, timed_operation
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile, FiguresMode
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.utils.async_helpers import run_async
from tdoc_crawler.utils.normalization import normalize_tdoc_id
@@ -297,7 +297,7 @@ def _run_markdown_only(
    primary: Path,
    output_dir: Path,
    *,
    figures_mode: str = "embed",
    figures_mode: FiguresMode | str = FiguresMode.EMBED,
) -> Path:
    """Convert a PDF document to Markdown using pymupdf4llm."""
    media_dir = output_dir / "media"
@@ -442,7 +442,7 @@ def convert_for_wiki(
            return md_file
        # Office formats → LibreOffice PDF first; native PDFs pass through.
        input_for_md = ensure_pdf(primary, wiki_source_dir, force=force)
        figures_mode = docling_config.figures_mode if docling_config else "embed"
        figures_mode = docling_config.figures_mode if docling_config else FiguresMode.EMBED
        with timed_operation(get_metrics_tracker(), document_id, MetricType.CONVERSION):
            result = _run_markdown_only(input_for_md, wiki_source_dir, figures_mode=figures_mode)
        if md_yaml_frontmatter:
+14 −14
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ import json
import logging
import os
from pathlib import Path
from typing import Any, Literal
from typing import Any

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
@@ -16,7 +16,7 @@ from docling_core.types.doc.base import ImageRefMode
from tdoc_crawler.extraction.docling.filter import _DOCLING_PIPELINE_LOGGER, _DoclingBadAllocFilter
from tdoc_crawler.extraction.docling.pipeline import _build_pipeline_options
from tdoc_crawler.extraction.errors import ConversionError
from tdoc_crawler.extraction.profiles import ExtractionProfile
from tdoc_crawler.extraction.profiles import DeviceType, ExtractionProfile, FiguresMode, TablesMode

logger = logging.getLogger(__name__)

@@ -31,23 +31,23 @@ class DoclingConfig:

    Args:
        figures_mode: How to handle figures in output.
            - ``"embed"``: Placeholder in markdown, data in JSON (default).
            - ``"reference"``: Extract figure images, reference via URI in markdown.
            - :attr:`FiguresMode.EMBED`: Placeholder in markdown, data in JSON (default).
            - :attr:`FiguresMode.REFERENCE`: Extract figure images, reference via URI.
        tables_mode: How to handle tables in output.
            - ``"embed"``: Tables embedded in markdown (default).
            - ``"csv"``: Tables exported as separate CSV files alongside the JSON.
            - :attr:`TablesMode.EMBED`: Tables embedded in markdown (default).
            - :attr:`TablesMode.CSV`: Tables exported as separate CSV files.
        device: Accelerator device for Docling models.
            - ``"auto"``: Auto-detect (CUDA if available, else CPU).
            - ``"cpu"``: Force CPU-only (avoids CUDA warnings).
            - ``"cuda"``: Force NVIDIA CUDA GPU.
            - ``"mps"``: Force Apple Metal Performance Shaders.
            - :attr:`DeviceType.AUTO`: Auto-detect (CUDA if available, else CPU).
            - :attr:`DeviceType.CPU`: Force CPU-only.
            - :attr:`DeviceType.CUDA`: Force NVIDIA CUDA GPU.
            - :attr:`DeviceType.MPS`: Force Apple Metal Performance Shaders.
    """

    def __init__(
        self,
        figures_mode: Literal["embed", "reference"] = "embed",
        tables_mode: Literal["embed", "csv"] = "embed",
        device: Literal["auto", "cpu", "cuda", "mps"] = "auto",
        figures_mode: FiguresMode | str = FiguresMode.EMBED,
        tables_mode: TablesMode | str = TablesMode.EMBED,
        device: DeviceType | str = DeviceType.AUTO,
    ) -> None:
        self.figures_mode = figures_mode
        self.tables_mode = tables_mode
@@ -57,7 +57,7 @@ class DoclingConfig:
def _get_or_create_converter(
    profile: ExtractionProfile,
    *,
    device: Literal["auto", "cpu", "cuda", "mps"] = "auto",
    device: DeviceType | str = DeviceType.AUTO,
    docx_direct: bool = False,
) -> DocumentConverter:
    """Return a cached ``DocumentConverter`` matching *profile*, *device*, and *docx_direct*.
Loading