Commit 60126818 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(workspace): rename and update artifact handling

* Update delete_artifacts option to clarify its purpose in workspace_delete.
* Refactor checkout logic to remove references to .ai folders.
* Modify get_cached_pdf_path to accept output_dir and adjust logic accordingly.
* Rename delete_ai_folder to delete_artifact_folder for consistency.
parent cf866811
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -118,7 +118,7 @@ def workspace_deactivate() -> None:
def workspace_delete(
    workspace_name: str = typer.Argument(..., help="Workspace name"),
    force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"),
    delete_artifacts: bool = typer.Option(False, "--delete-artifacts", help="Delete all .ai artifacts for workspace members"),
    delete_artifacts: bool = typer.Option(False, "--delete-artifacts", help="Delete all workspace artifacts for members"),
    delete_llm_wiki: bool = typer.Option(False, "--delete-llm-wiki", help="Delete the .llm-wiki folder for this workspace"),
) -> None:
    """Permanently delete a workspace and all associated files."""
+3 −4
Original line number Diff line number Diff line
"""Workspace item checkout and registration logic.

Handles the checkout phase of workspace item management:
downloading/fetching documents, setting up .ai folders, and creating
workspace member records.
downloading/fetching documents and creating workspace member records.

This module is intentionally free of document processing logic
(PDF conversion, markdown extraction, VLM). That responsibility
@@ -39,8 +38,8 @@ async def checkout_single_item(
) -> tuple[WorkspaceMember | None, str | None]:
    """Checkout a single workspace item and create a member record.

    Downloads the document if checkout is enabled, sets up the .ai subfolder,
    and returns a WorkspaceMember ready for registration.
    Downloads the document if checkout is enabled and returns a
    WorkspaceMember ready for registration.

    Args:
        item: Item ID to checkout (TDoc ID, spec number, or path).
+6 −10
Original line number Diff line number Diff line
@@ -68,25 +68,23 @@ def is_office_format(source_file: Path) -> bool:
    return source_file.suffix.lower() in OFFICE_FORMATS


def get_cached_pdf_path(source_file: Path) -> Path | None:
def get_cached_pdf_path(source_file: Path, output_dir: Path) -> Path | None:
    """Get the path to a cached PDF conversion if it exists.

    The cached PDF is stored in a `.ai` subdirectory next to the original file.

    Args:
        source_file: Path to the original Office document.
        output_dir: Directory where the cached PDF would be stored.

    Returns:
        Path to cached PDF if it exists, None otherwise.
    """
    ai_dir = source_file.parent / ".ai"
    cached_pdf = ai_dir / f"{source_file.stem}.pdf"
    cached_pdf = output_dir / f"{source_file.stem}.pdf"
    return cached_pdf if cached_pdf.exists() else None


def convert_to_pdf(
    source_file: Path,
    output_dir: Path | None = None,
    output_dir: Path,
    *,
    force: bool = False,
    config: ConverterConfig | None = None,
@@ -100,8 +98,7 @@ def convert_to_pdf(

    Args:
        source_file: Path to the Office document (DOCX, DOC, PPT, etc.)
        output_dir: Optional output directory for the PDF. If None, uses
            the `.ai` subdirectory next to the source file.
        output_dir: Output directory for the PDF.
        force: If True, re-convert even if a cached PDF exists.
        config: Optional converter configuration. If None, uses defaults
            from environment variables.
@@ -120,7 +117,6 @@ def convert_to_pdf(
        raise ConversionError(f"Unsupported file format: {source_file.suffix}")

    config = config or ConverterConfig.from_env()
    output_dir = output_dir or source_file.parent / ".ai"
    output_file = output_dir / f"{source_file.stem}.pdf"

    # Check for cached conversion
@@ -161,7 +157,7 @@ def ensure_pdf(

    Args:
        source_file: Path to source document.
        output_dir: Directory to place the PDF.
        output_dir: Directory to place the PDF (typically workspace sources dir).
        force: Force reconversion.
        config: Optional converter configuration.

+10 −11
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ from __future__ import annotations

import json
import logging
import tempfile
from pathlib import Path
from typing import Any

@@ -138,6 +139,7 @@ def _ensure_converted(
    force: bool = False,
    config: OpendataloaderConfig | None = None,
    source_pdf: Path | None = None,
    output_dir: Path | None = None,
) -> tuple[str, Path, str]:
    """Fetch TDoc and ensure markdown conversion exists.

@@ -151,10 +153,11 @@ def _ensure_converted(
        source_pdf: If provided, opendataloader processes this PDF instead of the
            original DOCX. This ensures all profiles use the same PDF that was
            generated for the wiki dir.
        output_dir: Directory for conversion artifacts. If None, uses a temp dir.

    Returns:
        Tuple of (raw_markdown, json_path, normalized_id).
        json_path may not exist on disk ÔÇö caller must check.
        json_path may not exist on disk  caller must check.

    Raises:
        ConversionError: If no document files found or conversion fails.
@@ -166,16 +169,17 @@ def _ensure_converted(
    if primary is None:
        raise ConversionError(f"No document files found for {normalized_id}")

    ai_dir = _get_ai_directory(primary)
    md_file = ai_dir / f"{primary.stem}.md"
    json_file = ai_dir / f"{primary.stem}.json"
    artifact_dir = output_dir or Path(tempfile.mkdtemp(prefix="3gpp-conv-"))
    artifact_dir.mkdir(parents=True, exist_ok=True)
    md_file = artifact_dir / f"{primary.stem}.md"
    json_file = artifact_dir / f"{primary.stem}.json"

    if md_file.exists() and not force:
        markdown_content = md_file.read_text(encoding="utf-8")
    else:
        # Use the provided PDF (from wiki dir) or fall back to original source
        input_file = source_pdf if source_pdf is not None and source_pdf.exists() else primary
        markdown_content, _ = _run_opendataloader(input_file, ai_dir, config=config)
        markdown_content, _ = _run_opendataloader(input_file, artifact_dir, config=config)
        md_file.write_text(markdown_content, encoding="utf-8")

    return markdown_content, json_file, normalized_id
@@ -237,7 +241,7 @@ def convert_for_wiki(
        hybrid="docling-fast",
        hybrid_mode="full" if profile == ExtractionProfile.ADVANCED else None,
    )
    markdown_content, json_path, _ = _ensure_converted(document_id, force=force, config=config, source_pdf=pdf_path)
    markdown_content, json_path, _ = _ensure_converted(document_id, force=force, config=config, source_pdf=pdf_path, output_dir=wiki_source_dir)

    # Write markdown to wiki source dir
    md_file = wiki_source_dir / f"{primary.stem}.md"
@@ -323,11 +327,6 @@ def convert_tdoc_metadata(
    return convert_document_to_markdown(document_id, force=force, config=config)


def _get_ai_directory(source_file: Path) -> Path:
    """Get the .ai directory adjacent to the source file."""
    return source_file.parent / ".ai"


def _ensure_hybrid_server_if_needed(config: OpendataloaderConfig) -> None:
    """Auto-start the hybrid server if the config requires hybrid mode.

+1 −5
Original line number Diff line number Diff line
@@ -63,17 +63,13 @@ def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFile
def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
    """Scan a spec checkout directory for available document files.

    Searches the directory tree for document files, excluding hidden files
    and the .ai subfolder.
    Searches the directory tree for document files, excluding hidden files.
    """
    files = SpecFiles(checkout_dir=spec_dir)

    if spec_dir.is_dir():
        for file_path in sorted(spec_dir.rglob("*")):
            if file_path.is_file() and not file_path.name.startswith("."):
                # Skip files inside .ai directories
                if ".ai" in file_path.parts:
                    continue
                suffix = file_path.suffix.lower()
                if suffix == ".pdf":
                    files.pdf_path = file_path
Loading