Commit e249a4ce authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(ai): add delete_ai_folder and --delete-artifacts option

Adds ability to delete .ai processing artifacts from checkout folders.
- New delete_ai_folder() function to remove .ai folders
- delete_workspace() now accepts delete_artifacts flag
- New CLI command: 3gpp-ai clear [--path] [--workspace] [--dry-run]
- Fixes classify_document_files to not fail on PermissionError
parent 7b434091
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ from threegpp_ai.operations.workspaces import (
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
    create_workspace,
    delete_ai_folder,
    delete_workspace,
    ensure_ai_subfolder,
    ensure_default_workspace,
@@ -56,6 +57,7 @@ __all__ = [
    "convert_document",
    "convert_tdoc_to_markdown",
    "create_workspace",
    "delete_ai_folder",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
+103 −4
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ from threegpp_ai import (
    checkout_tdoc_to_workspace,
    convert_tdoc_to_markdown,
    create_workspace,
    delete_ai_folder,
    delete_workspace,
    ensure_ai_subfolder,
    get_active_workspace,
@@ -86,6 +87,7 @@ from threegpp_ai.args import (
    WorkspaceNameArgument,
    WorkspaceNameOption,
    WorkspaceProcessForceOption,
    WorkspaceProcessSkipExistingOption,
    WorkspaceProcessVlmOption,
    WorkspaceReleaseOption,
)
@@ -431,6 +433,7 @@ async def _process_workspace_members(
    on_progress: Callable[[int, str], None] | None = None,
    checkout: bool = True,
    convert_md: bool = False,
    skip_existing: bool = False,
    vlm_options: VlmOptions | None = None,
    accelerator_config: AcceleratorConfig | None = None,
    profile: str | None = None,
@@ -449,6 +452,7 @@ async def _process_workspace_members(
        on_progress: Optional callback(completed_count, source_item_id) called after each member
        checkout: Whether to checkout documents if not available
        convert_md: Whether to extract markdown (implies PDF conversion)
        skip_existing: If True, skip extraction for components that already exist.
        vlm_options: Optional VLM features for extraction.
        accelerator_config: Optional accelerator settings for GPU/CPU and threading.
        profile: Extraction profile override.
@@ -524,6 +528,7 @@ async def _process_workspace_members(
                file_path,
                metadata=None,
                force=False,
                skip_existing=skip_existing,
                vlm_options=vlm_options,
                accelerator_config=accelerator_config,
                profile=profile,
@@ -948,8 +953,8 @@ def workspace_list_members(
        console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
        raise typer.Exit(1)

    manager = typer.get_current_context().obj.path
    checkout_base = manager.checkout_dir
    manager = PathConfig()
    checkout_base = manager.cache_dir / manager.checkout_dirname

    member_rows = [
        {
@@ -1000,6 +1005,7 @@ def workspace_list_members(
def workspace_process(
    workspace: WorkspaceNameOption = None,
    force: WorkspaceProcessForceOption = False,
    skip_existing: WorkspaceProcessSkipExistingOption = False,
    limit: WorkspaceLimitOption = None,
    output_format: OutputFormatOption = OutputFormat.TABLE.value,
    checkout: WorkspaceCheckoutOption = True,
@@ -1064,6 +1070,7 @@ def workspace_process(
                on_progress=on_progress,
                checkout=checkout,
                convert_md=True,
                skip_existing=skip_existing,
                vlm_options=vlm_options,
                accelerator_config=accelerator_config,
                profile=profile,
@@ -1104,13 +1111,105 @@ def workspace_process(
@workspace_app.command("delete", help="Delete a workspace")
def workspace_delete(
    name: WorkspaceNameArgument,
    delete_artifacts: Annotated[
        bool,
        typer.Option(
            "--delete-artifacts",
            help="Also delete all .ai processing artifacts in the checkout folder for this workspace's members",
        ),
    ] = False,
) -> None:
    result = delete_workspace(name)
    if not result:
    deleted = delete_workspace(name, delete_artifacts=delete_artifacts)
    if not deleted:
        console.print(f"[red]Failed to delete workspace '{name}'[/red]")
        raise typer.Exit(1)
    console.print(f"[green]Deleted workspace: {name}[/green]")


@app.command("clear", help="Delete all .ai processing artifacts from checkout/cache folders")
def clear_artifacts(
    path: Annotated[
        Path | None,
        typer.Option(
            "--path",
            "-p",
            help="Path to clear .ai folders from. Defaults to checkout directory.",
        ),
    ] = None,
    workspace: Annotated[
        str | None,
        typer.Option(
            "--workspace",
            "-w",
            help="Clear only .ai folders for members of this workspace.",
        ),
    ] = None,
    dry_run: Annotated[
        bool,
        typer.Option(
            "--dry-run",
            help="Show what would be deleted without actually deleting.",
        ),
    ] = False,
) -> None:
    """Delete all .ai processing artifacts from checkout folders.

    Use this to force re-extraction of all documents.
    """
    if path is None:
        path_config = PathConfig()
        checkout_path = path_config.cache_dir / path_config.checkout_dirname
    else:
        checkout_path = path

    if workspace is not None:
        # Clear only .ai folders for workspace members
        normalized = normalize_workspace_name(workspace)
        members = list_workspace_members(normalized, include_inactive=True)
        ai_dirs: set[Path] = set()
        for member in members:
            source = Path(member.source_path)
            if source.suffix == ".ai":
                ai_dirs.add(source)
            else:
                ai_dirs.add(source.parent / ".ai")

        total_deleted = 0
        for ai_dir in sorted(ai_dirs):
            if ai_dir.exists():
                count = delete_ai_folder(ai_dir)
                if count > 0:
                    console.print(f"  {'[dim]' if dry_run else ''}{ai_dir}{'[dim] (dry-run)' if dry_run else ''}: {count} items")
                    total_deleted += count

        if dry_run:
            console.print(f"\n[yellow]Dry-run: would delete {total_deleted} items from {len(ai_dirs)} .ai folders[/yellow]")
        else:
            console.print(f"\n[green]Deleted {total_deleted} items from {len(ai_dirs)} .ai folders[/green]")
    else:
        # Clear all .ai folders under checkout_path
        ai_dirs = list(checkout_path.rglob(".ai"))
        if not ai_dirs:
            console.print("[yellow]No .ai folders found[/yellow]")
            return

        total_deleted = 0
        for ai_dir in sorted(ai_dirs):
            if dry_run:
                count = sum(1 for _ in ai_dir.rglob("*") if _.is_file())
                console.print(f"  {ai_dir}: {count} items (dry-run)")
                total_deleted += count
            else:
                count = delete_ai_folder(ai_dir)
                if count > 0:
                    console.print(f"  {ai_dir}: {count} items")
                    total_deleted += count

        if dry_run:
            console.print(f"\n[yellow]Dry-run: would delete {total_deleted} items from {len(ai_dirs)} .ai folders[/yellow]")
        else:
            console.print(f"\n[green]Deleted {total_deleted} items from {len(ai_dirs)} .ai folders[/green]")


if __name__ == "__main__":
    app()
+51 −11
Original line number Diff line number Diff line
@@ -135,13 +135,13 @@ def list_workspaces() -> list[WorkspaceRegistry]:
    return list(registry.workspaces.values())


def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) -> bool:
    """Delete a workspace and keep legacy artifact cleanup as a compatibility flag.
def delete_workspace(workspace: str | None, *, delete_artifacts: bool = False) -> bool:
    """Delete a workspace and optionally delete all .ai artifacts for its members.

    Args:
        workspace: Workspace name to delete.
        preserve_artifacts: Reserved compatibility option. Legacy retrieval
            artifacts are no longer managed by this package.
        delete_artifacts: If True, delete all .ai processing folders for workspace members
            in the checkout directory.

    Returns:
        True if deleted, False if not found or if attempting to delete default.
@@ -151,6 +151,9 @@ def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) ->
        _logger.warning("Cannot delete the default workspace")
        return False

    # Get members before deleting workspace
    members = list_workspace_members(normalized_workspace, include_inactive=True)

    registry = _get_registry()
    deleted = registry.delete_workspace(normalized_workspace)
    if not deleted:
@@ -158,13 +161,50 @@ def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) ->

    registry.save()

    if not preserve_artifacts:
        _logger.warning("preserve_artifacts=False is deprecated and has no effect")
    # Delete .ai artifacts if requested
    if delete_artifacts and members:
        _logger.info("Deleting .ai artifacts for %d members", len(members))
        for member in members:
            source_path = Path(member.source_path)
            # Check if source_path is already the .ai folder (ends with .ai as directory)
            if source_path.name == ".ai":
                # Source path is the .ai folder itself
                delete_ai_folder(source_path)
            else:
                # Source path is the document folder - delete .ai subfolder
                delete_ai_folder(source_path / ".ai")

    _logger.info(f"Deleted workspace '{normalized_workspace}' (preserve_artifacts={preserve_artifacts})")
    _logger.info("Deleted workspace '%s' (delete_artifacts=%s)", normalized_workspace, delete_artifacts)
    return True


def delete_ai_folder(ai_path: Path) -> int:
    """Delete an .ai folder and return count of deleted items.

    Args:
        ai_path: Path to the .ai folder.

    Returns:
        Count of items deleted.
    """
    if not ai_path.exists() or not ai_path.is_dir():
        return 0

    count = 0
    try:
        for item in ai_path.rglob("*"):
            if item.is_file():
                item.unlink()
                count += 1
        # Remove .ai folder itself
        ai_path.rmdir()
        count += 1
        _logger.debug("Deleted .ai folder: %s", ai_path)
    except Exception as e:
        _logger.warning("Failed to delete .ai folder %s: %s", ai_path, e)
    return count


def get_active_workspace() -> str:
    """Get the name of the active workspace."""
    registry = _get_registry()
@@ -461,8 +501,7 @@ def _checkout_tdoc_if_needed(tdoc_id: str, metadata: TDocMetadata, checkout_base
        # Check if folder has actual document files (not just .ai subfolder)
        # Recursively check for files, excluding the .ai/ subfolder which contains processing outputs
        has_files = any(
            f.is_file() and f.parent.name != ".ai" and not any(part == ".ai" for part in f.relative_to(existing_path).parts)
            for f in existing_path.rglob("*")
            f.is_file() and f.parent.name != ".ai" and not any(part == ".ai" for part in f.relative_to(existing_path).parts) for f in existing_path.rglob("*")
        )
        if has_files:
            _logger.info(f"TDoc {tdoc_id} already checked out at {existing_path}")
@@ -623,6 +662,7 @@ __all__ = [
    "checkout_tdoc_to_workspace",
    "create_workspace",
    "deactivate_workspace_member",
    "delete_ai_folder",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",