Commit 62f844f7 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(workspace): add PDF conversion option and improve workspace deletion

* Add ConvertPdfOption to args.py for optional PDF conversion during add-members.
* Implement _process_single_item function in cli.py to handle PDF conversion.
* Enhance workspace_add_members function to support PDF conversion.
* Update delete_workspace function in workspaces.py to optionally remove LightRAG artifacts.
parent 5b7e6649
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -41,6 +41,10 @@ WorkspaceAutoBuildOption = Annotated[
WorkspaceItemsArgument = Annotated[list[str] | None, typer.Argument(help="Source item IDs to add (optional if filters provided)")]
WorkspaceKindOption = Annotated[str, typer.Option("--kind", help="Source kind (tdoc, spec, other)")]
WorkspaceCheckoutOption = Annotated[bool, typer.Option("--checkout/--no-checkout", help="Checkout/download documents if not present")]
ConvertPdfOption = Annotated[
    bool,
    typer.Option("--convert-pdf/--no-convert-pdf", "-cp", help="Convert office documents to PDF during add-members", envvar="TDC_AI_CONVERT_PDF"),
]
WorkspaceReleaseOption = Annotated[
    str | None,
    typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs."),
@@ -50,7 +54,10 @@ WorkspaceIncludeInactiveOption = Annotated[bool, typer.Option("--include-inactiv
WorkspaceProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing of all members")]
WorkspacePreserveArtifactsOption = Annotated[
    bool,
    typer.Option("--preserve-artifacts/--delete-artifacts", help="Preserve LightRAG artifacts"),
    typer.Option(
        "--preserve-artifacts/--delete-artifacts",
        help="Preserve LightRAG artifacts (embeddings, index). --delete-artifacts removes only LightRAG data, not document artifacts (.ai folders)",
    ),
]

# Filter options for workspace add-members
+178 −51
Original line number Diff line number Diff line
@@ -51,6 +51,7 @@ from threegpp_ai.args import (
    ConvertDocumentArgument,
    ConvertForceOption,
    ConvertOutputOption,
    ConvertPdfOption,
    EndDateOption,
    JsonOutputOption,
    SourcePatternExcludeOption,
@@ -77,7 +78,7 @@ from threegpp_ai.args import (
from threegpp_ai.lightrag.cli import app as rag_app
from threegpp_ai.lightrag.config import LightRAGConfig
from threegpp_ai.lightrag.metadata import RAGMetadata
from threegpp_ai.lightrag.processor import TDocProcessor
from threegpp_ai.lightrag.processor import OFFICE_FORMATS, TDocProcessor
from threegpp_ai.operations.workspace_registry import WorkspaceRegistry

app = typer.Typer(help="3GPP AI - Document Processing and RAG")
@@ -165,27 +166,40 @@ def _resolve_spec_release(item: str, release: str) -> str:
    return resolved


def _build_workspace_members(
def _process_single_item(
    *,
    item: str,
    workspace: str,
    items: list[str],
    source_kind: SourceKind,
    checkout: bool,
    release: str | None,
) -> tuple[list[Any], list[tuple[str, str]]]:
    manager = resolve_cache_manager()
    members: list[Any] = []
    skipped: list[tuple[str, str]] = []
    convert_pdf: bool,
    manager: CacheManager,
) -> tuple[Any | None, str | None, bool]:
    """Process a single workspace item (checkout + optional PDF conversion).

    for item in items:
    Args:
        item: Item ID to process
        workspace: Workspace name
        source_kind: Type of source (TDOC, SPEC, OTHER)
        checkout: Whether to checkout documents
        release: Spec release version
        convert_pdf: Whether to convert to PDF
        manager: CacheManager for paths

    Returns:
        Tuple of (member, skip_reason, was_converted)
        - member: WorkspaceMember if successful, None if skipped
        - skip_reason: Reason if skipped, None if successful
        - was_converted: True if PDF conversion was performed
    """
    source_path = item
    if checkout:
        checkout_path = None
        if source_kind == SourceKind.TDOC:
            checkout_path = checkout_tdoc_to_workspace(item, manager.checkout_dir, workspace, db_file=manager.db_file)
            if checkout_path is None:
                    skipped.append((item, "TDoc not found in database or meeting not crawled"))
                    continue
                return None, "TDoc not found in database or meeting not crawled", False
        elif source_kind == SourceKind.SPEC:
            checkout_path = checkout_spec_to_workspace(
                item,
@@ -195,16 +209,51 @@ def _build_workspace_members(
                db_file=manager.db_file,
            )
            if checkout_path is None:
                    skipped.append((item, "Spec not found in database"))
                    continue
                return None, "Spec not found in database", False

        if checkout_path is not None:
            source_path = str(checkout_path)
            ensure_ai_subfolder(checkout_path)

    # Optional PDF conversion
    was_converted = False
    if convert_pdf:
        member_for_convert = make_workspace_member(workspace, item, source_path, source_kind)
        pdf_path = _convert_member_to_pdf(member_for_convert)
        was_converted = pdf_path is not None

    resolved_release = _resolve_spec_release(item, release) if source_kind == SourceKind.SPEC and release else None
    source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
        members.append(make_workspace_member(workspace, source_item_id, source_path, source_kind))
    member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
    return member, None, was_converted


def _build_workspace_members(
    *,
    workspace: str,
    items: list[str],
    source_kind: SourceKind,
    checkout: bool,
    release: str | None,
) -> tuple[list[Any], list[tuple[str, str]]]:
    manager = resolve_cache_manager()
    members: list[Any] = []
    skipped: list[tuple[str, str]] = []

    for item in items:
        member, skip_reason, _ = _process_single_item(
            item=item,
            workspace=workspace,
            source_kind=source_kind,
            checkout=checkout,
            release=release,
            convert_pdf=False,  # Conversion handled separately in CLI
            manager=manager,
        )
        if skip_reason:
            skipped.append((item, skip_reason))
        else:
            members.append(member)

    return members, skipped

@@ -222,6 +271,39 @@ def _resolve_process_file(path: Path) -> Path | None:
    return None


def _convert_member_to_pdf(member: Any) -> Path | None:
    """Convert a workspace member's document to PDF if it's an office format.

    Args:
        member: WorkspaceMember with source_path to the document.

    Returns:
        Path to converted PDF in .ai subfolder, or None if conversion not needed/failed.
    """
    source_path = Path(member.source_path)

    # Find the document file
    doc_file = _resolve_process_file(source_path)
    if doc_file is None:
        _logger.debug("No document file found for %s", member.source_item_id)
        return None

    # Check if it's an office format that needs conversion
    if doc_file.suffix.lower() not in OFFICE_FORMATS:
        _logger.debug("Document %s is not an office format, skipping PDF conversion", doc_file.name)
        return None

    # Check if PDF already exists in .ai folder
    ai_dir = ensure_ai_subfolder(doc_file.parent)
    existing_pdf = ai_dir / f"{doc_file.stem}.pdf"
    if existing_pdf.exists():
        _logger.debug("PDF already exists for %s at %s", doc_file.name, existing_pdf)
        return existing_pdf

    # Convert to PDF using TDocProcessor's static method
    return TDocProcessor._convert_to_pdf(doc_file)


def _try_build_tdoc_metadata(source_item_id: str) -> RAGMetadata | None:
    if not source_item_id.startswith(("S", "R", "C", "T")):
        return None
@@ -468,6 +550,7 @@ def workspace_add_members(
    items: WorkspaceItemsArgument = None,
    kind: WorkspaceKindOption = "tdoc",
    checkout: WorkspaceCheckoutOption = True,
    convert_pdf: ConvertPdfOption = False,
    release: WorkspaceReleaseOption = None,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
@@ -483,8 +566,19 @@ def workspace_add_members(
    kind_normalized = kind.lower().rstrip("s")
    source_kind = SourceKind(kind_normalized) if kind_normalized in {entry.value for entry in SourceKind} else SourceKind.OTHER

    # Phase 1: Resolve items - either directly provided or via database query
    if items is not None:
        resolved_items = items
    else:
        # Database query can be slow - show spinner only
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console,
        ) as progress:
            task = progress.add_task("[cyan]Querying database...", total=None)
            resolved_items = _resolve_workspace_items(
        items=items,
                items=None,
                source_kind=source_kind,
                start_date=start_date,
                end_date=end_date,
@@ -500,19 +594,52 @@ def workspace_add_members(
            console.print("[yellow]No items match the provided filters[/yellow]")
            return

    members, skipped = _build_workspace_members(
    # Phase 2: Checkout + optional PDF conversion - single progress bar, one tick per document
    manager = resolve_cache_manager()
    members: list[Any] = []
    skipped: list[tuple[str, str]] = []
    converted_count = 0

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        MofNCompleteColumn(),
        TimeElapsedColumn(),
        console=console,
    ) as progress:
        task = progress.add_task(
            f"[cyan]Processing {len(resolved_items)} item(s)...",
            total=len(resolved_items),
        )
        for item in resolved_items:
            member, skip_reason, was_converted = _process_single_item(
                item=item,
                workspace=workspace_name,
        items=resolved_items,
                source_kind=source_kind,
                checkout=checkout,
                release=release,
                convert_pdf=convert_pdf,
                manager=manager,
            )
            if skip_reason:
                skipped.append((item, skip_reason))
                progress.update(task, advance=1, description=f"[cyan]{item} (skipped)")
            else:
                members.append(member)
                if was_converted:
                    converted_count += 1
                    progress.update(task, advance=1, description=f"[cyan]{item} (converted)")
                else:
                    progress.update(task, advance=1, description=f"[cyan]{item}")

    if skipped:
        console.print("\n[yellow]Skipped invalid items:[/yellow]")
        for item_id, reason in skipped:
            console.print(f"  - {item_id}: {reason}")

    if converted_count > 0:
        console.print(f"[green]Converted {converted_count} document(s) to PDF[/green]")

    added = add_workspace_members(workspace_name, members)
    console.print(f"[green]Added {added} member(s) to '{workspace_name}'[/green]")

+3 −2
Original line number Diff line number Diff line
@@ -311,7 +311,8 @@ class TDocRAG:
                kwargs["embed_model"] = model_name
            return kwargs
        if provider in ("openai", "zhipu"):
            return {"api_key": config.api_key, "api_base": config.api_base}
            # LightRAG's openai_complete/zhipu_complete expect base_url, not api_base
            return {"api_key": config.api_key, "base_url": config.api_base}
        if provider == "jina":
            return {"api_key": config.api_key}
        return {}
+31 −12
Original line number Diff line number Diff line
@@ -3,6 +3,8 @@
from __future__ import annotations

import logging
import os
import shutil
from pathlib import Path
from typing import Any

@@ -99,17 +101,16 @@ def list_workspaces() -> list[WorkspaceRegistry]:


def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) -> bool:
    """Delete a workspace while preserving artifacts from other workspaces.
    """Delete a workspace and optionally remove its LightRAG artifacts.

    Note: Artifacts in LanceDB are now scoped by workspace name in the document_id.
    When a workspace is deleted, its artifacts become orphaned (not associated with
    any active workspace) but are preserved.
    When preserve_artifacts=False, only deletes LightRAG embeddings and workspace index
    from ~/.3gpp-crawler/lightrag/{model}/{workspace}/. Does NOT delete document artifacts
    (PDFs, markdown) in the .ai subfolder of each document checkout.

    Args:
        workspace: Workspace name to delete.
        preserve_artifacts: If True (default), keep artifacts. If False, also delete
                          artifacts for this workspace (not implemented - artifacts are
                          always preserved to prevent data loss).
        preserve_artifacts: If True (default), keep LightRAG artifacts. If False, delete
                          embeddings and workspace index for this workspace only.

    Returns:
        True if deleted, False if not found or if attempting to delete default.
@@ -121,11 +122,29 @@ def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) ->

    registry = _get_registry()
    deleted = registry.delete_workspace(normalized_workspace)
    if deleted:
    if not deleted:
        return False

    registry.save()
        _logger.info(f"Deleted workspace '{normalized_workspace}' (artifacts preserved)")

    # Delete LightRAG artifacts if requested
    if not preserve_artifacts:
        try:
            # Get embedding model from environment to locate artifacts
            embedding_model = os.getenv("TDC_AI_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
            manager = resolve_cache_manager()
            workspace_artifacts_dir = manager.ai_embed_dir(embedding_model) / normalized_workspace

            if workspace_artifacts_dir.exists():
                shutil.rmtree(workspace_artifacts_dir)
                _logger.info(f"Deleted LightRAG artifacts for '{normalized_workspace}' from {workspace_artifacts_dir}")
            else:
                _logger.debug(f"No LightRAG artifacts found for '{normalized_workspace}'")
        except Exception as e:
            _logger.warning(f"Failed to delete LightRAG artifacts for '{normalized_workspace}': {e}")

    _logger.info(f"Deleted workspace '{normalized_workspace}' (preserve_artifacts={preserve_artifacts})")
    return True
    return False


def get_active_workspace() -> str: