Commit 49a6798b authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(ai): correct workspace document allocation and checkout

- Add resolve_tdoc_checkout_path() to find TDocs in checkout structure
- Add checkout_tdoc_to_workspace() to checkout TDocs when adding to workspace
- Add ensure_ai_subfolder() to create .ai subfolder for processed outputs
- Update add-members CLI to checkout/download TDocs and store actual paths
- Update process_all to use source_path from workspace members
- Add --checkout/--no-checkout flag to add-members command
- Fix imports and linter issues

Workspace members now store actual checkout paths instead of just IDs,
and TDocs are automatically downloaded when added to a workspace.
parent d581e7ba
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -25,14 +25,17 @@ from tdoc_crawler.ai.operations.summarize import SummarizeResult, summarize_tdoc
from tdoc_crawler.ai.operations.workspaces import (
    DEFAULT_WORKSPACE,
    add_workspace_members,
    checkout_tdoc_to_workspace,
    create_workspace,
    delete_workspace,
    ensure_ai_subfolder,
    ensure_default_workspace,
    get_workspace,
    is_default_workspace,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
    resolve_tdoc_checkout_path,
    resolve_workspace,
)
from tdoc_crawler.ai.storage import AiStorage
@@ -100,9 +103,11 @@ __all__ = [
    "ProcessingStatus",
    "SummarizeResult",
    "add_workspace_members",
    "checkout_tdoc_to_workspace",
    "convert_tdoc",
    "create_workspace",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
    "get_workspace",
    "is_default_workspace",
@@ -111,6 +116,7 @@ __all__ = [
    "normalize_workspace_name",
    "query_embeddings",
    "query_graph",
    "resolve_tdoc_checkout_path",
    "resolve_workspace",
    "summarize_tdoc",
]
+12 −3
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import logging
import re
from collections.abc import Callable
from pathlib import Path
from typing import Any

from tdoc_crawler.ai.models import (
    DocumentClassification,
@@ -363,10 +364,12 @@ def process_all(
    storage = AiStorage(checkout_base / ".ai" / "lancedb")
    normalized_workspace = normalize_workspace_name(workspace)

    # Filter tdoc_ids by workspace members if applicable
    # Get workspace members and build a lookup map
    members_map: dict[str, Any] = {}
    if normalized_workspace != "default":
        members = storage.list_workspace_members(normalized_workspace)
        member_ids = {m.source_item_id for m in members if m.is_active and m.source_kind == "tdoc"}
        members_map = {m.source_item_id: m for m in members if m.is_active and m.source_kind == "tdoc"}
        tdoc_ids = [tid for tid in tdoc_ids if tid in member_ids]

    results: dict[str, ProcessingStatus] = {}
@@ -377,11 +380,17 @@ def process_all(
                logger.info(f"Skipping {tdoc_id} in new_only mode - already completed")
                continue

        folder_path = checkout_base / tdoc_id
        # Use source_path from workspace member if available, otherwise fallback to default
        member = members_map.get(tdoc_id)
        folder_path = (
            Path(member.source_path)
            if member and member.source_path and Path(member.source_path).exists()
            else checkout_base / tdoc_id
        )

        if not folder_path.exists():
            logger.warning(f"Checkout folder not found: {folder_path}")
            continue

        try:
            status = run_pipeline(
                tdoc_id,
+94 −0
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@ from pathlib import Path
from typing import Any

from tdoc_crawler.ai.models import WorkspaceMember
from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec

DEFAULT_WORKSPACE = "default"

@@ -128,16 +130,108 @@ def make_workspace_member(
    )


def resolve_tdoc_checkout_path(tdoc_id: str, checkout_base: Path) -> Path | None:
    """Resolve a TDoc ID to its checkout path.

    The checkout path follows the 3GPP FTP structure:
    <checkout_base>/<group>/<meeting>/Docs/<tdoc_id>/

    Args:
        tdoc_id: TDoc identifier (e.g., "S4-251971")
        checkout_base: Base checkout directory

    Returns:
        Path to the TDoc checkout folder if found, None otherwise
    """
    # Search for the TDoc in the checkout directory
    # The structure is: <checkout_base>/*/*/Docs/<tdoc_id>/
    for docs_dir in checkout_base.rglob("Docs"):
        tdoc_path = docs_dir / tdoc_id
        if tdoc_path.exists() and tdoc_path.is_dir():
            return tdoc_path
    return None


def checkout_tdoc_to_workspace(
    tdoc_id: str,
    checkout_base: Path,
    storage: Any,
    workspace: str | None,
) -> Path | None:
    """Checkout a TDoc and add it to a workspace.

    Args:
        tdoc_id: TDoc identifier
        checkout_base: Base checkout directory
        storage: AiStorage instance
        workspace: Workspace name

    Returns:
        Path to the checked out TDoc folder, or None if checkout failed
    """
    # First check if already checked out
    existing_path = resolve_tdoc_checkout_path(tdoc_id, checkout_base)
    if existing_path:
        _logger.debug(f"TDoc {tdoc_id} already checked out at {existing_path}")
        return existing_path

    # Need to checkout the TDoc
    try:

        # Resolve TDoc metadata
        metadata = resolve_via_whatthespec(tdoc_id)
        if not metadata:
            _logger.warning(f"Could not resolve TDoc {tdoc_id}")
            return None

        # Checkout the TDoc
        result = checkout_tdoc(metadata, checkout_base)
        if result.error:
            _logger.warning(f"Failed to checkout TDoc {tdoc_id}: {result.error}")
            return None

        # Get the checkout path
        checkout_path = result.checkout_path
        if checkout_path and checkout_path.exists():
            _logger.info(f"Checked out TDoc {tdoc_id} to {checkout_path}")
            return checkout_path

        # Try to find the path if not returned
        return resolve_tdoc_checkout_path(tdoc_id, checkout_base)

    except Exception as e:
        _logger.warning(f"Error checking out TDoc {tdoc_id}: {e}")
        return None


def ensure_ai_subfolder(checkout_path: Path) -> Path:
    """Ensure the .ai subfolder exists for processed outputs.

    Args:
        checkout_path: Path to the checked out document folder

    Returns:
        Path to the .ai subfolder
    """
    ai_folder = checkout_path / ".ai"
    ai_folder.mkdir(parents=True, exist_ok=True)
    return ai_folder



__all__ = [
    "DEFAULT_WORKSPACE",
    "add_workspace_members",
    "checkout_tdoc_to_workspace",
    "create_workspace",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
    "get_workspace",
    "is_default_workspace",
    "list_workspaces",
    "make_workspace_member",
    "normalize_workspace_name",
    "resolve_tdoc_checkout_path",
    "resolve_workspace",
]
+22 −3
Original line number Diff line number Diff line
@@ -12,9 +12,11 @@ from rich.table import Table

from tdoc_crawler.ai import (
    AiStorage,
    checkout_tdoc_to_workspace,
    convert_tdoc,
    create_workspace,
    delete_workspace,
    ensure_ai_subfolder,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
@@ -310,7 +312,8 @@ def workspace_deactivate(
def workspace_add_members(
    workspace: Annotated[str | None, typer.Option("--workspace", "-w", help="Workspace name")] = None,
    items: Annotated[list[str], typer.Argument(..., help="Source item IDs to add")] = None,  # type: ignore[assignment]
    kind: Annotated[str, typer.Option("--kind", help="Source kind (tdoc, spec, other)")] = "tdoc",
    kind: Annotated[str, typer.Option("--kind", help="Source kind (tdoc, spec, other)"),] = "tdoc",
    checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", help="Checkout/download TDocs if not present")] = True,
    json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
) -> None:
    """Add source items to a workspace."""
@@ -320,7 +323,24 @@ def workspace_add_members(
    storage = AiStorage(manager.root / ".ai" / "lancedb")

    source_kind = SourceKind(kind.lower()) if kind.lower() in [e.value for e in SourceKind] else SourceKind.OTHER
    members = [make_workspace_member(workspace, item, item, source_kind) for item in items]

    # Build members with actual paths
    members = []
    checkout_base = manager.root / "checkout"

    for item in items:
        source_path = item

        # For TDocs/specs, try to resolve/checkout to get actual path
        if checkout and source_kind in (SourceKind.TDOC, SourceKind.SPEC):
            checkout_path = checkout_tdoc_to_workspace(item, checkout_base, storage, workspace)
            if checkout_path:
                source_path = str(checkout_path)
                # Ensure .ai subfolder exists
                ensure_ai_subfolder(checkout_path)

        members.append(make_workspace_member(workspace, item, source_path, source_kind))

    count = storage.add_workspace_members(workspace, members)

    if json_output:
@@ -328,7 +348,6 @@ def workspace_add_members(
    else:
        console.print(f"[green]Added {count} member(s) to workspace '{normalize_workspace_name(workspace)}'[/green]")


@_workspace_app.command("list-members")
def workspace_list_members(
    workspace: Annotated[str | None, typer.Option("--workspace", "-w", help="Workspace name")] = None,