Commit e06a1ea7 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(3gpp-ai): extract workspaces module into subpackage

- Split monolith workspaces.py (658 lines) into workspaces/ subpackage:
  crud.py (workspace CRUD), members.py (member management),
  utils.py (shared helpers, checkout functions)
- Add TypedDict definitions for type-safe dict returns
- Replace Any with WorkspaceMember, TDocMetadata, ProcessingResultDict
- Fix import: is_default_workspace/normalize_workspace_name from workspace_names
- Format _workspace_commands.py line wrapping
parent cff8ac02
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -12,6 +12,10 @@ from threegpp_ai.models import SourceKind, SummarizeResult, WorkspaceNotFoundErr
from threegpp_ai.operations.convert import convert_tdoc_metadata as convert_document
from threegpp_ai.operations.convert import convert_tdoc_to_markdown
from threegpp_ai.operations.summarize import summarize_tdoc as summarize_document
from threegpp_ai.operations.workspace_names import (
    is_default_workspace,
    normalize_workspace_name,
)
from threegpp_ai.operations.workspace_registry import (
    DEFAULT_WORKSPACE,
    WorkspaceRegistry,
@@ -29,11 +33,9 @@ from threegpp_ai.operations.workspaces import (
    ensure_default_workspace,
    get_workspace,
    get_workspace_member_counts,
    is_default_workspace,
    list_workspace_members,
    list_workspaces,
    make_workspace_member,
    normalize_workspace_name,
    remove_invalid_members,
    resolve_spec_release_from_db,
    resolve_tdoc_checkout_path,
+1 −1
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Any

import typer
from rich.console import Console
@@ -11,7 +12,6 @@ from rich.progress import (
    MofNCompleteColumn,
    Progress,
    SpinnerColumn,
    Task,
    TextColumn,
    TimeElapsedColumn,
)
+56 −36
Original line number Diff line number Diff line
@@ -10,13 +10,13 @@ import asyncio
from collections.abc import Callable
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from typing import TypedDict

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date

from threegpp_ai import SourceKind
@@ -24,6 +24,7 @@ from threegpp_ai.operations.checkout import checkout_single_item as _checkout_si
from threegpp_ai.operations.classify import pick_main_document
from threegpp_ai.operations.conversion import OFFICE_FORMATS, convert_to_pdf
from threegpp_ai.operations.convert import extract_document_structured_from_tdoc
from threegpp_ai.operations.workspace_registry import WorkspaceMember
from threegpp_ai.operations.workspaces import (
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
@@ -36,6 +37,17 @@ from ._shared import create_progress_bar
_logger = get_logger(__name__)


class ProcessingResultDict(TypedDict, total=False):
    """Dictionary representing the result of processing a workspace member."""

    source_item_id: str
    status: str
    reason: str
    pdf_path: Path
    markdown_path: Path
    error: str


async def checkout_single_item(
    *,
    item: str,
@@ -44,7 +56,7 @@ async def checkout_single_item(
    checkout: bool,
    release: str | None,
    path_config: PathConfig,
) -> tuple[Any | None, str | None]:
) -> tuple[WorkspaceMember | None, str | None]:
    """Checkout a single workspace item and return a member record.

    Delegates to :func:`threegpp_ai.operations.checkout.checkout_single_item`.
@@ -90,7 +102,7 @@ def resolve_process_file(path: Path) -> Path | None:
    return None


def convert_member_to_pdf(member: Any) -> Path | None:
def convert_member_to_pdf(member: WorkspaceMember) -> Path | None:
    """Convert a workspace member's document to PDF if it's an office format.

    Args:
@@ -124,13 +136,13 @@ def convert_member_to_pdf(member: Any) -> Path | None:

async def process_workspace_members(
    workspace: str,
    members: list[Any],
    members: list[WorkspaceMember],
    *,
    on_progress: Callable[[int, str], None] | None = None,
    checkout: bool = True,
    convert_md: bool = False,
    skip_existing: bool = False,
) -> list[dict[str, Any]]:
) -> list[ProcessingResultDict]:
    """Process workspace members with optional progress callback.

    Args:
@@ -144,7 +156,7 @@ async def process_workspace_members(
    Returns:
        List of processing results.
    """
    results: list[dict[str, Any]] = []
    results: list[ProcessingResultDict] = []
    path_config = PathConfig()

    for member in members:
@@ -171,11 +183,13 @@ async def process_workspace_members(
                file_path = resolve_process_file(checkout_path)

        if file_path is None or not file_path.exists():
            results.append({
            results.append(
                {
                    "source_item_id": member.source_item_id,
                    "status": "skipped",
                    "reason": "path or supported file not found",
            })
                }
            )
            if on_progress:
                on_progress(len(results), member.source_item_id)
            continue
@@ -187,34 +201,40 @@ async def process_workspace_members(
                if pdf_path is not None:
                    file_path = pdf_path
                elif doc_file.suffix.lower() not in {".pdf", ".txt", ".md"}:
                    results.append({
                    results.append(
                        {
                            "source_item_id": member.source_item_id,
                            "status": "skipped",
                            "reason": "office document could not be converted to PDF",
                    })
                        }
                    )
                    if on_progress:
                        on_progress(len(results), member.source_item_id)
                    continue

        try:
            extract_document_structured_from_tdoc(member.source_item_id, force=False)
            results.append({
            results.append(
                {
                    "source_item_id": member.source_item_id,
                    "file": str(file_path),
                    "status": "success",
                    "chars_extracted": 0,
                    "reason": None,
                    "error": None,
            })
                }
            )
        except Exception as exc:
            results.append({
            results.append(
                {
                    "source_item_id": member.source_item_id,
                    "file": str(file_path),
                    "status": "error",
                    "chars_extracted": 0,
                    "reason": "extraction_failed",
                    "error": str(exc),
            })
                }
            )

        if on_progress:
            on_progress(len(results), member.source_item_id)
@@ -282,7 +302,7 @@ def resolve_workspace_items(
        order=SortOrder.DESC,
    )

    async def _fetch_tdocs() -> list[Any]:
    async def _fetch_tdocs() -> list[TDocMetadata]:
        async with TDocDatabase(manager.db_file) as db:
            return await db.query_tdocs(config)

@@ -297,7 +317,7 @@ async def checkout_items(
    source_kind: SourceKind,
    checkout: bool,
    release: str | None,
) -> tuple[list[Any], list[tuple[str, str]]]:
) -> tuple[list[WorkspaceMember], list[tuple[str, str]]]:
    """Checkout items and return member records for workspace registration.

    Args:
@@ -311,7 +331,7 @@ async def checkout_items(
        Tuple of (members, skipped_items).
    """
    manager = PathConfig()
    members: list[Any] = []
    members: list[WorkspaceMember] = []
    skipped: list[tuple[str, str]] = []

    with create_progress_bar(
+19 −15
Original line number Diff line number Diff line
@@ -288,13 +288,15 @@ def _resolve_and_checkout(
        return

    console.print(f"[green]Checking out {len(resolved_items)} item(s) to workspace '{workspace_name}'...[/green]")
    members, skipped = asyncio.run(checkout_items(
    members, skipped = asyncio.run(
        checkout_items(
            resolved_items,
            workspace_name=workspace_name,
            source_kind=source_kind,
            checkout=checkout,
            release=release,
    ))
        )
    )

    _show_checkout_results(members, skipped, output)

@@ -334,14 +336,16 @@ def workspace_process(
    def _on_progress(completed: int, item_id: str) -> None:
        console.print(f"[dim]{completed}/{len(members)}: {item_id}[/dim]")

    results = asyncio.run(process_workspace_members(
    results = asyncio.run(
        process_workspace_members(
            normalized,
            members,
            on_progress=_on_progress,
            checkout=True,
            convert_md=force,
            skip_existing=skip_existing,
    ))
        )
    )

    success = [r for r in results if r["status"] == "success"]
    errors = [r for r in results if r["status"] == "error"]
+2 −2
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@ belongs in the `workspace process` command exclusively.
from __future__ import annotations

from pathlib import Path
from typing import Any

from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.logging import get_logger
@@ -25,6 +24,7 @@ from threegpp_ai import (
    make_workspace_member,
    resolve_spec_release_from_db,
)
from threegpp_ai.operations.workspace_registry import WorkspaceMember

_logger = get_logger(__name__)

@@ -37,7 +37,7 @@ async def checkout_single_item(
    checkout: bool,
    release: str | None,
    path_config: PathConfig,
) -> tuple[Any | None, str | None]:
) -> tuple[WorkspaceMember | None, str | None]:
    """Checkout a single workspace item and create a member record.

    Downloads the document if checkout is enabled, sets up the .ai subfolder,
Loading