Commit 7ddba15d authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(workspace): implement workspace registry management

* Introduce WorkspaceRegistry and WorkspaceMetadata classes for managing workspaces.
* Store workspace metadata in a JSON file at ~/.tdoc-crawler/.ai/workspaces.json.
* Add functions to create, delete, and list workspaces, as well as set and get the active workspace.
* Update existing functions to utilize the new workspace registry.
* Refactor related code in storage and CLI to accommodate document_id instead of tdoc_id.
* Enhance document summarization and processing functions to align with workspace management.
parent 54ae80bf
Loading
Loading
Loading
Loading
+19 −13
Original line number Diff line number Diff line
@@ -15,13 +15,15 @@ from tdoc_crawler.ai.models import (
    PipelineStage,
    ProcessingStatus,
)
from tdoc_crawler.ai.operations.convert import convert_tdoc
from tdoc_crawler.ai.operations.convert import convert_tdoc as convert_document
from tdoc_crawler.ai.operations.embeddings import query_embeddings as _query_embeddings
from tdoc_crawler.ai.operations.graph import query_graph as _query_graph
from tdoc_crawler.ai.operations.pipeline import get_status as _pipeline_get_status_impl
from tdoc_crawler.ai.operations.pipeline import process_all as _pipeline_process_all_impl
from tdoc_crawler.ai.operations.pipeline import process_tdoc as _pipeline_process_tdoc_impl
from tdoc_crawler.ai.operations.summarize import SummarizeResult, summarize_tdoc
from tdoc_crawler.ai.operations.pipeline import process_tdoc as process_document
from tdoc_crawler.ai.operations.summarize import SummarizeResult
from tdoc_crawler.ai.operations.summarize import summarize_tdoc as summarize_document
from tdoc_crawler.ai.operations.workspaces import (
    DEFAULT_WORKSPACE,
    add_workspace_members,
@@ -31,6 +33,7 @@ from tdoc_crawler.ai.operations.workspaces import (
    delete_workspace,
    ensure_ai_subfolder,
    ensure_default_workspace,
    get_active_workspace,
    get_workspace,
    is_default_workspace,
    list_workspaces,
@@ -38,30 +41,31 @@ from tdoc_crawler.ai.operations.workspaces import (
    normalize_workspace_name,
    resolve_tdoc_checkout_path,
    resolve_workspace,
    set_active_workspace,
)
from tdoc_crawler.ai.storage import AiStorage
from tdoc_crawler.config import CacheManager


def _pipeline_get_status(tdoc_id: str, workspace: str) -> ProcessingStatus | None:
def _pipeline_get_status(document_id: str, workspace: str) -> ProcessingStatus | None:
    """Get processing status for a TDoc."""
    return _pipeline_get_status_impl(tdoc_id, workspace=workspace)
    return _pipeline_get_status_impl(document_id, workspace=workspace)


def get_status(tdoc_id: str, workspace: str | None = None) -> ProcessingStatus | None:
def get_status(document_id: str, workspace: str | None = None) -> ProcessingStatus | None:
    """Get processing status for a TDoc."""
    return _pipeline_get_status_impl(tdoc_id, workspace=workspace)
    return _pipeline_get_status_impl(document_id, workspace=workspace)


def process_tdoc(
    tdoc_id: str,
    document_id: str,
    checkout_path: Path,
    force_rerun: bool = False,
    workspace: str | None = None,
) -> ProcessingStatus:
    """Process a single TDoc through the AI pipeline."""
    return _pipeline_process_tdoc_impl(
        tdoc_id,
        document_id,
        checkout_path,
        force_rerun=force_rerun,
        workspace=workspace,
@@ -69,7 +73,7 @@ def process_tdoc(


def process_all(
    tdoc_ids: list[str],
    document_ids: list[str],
    checkout_base: Path,
    new_only: bool = False,
    force_rerun: bool = False,
@@ -78,7 +82,7 @@ def process_all(
) -> dict[str, ProcessingStatus]:
    """Process multiple TDocs through the AI pipeline."""
    return _pipeline_process_all_impl(
        tdoc_ids,
        document_ids,
        checkout_base,
        new_only=new_only,
        force_rerun=force_rerun,
@@ -121,11 +125,12 @@ __all__ = [
    "add_workspace_members",
    "checkout_spec_to_workspace",
    "checkout_tdoc_to_workspace",
    "convert_tdoc",
    "convert_document",
    "create_workspace",
    "delete_workspace",
    "ensure_ai_subfolder",
    "ensure_default_workspace",
    "get_active_workspace",
    "get_status",
    "get_workspace",
    "is_default_workspace",
@@ -133,10 +138,11 @@ __all__ = [
    "make_workspace_member",
    "normalize_workspace_name",
    "process_all",
    "process_tdoc",
    "process_document",
    "query_embeddings",
    "query_graph",
    "resolve_tdoc_checkout_path",
    "resolve_workspace",
    "summarize_tdoc",
    "set_active_workspace",
    "summarize_document",
]
+2 −2
Original line number Diff line number Diff line
@@ -140,9 +140,9 @@ class AiConfig(BaseConfigModel):
    def _resolve_paths(self) -> AiConfig:
        if self.ai_store_path is None:
            # Include embedding model in path to avoid dimension conflicts
            # e.g., ~/.tdoc-crawler/.ai/lancedb/sentence-transformers/all-MiniLM-L6-v2
            # e.g., ~/.tdoc-crawler/.ai/sentence-transformers/all-MiniLM-L6-v2
            # Keep slash to group models by provider
            self.ai_store_path = self.cache_dir / ".ai" / "lancedb" / self.embedding_model
            self.ai_store_path = self.cache_dir / ".ai" / self.embedding_model
        return self

    @model_validator(mode="after")
+23 −23
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
from tdoc_crawler.utils.misc import utc_now


def _normalize_tdoc_id(value: str) -> str:
def _normalize_document_id(value: str) -> str:
    return value.strip().upper()


@@ -33,7 +33,7 @@ class PipelineStage(StrEnum):
class GraphNodeType(StrEnum):
    """Types of nodes in the knowledge graph."""

    TDOC = "tdoc"
    DOCUMENT = "document"
    MEETING = "meeting"
    SPEC = "spec"
    WORK_ITEM = "work_item"
@@ -165,7 +165,7 @@ class ArtifactScope(BaseModel):
class ProcessingStatus(BaseModel):
    """Processing state for a single TDoc."""

    tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
    document_id: str = Field(..., description="Document identifier (normalized via .upper())")
    current_stage: PipelineStage = Field(PipelineStage.PENDING, description="Current pipeline stage")
    classified_at: datetime | None = Field(None, description="Timestamp when classification completed")
    extracted_at: datetime | None = Field(None, description="Timestamp when extraction completed")
@@ -178,16 +178,16 @@ class ProcessingStatus(BaseModel):
    keywords: list[str] | None = Field(None, description="Keywords extracted from document content")
    detected_language: str | None = Field(None, description="Primary language detected in document")

    @field_validator("tdoc_id")
    @field_validator("document_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        return _normalize_tdoc_id(value)
    def _normalize_document_id(cls, value: str) -> str:
        return _normalize_document_id(value)


class DocumentClassification(BaseModel):
    """Classification of a file within a TDoc folder."""

    tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
    document_id: str = Field(..., description="Document identifier (normalized via .upper())")
    file_path: str = Field(..., description="Relative path within checkout folder")
    is_main_document: bool = Field(..., description="Whether this file is the main document")
    confidence: float = Field(..., description="Confidence score between 0.0 and 1.0")
@@ -195,10 +195,10 @@ class DocumentClassification(BaseModel):
    file_size_bytes: int = Field(..., ge=0, description="File size in bytes")
    classified_at: datetime = Field(default_factory=utc_now, description="Classification timestamp")

    @field_validator("tdoc_id")
    @field_validator("document_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        return _normalize_tdoc_id(value)
    def _normalize_document_id(cls, value: str) -> str:
        return _normalize_document_id(value)

    @field_validator("confidence")
    @classmethod
@@ -212,8 +212,8 @@ class DocumentClassification(BaseModel):
class DocumentChunk(BaseModel):
    """A chunk of extracted document text with its embedding."""

    chunk_id: str = Field(..., description="Unique chunk identifier '{tdoc_id}:{chunk_index}'")
    tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
    chunk_id: str = Field(..., description="Unique chunk identifier '{document_id}:{chunk_index}'")
    document_id: str = Field(..., description="Document identifier (normalized via .upper())")
    section_heading: str | None = Field(None, description="Heading for the chunk's section")
    chunk_index: int = Field(..., ge=0, description="Position within the document")
    text: str = Field(..., description="Chunk text content")
@@ -239,30 +239,30 @@ class DocumentChunk(BaseModel):
    def embedding(self, value: list[float]) -> None:
        self.vector = value

    @field_validator("tdoc_id")
    @field_validator("document_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        return _normalize_tdoc_id(value)
    def _normalize_document_id(cls, value: str) -> str:
        return _normalize_document_id(value)


class QueryResult(BaseModel):
    """Result from embedding similarity query."""

    tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
    document_id: str = Field(..., description="Document identifier (normalized via .upper())")
    section: str = Field("", description="Section heading or empty string")
    content: str = Field(..., description="Text content that matched the query")
    score: float = Field(..., ge=0.0, le=1.0, description="Similarity score (0.0-1.0)")

    @field_validator("tdoc_id")
    @field_validator("document_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        return _normalize_tdoc_id(value)
    def _normalize_document_id(cls, value: str) -> str:
        return _normalize_document_id(value)


class DocumentSummary(BaseModel):
    """AI-generated summary for a TDoc."""

    tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
    document_id: str = Field(..., description="Document identifier (normalized via .upper())")
    abstract: str = Field(..., description="150-250 word abstract")
    key_points: list[str] = Field(default_factory=list, description="Key findings")
    action_items: list[str] = Field(default_factory=list, description="Action items")
@@ -272,10 +272,10 @@ class DocumentSummary(BaseModel):
    prompt_version: str = Field("v1", description="Prompt template version")
    generated_at: datetime = Field(default_factory=utc_now, description="Generation timestamp")

    @field_validator("tdoc_id")
    @field_validator("document_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        return _normalize_tdoc_id(value)
    def _normalize_document_id(cls, value: str) -> str:
        return _normalize_document_id(value)


class GraphNode(BaseModel):
+9 −9
Original line number Diff line number Diff line
@@ -137,17 +137,17 @@ def _determine_heuristic(
    return best_file, "file_size_fallback", 0.5


def classify_tdoc_files(
    tdoc_id: str,
def classify_document_files(
    document_id: str,
    folder_path: Path,
) -> list[DocumentClassification]:
    """Classify files in a TDoc folder to identify the main document.
    """Classify files in a document folder to identify the main document.

    Uses heuristic scoring based on filename patterns, file size, and file type
    to determine which file is likely the main document.

    Args:
        tdoc_id: TDoc identifier (e.g., "SP-123456").
        document_id: Document identifier (e.g., "SP-123456").
        folder_path: Path to the TDoc checkout folder.

    Returns:
@@ -161,7 +161,7 @@ def classify_tdoc_files(
    files = [file_path for file_path in folder_path.glob("*.docx") if file_path.is_file() and not file_path.name.startswith(".")]

    if not files:
        logger.warning(f"No DOCX files found in {folder_path} for TDoc {tdoc_id}")
        logger.warning(f"No DOCX files found in {folder_path} for document {document_id}")
        return []

    if len(files) == 1:
@@ -169,7 +169,7 @@ def classify_tdoc_files(
        file = files[0]
        return [
            DocumentClassification(
                tdoc_id=tdoc_id,
                document_id=document_id,
                file_path=str(file.relative_to(folder_path)),
                is_main_document=True,
                confidence=1.0,
@@ -210,7 +210,7 @@ def classify_tdoc_files(

        result.append(
            DocumentClassification(
                tdoc_id=tdoc_id,
                document_id=document_id,
                file_path=str(file.relative_to(folder_path)),
                is_main_document=is_main,
                confidence=conf,
@@ -220,12 +220,12 @@ def classify_tdoc_files(
            )
        )

    logger.info(f"Classified {len(files)} files for {tdoc_id}, main: {best_file.name} (confidence: {confidence:.2f})")
    logger.info(f"Classified {len(files)} files for {document_id}, main: {best_file.name} (confidence: {confidence:.2f})")

    return result


__all__ = [
    "_score_filename",
    "classify_tdoc_files",
    "classify_document_files",
]
+4 −4
Original line number Diff line number Diff line
@@ -49,7 +49,7 @@ def _format_markdown(metadata: TDocMetadata) -> str:
    lines.append(f"# {metadata.title}\n")

    # TDoc ID
    lines.append(f"**TDoc ID:** {metadata.tdoc_id}\n")
    lines.append(f"**TDoc ID:** {metadata.document_id}\n")

    # Meeting info
    meeting_name = _get_meeting_info(metadata.meeting_id)
@@ -88,7 +88,7 @@ def _format_markdown(metadata: TDocMetadata) -> str:


def convert_tdoc(
    tdoc_id: str,
    document_id: str,
    output_path: Path | None = None,
) -> str:
    """Convert a TDoc to markdown format.
@@ -97,7 +97,7 @@ def convert_tdoc(
    representation containing title, meeting info, source, and description.

    Args:
        tdoc_id: TDoc identifier (e.g., "S4-260001").
        document_id: Document identifier (e.g., "S4-260001").
        output_path: Optional path to write markdown file. If None, returns
            the markdown string.

@@ -109,7 +109,7 @@ def convert_tdoc(
        ValueError: If TDoc cannot be found via WhatTheSpec.
    """
    # Normalize TDoc ID
    normalized_id = tdoc_id.strip().upper()
    normalized_id = document_id.strip().upper()

    # Fetch metadata from WhatTheSpec
    logger.info(f"Fetching TDoc metadata for {normalized_id} via WhatTheSpec")
Loading