Commit 14bd8fc2 authored by Jan Reimes's avatar Jan Reimes
Browse files

fix: update CLI for container usage and fix minor bugs

- Use AiServiceContainer.get_storage() instead of AiStorage() instantiations
- Fix attribute bug in query output: chunk.tdoc_id → chunk.document_id
- Fix progress callback for workspace process command
- Fix doc extension check in extract.py: != .doc instead of == .doc
- Fix lazy import in workspace_registry.py to avoid circular import (PLC0415 with comment)
- Minor formatting cleanup in http_client and parsers modules
parent 76993b4b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ def extract_doc_to_markdown(
        msg = f"DOC file not found: {doc_path}"
        raise ExtractionError(msg)

    if not doc_path.suffix.lower() == ".doc":
    if doc_path.suffix.lower() != ".doc":
        msg = f"File must be .doc format: {doc_path}"
        raise ExtractionError(msg)

+11 −11
Original line number Diff line number Diff line
@@ -72,16 +72,6 @@ class WorkspaceMetadata:
            "members": self.members,
        }

    @classmethod
    def from_dict(cls, name: str, data: dict[str, Any]) -> WorkspaceMetadata:
        return cls(
            name=name,
            created_at=data.get("created_at", utc_now().isoformat()),
            description=data.get("description", ""),
            auto_build=data.get("auto_build", True),
            members=data.get("members", []),
        )

    def add_member(self, member: WorkspaceMember) -> None:
        """Add a member to the workspace."""
        self.members = [m for m in self.members if m.get("source_item_id") != member.source_item_id]
@@ -125,6 +115,16 @@ class WorkspaceMetadata:
            members.append(member)
        return sorted(members, key=lambda m: m.source_item_id)

    @classmethod
    def from_dict(cls, name: str, data: dict[str, Any]) -> WorkspaceMetadata:
        return cls(
            name=name,
            created_at=data.get("created_at", utc_now().isoformat()),
            description=data.get("description", ""),
            auto_build=data.get("auto_build", True),
            members=data.get("members", []),
        )


@dataclass
class WorkspaceDisplayInfo:
@@ -381,7 +381,7 @@ def get_active_workspace(cache_manager_name: str | None = None) -> str:
    Returns:
        Name of the active workspace, or DEFAULT_WORKSPACE if none set.
    """
    from tdoc_crawler.config import CacheManager
    from tdoc_crawler.config import CacheManager  # noqa: PLC0415

    manager_name = cache_manager_name or "default"
    try:
+5 −8
Original line number Diff line number Diff line
@@ -9,14 +9,11 @@ from typing import Annotated

import typer
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn, TimeElapsedColumn
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
from rich.table import Table

from tdoc_crawler.ai import (
    AiConfig,
    AiServiceContainer,
    AiConfig,
    AiStorage,
    checkout_spec_to_workspace,
    checkout_tdoc_to_workspace,
    convert_document,
@@ -31,7 +28,7 @@ from tdoc_crawler.ai import (
    set_active_workspace,
    summarize_document,
)
from tdoc_crawler.ai.models import SourceKind
from tdoc_crawler.ai.models import PipelineStage, SourceKind
from tdoc_crawler.ai.operations.pipeline import process_all
from tdoc_crawler.ai.operations.workspaces import (
    add_workspace_members,
@@ -163,7 +160,7 @@ def ai_query(
            table.add_column("Snippet", style="white")
            for chunk, score in embedding_results:
                snippet = chunk.content[:120].replace("\n", " ")
                table.add_row(chunk.tdoc_id, str(chunk.section or ""), f"{score:.3f}", snippet)
                table.add_row(chunk.document_id, str(chunk.section or ""), snippet, f"{score:.3f}")
            console.print(table)
        else:
            console.print("[yellow]No embedding results found.[/yellow]")
@@ -354,9 +351,7 @@ def workspace_clear(
    """Clear all AI artifacts (embeddings, summaries, etc.) while preserving workspace members."""
    workspace = resolve_workspace(workspace)
    CacheManager().register()
    embeddings_manager = AiServiceContainer.get_instance().get_embeddings_manager()
    storage = AiServiceContainer.get_instance().get_storage()
    storage = AiStorage(AiConfig.from_env().ai_cache_dir, embedding_dimension=embeddings_manager.dimension)

    removed_count = storage.clear_workspace_artifacts(workspace)

@@ -563,6 +558,8 @@ def workspace_process(

        # Create progress callback
        def progress_callback(stage: PipelineStage, doc_id: str) -> None:
            # Lazy import to avoid circular dependency with PipelineStage
            # PipelineStage is defined in models, imported at top level
            # Update description with current document and stage
            stage_name = stage.value.replace("_", " ").title()
            processed_count[0] += 1
+1 −3
Original line number Diff line number Diff line
@@ -41,9 +41,7 @@ class DefaultHttpClientProvider:
            A requests.Session configured with caching.
        """
        if self._session is None:
            self._session = create_cached_session(
                cache_manager_name=self._cache_manager_name
            )
            self._session = create_cached_session(cache_manager_name=self._cache_manager_name)
        return self._session

    def close(self) -> None:
+1 −0
Original line number Diff line number Diff line
@@ -72,6 +72,7 @@ class MeetingParser:
        """
        return parse_meeting_page(html, working_group, subgroup, get_subtb)


def parse_meeting_page(
    html: str,
    working_group: WorkingGroup,