feat(workspace): implement workspace registry management (7ddba15d) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/ai/init.py

+19 −13

Original line number	Diff line number	Diff line
		@@ -15,13 +15,15 @@ from tdoc_crawler.ai.models import (
		PipelineStage,
		ProcessingStatus,
		)
		from tdoc_crawler.ai.operations.convert import convert_tdoc
		from tdoc_crawler.ai.operations.convert import convert_tdoc as convert_document
		from tdoc_crawler.ai.operations.embeddings import query_embeddings as _query_embeddings
		from tdoc_crawler.ai.operations.graph import query_graph as _query_graph
		from tdoc_crawler.ai.operations.pipeline import get_status as _pipeline_get_status_impl
		from tdoc_crawler.ai.operations.pipeline import process_all as _pipeline_process_all_impl
		from tdoc_crawler.ai.operations.pipeline import process_tdoc as _pipeline_process_tdoc_impl
		from tdoc_crawler.ai.operations.summarize import SummarizeResult, summarize_tdoc
		from tdoc_crawler.ai.operations.pipeline import process_tdoc as process_document
		from tdoc_crawler.ai.operations.summarize import SummarizeResult
		from tdoc_crawler.ai.operations.summarize import summarize_tdoc as summarize_document
		from tdoc_crawler.ai.operations.workspaces import (
		DEFAULT_WORKSPACE,
		add_workspace_members,
		@@ -31,6 +33,7 @@ from tdoc_crawler.ai.operations.workspaces import (
		delete_workspace,
		ensure_ai_subfolder,
		ensure_default_workspace,
		get_active_workspace,
		get_workspace,
		is_default_workspace,
		list_workspaces,
		@@ -38,30 +41,31 @@ from tdoc_crawler.ai.operations.workspaces import (
		normalize_workspace_name,
		resolve_tdoc_checkout_path,
		resolve_workspace,
		set_active_workspace,
		)
		from tdoc_crawler.ai.storage import AiStorage
		from tdoc_crawler.config import CacheManager


		def _pipeline_get_status(tdoc_id: str, workspace: str) -> ProcessingStatus \| None:
		def _pipeline_get_status(document_id: str, workspace: str) -> ProcessingStatus \| None:
		"""Get processing status for a TDoc."""
		return _pipeline_get_status_impl(tdoc_id, workspace=workspace)
		return _pipeline_get_status_impl(document_id, workspace=workspace)


		def get_status(tdoc_id: str, workspace: str \| None = None) -> ProcessingStatus \| None:
		def get_status(document_id: str, workspace: str \| None = None) -> ProcessingStatus \| None:
		"""Get processing status for a TDoc."""
		return _pipeline_get_status_impl(tdoc_id, workspace=workspace)
		return _pipeline_get_status_impl(document_id, workspace=workspace)


		def process_tdoc(
		tdoc_id: str,
		document_id: str,
		checkout_path: Path,
		force_rerun: bool = False,
		workspace: str \| None = None,
		) -> ProcessingStatus:
		"""Process a single TDoc through the AI pipeline."""
		return _pipeline_process_tdoc_impl(
		tdoc_id,
		document_id,
		checkout_path,
		force_rerun=force_rerun,
		workspace=workspace,
		@@ -69,7 +73,7 @@ def process_tdoc(


		def process_all(
		tdoc_ids: list[str],
		document_ids: list[str],
		checkout_base: Path,
		new_only: bool = False,
		force_rerun: bool = False,
		@@ -78,7 +82,7 @@ def process_all(
		) -> dict[str, ProcessingStatus]:
		"""Process multiple TDocs through the AI pipeline."""
		return _pipeline_process_all_impl(
		tdoc_ids,
		document_ids,
		checkout_base,
		new_only=new_only,
		force_rerun=force_rerun,
		@@ -121,11 +125,12 @@ __all__ = [
		"add_workspace_members",
		"checkout_spec_to_workspace",
		"checkout_tdoc_to_workspace",
		"convert_tdoc",
		"convert_document",
		"create_workspace",
		"delete_workspace",
		"ensure_ai_subfolder",
		"ensure_default_workspace",
		"get_active_workspace",
		"get_status",
		"get_workspace",
		"is_default_workspace",
		@@ -133,10 +138,11 @@ __all__ = [
		"make_workspace_member",
		"normalize_workspace_name",
		"process_all",
		"process_tdoc",
		"process_document",
		"query_embeddings",
		"query_graph",
		"resolve_tdoc_checkout_path",
		"resolve_workspace",
		"summarize_tdoc",
		"set_active_workspace",
		"summarize_document",
		]

src/tdoc_crawler/ai/config.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -140,9 +140,9 @@ class AiConfig(BaseConfigModel):
		def _resolve_paths(self) -> AiConfig:
		if self.ai_store_path is None:
		# Include embedding model in path to avoid dimension conflicts
		# e.g., ~/.tdoc-crawler/.ai/lancedb/sentence-transformers/all-MiniLM-L6-v2
		# e.g., ~/.tdoc-crawler/.ai/sentence-transformers/all-MiniLM-L6-v2
		# Keep slash to group models by provider
		self.ai_store_path = self.cache_dir / ".ai" / "lancedb" / self.embedding_model
		self.ai_store_path = self.cache_dir / ".ai" / self.embedding_model
		return self

		@model_validator(mode="after")

src/tdoc_crawler/ai/models.py

+23 −23

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ from pydantic import BaseModel, Field, field_validator, model_validator
		from tdoc_crawler.utils.misc import utc_now


		def _normalize_tdoc_id(value: str) -> str:
		def _normalize_document_id(value: str) -> str:
		return value.strip().upper()


		@@ -33,7 +33,7 @@ class PipelineStage(StrEnum):
		class GraphNodeType(StrEnum):
		"""Types of nodes in the knowledge graph."""

		TDOC = "tdoc"
		DOCUMENT = "document"
		MEETING = "meeting"
		SPEC = "spec"
		WORK_ITEM = "work_item"
		@@ -165,7 +165,7 @@ class ArtifactScope(BaseModel):
		class ProcessingStatus(BaseModel):
		"""Processing state for a single TDoc."""

		tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
		document_id: str = Field(..., description="Document identifier (normalized via .upper())")
		current_stage: PipelineStage = Field(PipelineStage.PENDING, description="Current pipeline stage")
		classified_at: datetime \| None = Field(None, description="Timestamp when classification completed")
		extracted_at: datetime \| None = Field(None, description="Timestamp when extraction completed")
		@@ -178,16 +178,16 @@ class ProcessingStatus(BaseModel):
		keywords: list[str] \| None = Field(None, description="Keywords extracted from document content")
		detected_language: str \| None = Field(None, description="Primary language detected in document")

		@field_validator("tdoc_id")
		@field_validator("document_id")
		@classmethod
		def _normalize_tdoc_id(cls, value: str) -> str:
		return _normalize_tdoc_id(value)
		def _normalize_document_id(cls, value: str) -> str:
		return _normalize_document_id(value)


		class DocumentClassification(BaseModel):
		"""Classification of a file within a TDoc folder."""

		tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
		document_id: str = Field(..., description="Document identifier (normalized via .upper())")
		file_path: str = Field(..., description="Relative path within checkout folder")
		is_main_document: bool = Field(..., description="Whether this file is the main document")
		confidence: float = Field(..., description="Confidence score between 0.0 and 1.0")
		@@ -195,10 +195,10 @@ class DocumentClassification(BaseModel):
		file_size_bytes: int = Field(..., ge=0, description="File size in bytes")
		classified_at: datetime = Field(default_factory=utc_now, description="Classification timestamp")

		@field_validator("tdoc_id")
		@field_validator("document_id")
		@classmethod
		def _normalize_tdoc_id(cls, value: str) -> str:
		return _normalize_tdoc_id(value)
		def _normalize_document_id(cls, value: str) -> str:
		return _normalize_document_id(value)

		@field_validator("confidence")
		@classmethod
		@@ -212,8 +212,8 @@ class DocumentClassification(BaseModel):
		class DocumentChunk(BaseModel):
		"""A chunk of extracted document text with its embedding."""

		chunk_id: str = Field(..., description="Unique chunk identifier '{tdoc_id}:{chunk_index}'")
		tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
		chunk_id: str = Field(..., description="Unique chunk identifier '{document_id}:{chunk_index}'")
		document_id: str = Field(..., description="Document identifier (normalized via .upper())")
		section_heading: str \| None = Field(None, description="Heading for the chunk's section")
		chunk_index: int = Field(..., ge=0, description="Position within the document")
		text: str = Field(..., description="Chunk text content")
		@@ -239,30 +239,30 @@ class DocumentChunk(BaseModel):
		def embedding(self, value: list[float]) -> None:
		self.vector = value

		@field_validator("tdoc_id")
		@field_validator("document_id")
		@classmethod
		def _normalize_tdoc_id(cls, value: str) -> str:
		return _normalize_tdoc_id(value)
		def _normalize_document_id(cls, value: str) -> str:
		return _normalize_document_id(value)


		class QueryResult(BaseModel):
		"""Result from embedding similarity query."""

		tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
		document_id: str = Field(..., description="Document identifier (normalized via .upper())")
		section: str = Field("", description="Section heading or empty string")
		content: str = Field(..., description="Text content that matched the query")
		score: float = Field(..., ge=0.0, le=1.0, description="Similarity score (0.0-1.0)")

		@field_validator("tdoc_id")
		@field_validator("document_id")
		@classmethod
		def _normalize_tdoc_id(cls, value: str) -> str:
		return _normalize_tdoc_id(value)
		def _normalize_document_id(cls, value: str) -> str:
		return _normalize_document_id(value)


		class DocumentSummary(BaseModel):
		"""AI-generated summary for a TDoc."""

		tdoc_id: str = Field(..., description="TDoc identifier (normalized via .upper())")
		document_id: str = Field(..., description="Document identifier (normalized via .upper())")
		abstract: str = Field(..., description="150-250 word abstract")
		key_points: list[str] = Field(default_factory=list, description="Key findings")
		action_items: list[str] = Field(default_factory=list, description="Action items")
		@@ -272,10 +272,10 @@ class DocumentSummary(BaseModel):
		prompt_version: str = Field("v1", description="Prompt template version")
		generated_at: datetime = Field(default_factory=utc_now, description="Generation timestamp")

		@field_validator("tdoc_id")
		@field_validator("document_id")
		@classmethod
		def _normalize_tdoc_id(cls, value: str) -> str:
		return _normalize_tdoc_id(value)
		def _normalize_document_id(cls, value: str) -> str:
		return _normalize_document_id(value)


		class GraphNode(BaseModel):

src/tdoc_crawler/ai/operations/classify.py

+9 −9

Original line number	Diff line number	Diff line
		@@ -137,17 +137,17 @@ def _determine_heuristic(
		return best_file, "file_size_fallback", 0.5


		def classify_tdoc_files(
		tdoc_id: str,
		def classify_document_files(
		document_id: str,
		folder_path: Path,
		) -> list[DocumentClassification]:
		"""Classify files in a TDoc folder to identify the main document.
		"""Classify files in a document folder to identify the main document.

		Uses heuristic scoring based on filename patterns, file size, and file type
		to determine which file is likely the main document.

		Args:
		tdoc_id: TDoc identifier (e.g., "SP-123456").
		document_id: Document identifier (e.g., "SP-123456").
		folder_path: Path to the TDoc checkout folder.

		Returns:
		@@ -161,7 +161,7 @@ def classify_tdoc_files(
		files = [file_path for file_path in folder_path.glob("*.docx") if file_path.is_file() and not file_path.name.startswith(".")]

		if not files:
		logger.warning(f"No DOCX files found in {folder_path} for TDoc {tdoc_id}")
		logger.warning(f"No DOCX files found in {folder_path} for document {document_id}")
		return []

		if len(files) == 1:
		@@ -169,7 +169,7 @@ def classify_tdoc_files(
		file = files[0]
		return [
		DocumentClassification(
		tdoc_id=tdoc_id,
		document_id=document_id,
		file_path=str(file.relative_to(folder_path)),
		is_main_document=True,
		confidence=1.0,
		@@ -210,7 +210,7 @@ def classify_tdoc_files(

		result.append(
		DocumentClassification(
		tdoc_id=tdoc_id,
		document_id=document_id,
		file_path=str(file.relative_to(folder_path)),
		is_main_document=is_main,
		confidence=conf,
		@@ -220,12 +220,12 @@ def classify_tdoc_files(
		)
		)

		logger.info(f"Classified {len(files)} files for {tdoc_id}, main: {best_file.name} (confidence: {confidence:.2f})")
		logger.info(f"Classified {len(files)} files for {document_id}, main: {best_file.name} (confidence: {confidence:.2f})")

		return result


		__all__ = [
		"_score_filename",
		"classify_tdoc_files",
		"classify_document_files",
		]

src/tdoc_crawler/ai/operations/convert.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -49,7 +49,7 @@ def _format_markdown(metadata: TDocMetadata) -> str:
		lines.append(f"# {metadata.title}\n")

		# TDoc ID
		lines.append(f"TDoc ID: {metadata.tdoc_id}\n")
		lines.append(f"TDoc ID: {metadata.document_id}\n")

		# Meeting info
		meeting_name = _get_meeting_info(metadata.meeting_id)
		@@ -88,7 +88,7 @@ def _format_markdown(metadata: TDocMetadata) -> str:


		def convert_tdoc(
		tdoc_id: str,
		document_id: str,
		output_path: Path \| None = None,
		) -> str:
		"""Convert a TDoc to markdown format.
		@@ -97,7 +97,7 @@ def convert_tdoc(
		representation containing title, meeting info, source, and description.

		Args:
		tdoc_id: TDoc identifier (e.g., "S4-260001").
		document_id: Document identifier (e.g., "S4-260001").
		output_path: Optional path to write markdown file. If None, returns
		the markdown string.

		@@ -109,7 +109,7 @@ def convert_tdoc(
		ValueError: If TDoc cannot be found via WhatTheSpec.
		"""
		# Normalize TDoc ID
		normalized_id = tdoc_id.strip().upper()
		normalized_id = document_id.strip().upper()

		# Fetch metadata from WhatTheSpec
		logger.info(f"Fetching TDoc metadata for {normalized_id} via WhatTheSpec")