refactor(tdoc-ai): remove dead code from models, storage, and operations (a2de4429) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc-ai/tdoc_ai/models.py

+1 −66

Original line number	Diff line number	Diff line
		@@ -2,12 +2,10 @@

		from __future__ import annotations

		import json
		from datetime import datetime
		from enum import StrEnum
		from typing import Any

		import yaml
		from pydantic import BaseModel, Field, field_validator, model_validator

		from tdoc_ai.config import AiConfig
		@@ -25,7 +23,6 @@ class PipelineStage(StrEnum):
		CLASSIFYING = "classifying"
		EXTRACTING = "extracting"
		EMBEDDING = "embedding"
		SUMMARIZING = "summarizing"
		GRAPHING = "graphing"
		COMPLETED = "completed"
		FAILED = "failed"
		@@ -41,26 +38,17 @@ class GraphNodeType(StrEnum):
		WORK_ITEM = "work_item"
		CHANGE_REQUEST = "cr"
		COMPANY = "company"
		CONCEPT = "concept"


		class GraphQueryLevel(StrEnum):
		"""Level of sophistication for graph query answer generation."""

		SIMPLE = "simple" # Return count and list without synthesis
		MEDIUM = "medium" # Parse query, filter, generate simple text summary
		ADVANCED = "advanced" # Use LLM to synthesize answer from graph + embeddings (GraphRAG)
		# Use LLM to synthesize answer from graph + embeddings (GraphRAG)


		class GraphEdgeType(StrEnum):
		"""Types of edges in the knowledge graph."""

		DISCUSSES = "discusses"
		REVISES = "revises"
		REFERENCES = "references"
		SUPERSEDES = "supersedes"
		AUTHORED_BY = "authored_by"
		MERGED_INTO = "merged_into"
		PRESENTED_AT = "presented_at"
		REVISION_OF = "revision_of" # is_revision_of metadata relationship

		@@ -80,15 +68,6 @@ class ProcessingFailureType(StrEnum):
		- GRAPH_FAILED: Graph building failed
		"""

		# Permanent failures
		NOT_FOUND_ONLINE = "not_found_online"
		DOWNLOAD_FAILED = "download_failed"
		BROKEN_SOURCE = "broken_source"
		CLASSIFICATION_FAILED = "classification_failed"

		# Retryable failures
		EXTRACTION_FAILED = "extraction_failed"
		EMBEDDING_FAILED = "embedding_failed"
		GRAPH_FAILED = "graph_failed"


		@@ -96,16 +75,11 @@ class WorkspaceStatus(StrEnum):
		"""Lifecycle state of a workspace."""

		ACTIVE = "active"
		ARCHIVED = "archived"


		class SourceKind(StrEnum):
		"""Kinds of source items that can be part of a workspace corpus."""

		TDOC = "tdoc"
		SPEC = "spec"
		OTHER = "other"


		class AiError(Exception):
		"""Base exception for AI processing errors."""
		@@ -217,25 +191,6 @@ class ProcessingStatus(BaseModel):
		keywords: list[str] \| None = Field(None, description="Keywords extracted from document content")
		detected_language: str \| None = Field(None, description="Primary language detected in document")

		@property
		def is_permanent_failure(self) -> bool:
		"""Check if this status represents a permanent failure that should not be retried."""
		return self.failure_type is not None and self.failure_type in (
		ProcessingFailureType.NOT_FOUND_ONLINE,
		ProcessingFailureType.DOWNLOAD_FAILED,
		ProcessingFailureType.BROKEN_SOURCE,
		ProcessingFailureType.CLASSIFICATION_FAILED,
		)

		@property
		def is_retryable_failure(self) -> bool:
		"""Check if this status represents a retryable failure."""
		return self.failure_type is not None and self.failure_type in (
		ProcessingFailureType.EXTRACTION_FAILED,
		ProcessingFailureType.EMBEDDING_FAILED,
		ProcessingFailureType.GRAPH_FAILED,
		)

		@field_validator("document_id")
		@classmethod
		def _normalize_document_id(cls, value: str) -> str:
		@@ -379,26 +334,6 @@ class SummarizeResult(BaseModel):
		)
		word_count: int = Field(..., ge=0, description="Actual word count of summary")

		def to_markdown(self) -> str:
		"""Format result as markdown."""
		lines = [f"## Summary\n\n{self.summary}\n"]
		if self.keywords:
		lines.append(f"## Keywords\n\n{', '.join(self.keywords)}\n")
		if self.metadata:
		lines.append("## Metadata\n")
		for key, value in self.metadata.items():
		lines.append(f"- {key}: {value}\n")
		lines.append(f"\nWord count: {self.word_count}\n")
		return "".join(lines)

		def to_json(self) -> str:
		"""Format result as JSON."""
		return json.dumps(self.model_dump(), indent=2)

		def to_yaml(self) -> str:
		"""Format result as YAML."""
		return yaml.dump(self.model_dump(), default_flow_style=False)

		@field_validator("keywords", mode="before")
		@classmethod
		def _normalize_keywords(cls, value: list[str] \| None) -> list[str]:

src/tdoc-ai/tdoc_ai/operations/graph.py

+0 −17

Original line number	Diff line number	Diff line
		@@ -844,20 +844,3 @@ def _generate_answer(
		# For now, fall back to medium behavior
		logger.warning("Advanced query level not yet implemented, falling back to medium")
		return _generate_answer(query, nodes, edges, "medium")


		def get_tdoc_evolution(document_id: str, storage: AiStorage) -> list[GraphNode]:
		"""Get evolution chain for a TDoc (revisions, supersessions)."""
		nodes, edges = storage.query_graph(filters={})
		related_ids = {document_id}

		for edge in edges:
		if edge.edge_type in [GraphEdgeType.REVISES, GraphEdgeType.SUPERSEDES]:
		if edge.source_id == document_id:
		related_ids.add(edge.target_id)
		elif edge.target_id == document_id:
		related_ids.add(edge.source_id)

		related_nodes = [n for n in nodes if n.node_id in related_ids and n.node_type == GraphNodeType.DOCUMENT]
		related_nodes.sort(key=lambda n: n.created_at or datetime.min)
		return related_nodes

src/tdoc-ai/tdoc_ai/operations/summarize.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -19,11 +19,11 @@ from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
		from tdoc_crawler.utils.misc import utc_now

		logger = logging.getLogger(__name__)
		storage = None
		_ = None

		# Summary settings
		ABSTRACT_MIN_WORDS = 150
		ABSTRACT_MAX_WORDS = 250
		_ = 150
		_ = 250

		# Prompt templates
		SUMMARY_SYSTEM_PROMPT = """You are a technical document analyzer specializing in 3GPP TDoc documents.

src/tdoc-ai/tdoc_ai/storage.py

+0 −48

Original line number	Diff line number	Diff line
		@@ -198,24 +198,6 @@ class AiStorage:
		payload.append(record)
		table.add(payload)

		def get_chunks(self, document_id: str, workspace: str \| None = None) -> list[DocumentChunk]:
		"""Return chunks for a TDoc if present."""
		normalized_workspace = _normalize_workspace_name(workspace)
		scoped_document_id = _to_scoped_document_id(document_id, normalized_workspace)
		table = self._table("chunks")
		records = _table_to_records(table)
		chunks: list[DocumentChunk] = []
		for record in records:
		if record.get("document_id") == scoped_document_id:
		unscoped_record = dict(record)
		unscoped_record["document_id"] = _from_scoped_document_id(scoped_document_id)[1]
		# Handle NaN for optional fields
		for field in ["section_heading", "section_number"]:
		if field in unscoped_record and (unscoped_record[field] is None or _is_nan(unscoped_record[field])):
		unscoped_record[field] = None
		chunks.append(DocumentChunk(**unscoped_record))
		return chunks

		def search_chunks(
		self,
		query_vector: list[float],
		@@ -267,36 +249,6 @@ class AiStorage:
		return DocumentSummary(**unscoped_record)
		return None

		def clear_workspace_artifacts(self, workspace_name: str \| None) -> int:
		"""Clear all AI artifacts (embeddings, summaries, etc.) for a workspace while preserving members.

		Args:
		workspace_name: Name of workspace.

		Returns:
		Number of artifact records removed.
		"""
		normalized_workspace = _normalize_workspace_name(workspace_name)
		removed_count = 0

		artifact_tables = ["processing_status", "classifications", "chunks", "summaries", "graph_nodes", "graph_edges"]

		for table_name in artifact_tables:
		try:
		table = self._table(table_name)
		all_records = _table_to_records(table)
		for record in all_records:
		document_id = record.get("document_id", "")
		record_workspace, _ = _from_scoped_document_id(document_id)
		if record_workspace == normalized_workspace:
		table.delete(f"document_id = '{document_id}'")
		removed_count += 1
		except Exception as exc:
		_logger.warning(f"Failed to clear artifacts from {table_name}: {exc}")

		_logger.info(f"Cleared {removed_count} artifact(s) for workspace '{normalized_workspace}'")
		return removed_count

		def save_nodes(self, nodes: list[GraphNode], workspace: str \| None = None) -> None:
		"""Persist graph nodes."""
		if not nodes: