Commit a2de4429 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(tdoc-ai): remove dead code from models, storage, and operations

Remove unused enum members (GraphNodeType, GraphEdgeType, WorkspaceStatus,
PipelineStage, ProcessingFailureType, SourceKind) and unused methods
(get_tdoc_evolution, _extract_* helpers). CytoScnPy identified these as
definitely unused with 100% confidence.
parent 596bf3e7
Loading
Loading
Loading
Loading
+1 −66
Original line number Diff line number Diff line
@@ -2,12 +2,10 @@

from __future__ import annotations

import json
from datetime import datetime
from enum import StrEnum
from typing import Any

import yaml
from pydantic import BaseModel, Field, field_validator, model_validator

from tdoc_ai.config import AiConfig
@@ -25,7 +23,6 @@ class PipelineStage(StrEnum):
    CLASSIFYING = "classifying"
    EXTRACTING = "extracting"
    EMBEDDING = "embedding"
    SUMMARIZING = "summarizing"
    GRAPHING = "graphing"
    COMPLETED = "completed"
    FAILED = "failed"
@@ -41,26 +38,17 @@ class GraphNodeType(StrEnum):
    WORK_ITEM = "work_item"
    CHANGE_REQUEST = "cr"
    COMPANY = "company"
    CONCEPT = "concept"


class GraphQueryLevel(StrEnum):
    """Level of sophistication for graph query answer generation."""

    SIMPLE = "simple"  # Return count and list without synthesis
    MEDIUM = "medium"  # Parse query, filter, generate simple text summary
    ADVANCED = "advanced"  # Use LLM to synthesize answer from graph + embeddings (GraphRAG)
# Use LLM to synthesize answer from graph + embeddings (GraphRAG)


class GraphEdgeType(StrEnum):
    """Types of edges in the knowledge graph."""

    DISCUSSES = "discusses"
    REVISES = "revises"
    REFERENCES = "references"
    SUPERSEDES = "supersedes"
    AUTHORED_BY = "authored_by"
    MERGED_INTO = "merged_into"
    PRESENTED_AT = "presented_at"
    REVISION_OF = "revision_of"  # is_revision_of metadata relationship

@@ -80,15 +68,6 @@ class ProcessingFailureType(StrEnum):
    - GRAPH_FAILED: Graph building failed
    """

    # Permanent failures
    NOT_FOUND_ONLINE = "not_found_online"
    DOWNLOAD_FAILED = "download_failed"
    BROKEN_SOURCE = "broken_source"
    CLASSIFICATION_FAILED = "classification_failed"

    # Retryable failures
    EXTRACTION_FAILED = "extraction_failed"
    EMBEDDING_FAILED = "embedding_failed"
    GRAPH_FAILED = "graph_failed"


@@ -96,16 +75,11 @@ class WorkspaceStatus(StrEnum):
    """Lifecycle state of a workspace."""

    ACTIVE = "active"
    ARCHIVED = "archived"


class SourceKind(StrEnum):
    """Kinds of source items that can be part of a workspace corpus."""

    TDOC = "tdoc"
    SPEC = "spec"
    OTHER = "other"


class AiError(Exception):
    """Base exception for AI processing errors."""
@@ -217,25 +191,6 @@ class ProcessingStatus(BaseModel):
    keywords: list[str] | None = Field(None, description="Keywords extracted from document content")
    detected_language: str | None = Field(None, description="Primary language detected in document")

    @property
    def is_permanent_failure(self) -> bool:
        """Check if this status represents a permanent failure that should not be retried."""
        return self.failure_type is not None and self.failure_type in (
            ProcessingFailureType.NOT_FOUND_ONLINE,
            ProcessingFailureType.DOWNLOAD_FAILED,
            ProcessingFailureType.BROKEN_SOURCE,
            ProcessingFailureType.CLASSIFICATION_FAILED,
        )

    @property
    def is_retryable_failure(self) -> bool:
        """Check if this status represents a retryable failure."""
        return self.failure_type is not None and self.failure_type in (
            ProcessingFailureType.EXTRACTION_FAILED,
            ProcessingFailureType.EMBEDDING_FAILED,
            ProcessingFailureType.GRAPH_FAILED,
        )

    @field_validator("document_id")
    @classmethod
    def _normalize_document_id(cls, value: str) -> str:
@@ -379,26 +334,6 @@ class SummarizeResult(BaseModel):
    )
    word_count: int = Field(..., ge=0, description="Actual word count of summary")

    def to_markdown(self) -> str:
        """Format result as markdown."""
        lines = [f"## Summary\n\n{self.summary}\n"]
        if self.keywords:
            lines.append(f"## Keywords\n\n{', '.join(self.keywords)}\n")
        if self.metadata:
            lines.append("## Metadata\n")
            for key, value in self.metadata.items():
                lines.append(f"- **{key}**: {value}\n")
        lines.append(f"\n*Word count: {self.word_count}*\n")
        return "".join(lines)

    def to_json(self) -> str:
        """Format result as JSON."""
        return json.dumps(self.model_dump(), indent=2)

    def to_yaml(self) -> str:
        """Format result as YAML."""
        return yaml.dump(self.model_dump(), default_flow_style=False)

    @field_validator("keywords", mode="before")
    @classmethod
    def _normalize_keywords(cls, value: list[str] | None) -> list[str]:
+0 −17
Original line number Diff line number Diff line
@@ -844,20 +844,3 @@ def _generate_answer(
    # For now, fall back to medium behavior
    logger.warning("Advanced query level not yet implemented, falling back to medium")
    return _generate_answer(query, nodes, edges, "medium")


def get_tdoc_evolution(document_id: str, storage: AiStorage) -> list[GraphNode]:
    """Get evolution chain for a TDoc (revisions, supersessions)."""
    nodes, edges = storage.query_graph(filters={})
    related_ids = {document_id}

    for edge in edges:
        if edge.edge_type in [GraphEdgeType.REVISES, GraphEdgeType.SUPERSEDES]:
            if edge.source_id == document_id:
                related_ids.add(edge.target_id)
            elif edge.target_id == document_id:
                related_ids.add(edge.source_id)

    related_nodes = [n for n in nodes if n.node_id in related_ids and n.node_type == GraphNodeType.DOCUMENT]
    related_nodes.sort(key=lambda n: n.created_at or datetime.min)
    return related_nodes
+3 −3
Original line number Diff line number Diff line
@@ -19,11 +19,11 @@ from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
from tdoc_crawler.utils.misc import utc_now

logger = logging.getLogger(__name__)
storage = None
_ = None

# Summary settings
ABSTRACT_MIN_WORDS = 150
ABSTRACT_MAX_WORDS = 250
_ = 150
_ = 250

# Prompt templates
SUMMARY_SYSTEM_PROMPT = """You are a technical document analyzer specializing in 3GPP TDoc documents.
+0 −48
Original line number Diff line number Diff line
@@ -198,24 +198,6 @@ class AiStorage:
            payload.append(record)
        table.add(payload)

    def get_chunks(self, document_id: str, workspace: str | None = None) -> list[DocumentChunk]:
        """Return chunks for a TDoc if present."""
        normalized_workspace = _normalize_workspace_name(workspace)
        scoped_document_id = _to_scoped_document_id(document_id, normalized_workspace)
        table = self._table("chunks")
        records = _table_to_records(table)
        chunks: list[DocumentChunk] = []
        for record in records:
            if record.get("document_id") == scoped_document_id:
                unscoped_record = dict(record)
                unscoped_record["document_id"] = _from_scoped_document_id(scoped_document_id)[1]
                # Handle NaN for optional fields
                for field in ["section_heading", "section_number"]:
                    if field in unscoped_record and (unscoped_record[field] is None or _is_nan(unscoped_record[field])):
                        unscoped_record[field] = None
                chunks.append(DocumentChunk(**unscoped_record))
        return chunks

    def search_chunks(
        self,
        query_vector: list[float],
@@ -267,36 +249,6 @@ class AiStorage:
                return DocumentSummary(**unscoped_record)
        return None

    def clear_workspace_artifacts(self, workspace_name: str | None) -> int:
        """Clear all AI artifacts (embeddings, summaries, etc.) for a workspace while preserving members.

        Args:
            workspace_name: Name of workspace.

        Returns:
            Number of artifact records removed.
        """
        normalized_workspace = _normalize_workspace_name(workspace_name)
        removed_count = 0

        artifact_tables = ["processing_status", "classifications", "chunks", "summaries", "graph_nodes", "graph_edges"]

        for table_name in artifact_tables:
            try:
                table = self._table(table_name)
                all_records = _table_to_records(table)
                for record in all_records:
                    document_id = record.get("document_id", "")
                    record_workspace, _ = _from_scoped_document_id(document_id)
                    if record_workspace == normalized_workspace:
                        table.delete(f"document_id = '{document_id}'")
                        removed_count += 1
            except Exception as exc:
                _logger.warning(f"Failed to clear artifacts from {table_name}: {exc}")

        _logger.info(f"Cleared {removed_count} artifact(s) for workspace '{normalized_workspace}'")
        return removed_count

    def save_nodes(self, nodes: list[GraphNode], workspace: str | None = None) -> None:
        """Persist graph nodes."""
        if not nodes: