Commit 9e3dca91 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(3gpp-ai): add structured summarize context and wiki output mode

parent 3fd808f5
Loading
Loading
Loading
Loading
+10 −2
Original line number Diff line number Diff line
@@ -104,12 +104,19 @@ Generates LLM-powered summary of TDoc content.

**Pipeline:**

1. Get markdown via `convert_tdoc_to_markdown()`
1. Get structured extraction via `extract_tdoc_structured()`
1. Prefer structured context (equations/tables/figures with provenance) and fall back to markdown-only when structured artifacts are unavailable
1. Truncate to `SUMMARY_INPUT_LIMIT` (8000 chars)
1. Generate summary via LiteLLM
1. Extract keywords via LiteLLM
1. Return `SummarizeResult`

**Output mode selector:**

- `--output-mode standard` (default): current summarize output shape
- `--output-mode wiki`: wiki-ready rendering with section headers and citation-friendly layout
- Mode only changes CLI rendering shape; summarize operation contract remains unchanged

## File Type Priority

The pipeline prefers formats in this order:
@@ -152,7 +159,8 @@ To force re-conversion:
3gpp-ai convert <tdoc_id> [--output FILE] [--force]

# Summarize TDoc
3gpp-ai summarize <tdoc_id> [--max-words N] [--force] [--json-output]
3gpp-ai summarize <tdoc_id> [--words N] [--force]
3gpp-ai summarize <tdoc_id> [--output-mode standard|wiki]
```

## Error Handling
+23 −0
Original line number Diff line number Diff line
@@ -23,6 +23,29 @@ CacheDirOption = Annotated[
SummarizeDocumentArgument = Annotated[str, typer.Argument(help="Document ID to summarize")]
SummarizeWordsOption = Annotated[int, typer.Option("--words", "-w", help="Target/Maximum word count")]
SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")]
SummarizeQualityPolicyOption = Annotated[
    str | None,
    typer.Option(
        "--quality-policy",
        help="Quality policy mode: strict, balanced, permissive",
        envvar="TDC_AI_QUALITY_POLICY_MODE",
    ),
]
SummarizeAllowFailedQualityOption = Annotated[
    bool,
    typer.Option(
        "--allow-failed-quality/--no-allow-failed-quality",
        help="Allow summarize when extraction quality status is failed",
        envvar="TDC_AI_ALLOW_FAILED_QUALITY",
    ),
]
SummarizeOutputModeOption = Annotated[
    str,
    typer.Option(
        "--output-mode",
        help="Summarize output shape: standard or wiki",
    ),
]

# Convert
ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")]
+31 −1
Original line number Diff line number Diff line
@@ -68,8 +68,11 @@ from threegpp_ai.args import (
    SourcePatternExcludeOption,
    SourcePatternOption,
    StartDateOption,
    SummarizeAllowFailedQualityOption,
    SummarizeDocumentArgument,
    SummarizeForceOption,
    SummarizeOutputModeOption,
    SummarizeQualityPolicyOption,
    SummarizeWordsOption,
    TitlePatternExcludeOption,
    TitlePatternOption,
@@ -564,9 +567,36 @@ def ai_summarize(
    document_id: SummarizeDocumentArgument,
    words: SummarizeWordsOption = 200,
    force: SummarizeForceOption = False,
    quality_policy: SummarizeQualityPolicyOption = None,
    allow_failed_quality: SummarizeAllowFailedQualityOption = False,
    output_mode: SummarizeOutputModeOption = "standard",
) -> None:
    """Summarize one TDoc through the 3gpp-ai pipeline."""
    result = summarize_document(document_id=document_id, max_words=words, force=force)
    normalized_mode = output_mode.strip().lower()
    if normalized_mode not in {"standard", "wiki"}:
        raise typer.BadParameter("--output-mode must be one of: standard, wiki")

    result = summarize_document(
        document_id=document_id,
        max_words=words,
        force=force,
        quality_policy_mode=quality_policy,
        allow_failed_quality=allow_failed_quality,
    )
    if normalized_mode == "wiki":
        console.print(f"## Wiki Summary for {document_id}")
        console.print("### Abstract")
        console.print(result.summary)
        if result.keywords:
            console.print("### Keywords")
            for keyword in result.keywords:
                console.print(f"- {keyword}")
        extraction_status = result.metadata.get("extraction_status")
        if extraction_status:
            console.print("### Source Quality")
            console.print(f"Extraction status: {extraction_status}")
        return

    console.print(f"## Summary for {document_id}")
    console.print(result.summary)

+15 −1
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ DEFAULT_LLM_MODEL = "openrouter/openrouter/free"
# Type aliases
ExtractionProfile = Literal["default", "balanced", "optimum", "custom"]
GraphQueryLevel = Literal["simple", "medium", "advanced"]
QualityPolicyMode = Literal["strict", "balanced", "permissive"]


class AiConfig(BaseSettings):
@@ -166,6 +167,18 @@ class AiConfig(BaseSettings):
        description="Enable figure description generation with vision-capable models",
    )

    # Extraction quality policy
    quality_policy_mode: QualityPolicyMode = Field(
        "balanced",
        validation_alias=AliasChoices("TDC_AI_QUALITY_POLICY_MODE", "quality_policy_mode"),
        description="Policy for downstream handling of extraction quality status (strict|balanced|permissive)",
    )
    allow_failed_quality: bool = Field(
        False,
        validation_alias=AliasChoices("TDC_AI_ALLOW_FAILED_QUALITY", "allow_failed_quality"),
        description="Allow one-off summarize execution for failed extraction quality status",
    )

    # Graph
    graph_query_level: GraphQueryLevel = Field(
        "simple",
@@ -183,6 +196,7 @@ class AiConfig(BaseSettings):
            raise ValueError(msg)
        return self


class ThreeGPPAIConfig(ThreeGPPConfig):
    """Extended config for 3gpp-ai, adding [ai] section.

@@ -193,4 +207,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig):
    ai: AiConfig = Field(default_factory=AiConfig)


__all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"]
__all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "QualityPolicyMode", "ThreeGPPAIConfig"]
+96 −0
Original line number Diff line number Diff line
@@ -40,6 +40,23 @@ class GraphEdgeType(StrEnum):
    REVISION_OF = auto()  # is_revision_of metadata relationship


class ExtractionQualityStatus(StrEnum):
    """Deterministic extraction quality status."""

    OK = "ok"
    PARTIAL = "partial"
    FAILED = "failed"


class ExtractionQualityReasonCode(StrEnum):
    """Stable machine-readable reason codes for quality gate outcomes."""

    MISSING_ARTIFACT = "missing_artifact"
    MISSING_METADATA = "missing_metadata"
    INVALID_PROVENANCE = "invalid_provenance"
    COVERAGE_MISMATCH = "coverage_mismatch"


class SourceKind(StrEnum):
    """Kinds of source items that can be part of a workspace corpus."""

@@ -174,8 +191,13 @@ class ExtractedTableElement(BaseModel):
    row_count: int = Field(0, ge=0, description="Number of table rows")
    column_count: int = Field(0, ge=0, description="Number of table columns")
    cells: list[list[str]] = Field(default_factory=list, description="Normalized table cell matrix")
    cell_metadata: list[list[dict[str, Any] | None]] = Field(
        default_factory=list,
        description="Optional per-cell metadata matrix aligned with `cells`",
    )
    markdown: str | None = Field(None, description="Markdown representation of the table")
    caption: str | None = Field(None, description="Detected table caption")
    source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance")


class ExtractedFigureElement(BaseModel):
@@ -187,6 +209,12 @@ class ExtractedFigureElement(BaseModel):
    image_format: str | None = Field(None, description="Image format, e.g. png, jpeg")
    caption: str | None = Field(None, description="Detected figure caption")
    description: str | None = Field(None, description="Optional generated figure description")
    source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance")
    is_partial: bool = Field(False, description="Whether figure record is partial and contains degraded fields")
    partial_reason_codes: list[str] = Field(
        default_factory=list,
        description="Machine-readable diagnostics describing why figure payload is partial",
    )
    metadata: dict[str, Any] = Field(default_factory=dict, description="Provider/extraction metadata")


@@ -198,6 +226,59 @@ class ExtractedEquationElement(BaseModel):
    equation_number: str | None = Field(None, description="Equation label/number if available")
    latex: str = Field(..., description="Equation content in LaTeX-compatible format")
    raw_text: str | None = Field(None, description="Original extracted equation text")
    source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance")
    normalized_text: str | None = Field(None, description="Normalized equation text for machine-readable comparison")
    equation_type: str | None = Field(None, description="Equation syntax family, e.g. latex")
    display_mode: str | None = Field(None, description="Equation rendering mode: inline or display")


class PageMetadataContract(BaseModel):
    """Canonical page-level metadata contract for extraction artifacts."""

    page_number: int = Field(..., ge=1, description="1-based page number")
    table_count: int = Field(0, ge=0, description="Number of extracted tables on this page")
    figure_count: int = Field(0, ge=0, description="Number of extracted figures on this page")
    equation_count: int = Field(0, ge=0, description="Number of extracted equations on this page")


class DocumentMetadataContract(BaseModel):
    """Canonical document-level metadata contract for extraction outputs."""

    document_id: str | None = Field(None, description="Document identifier if available")
    title: str | None = Field(None, description="Document title if available")
    source_path: str | None = Field(None, description="Resolved source path if available")
    file_extension: str | None = Field(None, description="Source file extension, e.g. .pdf")
    total_pages: int | None = Field(None, ge=1, description="Detected total page count if available")
    extraction_profile: str | None = Field(None, description="Resolved extraction profile")
    extraction_status: str = Field("ok", description="Extraction status: ok|partial|failed")
    config_hash: str | None = Field(None, description="Deterministic extraction config fingerprint")


class QualityGateCheckResult(BaseModel):
    """Single quality gate check result."""

    gate: str = Field(..., description="Gate identifier")
    passed: bool = Field(..., description="Whether the gate passed")
    reason_code: ExtractionQualityReasonCode | None = Field(
        None,
        description="Reason code when gate fails",
    )
    message: str | None = Field(None, description="Optional human-readable gate detail")


class ExtractionQualityReport(BaseModel):
    """Deterministic quality report persisted per document."""

    status: ExtractionQualityStatus = Field(..., description="Final deterministic extraction status")
    reason_codes: list[ExtractionQualityReasonCode] = Field(
        default_factory=list,
        description="Unique reason codes for failed checks",
    )
    checks: list[QualityGateCheckResult] = Field(default_factory=list, description="Gate-by-gate outcomes")
    gate_metrics_summary: dict[str, int | float] = Field(
        default_factory=dict,
        description="Compact gate metrics summary",
    )


class StructuredExtractionResult(BaseModel):
@@ -207,6 +288,11 @@ class StructuredExtractionResult(BaseModel):
    tables: list[ExtractedTableElement] = Field(default_factory=list, description="Extracted tables")
    figures: list[ExtractedFigureElement] = Field(default_factory=list, description="Extracted figures")
    equations: list[ExtractedEquationElement] = Field(default_factory=list, description="Extracted equations")
    document_metadata: DocumentMetadataContract | None = Field(
        None,
        description="Canonical document-level metadata contract",
    )
    pages: list[PageMetadataContract] = Field(default_factory=list, description="Canonical page-level metadata contracts")
    metadata: dict[str, Any] = Field(default_factory=dict, description="Document-level extraction metadata")

    @property
@@ -308,15 +394,25 @@ __all__ = [
    "AiError",
    "ConversionError",
    "DocumentClassification",
    "DocumentMetadataContract",
    "DocumentSummary",
    "EmbeddingDimensionError",
    "ExtractedEquationElement",
    "ExtractedFigureElement",
    "ExtractedTableElement",
    "ExtractionError",
    "ExtractionQualityReasonCode",
    "ExtractionQualityReport",
    "ExtractionQualityStatus",
    "GraphEdge",
    "GraphEdgeType",
    "GraphNode",
    "GraphNodeType",
    "LlmConfigError",
    "PageMetadataContract",
    "QualityGateCheckResult",
    "SourceKind",
    "StructuredExtractionResult",
    "SummarizeResult",
    "TDocNotFoundError",
    "Workspace",
Loading