Loading packages/3gpp-ai/docs/PIPELINE.md +10 −2 Original line number Diff line number Diff line Loading @@ -104,12 +104,19 @@ Generates LLM-powered summary of TDoc content. **Pipeline:** 1. Get markdown via `convert_tdoc_to_markdown()` 1. Get structured extraction via `extract_tdoc_structured()` 1. Prefer structured context (equations/tables/figures with provenance) and fall back to markdown-only when structured artifacts are unavailable 1. Truncate to `SUMMARY_INPUT_LIMIT` (8000 chars) 1. Generate summary via LiteLLM 1. Extract keywords via LiteLLM 1. Return `SummarizeResult` **Output mode selector:** - `--output-mode standard` (default): current summarize output shape - `--output-mode wiki`: wiki-ready rendering with section headers and citation-friendly layout - Mode only changes CLI rendering shape; summarize operation contract remains unchanged ## File Type Priority The pipeline prefers formats in this order: Loading Loading @@ -152,7 +159,8 @@ To force re-conversion: 3gpp-ai convert <tdoc_id> [--output FILE] [--force] # Summarize TDoc 3gpp-ai summarize <tdoc_id> [--max-words N] [--force] [--json-output] 3gpp-ai summarize <tdoc_id> [--words N] [--force] 3gpp-ai summarize <tdoc_id> [--output-mode standard|wiki] ``` ## Error Handling Loading packages/3gpp-ai/threegpp_ai/args.py +23 −0 Original line number Diff line number Diff line Loading @@ -23,6 +23,29 @@ CacheDirOption = Annotated[ SummarizeDocumentArgument = Annotated[str, typer.Argument(help="Document ID to summarize")] SummarizeWordsOption = Annotated[int, typer.Option("--words", "-w", help="Target/Maximum word count")] SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")] SummarizeQualityPolicyOption = Annotated[ str | None, typer.Option( "--quality-policy", help="Quality policy mode: strict, balanced, permissive", envvar="TDC_AI_QUALITY_POLICY_MODE", ), ] SummarizeAllowFailedQualityOption = Annotated[ bool, typer.Option( "--allow-failed-quality/--no-allow-failed-quality", help="Allow summarize when extraction quality status is failed", envvar="TDC_AI_ALLOW_FAILED_QUALITY", ), ] SummarizeOutputModeOption = Annotated[ str, typer.Option( "--output-mode", help="Summarize output shape: standard or wiki", ), ] # Convert ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")] Loading packages/3gpp-ai/threegpp_ai/cli.py +31 −1 Original line number Diff line number Diff line Loading @@ -68,8 +68,11 @@ from threegpp_ai.args import ( SourcePatternExcludeOption, SourcePatternOption, StartDateOption, SummarizeAllowFailedQualityOption, SummarizeDocumentArgument, SummarizeForceOption, SummarizeOutputModeOption, SummarizeQualityPolicyOption, SummarizeWordsOption, TitlePatternExcludeOption, TitlePatternOption, Loading Loading @@ -564,9 +567,36 @@ def ai_summarize( document_id: SummarizeDocumentArgument, words: SummarizeWordsOption = 200, force: SummarizeForceOption = False, quality_policy: SummarizeQualityPolicyOption = None, allow_failed_quality: SummarizeAllowFailedQualityOption = False, output_mode: SummarizeOutputModeOption = "standard", ) -> None: """Summarize one TDoc through the 3gpp-ai pipeline.""" result = summarize_document(document_id=document_id, max_words=words, force=force) normalized_mode = output_mode.strip().lower() if normalized_mode not in {"standard", "wiki"}: raise typer.BadParameter("--output-mode must be one of: standard, wiki") result = summarize_document( document_id=document_id, max_words=words, force=force, quality_policy_mode=quality_policy, allow_failed_quality=allow_failed_quality, ) if normalized_mode == "wiki": console.print(f"## Wiki Summary for {document_id}") console.print("### Abstract") console.print(result.summary) if result.keywords: console.print("### Keywords") for keyword in result.keywords: console.print(f"- {keyword}") extraction_status = result.metadata.get("extraction_status") if extraction_status: console.print("### Source Quality") console.print(f"Extraction status: {extraction_status}") return console.print(f"## Summary for {document_id}") console.print(result.summary) Loading packages/3gpp-ai/threegpp_ai/config.py +15 −1 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ DEFAULT_LLM_MODEL = "openrouter/openrouter/free" # Type aliases ExtractionProfile = Literal["default", "balanced", "optimum", "custom"] GraphQueryLevel = Literal["simple", "medium", "advanced"] QualityPolicyMode = Literal["strict", "balanced", "permissive"] class AiConfig(BaseSettings): Loading Loading @@ -166,6 +167,18 @@ class AiConfig(BaseSettings): description="Enable figure description generation with vision-capable models", ) # Extraction quality policy quality_policy_mode: QualityPolicyMode = Field( "balanced", validation_alias=AliasChoices("TDC_AI_QUALITY_POLICY_MODE", "quality_policy_mode"), description="Policy for downstream handling of extraction quality status (strict|balanced|permissive)", ) allow_failed_quality: bool = Field( False, validation_alias=AliasChoices("TDC_AI_ALLOW_FAILED_QUALITY", "allow_failed_quality"), description="Allow one-off summarize execution for failed extraction quality status", ) # Graph graph_query_level: GraphQueryLevel = Field( "simple", Loading @@ -183,6 +196,7 @@ class AiConfig(BaseSettings): raise ValueError(msg) return self class ThreeGPPAIConfig(ThreeGPPConfig): """Extended config for 3gpp-ai, adding [ai] section. Loading @@ -193,4 +207,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig): ai: AiConfig = Field(default_factory=AiConfig) __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"] __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "QualityPolicyMode", "ThreeGPPAIConfig"] packages/3gpp-ai/threegpp_ai/models.py +96 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,23 @@ class GraphEdgeType(StrEnum): REVISION_OF = auto() # is_revision_of metadata relationship class ExtractionQualityStatus(StrEnum): """Deterministic extraction quality status.""" OK = "ok" PARTIAL = "partial" FAILED = "failed" class ExtractionQualityReasonCode(StrEnum): """Stable machine-readable reason codes for quality gate outcomes.""" MISSING_ARTIFACT = "missing_artifact" MISSING_METADATA = "missing_metadata" INVALID_PROVENANCE = "invalid_provenance" COVERAGE_MISMATCH = "coverage_mismatch" class SourceKind(StrEnum): """Kinds of source items that can be part of a workspace corpus.""" Loading Loading @@ -174,8 +191,13 @@ class ExtractedTableElement(BaseModel): row_count: int = Field(0, ge=0, description="Number of table rows") column_count: int = Field(0, ge=0, description="Number of table columns") cells: list[list[str]] = Field(default_factory=list, description="Normalized table cell matrix") cell_metadata: list[list[dict[str, Any] | None]] = Field( default_factory=list, description="Optional per-cell metadata matrix aligned with `cells`", ) markdown: str | None = Field(None, description="Markdown representation of the table") caption: str | None = Field(None, description="Detected table caption") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") class ExtractedFigureElement(BaseModel): Loading @@ -187,6 +209,12 @@ class ExtractedFigureElement(BaseModel): image_format: str | None = Field(None, description="Image format, e.g. png, jpeg") caption: str | None = Field(None, description="Detected figure caption") description: str | None = Field(None, description="Optional generated figure description") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") is_partial: bool = Field(False, description="Whether figure record is partial and contains degraded fields") partial_reason_codes: list[str] = Field( default_factory=list, description="Machine-readable diagnostics describing why figure payload is partial", ) metadata: dict[str, Any] = Field(default_factory=dict, description="Provider/extraction metadata") Loading @@ -198,6 +226,59 @@ class ExtractedEquationElement(BaseModel): equation_number: str | None = Field(None, description="Equation label/number if available") latex: str = Field(..., description="Equation content in LaTeX-compatible format") raw_text: str | None = Field(None, description="Original extracted equation text") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") normalized_text: str | None = Field(None, description="Normalized equation text for machine-readable comparison") equation_type: str | None = Field(None, description="Equation syntax family, e.g. latex") display_mode: str | None = Field(None, description="Equation rendering mode: inline or display") class PageMetadataContract(BaseModel): """Canonical page-level metadata contract for extraction artifacts.""" page_number: int = Field(..., ge=1, description="1-based page number") table_count: int = Field(0, ge=0, description="Number of extracted tables on this page") figure_count: int = Field(0, ge=0, description="Number of extracted figures on this page") equation_count: int = Field(0, ge=0, description="Number of extracted equations on this page") class DocumentMetadataContract(BaseModel): """Canonical document-level metadata contract for extraction outputs.""" document_id: str | None = Field(None, description="Document identifier if available") title: str | None = Field(None, description="Document title if available") source_path: str | None = Field(None, description="Resolved source path if available") file_extension: str | None = Field(None, description="Source file extension, e.g. .pdf") total_pages: int | None = Field(None, ge=1, description="Detected total page count if available") extraction_profile: str | None = Field(None, description="Resolved extraction profile") extraction_status: str = Field("ok", description="Extraction status: ok|partial|failed") config_hash: str | None = Field(None, description="Deterministic extraction config fingerprint") class QualityGateCheckResult(BaseModel): """Single quality gate check result.""" gate: str = Field(..., description="Gate identifier") passed: bool = Field(..., description="Whether the gate passed") reason_code: ExtractionQualityReasonCode | None = Field( None, description="Reason code when gate fails", ) message: str | None = Field(None, description="Optional human-readable gate detail") class ExtractionQualityReport(BaseModel): """Deterministic quality report persisted per document.""" status: ExtractionQualityStatus = Field(..., description="Final deterministic extraction status") reason_codes: list[ExtractionQualityReasonCode] = Field( default_factory=list, description="Unique reason codes for failed checks", ) checks: list[QualityGateCheckResult] = Field(default_factory=list, description="Gate-by-gate outcomes") gate_metrics_summary: dict[str, int | float] = Field( default_factory=dict, description="Compact gate metrics summary", ) class StructuredExtractionResult(BaseModel): Loading @@ -207,6 +288,11 @@ class StructuredExtractionResult(BaseModel): tables: list[ExtractedTableElement] = Field(default_factory=list, description="Extracted tables") figures: list[ExtractedFigureElement] = Field(default_factory=list, description="Extracted figures") equations: list[ExtractedEquationElement] = Field(default_factory=list, description="Extracted equations") document_metadata: DocumentMetadataContract | None = Field( None, description="Canonical document-level metadata contract", ) pages: list[PageMetadataContract] = Field(default_factory=list, description="Canonical page-level metadata contracts") metadata: dict[str, Any] = Field(default_factory=dict, description="Document-level extraction metadata") @property Loading Loading @@ -308,15 +394,25 @@ __all__ = [ "AiError", "ConversionError", "DocumentClassification", "DocumentMetadataContract", "DocumentSummary", "EmbeddingDimensionError", "ExtractedEquationElement", "ExtractedFigureElement", "ExtractedTableElement", "ExtractionError", "ExtractionQualityReasonCode", "ExtractionQualityReport", "ExtractionQualityStatus", "GraphEdge", "GraphEdgeType", "GraphNode", "GraphNodeType", "LlmConfigError", "PageMetadataContract", "QualityGateCheckResult", "SourceKind", "StructuredExtractionResult", "SummarizeResult", "TDocNotFoundError", "Workspace", Loading Loading
packages/3gpp-ai/docs/PIPELINE.md +10 −2 Original line number Diff line number Diff line Loading @@ -104,12 +104,19 @@ Generates LLM-powered summary of TDoc content. **Pipeline:** 1. Get markdown via `convert_tdoc_to_markdown()` 1. Get structured extraction via `extract_tdoc_structured()` 1. Prefer structured context (equations/tables/figures with provenance) and fall back to markdown-only when structured artifacts are unavailable 1. Truncate to `SUMMARY_INPUT_LIMIT` (8000 chars) 1. Generate summary via LiteLLM 1. Extract keywords via LiteLLM 1. Return `SummarizeResult` **Output mode selector:** - `--output-mode standard` (default): current summarize output shape - `--output-mode wiki`: wiki-ready rendering with section headers and citation-friendly layout - Mode only changes CLI rendering shape; summarize operation contract remains unchanged ## File Type Priority The pipeline prefers formats in this order: Loading Loading @@ -152,7 +159,8 @@ To force re-conversion: 3gpp-ai convert <tdoc_id> [--output FILE] [--force] # Summarize TDoc 3gpp-ai summarize <tdoc_id> [--max-words N] [--force] [--json-output] 3gpp-ai summarize <tdoc_id> [--words N] [--force] 3gpp-ai summarize <tdoc_id> [--output-mode standard|wiki] ``` ## Error Handling Loading
packages/3gpp-ai/threegpp_ai/args.py +23 −0 Original line number Diff line number Diff line Loading @@ -23,6 +23,29 @@ CacheDirOption = Annotated[ SummarizeDocumentArgument = Annotated[str, typer.Argument(help="Document ID to summarize")] SummarizeWordsOption = Annotated[int, typer.Option("--words", "-w", help="Target/Maximum word count")] SummarizeForceOption = Annotated[bool, typer.Option("--force", "-f", help="Force reconversion even if cached")] SummarizeQualityPolicyOption = Annotated[ str | None, typer.Option( "--quality-policy", help="Quality policy mode: strict, balanced, permissive", envvar="TDC_AI_QUALITY_POLICY_MODE", ), ] SummarizeAllowFailedQualityOption = Annotated[ bool, typer.Option( "--allow-failed-quality/--no-allow-failed-quality", help="Allow summarize when extraction quality status is failed", envvar="TDC_AI_ALLOW_FAILED_QUALITY", ), ] SummarizeOutputModeOption = Annotated[ str, typer.Option( "--output-mode", help="Summarize output shape: standard or wiki", ), ] # Convert ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")] Loading
packages/3gpp-ai/threegpp_ai/cli.py +31 −1 Original line number Diff line number Diff line Loading @@ -68,8 +68,11 @@ from threegpp_ai.args import ( SourcePatternExcludeOption, SourcePatternOption, StartDateOption, SummarizeAllowFailedQualityOption, SummarizeDocumentArgument, SummarizeForceOption, SummarizeOutputModeOption, SummarizeQualityPolicyOption, SummarizeWordsOption, TitlePatternExcludeOption, TitlePatternOption, Loading Loading @@ -564,9 +567,36 @@ def ai_summarize( document_id: SummarizeDocumentArgument, words: SummarizeWordsOption = 200, force: SummarizeForceOption = False, quality_policy: SummarizeQualityPolicyOption = None, allow_failed_quality: SummarizeAllowFailedQualityOption = False, output_mode: SummarizeOutputModeOption = "standard", ) -> None: """Summarize one TDoc through the 3gpp-ai pipeline.""" result = summarize_document(document_id=document_id, max_words=words, force=force) normalized_mode = output_mode.strip().lower() if normalized_mode not in {"standard", "wiki"}: raise typer.BadParameter("--output-mode must be one of: standard, wiki") result = summarize_document( document_id=document_id, max_words=words, force=force, quality_policy_mode=quality_policy, allow_failed_quality=allow_failed_quality, ) if normalized_mode == "wiki": console.print(f"## Wiki Summary for {document_id}") console.print("### Abstract") console.print(result.summary) if result.keywords: console.print("### Keywords") for keyword in result.keywords: console.print(f"- {keyword}") extraction_status = result.metadata.get("extraction_status") if extraction_status: console.print("### Source Quality") console.print(f"Extraction status: {extraction_status}") return console.print(f"## Summary for {document_id}") console.print(result.summary) Loading
packages/3gpp-ai/threegpp_ai/config.py +15 −1 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ DEFAULT_LLM_MODEL = "openrouter/openrouter/free" # Type aliases ExtractionProfile = Literal["default", "balanced", "optimum", "custom"] GraphQueryLevel = Literal["simple", "medium", "advanced"] QualityPolicyMode = Literal["strict", "balanced", "permissive"] class AiConfig(BaseSettings): Loading Loading @@ -166,6 +167,18 @@ class AiConfig(BaseSettings): description="Enable figure description generation with vision-capable models", ) # Extraction quality policy quality_policy_mode: QualityPolicyMode = Field( "balanced", validation_alias=AliasChoices("TDC_AI_QUALITY_POLICY_MODE", "quality_policy_mode"), description="Policy for downstream handling of extraction quality status (strict|balanced|permissive)", ) allow_failed_quality: bool = Field( False, validation_alias=AliasChoices("TDC_AI_ALLOW_FAILED_QUALITY", "allow_failed_quality"), description="Allow one-off summarize execution for failed extraction quality status", ) # Graph graph_query_level: GraphQueryLevel = Field( "simple", Loading @@ -183,6 +196,7 @@ class AiConfig(BaseSettings): raise ValueError(msg) return self class ThreeGPPAIConfig(ThreeGPPConfig): """Extended config for 3gpp-ai, adding [ai] section. Loading @@ -193,4 +207,4 @@ class ThreeGPPAIConfig(ThreeGPPConfig): ai: AiConfig = Field(default_factory=AiConfig) __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "ThreeGPPAIConfig"] __all__ = ["AiConfig", "ExtractionProfile", "GraphQueryLevel", "QualityPolicyMode", "ThreeGPPAIConfig"]
packages/3gpp-ai/threegpp_ai/models.py +96 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,23 @@ class GraphEdgeType(StrEnum): REVISION_OF = auto() # is_revision_of metadata relationship class ExtractionQualityStatus(StrEnum): """Deterministic extraction quality status.""" OK = "ok" PARTIAL = "partial" FAILED = "failed" class ExtractionQualityReasonCode(StrEnum): """Stable machine-readable reason codes for quality gate outcomes.""" MISSING_ARTIFACT = "missing_artifact" MISSING_METADATA = "missing_metadata" INVALID_PROVENANCE = "invalid_provenance" COVERAGE_MISMATCH = "coverage_mismatch" class SourceKind(StrEnum): """Kinds of source items that can be part of a workspace corpus.""" Loading Loading @@ -174,8 +191,13 @@ class ExtractedTableElement(BaseModel): row_count: int = Field(0, ge=0, description="Number of table rows") column_count: int = Field(0, ge=0, description="Number of table columns") cells: list[list[str]] = Field(default_factory=list, description="Normalized table cell matrix") cell_metadata: list[list[dict[str, Any] | None]] = Field( default_factory=list, description="Optional per-cell metadata matrix aligned with `cells`", ) markdown: str | None = Field(None, description="Markdown representation of the table") caption: str | None = Field(None, description="Detected table caption") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") class ExtractedFigureElement(BaseModel): Loading @@ -187,6 +209,12 @@ class ExtractedFigureElement(BaseModel): image_format: str | None = Field(None, description="Image format, e.g. png, jpeg") caption: str | None = Field(None, description="Detected figure caption") description: str | None = Field(None, description="Optional generated figure description") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") is_partial: bool = Field(False, description="Whether figure record is partial and contains degraded fields") partial_reason_codes: list[str] = Field( default_factory=list, description="Machine-readable diagnostics describing why figure payload is partial", ) metadata: dict[str, Any] = Field(default_factory=dict, description="Provider/extraction metadata") Loading @@ -198,6 +226,59 @@ class ExtractedEquationElement(BaseModel): equation_number: str | None = Field(None, description="Equation label/number if available") latex: str = Field(..., description="Equation content in LaTeX-compatible format") raw_text: str | None = Field(None, description="Original extracted equation text") source_anchor_id: str | None = Field(None, description="Normalized source anchor identifier for provenance") normalized_text: str | None = Field(None, description="Normalized equation text for machine-readable comparison") equation_type: str | None = Field(None, description="Equation syntax family, e.g. latex") display_mode: str | None = Field(None, description="Equation rendering mode: inline or display") class PageMetadataContract(BaseModel): """Canonical page-level metadata contract for extraction artifacts.""" page_number: int = Field(..., ge=1, description="1-based page number") table_count: int = Field(0, ge=0, description="Number of extracted tables on this page") figure_count: int = Field(0, ge=0, description="Number of extracted figures on this page") equation_count: int = Field(0, ge=0, description="Number of extracted equations on this page") class DocumentMetadataContract(BaseModel): """Canonical document-level metadata contract for extraction outputs.""" document_id: str | None = Field(None, description="Document identifier if available") title: str | None = Field(None, description="Document title if available") source_path: str | None = Field(None, description="Resolved source path if available") file_extension: str | None = Field(None, description="Source file extension, e.g. .pdf") total_pages: int | None = Field(None, ge=1, description="Detected total page count if available") extraction_profile: str | None = Field(None, description="Resolved extraction profile") extraction_status: str = Field("ok", description="Extraction status: ok|partial|failed") config_hash: str | None = Field(None, description="Deterministic extraction config fingerprint") class QualityGateCheckResult(BaseModel): """Single quality gate check result.""" gate: str = Field(..., description="Gate identifier") passed: bool = Field(..., description="Whether the gate passed") reason_code: ExtractionQualityReasonCode | None = Field( None, description="Reason code when gate fails", ) message: str | None = Field(None, description="Optional human-readable gate detail") class ExtractionQualityReport(BaseModel): """Deterministic quality report persisted per document.""" status: ExtractionQualityStatus = Field(..., description="Final deterministic extraction status") reason_codes: list[ExtractionQualityReasonCode] = Field( default_factory=list, description="Unique reason codes for failed checks", ) checks: list[QualityGateCheckResult] = Field(default_factory=list, description="Gate-by-gate outcomes") gate_metrics_summary: dict[str, int | float] = Field( default_factory=dict, description="Compact gate metrics summary", ) class StructuredExtractionResult(BaseModel): Loading @@ -207,6 +288,11 @@ class StructuredExtractionResult(BaseModel): tables: list[ExtractedTableElement] = Field(default_factory=list, description="Extracted tables") figures: list[ExtractedFigureElement] = Field(default_factory=list, description="Extracted figures") equations: list[ExtractedEquationElement] = Field(default_factory=list, description="Extracted equations") document_metadata: DocumentMetadataContract | None = Field( None, description="Canonical document-level metadata contract", ) pages: list[PageMetadataContract] = Field(default_factory=list, description="Canonical page-level metadata contracts") metadata: dict[str, Any] = Field(default_factory=dict, description="Document-level extraction metadata") @property Loading Loading @@ -308,15 +394,25 @@ __all__ = [ "AiError", "ConversionError", "DocumentClassification", "DocumentMetadataContract", "DocumentSummary", "EmbeddingDimensionError", "ExtractedEquationElement", "ExtractedFigureElement", "ExtractedTableElement", "ExtractionError", "ExtractionQualityReasonCode", "ExtractionQualityReport", "ExtractionQualityStatus", "GraphEdge", "GraphEdgeType", "GraphNode", "GraphNodeType", "LlmConfigError", "PageMetadataContract", "QualityGateCheckResult", "SourceKind", "StructuredExtractionResult", "SummarizeResult", "TDocNotFoundError", "Workspace", Loading