Commit 3a440a93 authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(extraction): remove broken spec routing from convert_for_wiki, fix SpecDownloads init

- Remove _convert_spec_for_wiki function (nested asyncio.run() + spec
  checkout pipeline hangs on network requests)
- Remove source_kind parameter from convert_for_wiki (all items go through
  TDoc pipeline via WhatTheSpec, which now handles unknown statuses)
- Fix SpecDownloads.__init__() in checkout.py: was passing
  cache_manager_name kwarg that doesn't exist on the constructor
- Update _workspace_commands.py caller to match new signature
parent 09d1f150
Loading
Loading
Loading
Loading
+0 −80
Original line number Diff line number Diff line
@@ -19,9 +19,6 @@
# Checkout directory name (default: checkout)
# TDC_CHECKOUT_DIRNAME=checkout

# AI cache directory name (default: lightrag)
# TDC_AI_CACHE_DIRNAME=lightrag

# ============================================================================
# ETSI ONLINE (EOL) CREDENTIALS
# ============================================================================
@@ -90,83 +87,6 @@ TDC_MAX_RETRIES=3
# Number of parallel subinterpreter workers (default: 4)
TDC_WORKERS=4

# ============================================================================
# AI CONFIGURATION (3GPP-AI)
# ============================================================================
# These settings are used by the 3gpp-ai package for document embeddings,
# knowledge graphs, and LLM-based processing.
# See: packages/3gpp-ai/docs/config.md

# LLM model in format <provider>/<model_name>
# Recommended: openrouter/openrouter/free (free tier, no subscription required)
# API key: <provider-uppercase>_API_KEY (or set TDC_AI_LLM_API_KEY directly)
TDC_AI_LLM_MODEL=openrouter/openrouter/free

# Optional custom base URL for LLM provider/proxy
# TDC_AI_LLM_API_BASE=

# Optional API key for LLM provider (overrides default provider-specific env vars)
# TDC_AI_LLM_API_KEY=

# Embedding model in format <provider>/<model_name>
# Recommended: ollama/vuongnguyen2212/CodeRankEmbed:latest (CodeRank embedding, self-hosted)
TDC_AI_EMBEDDING_MODEL=ollama/vuongnguyen2212/CodeRankEmbed:latest

# Maximum tokens per chunk (default: 1000)
TDC_AI_MAX_CHUNK_SIZE=1000

# Token overlap between chunks (default: 100)
TDC_AI_CHUNK_OVERLAP=100

# Minimum abstract word count (default: 150)
TDC_AI_ABSTRACT_MIN_WORDS=150

# Maximum abstract word count (default: 250)
TDC_AI_ABSTRACT_MAX_WORDS=250

# Number of parallel workers for AI processing (default: 4)
TDC_AI_PARALLELISM=4

# Convert office documents to PDF during workspace add-members (default: false)
# Set to "true", "1", or "yes" to enable
# TDC_AI_CONVERT_PDF=false

# Extract markdown from PDFs during workspace add-members (default: false)
# When enabled, implies TDC_AI_CONVERT_PDF=true
# TDC_AI_CONVERT_MD=false

# Enable VLM for figure descriptions and formula enrichment (default: false)
# Set to "true", "1", or "yes" to enable
# TDC_AI_VLM=false

# ============================================================================
# OPENDATALOADER PDF SETTINGS
# ============================================================================
# OpenDataLoader is used for PDF extraction (replaces previous docling-based pipeline)
# Requires Java 11+ installed on system PATH
# See: https://github.com/opendataloader-project/opendataloader-pdf

# Enable hybrid AI mode for complex PDF pages (default: off)
# Options: off, docling-fast, docling-full
# Requires: pip install "opendataloader-pdf[hybrid]" and opendataloader-pdf-hybrid server running
# TDC_AI_HYBRID_MODE=off

# URL for hybrid AI server when enabled (default: http://localhost:5002)
# TDC_AI_HYBRID_URL=http://localhost:5002

# ============================================================================
# GRAPH QUERY CONFIGURATION
# ============================================================================

# Graph query level: simple|medium|advanced (default: simple)
# simple: Return count and list without synthesis
# medium: Parse query keywords, filter nodes, generate simple text summary
# advanced: Use LLM to synthesize answer from graph + embeddings (GraphRAG)
# TDC_GRAPH_QUERY_LEVEL=simple

# Enable shared embedding storage across workspaces (default: true)
# TDC_LIGHTRAG_SHARED_STORAGE=true

# ============================================================================
# ADDITIONAL SERVICES
# ============================================================================
+0 −1
Original line number Diff line number Diff line
@@ -212,7 +212,6 @@ def workspace_process(
            result_path = convert_for_wiki(
                document_id=source_id,
                wiki_source_dir=wiki_source_dir,
                source_kind=member.source_kind,
                profile=extraction_profile,
                force=force,
            )
+1 −86
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Annotated, Literal
from typing import Annotated

import typer

@@ -53,10 +53,6 @@ IncrementalOption = Annotated[
]
LimitTDocsOption = Annotated[int | None, typer.Option("--limit-tdocs", help="Limit number of TDocs", envvar=ConfigEnvVar.TDC_LIMIT_TDOCS.name)]
ClearTDocsOption = Annotated[bool, typer.Option("--clear-tdocs", help="Clear all TDocs before crawling")]
_ = Annotated[
    int | None,
    typer.Option("--overall-timeout", help="Maximum total crawl duration in seconds (None = unlimited)", envvar=ConfigEnvVar.TDC_OVERALL_TIMEOUT.name),
]
OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format (table, json, ison, toon, yaml)", envvar=ConfigEnvVar.TDC_OUTPUT.name)]
EolUsernameOption = Annotated[str | None, typer.Option("--eol-username", help="ETSI Online account username", envvar=ConfigEnvVar.TDC_EOL_USERNAME.name)]
EolPasswordOption = Annotated[str | None, typer.Option("--eol-password", help="ETSI Online account password", envvar=ConfigEnvVar.TDC_EOL_PASSWORD.name)]
@@ -102,7 +98,6 @@ AgendaPatternExcludeOption = Annotated[
ClearSpecsOption = Annotated[bool, typer.Option("--clear-specs", help="Clear all specs before crawling")]
TitleOption = Annotated[str | None, typer.Option("--title", help="Filter by title contains")]
StatusOption = Annotated[str | None, typer.Option("--status", help="Filter by status")]
_ = Annotated[list[str] | None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]

SpecFileOption = Annotated[Path | None, typer.Option("--spec-file", help="File with spec numbers")]
ReleaseOption = Annotated[
@@ -156,83 +151,3 @@ NoProgressOption = Annotated[
    typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
]

# Options - AI
SummarizeDocumentArgument = Annotated[str, typer.Argument(help="Document ID to summarize")]
SummarizeWordsOption = Annotated[int, typer.Option("--words", "-w", help="Target/Maximum word count")]
SummarizeForceOption = Annotated[
    bool,
    typer.Option("--force", "-f", help="Force reconversion even if cached"),
]
JsonOutputOption = Annotated[bool, typer.Option("--json", help="Output as JSON")]

ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")]
ConvertOutputOption = Annotated[
    Path | None,
    typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"),
]
ConvertForceOption = Annotated[
    bool,
    typer.Option("--force", "-f", help="Force reconversion even if cached"),
]

QueryArgument = Annotated[str | None, typer.Argument(help="Semantic search query")]
WorkspaceNameOption = Annotated[str | None, typer.Option("--workspace", "-w", help="Workspace name")]
EmbeddingTopKOption = Annotated[int, typer.Option("--top-k", "-k", help="Number of embedding results to return")]
QueryMaxWordsOption = Annotated[int, typer.Option("--words", help="Maximum word count for LLM answer")]

_ = Annotated[str | None, typer.Option("--tdoc-id", "-t", help="TDoc ID to process")]
EmbeddingBackendOption = Annotated[
    Literal["torch", "onnx", "openvino"],
    typer.Option(
        "--accelerate",
        "-a",
        help="Embedding backend (torch, onnx, openvino)",
        envvar=ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name,
    ),
]
_ = Annotated[str | None, typer.Option("--checkout-path", help="Path to checkout document")]
CheckoutBaseOption = Annotated[str | None, typer.Option("--checkout-base", help="Base path for checkout")]
ProcessAllOption = Annotated[bool, typer.Option("--all", help="Process all documents in workspace")]
ProcessNewOnlyOption = Annotated[bool, typer.Option("--new-only", help="Process only new documents")]
ProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing")]
ProcessLimitOption = Annotated[int | None, typer.Option("--limit", "-l", help="Limit number of documents to process (for testing)")]

StatusTDocIdOption = Annotated[str | None, typer.Option("--tdoc-id", "-t", help="TDoc ID to check status for")]

WorkspaceNameArgument = Annotated[str, typer.Argument(help="Workspace name")]
WorkspaceActivateArgument = Annotated[str, typer.Argument(help="Workspace name to activate")]
WorkspaceAutoBuildOption = Annotated[
    bool,
    typer.Option("--auto-build", help="Automatically process documents added to this workspace"),
]
WorkspaceActivateOption = Annotated[
    bool,
    typer.Option(
        True,
        "--activate/--no-activate",
        help="Activate workspace after creation",
        envvar=ConfigEnvVar.TDC_AI_WORKSPACE_ACTIVATE.name,
    ),
]

WorkspaceItemsArgument = Annotated[list[str] | None, typer.Argument(..., help="Source item IDs to add (optional if filters provided)")]
WorkspaceKindOption = Annotated[str, typer.Option("--kind", help="Source kind (tdoc, spec, other)")]
WorkspaceCheckoutOption = Annotated[
    bool,
    typer.Option("--checkout/--no-checkout", help="Checkout/download documents if not present"),
]
WorkspaceReleaseOption = Annotated[
    str | None,
    typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs."),
]
WorkspaceLimitOption = Annotated[int | None, typer.Option("--limit", help="Maximum items to add")]
WorkspaceIncludeInactiveOption = Annotated[bool, typer.Option("--include-inactive", help="Include inactive members")]

WorkspaceProcessNewOnlyOption = Annotated[bool, typer.Option("--new-only", help="Process only TDocs not already completed")]
WorkspaceProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing of all TDocs")]

WorkspacePreserveArtifactsOption = Annotated[
    bool,
    typer.Option("--preserve-artifacts/--no-preserve-artifacts", help="Preserve artifacts"),
]
+0 −1
Original line number Diff line number Diff line
@@ -24,7 +24,6 @@ from __future__ import annotations
from pathlib import Path
from typing import ClassVar


# Single source of truth for default filenames/dirnames
DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"
+1 −28
Original line number Diff line number Diff line
"""Environment variable name constants mapped to their corresponding config field paths.

All TDC_*, HTTP_CACHE_*, and LIGHTRAG_* environment variables remain functional
All TDC_* and HTTP_CACHE_* environment variables remain functional
via pydantic's AliasChoices mechanism.

This module provides a StrEnum `ConfigEnvVar` where:
@@ -54,30 +54,8 @@ class ConfigEnvVar(StrEnum):
    TDC_TITLE_LIKE = "crawl.title_like"
    TDC_LIMIT_TDOCS = "crawl.limit"
    TDC_WORKERS = "crawl.workers"
    # AI/LightRAG (TDC_AI_* and LIGHTRAG_*)
    TDC_AI_LLM_MODEL = "ai.llm_model"
    TDC_AI_LLM_API_BASE = "ai.llm_api_base"
    TDC_AI_LLM_API_KEY = "ai.llm_api_key"
    TDC_AI_EMBEDDING_MODEL = "ai.embedding_model"
    TDC_AI_EMBEDDING_API_BASE = "ai.embedding_api_base"
    TDC_AI_EMBEDDING_API_KEY = "ai.embedding_api_key"
    TDC_AI_MAX_CHUNK_SIZE = "ai.max_chunk_size"
    TDC_AI_CHUNK_OVERLAP = "ai.chunk_overlap"
    TDC_AI_CONVERT_PDF = "ai.convert_pdf"
    TDC_AI_CONVERT_MD = "ai.convert_md"
    TDC_AI_VLM = "ai.vlm"
    TDC_AI_ABSTRACT_MIN_WORDS = "ai.abstract_min_words"
    TDC_AI_ABSTRACT_MAX_WORDS = "ai.abstract_max_words"
    TDC_AI_PARALLELISM = "ai.parallelism"
    TDC_GRAPH_QUERY_LEVEL = "ai.graph_query_level"
    TDC_LIGHTRAG_SHARED_STORAGE = "ai.lightrag.shared_storage"
    LIGHTRAG_SHARED_STORAGE = "ai.lightrag.shared_storage"
    LIGHTRAG_DB_BACKEND = "ai.lightrag.db_backend"
    # AI-specific (not in settings.py but used in CLI args)
    TDC_AI_OUTPUT_FORMAT = "ai.output_format"
    TDC_LIMIT_MEETINGS = "crawl.limit_meetings"
    TDC_LIMIT_MEETINGS_PER_SUBWG = "crawl.limit_meetings_per_subwg"
    TDC_OVERALL_TIMEOUT = "crawl.overall_timeout"
    TDC_OUTPUT = "output_format"
    TDC_SOURCE_PATTERN = "crawl.source_pattern"
    TDC_SOURCE_PATTERN_EXCLUDE = "crawl.source_pattern_exclude"
@@ -89,11 +67,6 @@ class ConfigEnvVar(StrEnum):
    TDC_CHECKOUT_DIR = "path.checkout_dir"
    TDC_VERBOSITY = "verbosity"
    TDC_USE_WHATTHESPEC = "http.use_whatthespec"
    TDC_AI_EMBEDDING_BACKEND = "ai.embedding_backend"
    TDC_AI_WORKSPACE_ACTIVATE = "ai.workspace_activate"
    TDC_AI_DEVICE = "ai.device"
    TDC_AI_NUM_THREADS = "ai.num_threads"
    TDC_AI_BATCH_SIZE = "ai.batch_size"


TOML_PATH_TO_ENV_VAR: dict[str, str] = {e.value: e.name for e in ConfigEnvVar}
Loading