fix(extraction): remove broken spec routing from convert_for_wiki, fix SpecDownloads init (3a440a93) · Commits · Jan Reimes / 3gpp-crawler

.env.example

+0 −80

Original line number	Diff line number	Diff line
		@@ -19,9 +19,6 @@
		# Checkout directory name (default: checkout)
		# TDC_CHECKOUT_DIRNAME=checkout

		# AI cache directory name (default: lightrag)
		# TDC_AI_CACHE_DIRNAME=lightrag

		# ============================================================================
		# ETSI ONLINE (EOL) CREDENTIALS
		# ============================================================================
		@@ -90,83 +87,6 @@ TDC_MAX_RETRIES=3
		# Number of parallel subinterpreter workers (default: 4)
		TDC_WORKERS=4

		# ============================================================================
		# AI CONFIGURATION (3GPP-AI)
		# ============================================================================
		# These settings are used by the 3gpp-ai package for document embeddings,
		# knowledge graphs, and LLM-based processing.
		# See: packages/3gpp-ai/docs/config.md

		# LLM model in format <provider>/<model_name>
		# Recommended: openrouter/openrouter/free (free tier, no subscription required)
		# API key: <provider-uppercase>_API_KEY (or set TDC_AI_LLM_API_KEY directly)
		TDC_AI_LLM_MODEL=openrouter/openrouter/free

		# Optional custom base URL for LLM provider/proxy
		# TDC_AI_LLM_API_BASE=

		# Optional API key for LLM provider (overrides default provider-specific env vars)
		# TDC_AI_LLM_API_KEY=

		# Embedding model in format <provider>/<model_name>
		# Recommended: ollama/vuongnguyen2212/CodeRankEmbed:latest (CodeRank embedding, self-hosted)
		TDC_AI_EMBEDDING_MODEL=ollama/vuongnguyen2212/CodeRankEmbed:latest

		# Maximum tokens per chunk (default: 1000)
		TDC_AI_MAX_CHUNK_SIZE=1000

		# Token overlap between chunks (default: 100)
		TDC_AI_CHUNK_OVERLAP=100

		# Minimum abstract word count (default: 150)
		TDC_AI_ABSTRACT_MIN_WORDS=150

		# Maximum abstract word count (default: 250)
		TDC_AI_ABSTRACT_MAX_WORDS=250

		# Number of parallel workers for AI processing (default: 4)
		TDC_AI_PARALLELISM=4

		# Convert office documents to PDF during workspace add-members (default: false)
		# Set to "true", "1", or "yes" to enable
		# TDC_AI_CONVERT_PDF=false

		# Extract markdown from PDFs during workspace add-members (default: false)
		# When enabled, implies TDC_AI_CONVERT_PDF=true
		# TDC_AI_CONVERT_MD=false

		# Enable VLM for figure descriptions and formula enrichment (default: false)
		# Set to "true", "1", or "yes" to enable
		# TDC_AI_VLM=false

		# ============================================================================
		# OPENDATALOADER PDF SETTINGS
		# ============================================================================
		# OpenDataLoader is used for PDF extraction (replaces previous docling-based pipeline)
		# Requires Java 11+ installed on system PATH
		# See: https://github.com/opendataloader-project/opendataloader-pdf

		# Enable hybrid AI mode for complex PDF pages (default: off)
		# Options: off, docling-fast, docling-full
		# Requires: pip install "opendataloader-pdf[hybrid]" and opendataloader-pdf-hybrid server running
		# TDC_AI_HYBRID_MODE=off

		# URL for hybrid AI server when enabled (default: http://localhost:5002)
		# TDC_AI_HYBRID_URL=http://localhost:5002

		# ============================================================================
		# GRAPH QUERY CONFIGURATION
		# ============================================================================

		# Graph query level: simple\|medium\|advanced (default: simple)
		# simple: Return count and list without synthesis
		# medium: Parse query keywords, filter nodes, generate simple text summary
		# advanced: Use LLM to synthesize answer from graph + embeddings (GraphRAG)
		# TDC_GRAPH_QUERY_LEVEL=simple

		# Enable shared embedding storage across workspaces (default: true)
		# TDC_LIGHTRAG_SHARED_STORAGE=true

		# ============================================================================
		# ADDITIONAL SERVICES
		# ============================================================================

src/tdoc_crawler/cli/_workspace_commands.py

+0 −1

Original line number	Diff line number	Diff line
		@@ -212,7 +212,6 @@ def workspace_process(
		result_path = convert_for_wiki(
		document_id=source_id,
		wiki_source_dir=wiki_source_dir,
		source_kind=member.source_kind,
		profile=extraction_profile,
		force=force,
		)

src/tdoc_crawler/cli/args.py

+1 −86

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		from __future__ import annotations

		from pathlib import Path
		from typing import Annotated, Literal
		from typing import Annotated

		import typer

		@@ -53,10 +53,6 @@ IncrementalOption = Annotated[
		]
		LimitTDocsOption = Annotated[int \| None, typer.Option("--limit-tdocs", help="Limit number of TDocs", envvar=ConfigEnvVar.TDC_LIMIT_TDOCS.name)]
		ClearTDocsOption = Annotated[bool, typer.Option("--clear-tdocs", help="Clear all TDocs before crawling")]
		_ = Annotated[
		int \| None,
		typer.Option("--overall-timeout", help="Maximum total crawl duration in seconds (None = unlimited)", envvar=ConfigEnvVar.TDC_OVERALL_TIMEOUT.name),
		]
		OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format (table, json, ison, toon, yaml)", envvar=ConfigEnvVar.TDC_OUTPUT.name)]
		EolUsernameOption = Annotated[str \| None, typer.Option("--eol-username", help="ETSI Online account username", envvar=ConfigEnvVar.TDC_EOL_USERNAME.name)]
		EolPasswordOption = Annotated[str \| None, typer.Option("--eol-password", help="ETSI Online account password", envvar=ConfigEnvVar.TDC_EOL_PASSWORD.name)]
		@@ -102,7 +98,6 @@ AgendaPatternExcludeOption = Annotated[
		ClearSpecsOption = Annotated[bool, typer.Option("--clear-specs", help="Clear all specs before crawling")]
		TitleOption = Annotated[str \| None, typer.Option("--title", help="Filter by title contains")]
		StatusOption = Annotated[str \| None, typer.Option("--status", help="Filter by status")]
		_ = Annotated[list[str] \| None, typer.Option("--spec", help="Spec number(s) (dotted or undotted)")]

		SpecFileOption = Annotated[Path \| None, typer.Option("--spec-file", help="File with spec numbers")]
		ReleaseOption = Annotated[
		@@ -156,83 +151,3 @@ NoProgressOption = Annotated[
		typer.Option("--no-progress", help="Disable progress bar (useful for scripts and CI)"),
		]


		# Options - AI
		SummarizeDocumentArgument = Annotated[str, typer.Argument(help="Document ID to summarize")]
		SummarizeWordsOption = Annotated[int, typer.Option("--words", "-w", help="Target/Maximum word count")]
		SummarizeForceOption = Annotated[
		bool,
		typer.Option("--force", "-f", help="Force reconversion even if cached"),
		]
		JsonOutputOption = Annotated[bool, typer.Option("--json", help="Output as JSON")]

		ConvertDocumentArgument = Annotated[str, typer.Argument(help="Document ID to convert")]
		ConvertOutputOption = Annotated[
		Path \| None,
		typer.Option("--output", "-o", help="Output file path (optional, prints to stdout if not specified)"),
		]
		ConvertForceOption = Annotated[
		bool,
		typer.Option("--force", "-f", help="Force reconversion even if cached"),
		]

		QueryArgument = Annotated[str \| None, typer.Argument(help="Semantic search query")]
		WorkspaceNameOption = Annotated[str \| None, typer.Option("--workspace", "-w", help="Workspace name")]
		EmbeddingTopKOption = Annotated[int, typer.Option("--top-k", "-k", help="Number of embedding results to return")]
		QueryMaxWordsOption = Annotated[int, typer.Option("--words", help="Maximum word count for LLM answer")]

		_ = Annotated[str \| None, typer.Option("--tdoc-id", "-t", help="TDoc ID to process")]
		EmbeddingBackendOption = Annotated[
		Literal["torch", "onnx", "openvino"],
		typer.Option(
		"--accelerate",
		"-a",
		help="Embedding backend (torch, onnx, openvino)",
		envvar=ConfigEnvVar.TDC_AI_EMBEDDING_BACKEND.name,
		),
		]
		_ = Annotated[str \| None, typer.Option("--checkout-path", help="Path to checkout document")]
		CheckoutBaseOption = Annotated[str \| None, typer.Option("--checkout-base", help="Base path for checkout")]
		ProcessAllOption = Annotated[bool, typer.Option("--all", help="Process all documents in workspace")]
		ProcessNewOnlyOption = Annotated[bool, typer.Option("--new-only", help="Process only new documents")]
		ProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing")]
		ProcessLimitOption = Annotated[int \| None, typer.Option("--limit", "-l", help="Limit number of documents to process (for testing)")]

		StatusTDocIdOption = Annotated[str \| None, typer.Option("--tdoc-id", "-t", help="TDoc ID to check status for")]

		WorkspaceNameArgument = Annotated[str, typer.Argument(help="Workspace name")]
		WorkspaceActivateArgument = Annotated[str, typer.Argument(help="Workspace name to activate")]
		WorkspaceAutoBuildOption = Annotated[
		bool,
		typer.Option("--auto-build", help="Automatically process documents added to this workspace"),
		]
		WorkspaceActivateOption = Annotated[
		bool,
		typer.Option(
		True,
		"--activate/--no-activate",
		help="Activate workspace after creation",
		envvar=ConfigEnvVar.TDC_AI_WORKSPACE_ACTIVATE.name,
		),
		]

		WorkspaceItemsArgument = Annotated[list[str] \| None, typer.Argument(..., help="Source item IDs to add (optional if filters provided)")]
		WorkspaceKindOption = Annotated[str, typer.Option("--kind", help="Source kind (tdoc, spec, other)")]
		WorkspaceCheckoutOption = Annotated[
		bool,
		typer.Option("--checkout/--no-checkout", help="Checkout/download documents if not present"),
		]
		WorkspaceReleaseOption = Annotated[
		str \| None,
		typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs."),
		]
		WorkspaceLimitOption = Annotated[int \| None, typer.Option("--limit", help="Maximum items to add")]
		WorkspaceIncludeInactiveOption = Annotated[bool, typer.Option("--include-inactive", help="Include inactive members")]

		WorkspaceProcessNewOnlyOption = Annotated[bool, typer.Option("--new-only", help="Process only TDocs not already completed")]
		WorkspaceProcessForceOption = Annotated[bool, typer.Option("--force", help="Force reprocessing of all TDocs")]

		WorkspacePreserveArtifactsOption = Annotated[
		bool,
		typer.Option("--preserve-artifacts/--no-preserve-artifacts", help="Preserve artifacts"),
		]

src/tdoc_crawler/config/cache_manager.py

+0 −1

Original line number	Diff line number	Diff line
		@@ -24,7 +24,6 @@ from __future__ import annotations
		from pathlib import Path
		from typing import ClassVar


		# Single source of truth for default filenames/dirnames
		DEFAULT_DATABASE_FILENAME = "3gpp_crawler.db"
		DEFAULT_HTTP_CACHE_FILENAME = "http-cache.sqlite3"

src/tdoc_crawler/config/env_vars.py

+1 −28

Original line number	Diff line number	Diff line
		"""Environment variable name constants mapped to their corresponding config field paths.

		All TDC_, HTTP_CACHE_, and LIGHTRAG_* environment variables remain functional
		All TDC_* and HTTP_CACHE_* environment variables remain functional
		via pydantic's AliasChoices mechanism.

		This module provides a StrEnum `ConfigEnvVar` where:
		@@ -54,30 +54,8 @@ class ConfigEnvVar(StrEnum):
		TDC_TITLE_LIKE = "crawl.title_like"
		TDC_LIMIT_TDOCS = "crawl.limit"
		TDC_WORKERS = "crawl.workers"
		# AI/LightRAG (TDC_AI_* and LIGHTRAG_*)
		TDC_AI_LLM_MODEL = "ai.llm_model"
		TDC_AI_LLM_API_BASE = "ai.llm_api_base"
		TDC_AI_LLM_API_KEY = "ai.llm_api_key"
		TDC_AI_EMBEDDING_MODEL = "ai.embedding_model"
		TDC_AI_EMBEDDING_API_BASE = "ai.embedding_api_base"
		TDC_AI_EMBEDDING_API_KEY = "ai.embedding_api_key"
		TDC_AI_MAX_CHUNK_SIZE = "ai.max_chunk_size"
		TDC_AI_CHUNK_OVERLAP = "ai.chunk_overlap"
		TDC_AI_CONVERT_PDF = "ai.convert_pdf"
		TDC_AI_CONVERT_MD = "ai.convert_md"
		TDC_AI_VLM = "ai.vlm"
		TDC_AI_ABSTRACT_MIN_WORDS = "ai.abstract_min_words"
		TDC_AI_ABSTRACT_MAX_WORDS = "ai.abstract_max_words"
		TDC_AI_PARALLELISM = "ai.parallelism"
		TDC_GRAPH_QUERY_LEVEL = "ai.graph_query_level"
		TDC_LIGHTRAG_SHARED_STORAGE = "ai.lightrag.shared_storage"
		LIGHTRAG_SHARED_STORAGE = "ai.lightrag.shared_storage"
		LIGHTRAG_DB_BACKEND = "ai.lightrag.db_backend"
		# AI-specific (not in settings.py but used in CLI args)
		TDC_AI_OUTPUT_FORMAT = "ai.output_format"
		TDC_LIMIT_MEETINGS = "crawl.limit_meetings"
		TDC_LIMIT_MEETINGS_PER_SUBWG = "crawl.limit_meetings_per_subwg"
		TDC_OVERALL_TIMEOUT = "crawl.overall_timeout"
		TDC_OUTPUT = "output_format"
		TDC_SOURCE_PATTERN = "crawl.source_pattern"
		TDC_SOURCE_PATTERN_EXCLUDE = "crawl.source_pattern_exclude"
		@@ -89,11 +67,6 @@ class ConfigEnvVar(StrEnum):
		TDC_CHECKOUT_DIR = "path.checkout_dir"
		TDC_VERBOSITY = "verbosity"
		TDC_USE_WHATTHESPEC = "http.use_whatthespec"
		TDC_AI_EMBEDDING_BACKEND = "ai.embedding_backend"
		TDC_AI_WORKSPACE_ACTIVATE = "ai.workspace_activate"
		TDC_AI_DEVICE = "ai.device"
		TDC_AI_NUM_THREADS = "ai.num_threads"
		TDC_AI_BATCH_SIZE = "ai.batch_size"


		TOML_PATH_TO_ENV_VAR: dict[str, str] = {e.value: e.name for e in ConfigEnvVar}