refactor(ai): deduplicate release normalization, extract check_pdf_status, add type safety (115e2321) · Commits · Jan Reimes / 3gpp-crawler

.config/mise/config.toml

+3 −0

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@
		"github:djinn-soul/CytoScnPy" = "latest"
		"github:AlmogBaku/debug-skill" = "latest"
		"github:rtk-ai/rtk" = "latest"
		#"github:arabold/docs-mcp-server" = "latest"

		"ruff" = "latest"
		"ty" = "latest"
		@@ -80,6 +81,7 @@ shell = "pwsh -NoProfile -Command"
		run = [
		'cls',
		'bun x add-mcp -y -a {{usage.ai_agent}} "grepai mcp-serve"',
		#'bun x add-mcp -y -a {{usage.ai_agent}} -n docs-mcp-server "grepai mcp-serve"',
		'bun x add-mcp -y -a {{usage.ai_agent}} -n cytoscnpy-mcp "cytoscnpy mcp-server"'
		]

		@@ -108,6 +110,7 @@ run = [
		"bun x skills add https://github.com/AlmogBaku/debug-skill -a universal -y", # for dab tool -> AlmogBaku/debug-skill
		"bun x skills add yoanbernabeu/grepai-skills -a universal -y",
		"bun x skills add AlmogBaku/debug-skill -a universal -y",
		"bun x skills add https://github.com/arabold/docs-mcp-server -a universal -y",

		# 3GPP skills (TODO: fix - requires well-known endpoint)
		"bun x skills add https://forge.3gpp.org/rep/reimes/awesome-3gpp-skills/-/tree/main/skills -a universal -y",

packages/3gpp-ai/threegpp_ai/args.py

+9 −2

Original line number	Diff line number	Diff line
		@@ -3,12 +3,19 @@
		from __future__ import annotations

		from pathlib import Path
		from typing import Annotated
		from typing import Annotated, Literal

		import typer

		# Common
		JsonOutputOption = Annotated[bool, typer.Option("--json", help="Output as JSON")]
		OutputFormatOption = Annotated[
		Literal["text", "json", "yaml"],
		typer.Option(
		"--output-format",
		help="Output format: 'text' (default), 'json', or 'yaml'",
		envvar="TDC_AI_OUTPUT_FORMAT",
		),
		]
		CacheDirOption = Annotated[
		Path \| None,
		typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR"),

packages/3gpp-ai/threegpp_ai/cli.py

+46 −23

Original line number	Diff line number	Diff line
		@@ -23,6 +23,7 @@ from tdoc_crawler.logging import get_console, get_logger
		from tdoc_crawler.models.base import OutputFormat, SortOrder
		from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.utils.normalization import normalize_release_version

		from threegpp_ai import (
		SourceKind,
		@@ -54,7 +55,7 @@ from threegpp_ai.args import (
		ConvertOutputOption,
		ConvertPdfOption,
		EndDateOption,
		JsonOutputOption,
		OutputFormatOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		StartDateOption,
		@@ -85,7 +86,8 @@ from threegpp_ai.operations.classify import pick_main_document
		from threegpp_ai.operations.conversion import OFFICE_FORMATS, convert_to_pdf
		from threegpp_ai.operations.convert import convert_document_to_markdown
		from threegpp_ai.operations.extraction import extract_document_structured
		from threegpp_ai.operations.workspace_registry import WorkspaceRegistry
		from threegpp_ai.operations.workspace_registry import WorkspaceRegistry, normalize_spec_member_id
		from threegpp_ai.operations.workspace_utils import check_pdf_status

		# Load environment variables from .env file
		load_dotenv()
		@@ -214,6 +216,18 @@ def _resolve_workspace_name(workspace: str \| None) -> str:
		raise typer.Exit(1)


		def _get_relative_path(source_path: str, base_path: Path) -> str:
		"""Convert absolute path to relative path from base, or return original if not under base."""
		try:
		source = Path(source_path)
		base = Path(base_path)
		# Try to compute relative path
		return str(source.relative_to(base))
		except ValueError:
		# Path is not relative to base, return as-is
		return source_path


		def _resolve_workspace_items(
		*,
		items: list[str] \| None,
		@@ -341,8 +355,10 @@ async def _process_single_item(
		except Exception as e:
		_logger.debug("Failed to extract markdown for %s: %s", item, e)

		resolved_release = await resolve_spec_release_from_db(item, release) if source_kind == SourceKind.SPEC and release else None
		source_item_id = f"{item}-REL{resolved_release}" if resolved_release else item
		resolved_release = None
		if source_kind == SourceKind.SPEC and release:
		resolved_release, _ = await resolve_spec_release_from_db(item, release)
		source_item_id = f"{item}-REL{normalize_release_version(resolved_release)}" if resolved_release else item
		member = make_workspace_member(workspace, source_item_id, source_path, source_kind)
		return member, None, was_converted, was_md_extracted

		@@ -544,7 +560,7 @@ def ai_convert(
		document_id: ConvertDocumentArgument,
		output: ConvertOutputOption = None,
		force: ConvertForceOption = False,
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		"""Convert one TDoc and optionally persist markdown output."""
		markdown_or_path = asyncio.run(
		@@ -552,7 +568,7 @@ def ai_convert(
		)

		if output:
		if json_output:
		if output_format == "json":
		_print_output(
		{"output": str(output)},
		OutputFormat.JSON,
		@@ -562,7 +578,7 @@ def ai_convert(
		console.print(f"[green]Converted {document_id} to {output}[/green]")
		return

		if json_output:
		if output_format == "json":
		_print_output(
		{"markdown": markdown_or_path},
		OutputFormat.JSON,
		@@ -578,14 +594,14 @@ def workspace_create(
		name: WorkspaceNameArgument,
		auto_build: WorkspaceAutoBuildOption = False,
		activate: WorkspaceActivateOption = True,
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		registry = create_workspace(name, auto_build=auto_build)
		workspace = registry.get_workspace(name)
		if activate:
		set_active_workspace(name)

		if json_output:
		if output_format == "json":
		_print_output(
		{"name": workspace.name if workspace else name, "auto_build": auto_build},
		OutputFormat.JSON,
		@@ -600,7 +616,7 @@ def workspace_create(

		@workspace_app.command("list", help="List all workspaces")
		def workspace_list(
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		registry = WorkspaceRegistry.load()
		workspaces = registry.list_workspaces()
		@@ -616,7 +632,7 @@ def workspace_list(
		for entry in workspaces
		]

		if json_output:
		if output_format == "json":
		_print_output(
		workspace_rows,
		OutputFormat.JSON,
		@@ -651,7 +667,7 @@ def workspace_query(
		),
		] = QueryMode.HYBRID,
		workspace: WorkspaceNameOption = None,
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		"""Query the LightRAG knowledge graph."""
		workspace_name = _resolve_workspace_name(workspace)
		@@ -672,7 +688,7 @@ def workspace_query(

		result = asyncio.run(_run())

		if json_output:
		if output_format == "json":
		_print_output(
		{"query": query, "mode": mode.value, "result": result},
		OutputFormat.JSON,
		@@ -739,7 +755,7 @@ def workspace_status(
		@workspace_app.command("info", help="Show detailed information about a workspace")
		def workspace_info(
		name: WorkspaceNameArgument,
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		workspace = get_workspace(name)
		if workspace is None:
		@@ -747,7 +763,7 @@ def workspace_info(
		raise typer.Exit(1)

		counts = get_workspace_member_counts(name)
		if json_output:
		if output_format == "json":
		_print_output(
		{"name": workspace.name, "auto_build": workspace.auto_build, "member_counts": counts},
		OutputFormat.JSON,
		@@ -932,7 +948,7 @@ def workspace_add_members(
		def workspace_list_members(
		workspace: WorkspaceNameOption = None,
		include_inactive: WorkspaceIncludeInactiveOption = False,
		json_output: JsonOutputOption = False,
		output: ProvidersOutputOption = OutputFormat.TABLE,
		) -> None:
		workspace_name = _resolve_workspace_name(workspace)

		@@ -942,9 +958,12 @@ def workspace_list_members(
		console.print(f"[red]Workspace '{workspace_name}' not found[/red]")
		raise typer.Exit(1)

		manager = resolve_cache_manager()
		checkout_base = manager.checkout_dir

		member_rows = [
		{
		"source_item_id": entry.source_item_id,
		"source_item_id": normalize_spec_member_id(entry.source_item_id),
		"source_path": entry.source_path,
		"source_kind": entry.source_kind.value,
		"is_active": entry.is_active,
		@@ -953,31 +972,35 @@ def workspace_list_members(
		for entry in members
		]

		if json_output:
		if output is not OutputFormat.TABLE:
		_print_output(
		member_rows,
		OutputFormat.JSON,
		output,
		table_title=f"Members: {workspace_name}",
		)
		return

		table_rows = [
		{
		"idx": str(i + 1),
		"source_id": row["source_item_id"],
		"kind": row["source_kind"],
		"path": row["source_path"],
		"path": _get_relative_path(str(row["source_path"]), checkout_base),
		"pdf": check_pdf_status(str(row["source_path"])),
		"active": "Yes" if row["is_active"] else "No",
		}
		for row in member_rows
		for i, row in enumerate(member_rows)
		]
		_print_output(
		table_rows,
		OutputFormat.TABLE,
		table_title=f"Members: {workspace_name}",
		table_columns=[
		TableColumnSpec("idx", "#", style="dim"),
		TableColumnSpec("source_id", "Source ID", style="cyan"),
		TableColumnSpec("kind", "Kind", style="green"),
		TableColumnSpec("path", "Path", style="white"),
		TableColumnSpec("pdf", "PDF", style="blue"),
		TableColumnSpec("active", "Active", style="yellow"),
		],
		)
		@@ -988,7 +1011,7 @@ def workspace_process(
		workspace: WorkspaceNameOption = None,
		force: WorkspaceProcessForceOption = False,
		limit: WorkspaceLimitOption = None,
		json_output: JsonOutputOption = False,
		output_format: OutputFormatOption = "text",
		) -> None:
		workspace_name = _resolve_workspace_name(workspace)

		@@ -1039,7 +1062,7 @@ def workspace_process(
		"results": results,
		}

		if json_output:
		if output_format == "json":
		_print_output(payload, OutputFormat.JSON, table_title="Workspace Process")
		return

packages/3gpp-ai/threegpp_ai/lightrag/cli.py

+6 −6

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ from __future__ import annotations

		import asyncio
		import json
		from typing import Annotated
		from typing import Annotated, Literal

		import typer
		from rich.console import Console
		@@ -37,10 +37,10 @@ def query_graph(
		str,
		typer.Option("--workspace", "-w", help="Workspace name"),
		] = "default",
		json_output: Annotated[
		bool,
		typer.Option("--json", "-j", help="Output raw JSON"),
		] = False,
		output_format: Annotated[
		Literal["text", "json", "yaml"],
		typer.Option("--output-format", help="Output format: 'text' (default), 'json', or 'yaml'"),
		] = "text",
		) -> None:
		"""Query the LightRAG knowledge graph.

		@@ -62,7 +62,7 @@ def query_graph(

		result = asyncio.run(_run())

		if json_output:
		if output_format == "json":
		typer.echo(json.dumps({"query": query, "mode": mode.value, "result": result}))
		else:
		console.print(f"\n[bold]Query:[/bold] {query}")

packages/3gpp-ai/threegpp_ai/operations/extraction.py

+9 −3

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@ from typing import Any

		from convert_lo import LibreOfficeFormat
		from convert_lo.converter import Converter
		from kreuzberg import extract_file_sync
		from kreuzberg import ExtractionConfig, ImageExtractionConfig, PdfConfig, extract_file_sync
		from tdoc_crawler.logging import get_logger

		from threegpp_ai.models import ConversionError, ExtractionError
		@@ -40,6 +40,12 @@ OFFICE_FORMATS = {".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx"}
		# All supported formats (PDF + Office formats + text files)
		SUPPORTED_FORMATS = {".pdf", ".txt", ".md"} \| OFFICE_FORMATS

		# Kreuzberg config that enables image extraction from PDFs (disabled by default)
		_EXTRACTION_CONFIG = ExtractionConfig(
		images=ImageExtractionConfig(),
		pdf_options=PdfConfig(extract_images=True),
		)


		def _is_supported_format(file_path: Path) -> bool:
		"""Check if a file format is supported for extraction."""
		@@ -164,7 +170,7 @@ def _convert_via_libreoffice(
		converter = Converter()
		with tempfile.TemporaryDirectory() as tmpdir:
		conversion_result = converter.convert(file_path, LibreOfficeFormat.PDF, Path(tmpdir))
		result = extract_file_sync(str(conversion_result.output_path))
		result = extract_file_sync(str(conversion_result.output_path), config=_EXTRACTION_CONFIG)
		return _build_structured_from_result(
		result,
		ai_dir=ai_dir,
		@@ -265,7 +271,7 @@ def extract_document_structured(

		# No valid cache found, perform fresh extraction
		if file_path.suffix.lower() == ".pdf" or file_path.suffix.lower() in {".txt", ".md"}:
		result = extract_file_sync(str(file_path))
		result = extract_file_sync(str(file_path), config=_EXTRACTION_CONFIG)
		extraction = _build_structured_from_result(
		result,
		ai_dir=ai_dir,