feat(cli): add options for figures, tables, and device handling (7c74dd3c) · Commits · Jan Reimes / 3gpp-crawler

.config/mise/config.toml

+1 −1

Original line number	Diff line number	Diff line
		@@ -90,7 +90,7 @@ run = [
		run = [
		"cls",
		# python & programming skills
		"npx -y skills add https://github.com/jr2804/prompts -a universal -y -s python-ultimate -s code-deduplication, -s coding-discipline -s output-quality",
		"npx -y skills add https://github.com/jr2804/prompts -a universal -y -s python-ultimate -s code-deduplication -s docling-document-intelligence -s coding-discipline -s output-quality",
		"npx -y skills add https://github.com/jiatastic/open-python-skills -a universal -y -s ty-skills -s pydantic",
		"npx -y skills add https://github.com/glaforge/deslopify -a universal -y",

pyproject.toml

+3 −2

Original line number	Diff line number	Diff line
		@@ -19,10 +19,10 @@ dependencies = [
		"beautifulsoup4>=4.14.2",
		"brotli>=1.2.0",
		"convert-lo",
		"doc2txt>=1.0.8",
		# "doc2txt>=1.0.8",
		"hishel>=1.1.8",
		"liteparse>=1.2.0",
		"opendataloader-pdf[hybrid]>=2.2.0",
		"docling>=2.92.0",
		"packaging>=25.0",
		"pandas>=3.0.0",
		"pydantic>=2.12.2",
		@@ -41,6 +41,7 @@ dependencies = [
		"toon-format",
		"pydantic-settings>=2.13.1",
		"niquests>=3.18.4",
		"opencv-python-headless>=4.13.0.92",
		]

		[project.urls]

src/tdoc_crawler/cli/_shared.py

+0 −22

Original line number	Diff line number	Diff line
		@@ -7,8 +7,6 @@ from pathlib import Path
		from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TaskID, TextColumn

		from tdoc_crawler.database.base import DocDatabase
		from tdoc_crawler.extraction.hybrid_server import HybridServerConfig, ensure_hybrid_server
		from tdoc_crawler.extraction.profiles import ExtractionProfile
		from tdoc_crawler.logging import get_console
		from tdoc_crawler.specs.operations.checkout import clear_checkout_specs
		from tdoc_crawler.tdocs.operations.checkout import clear_checkout_tdocs
		@@ -84,28 +82,8 @@ def create_progress_bar(description: str, total: float = 100) -> tuple[Progress,
		return progress, task


		def ensure_hybrid_server_for_profile(profile: ExtractionProfile) -> bool:
		"""Ensure hybrid server is available for extraction profiles that require it."""
		if profile != ExtractionProfile.ADVANCED:
		return True

		config = HybridServerConfig(
		enrich_formula=True,
		enrich_picture=True,
		)
		console.print(f"[dim]Ensuring hybrid server for profile '{profile.value}'...[/dim]")
		_, status = ensure_hybrid_server(config=config, auto_start=True, progress_callback=lambda message: console.print(f"[dim]{message}[/dim]"))
		if status.running:
		return True

		error = status.error or "unknown startup failure"
		console.print(f"[red]Hybrid server is not available: {error}[/red]")
		return False


		__all__ = [
		"console",
		"create_progress_bar",
		"ensure_hybrid_server_for_profile",
		"handle_clear_options",
		]

src/tdoc_crawler/cli/args.py

+21 −0

Original line number	Diff line number	Diff line
		@@ -196,6 +196,27 @@ ProfileOption = Annotated[
		str,
		typer.Option("--profile", help="Extraction profile: pdf-only, default, or advanced"),
		]
		FiguresModeOption = Annotated[
		str,
		typer.Option(
		"--figures",
		help="Figure handling: embed (placeholder in markdown) or reference (extract image files)",
		),
		]
		TablesModeOption = Annotated[
		str,
		typer.Option(
		"--tables",
		help="Table handling: embed (in markdown) or csv (separate CSV files)",
		),
		]
		DeviceOption = Annotated[
		str,
		typer.Option(
		"--device",
		help="Accelerator device: auto (detect), cpu, cuda, or mps",
		),
		]
		WorkspaceNameOption = Annotated[
		str \| None,
		typer.Option("-w", "--workspace", help="Workspace name (default: active workspace)"),

src/tdoc_crawler/cli/workspace/process.py

+34 −9

Original line number	Diff line number	Diff line
		@@ -9,17 +9,20 @@ from typing import Any

		import typer

		from tdoc_crawler.cli._shared import console, create_progress_bar, ensure_hybrid_server_for_profile
		from tdoc_crawler.cli._shared import console, create_progress_bar
		from tdoc_crawler.cli.args import (
		DeviceOption,
		FiguresModeOption,
		MdYamlFrontmatterOption,
		ProcessLimitOption,
		ProfileOption,
		SkipExistingOption,
		TablesModeOption,
		VerbosityOption,
		WorkspaceProcessForceOption,
		)
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.extraction.convert import convert_for_wiki
		from tdoc_crawler.extraction.convert import ConversionError, DoclingConfig, convert_for_wiki
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
		from tdoc_crawler.logging import get_logger, set_verbosity
		@@ -95,7 +98,7 @@ def _should_skip_member(


		def _read_page_count(json_dir: Path) -> int:
		"""Read the number of pages from opendataloader JSON output.
		"""Read page count from Docling JSON output.

		Returns 0 if the JSON file is not found or cannot be read.
		"""
		@@ -104,7 +107,9 @@ def _read_page_count(json_dir: Path) -> int:
		return 0
		try:
		data = json.loads(json_files[0].read_text(encoding="utf-8"))
		return int(data.get("number of pages", 0))
		# DoclingDocument stores pages as a list
		pages = data.get("pages", [])
		return len(pages) if isinstance(pages, list) else 0
		except json.JSONDecodeError, OSError, ValueError:
		return 0

		@@ -115,11 +120,12 @@ def _process_member(
		extraction_profile: ExtractionProfile,
		force: bool,
		md_yaml_frontmatter: bool,
		docling_config: DoclingConfig,
		) -> tuple[str, bool, bool, int]:
		"""Process a single workspace member.

		Returns:
		Tuple of (source_id, succeeded, failed, page_count).
		Tuple of ``(source_id, succeeded, failed, page_count)``.
		"""
		source_id = member.source_item_id
		wiki_source_dir = wiki_source_dir_base / source_id
		@@ -132,6 +138,7 @@ def _process_member(
		source_kind=member.source_kind,
		profile=extraction_profile,
		force=force,
		docling_config=docling_config,
		)
		if result_path:
		suffix = result_path.suffix.lstrip(".")
		@@ -139,6 +146,10 @@ def _process_member(
		return source_id, True, False, _read_page_count(wiki_source_dir)
		logger.debug("No output for %s", source_id)
		return source_id, False, False, 0
		except (ConversionError, FileNotFoundError) as e:
		console.print(f"[red] Failed {source_id}: {e}[/red]")
		logger.error("Failed to process %s: %s", source_id, e)
		return source_id, False, True, 0
		except Exception as e:
		console.print(f"[red] Failed {source_id}: {e}[/red]")
		logger.error("Failed to process %s: %s", source_id, e)
		@@ -151,6 +162,9 @@ def workspace_process(
		limit: ProcessLimitOption = None,
		skip_existing: SkipExistingOption = False,
		profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE.value,
		figures: FiguresModeOption = "embed",
		tables: TablesModeOption = "embed",
		device: DeviceOption = "auto",
		md_yaml_frontmatter: MdYamlFrontmatterOption = True,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		@@ -168,7 +182,20 @@ def workspace_process(
		console.print(f"[red]Invalid profile '{profile}'. Use: pdf-only, default, advanced[/red]")
		raise typer.Exit(1)

		console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}'...[/yellow]")
		# Validate figure/table mode options
		if figures not in ("embed", "reference"):
		console.print(f"[red]Invalid figures mode '{figures}'. Use: embed, reference[/red]")
		raise typer.Exit(1)
		if tables not in ("embed", "csv"):
		console.print(f"[red]Invalid tables mode '{tables}'. Use: embed, csv[/red]")
		raise typer.Exit(1)
		if device not in ("auto", "cpu", "cuda", "mps"):
		console.print(f"[red]Invalid device '{device}'. Use: auto, cpu, cuda, mps[/red]")
		raise typer.Exit(1)

		docling_config = DoclingConfig(figures_mode=figures, tables_mode=tables, device=device)

		console.print(f"[yellow]Processing workspace '{normalized}' with profile '{extraction_profile.value}' (figures={figures}, tables={tables})...[/yellow]")

		try:
		members = list_workspace_members(normalized, include_inactive=False)
		@@ -183,9 +210,6 @@ def workspace_process(
		if limit is not None:
		members = members[:limit]

		if not ensure_hybrid_server_for_profile(extraction_profile):
		raise typer.Exit(1)

		cache_manager = resolve_cache_manager()
		metadata = get_workspace(normalized)
		if metadata is not None:
		@@ -220,6 +244,7 @@ def workspace_process(
		extraction_profile,
		force,
		md_yaml_frontmatter,
		docling_config,
		)
		if succeeded:
		processed += 1