refactor(workspace): rename and update artifact handling (60126818) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/_workspace_commands.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -118,7 +118,7 @@ def workspace_deactivate() -> None:
		def workspace_delete(
		workspace_name: str = typer.Argument(..., help="Workspace name"),
		force: bool = typer.Option(False, "--force", help="Permanently delete workspace and all artifacts"),
		delete_artifacts: bool = typer.Option(False, "--delete-artifacts", help="Delete all .ai artifacts for workspace members"),
		delete_artifacts: bool = typer.Option(False, "--delete-artifacts", help="Delete all workspace artifacts for members"),
		delete_llm_wiki: bool = typer.Option(False, "--delete-llm-wiki", help="Delete the .llm-wiki folder for this workspace"),
		) -> None:
		"""Permanently delete a workspace and all associated files."""

src/tdoc_crawler/extraction/checkout.py

+3 −4

Original line number	Diff line number	Diff line
		"""Workspace item checkout and registration logic.

		Handles the checkout phase of workspace item management:
		downloading/fetching documents, setting up .ai folders, and creating
		workspace member records.
		downloading/fetching documents and creating workspace member records.

		This module is intentionally free of document processing logic
		(PDF conversion, markdown extraction, VLM). That responsibility
		@@ -39,8 +38,8 @@ async def checkout_single_item(
		) -> tuple[WorkspaceMember \| None, str \| None]:
		"""Checkout a single workspace item and create a member record.

		Downloads the document if checkout is enabled, sets up the .ai subfolder,
		and returns a WorkspaceMember ready for registration.
		Downloads the document if checkout is enabled and returns a
		WorkspaceMember ready for registration.

		Args:
		item: Item ID to checkout (TDoc ID, spec number, or path).

src/tdoc_crawler/extraction/conversion.py

+6 −10

Original line number	Diff line number	Diff line
		@@ -68,25 +68,23 @@ def is_office_format(source_file: Path) -> bool:
		return source_file.suffix.lower() in OFFICE_FORMATS


		def get_cached_pdf_path(source_file: Path) -> Path \| None:
		def get_cached_pdf_path(source_file: Path, output_dir: Path) -> Path \| None:
		"""Get the path to a cached PDF conversion if it exists.

		The cached PDF is stored in a `.ai` subdirectory next to the original file.

		Args:
		source_file: Path to the original Office document.
		output_dir: Directory where the cached PDF would be stored.

		Returns:
		Path to cached PDF if it exists, None otherwise.
		"""
		ai_dir = source_file.parent / ".ai"
		cached_pdf = ai_dir / f"{source_file.stem}.pdf"
		cached_pdf = output_dir / f"{source_file.stem}.pdf"
		return cached_pdf if cached_pdf.exists() else None


		def convert_to_pdf(
		source_file: Path,
		output_dir: Path \| None = None,
		output_dir: Path,
		*,
		force: bool = False,
		config: ConverterConfig \| None = None,
		@@ -100,8 +98,7 @@ def convert_to_pdf(

		Args:
		source_file: Path to the Office document (DOCX, DOC, PPT, etc.)
		output_dir: Optional output directory for the PDF. If None, uses
		the `.ai` subdirectory next to the source file.
		output_dir: Output directory for the PDF.
		force: If True, re-convert even if a cached PDF exists.
		config: Optional converter configuration. If None, uses defaults
		from environment variables.
		@@ -120,7 +117,6 @@ def convert_to_pdf(
		raise ConversionError(f"Unsupported file format: {source_file.suffix}")

		config = config or ConverterConfig.from_env()
		output_dir = output_dir or source_file.parent / ".ai"
		output_file = output_dir / f"{source_file.stem}.pdf"

		# Check for cached conversion
		@@ -161,7 +157,7 @@ def ensure_pdf(

		Args:
		source_file: Path to source document.
		output_dir: Directory to place the PDF.
		output_dir: Directory to place the PDF (typically workspace sources dir).
		force: Force reconversion.
		config: Optional converter configuration.

src/tdoc_crawler/extraction/convert.py

+10 −11

Original line number	Diff line number	Diff line
		@@ -8,6 +8,7 @@ from __future__ import annotations

		import json
		import logging
		import tempfile
		from pathlib import Path
		from typing import Any

		@@ -138,6 +139,7 @@ def _ensure_converted(
		force: bool = False,
		config: OpendataloaderConfig \| None = None,
		source_pdf: Path \| None = None,
		output_dir: Path \| None = None,
		) -> tuple[str, Path, str]:
		"""Fetch TDoc and ensure markdown conversion exists.

		@@ -151,10 +153,11 @@ def _ensure_converted(
		source_pdf: If provided, opendataloader processes this PDF instead of the
		original DOCX. This ensures all profiles use the same PDF that was
		generated for the wiki dir.
		output_dir: Directory for conversion artifacts. If None, uses a temp dir.

		Returns:
		Tuple of (raw_markdown, json_path, normalized_id).
		json_path may not exist on disk ÔÇö caller must check.
		json_path may not exist on disk — caller must check.

		Raises:
		ConversionError: If no document files found or conversion fails.
		@@ -166,16 +169,17 @@ def _ensure_converted(
		if primary is None:
		raise ConversionError(f"No document files found for {normalized_id}")

		ai_dir = _get_ai_directory(primary)
		md_file = ai_dir / f"{primary.stem}.md"
		json_file = ai_dir / f"{primary.stem}.json"
		artifact_dir = output_dir or Path(tempfile.mkdtemp(prefix="3gpp-conv-"))
		artifact_dir.mkdir(parents=True, exist_ok=True)
		md_file = artifact_dir / f"{primary.stem}.md"
		json_file = artifact_dir / f"{primary.stem}.json"

		if md_file.exists() and not force:
		markdown_content = md_file.read_text(encoding="utf-8")
		else:
		# Use the provided PDF (from wiki dir) or fall back to original source
		input_file = source_pdf if source_pdf is not None and source_pdf.exists() else primary
		markdown_content, _ = _run_opendataloader(input_file, ai_dir, config=config)
		markdown_content, _ = _run_opendataloader(input_file, artifact_dir, config=config)
		md_file.write_text(markdown_content, encoding="utf-8")

		return markdown_content, json_file, normalized_id
		@@ -237,7 +241,7 @@ def convert_for_wiki(
		hybrid="docling-fast",
		hybrid_mode="full" if profile == ExtractionProfile.ADVANCED else None,
		)
		markdown_content, json_path, _ = _ensure_converted(document_id, force=force, config=config, source_pdf=pdf_path)
		markdown_content, json_path, _ = _ensure_converted(document_id, force=force, config=config, source_pdf=pdf_path, output_dir=wiki_source_dir)

		# Write markdown to wiki source dir
		md_file = wiki_source_dir / f"{primary.stem}.md"
		@@ -323,11 +327,6 @@ def convert_tdoc_metadata(
		return convert_document_to_markdown(document_id, force=force, config=config)


		def _get_ai_directory(source_file: Path) -> Path:
		"""Get the .ai directory adjacent to the source file."""
		return source_file.parent / ".ai"


		def _ensure_hybrid_server_if_needed(config: OpendataloaderConfig) -> None:
		"""Auto-start the hybrid server if the config requires hybrid mode.

src/tdoc_crawler/extraction/fetch_spec.py

+1 −5

Original line number	Diff line number	Diff line
		@@ -63,17 +63,13 @@ def fetch_spec_files(spec_number: str, force_download: bool = False) -> SpecFile
		def _scan_spec_dir(spec_dir: Path) -> SpecFiles:
		"""Scan a spec checkout directory for available document files.

		Searches the directory tree for document files, excluding hidden files
		and the .ai subfolder.
		Searches the directory tree for document files, excluding hidden files.
		"""
		files = SpecFiles(checkout_dir=spec_dir)

		if spec_dir.is_dir():
		for file_path in sorted(spec_dir.rglob("*")):
		if file_path.is_file() and not file_path.name.startswith("."):
		# Skip files inside .ai directories
		if ".ai" in file_path.parts:
		continue
		suffix = file_path.suffix.lower()
		if suffix == ".pdf":
		files.pdf_path = file_path