Commit c5a40772 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(conversion): add remote PDF conversion fallback support

* Introduce remote PDF conversion options in .env.example.
* Update workspace.md to document --remote-fallback CLI option.
* Add RemoteFallbackOption to args.py for command-line interface.
* Implement remote conversion logic in convert.py with fallback handling.
parent 72288d43
Loading
Loading
Loading
Loading
+32 −0
Original line number Diff line number Diff line
@@ -103,6 +103,38 @@ TDC_WORKERS=4
# Sign up at: https://huggingface.co/
# HF_TOKEN=

# ============================================================================
# PDF REMOTE CONVERTER (optional fallback when LibreOffice fails)
# ============================================================================
# At least one provider must be configured to enable remote conversion.
# Providers are tried in failover order: CloudConvert -> Adobe -> Zamzar.
# Install: uv add pdf-remote-converter

# Generic fallback API key (used when provider-specific key is missing)
# PDF_REMOTE_CONVERTER_API_KEY=

# Default provider name (cloudconvert, adobe, or zamzar)
# PDF_REMOTE_CONVERTER_DEFAULT_PROVIDER=cloudconvert

# Enable/disable remote PDF fallback when LibreOffice fails (default: true)
# TDC_REMOTE_FALLBACK=true

# HTTP cache directory for remote converter
# PDF_REMOTE_CONVERTER_CACHE_DIR=~/.cache/pdf-remote-converter

# --- CloudConvert (300 credits/month, no file size limit) ---
# Sign up at: https://cloudconvert.com/
# CLOUDCONVERT_API_KEY=

# --- Adobe PDF Services (500 credits/month, no file size limit) ---
# Sign up at: https://developer.adobe.com/document-services
# ADOBE_CLIENT_ID=
# ADOBE_CLIENT_SECRET=

# --- Zamzar (100 conversions/month, 1 MB free-tier limit) ---
# Sign up at: https://developers.zamzar.com/
# ZAMZAR_API_KEY=

# ============================================================================
# NOTE
# ============================================================================
+26 −0
Original line number Diff line number Diff line
@@ -93,6 +93,32 @@ Extract structured data from all workspace members. This is the core pipeline

By default (`--skip-existing`), existing output is preserved. Use `--force` to overwrite.

### `--remote-fallback` / `--no-remote-fallback`

Controls whether an online PDF conversion service is used when LibreOffice fails (e.g. for large or complex documents). Default: **enabled**.

When enabled, the processing pipeline tries LibreOffice first. If that fails with a `ConversionError`, and at least one remote provider has API keys configured, the document is sent to the online service as a fallback. A warning is logged when the fallback triggers.

Non-`ConversionError` failures (e.g. missing input file) propagate immediately without triggering the fallback.

To use the fallback, configure at least one provider in your environment (see `.env.example`):

| Provider | Env Vars | Free Quota | File Size Limit |
|----------|----------|------------|-----------------|
| CloudConvert | `CLOUDCONVERT_API_KEY` | 300/mo | None |
| Adobe | `ADOBE_CLIENT_ID` + `ADOBE_CLIENT_SECRET` | 500/mo | None |
| Zamzar | `ZAMZAR_API_KEY` | 100/mo | 1 MB |

Providers are tried in failover order: CloudConvert → Adobe → Zamzar.

```bash
# Disable fallback (LibreOffice only)
3gpp-crawler workspace process --no-remote-fallback

# Or via environment variable
TDC_REMOTE_FALLBACK=false 3gpp-crawler workspace process
```

### `workspace clear-invalid`

Remove members whose source path no longer exists on disk.
+2 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ dependencies = [
    "opencv-python-headless>=4.13.0.92",
    "pymupdf>=1.27.2.3",
    "pymupdf4llm>=1.27.2.3",
    "pdf-remote-converter",
]

[project.urls]
@@ -120,3 +121,4 @@ convert-lo = { workspace = true }
pool-executors = { workspace = true }
toon-format = { git = "https://github.com/toon-format/toon-python.git" }
doc2txt = { git = "https://github.com/Quantatirsk/doc2txt-pypi.git" }
pdf-remote-converter = { git = "https://forge.3gpp.org/rep/reimes/pdf-remote-converter" }
+8 −0
Original line number Diff line number Diff line
@@ -253,6 +253,14 @@ DocxDirectOption = Annotated[
        envvar=ConfigEnvVar.TDC_DOCX_DIRECT.name,
    ),
]
RemoteFallbackOption = Annotated[
    bool,
    typer.Option(
        "--remote-fallback/--no-remote-fallback",
        help="Use online PDF service when LibreOffice fails (requires API keys, see .env.example)",
        envvar=ConfigEnvVar.TDC_REMOTE_FALLBACK.name,
    ),
]
WorkspaceNameOption = Annotated[
    str | None,
    typer.Option("-w", "--workspace", help="Workspace name (default: active workspace)", envvar=ConfigEnvVar.TDC_WORKSPACE.name),
+7 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ from tdoc_crawler.cli.args import (
    MdYamlFrontmatterOption,
    ProcessLimitOption,
    ProfileOption,
    RemoteFallbackOption,
    SkipExistingOption,
    TablesModeOption,
    VerbosityOption,
@@ -254,6 +255,7 @@ def _process_member(
    md_yaml_frontmatter: bool,
    docling_config: DoclingConfig,
    docx_direct: bool = False,
    remote_fallback: bool = True,
) -> tuple[str, bool, bool, int]:
    """Process a single workspace member.

@@ -278,6 +280,7 @@ def _process_member(
            docx_direct=docx_direct,
            md_yaml_frontmatter=md_yaml_frontmatter,
            db_metadata=db_metadata,
            remote_fallback=remote_fallback,
        )
        suffix = result_path.suffix.lstrip(".")
        logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
@@ -301,6 +304,7 @@ def _run_processing_loop(
    md_yaml_frontmatter: bool,
    docling_config: DoclingConfig,
    docx_direct: bool,
    remote_fallback: bool,
) -> tuple[int, int, int, int]:
    """Run the processing loop over workspace members.

@@ -336,6 +340,7 @@ def _run_processing_loop(
                md_yaml_frontmatter,
                docling_config,
                docx_direct,
                remote_fallback,
            )
            if succeeded:
                processed += 1
@@ -371,6 +376,7 @@ def workspace_process(
    tables: TablesModeOption = TablesMode.EMBED,
    device: DeviceOption = DeviceType.AUTO,
    docx_direct: DocxDirectOption = False,
    remote_fallback: RemoteFallbackOption = True,
    md_yaml_frontmatter: MdYamlFrontmatterOption = True,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
@@ -416,6 +422,7 @@ def workspace_process(
        md_yaml_frontmatter,
        docling_config,
        docx_direct,
        remote_fallback,
    )

    summary = f"Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped"
Loading