Commit a00078aa authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(adobe, base, cloudconvert, zamzar): improve conversion methods and validations

* Rename conversion methods to _perform_convert for clarity.
* Implement file size validation in AbstractProvider.
* Remove redundant format checks in provider-specific implementations.
* Update Zamzar provider to use job-based conversion workflow.
parent b474edeb
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
@@ -55,12 +55,9 @@ class AdobeProvider(AbstractProvider):
        """Return True when the provider is available and configured."""
        return bool(self.client_id and self.client_secret) and self.quota_remaining > 0

    def convert(self, input_path: Path, output_path: Path) -> ConversionResult:
    def _perform_convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Convert the input Office document to PDF using Adobe PDF Services."""
        input_format = input_path.suffix.lstrip(".").lower()
        if input_format not in self.supported_formats:
            raise InvalidFormatError(f"Format '{input_format}' is not supported by Adobe.")

        try:
            credentials = ServicePrincipalCredentials(client_id=self.client_id, client_secret=self.client_secret)
            pdf_services = PDFServices(credentials=credentials)
+41 −3
Original line number Diff line number Diff line
@@ -2,10 +2,10 @@

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Protocol

from pdf_remote_converter.exceptions import FileTooLargeError, InvalidFormatError
from pdf_remote_converter.providers.models import ConversionResult


@@ -28,6 +28,10 @@ class ProviderBackend(Protocol):
    def quota_remaining(self) -> int:
        """Return the remaining quota for the provider."""

    @property
    def max_file_size(self) -> int | None:
        """Return the maximum file size in bytes, or None if no limit."""

    def convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Convert an input file to PDF."""

@@ -35,7 +39,6 @@ class ProviderBackend(Protocol):
        """Return whether the provider is available and configured."""


@dataclass
class AbstractProvider:
    """Base class with shared provider behavior."""

@@ -44,6 +47,24 @@ class AbstractProvider:
    monthly_quota: int
    quota_remaining: int

    def __init__(
        self,
        name: str,
        supported_formats: set[str],
        monthly_quota: int,
        quota_remaining: int,
    ) -> None:
        """Initialize the provider with shared configuration."""
        self.name = name
        self.supported_formats = supported_formats
        self.monthly_quota = monthly_quota
        self.quota_remaining = quota_remaining

    @property
    def max_file_size(self) -> int | None:
        """Return the maximum file size in bytes, or None if no limit."""
        return None

    def is_healthy(self) -> bool:
        """Return True when the provider is available for use."""
        return self.quota_remaining > 0
@@ -51,6 +72,23 @@ class AbstractProvider:
    def convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Convert the input file to PDF.

        Subclasses must implement the conversion logic.
        Performs format and file size validation before delegating to
        the provider-specific implementation via _perform_convert().
        """
        input_format = input_path.suffix.lstrip(".").lower()
        if input_format not in self.supported_formats:
            raise InvalidFormatError(f"Format '{input_format}' is not supported by {self.name}.")

        if self.max_file_size is not None:
            file_size = input_path.stat().st_size
            if file_size > self.max_file_size:
                raise FileTooLargeError(f"{self.name} supports files up to {self.max_file_size} bytes.")

        return self._perform_convert(input_path, output_path)

    def _perform_convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Perform the actual conversion.

        Subclasses must implement this method to provide provider-specific logic.
        """
        raise NotImplementedError
+1 −6
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@ from hishel.httpx import SyncCacheClient
from pdf_remote_converter.exceptions import (
    AuthenticationError,
    ConversionError,
    InvalidFormatError,
    ProviderUnavailableError,
    QuotaExceededError,
    RateLimitError,
@@ -40,12 +39,8 @@ class CloudConvertProvider(AbstractProvider):
        """Return True when the provider is available and configured."""
        return bool(self.api_key) and self.quota_remaining > 0

    def convert(self, input_path: Path, output_path: Path) -> ConversionResult:
    def _perform_convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Convert the input Office document to PDF using CloudConvert."""
        input_format = input_path.suffix.lstrip(".").lower()
        if input_format not in self.supported_formats:
            raise InvalidFormatError(f"Format '{input_format}' is not supported by CloudConvert.")

        job_payload = {
            "tasks": {
                "import-my-file": {"operation": "import/upload"},
+21 −34
Original line number Diff line number Diff line
@@ -13,8 +13,6 @@ from hishel.httpx import SyncCacheClient
from pdf_remote_converter.exceptions import (
    AuthenticationError,
    ConversionError,
    FileTooLargeError,
    InvalidFormatError,
    ProviderUnavailableError,
    QuotaExceededError,
    RateLimitError,
@@ -28,6 +26,7 @@ class ZamzarProvider(AbstractProvider):
    """Zamzar API implementation."""

    BASE_URL = "https://api.zamzar.com/v1"
    MAX_FILE_SIZE = 1 * 1024 * 1024  # 1MB free tier limit

    def __init__(self, api_key: str, http_client: SyncCacheClient | None = None) -> None:
        """Initialize Zamzar provider with API key and HTTP client."""
@@ -36,44 +35,32 @@ class ZamzarProvider(AbstractProvider):
        self.supported_formats = {"doc", "docx", "xls", "xlsx", "ppt", "pptx"}
        self.monthly_quota = 100
        self.quota_remaining = self.monthly_quota
        self.zamzar_max_file_size = 1 * 1024 * 1024
        self.name = "zamzar"

    @property
    def max_file_size(self) -> int | None:
        """Return the maximum file size in bytes (1MB for Zamzar free tier)."""
        return self.MAX_FILE_SIZE

    def is_healthy(self) -> bool:
        """Return True when the provider is available and configured."""
        return bool(self.api_key) and self.quota_remaining > 0

    def convert(self, input_path: Path, output_path: Path) -> ConversionResult:
    def _perform_convert(self, input_path: Path, output_path: Path) -> ConversionResult:
        """Convert the input Office document to PDF using Zamzar."""
        input_format = input_path.suffix.lstrip(".").lower()
        if input_format not in self.supported_formats:
            raise InvalidFormatError(f"Format '{input_format}' is not supported by Zamzar.")

        file_size = input_path.stat().st_size
        if file_size > self.zamzar_max_file_size:
            raise FileTooLargeError("Zamzar free tier supports files up to 1MB.")

        upload_response = self._request_json(
            "POST",
            f"{self.BASE_URL}/files",
            files={"file": (input_path.name, input_path.read_bytes())},
        )
        upload_data = self._extract_data(upload_response)
        source_file_id = upload_data.get("id")
        if not source_file_id:
            raise ConversionError("Zamzar upload did not return a file id.")

        conversion_response = self._request_json(
        # Create conversion job using /jobs endpoint
        job_response = self._request_json(
            "POST",
            f"{self.BASE_URL}/conversions",
            json={"source_file_id": source_file_id, "target_format": "pdf"},
            f"{self.BASE_URL}/jobs",
            files={"source_file": (input_path.name, input_path.read_bytes())},
            data={"target_format": "pdf"},
        )
        conversion_data = self._extract_data(conversion_response)
        conversion_id = conversion_data.get("id")
        if not conversion_id:
            raise ConversionError("Zamzar conversion did not return a conversion id.")
        job_data = self._extract_data(job_response)
        job_id = job_data.get("id")
        if not job_id:
            raise ConversionError("Zamzar job creation did not return a job id.")

        target_file_id = self._poll_conversion(conversion_id)
        target_file_id = self._poll_job(job_id)
        download_response = self._request_raw(
            "GET",
            f"{self.BASE_URL}/files/{target_file_id}/content",
@@ -83,7 +70,7 @@ class ZamzarProvider(AbstractProvider):
        return ConversionResult(
            output_path=output_path,
            provider=self.name,
            from_cache=bool(upload_response.extensions.get("from_cache")),
            from_cache=bool(job_response.extensions.get("from_cache")),
            credits_used=1,
        )

@@ -125,9 +112,9 @@ class ZamzarProvider(AbstractProvider):
            raise QuotaExceededError(message)
        raise ConversionError(message)

    def _poll_conversion(self, conversion_id: str) -> str:
        """Poll the conversion status until completion and return target file id."""
        poll_url = f"{self.BASE_URL}/conversions/{conversion_id}"
    def _poll_job(self, job_id: int) -> str:
        """Poll the job status until completion and return target file id."""
        poll_url = f"{self.BASE_URL}/jobs/{job_id}"
        for _ in range(60):
            response = self._request_json("GET", poll_url)
            data = self._extract_data(response)