Commit beb28abd authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(checkout): add support for OTHER source checkout from URLs and local files

* Implemented _checkout_other function to handle both URLs and local files.
* Added content type sniffing and extension guessing for downloaded files.
* Enhanced checkout_single_item to manage OTHER source kind with appropriate handling.
* Introduced tests for OTHER source checkout functionality, covering various scenarios.
parent b0089e48
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -43,6 +43,12 @@ tdoc-crawler query --agenda "*atias*" --start-date 2018

3gpp-crawler workspace add 26250:26258 --kind spec

:: add other sources
3gpp-crawler workspace add https://www.itu.int/dms_pubrec/itu-r/rec/bs/R-REC-BS.775-1-199407-S!!PDF-E.pdf --kind other
3gpp-crawler workspace add https://www.itu.int/dms_pub/itu-r/opb/rep/r-rep-bs.2159-6-2013-pdf-e.pdf --kind other
3gpp-crawler workspace add https://en.wikipedia.org/wiki/Ambisonics --kind other


:: overview
3gpp-crawler workspace members

+1 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ dependencies = [
    "opencv-python-headless>=4.13.0.92",
    "pymupdf>=1.27.2.3",
    "pymupdf4llm>=1.27.2.3",
    "markdownify>=0.14",
    "pdf-remote-converter",
]

+27 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ from typing import Annotated
import typer

from tdoc_crawler.config import ConfigEnvVar
from tdoc_crawler.extraction.profiles import DeviceType, ExtractionProfile, FiguresMode, TablesMode
from tdoc_crawler.extraction.profiles import DeviceType, ExtractionProfile, FiguresMode, SpreadsheetMode, TablesMode

# Arguments
TDocIdsArgument = Annotated[list[str] | None, typer.Argument(help="TDoc identifiers to query")]
@@ -274,6 +274,32 @@ DryRunOption = Annotated[
    typer.Option("--dry-run", help="Show what would be removed without removing"),
]

SpreadsheetModeOption = Annotated[
    SpreadsheetMode,
    typer.Option(
        "--spreadsheet-mode",
        help="Spreadsheet handling: data (extract tables as markdown/CSV) or visual (convert to PDF via LibreOffice)",
    ),
]

CheckoutPrefixOption = Annotated[
    str,
    typer.Option(
        "--checkout-prefix",
        help="Prefix for checkout directory names when adding OTHER sources (default: OTHER_)",
    ),
]

DepthOption = Annotated[
    int,
    typer.Option(
        "--depth",
        help="Maximum crawl depth for URL-based OTHER sources (default: 1)",
        min=1,
        max=5,
    ),
]

WorkspaceDirOption = Annotated[
    Path | None,
    typer.Option("--dir", "-d", help="Custom workspace data directory (default: ~/.3gpp-crawler/workspaces/<name>)"),
+23 −0
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    AutoCrawlSpecsOption,
    CheckoutPrefixOption,
    DryRunOption,
    EndDateOption,
    IncludeInactiveOption,
@@ -29,6 +30,7 @@ from tdoc_crawler.cli.args import (
)
from tdoc_crawler.config import PathConfig
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.extraction.checkout import checkout_single_item
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.models.workspaces import SourceKind
from tdoc_crawler.tdocs.models import TDocQueryConfig
@@ -113,6 +115,7 @@ def workspace_add(
    title: TitlePatternOption = None,
    title_ex: TitlePatternExcludeOption = None,
    auto_crawl_specs: AutoCrawlSpecsOption = True,
    checkout_prefix: CheckoutPrefixOption = "OTHER_",
) -> None:
    """Add documents to a workspace.

@@ -193,6 +196,26 @@ def workspace_add(
                console.print(f"[yellow]Skipped {skipped} invalid spec input(s).[/yellow]")
            items = expanded
        for item in items:
            if source_kind == SourceKind.OTHER:
                try:
                    checked_out, skip_reason = asyncio.run(
                        checkout_single_item(
                            item=item,
                            source_kind=source_kind,
                            checkout_prefix=checkout_prefix,
                            workspace=normalized,
                            checkout=True,
                            release=None,
                            path_config=PathConfig(),
                        )
                    )
                    if checked_out is None:
                        console.print(f"[yellow]  Warning: failed to checkout OTHER item '{item}': {skip_reason}[/yellow]")
                        continue
                    members.append(checked_out)
                except Exception as exc:
                    console.print(f"[yellow]  Warning: failed to checkout OTHER item '{item}': {exc}[/yellow]")
                continue
            resolved_release = _resolve_spec_release_for_add(item, release, source_kind, auto_crawl_specs)
            if resolved_release is None:
                continue
+9 −1
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ from tdoc_crawler.cli.args import (
    ProfileOption,
    RemoteFallbackOption,
    SkipExistingOption,
    SpreadsheetModeOption,
    TablesModeOption,
    VerbosityOption,
    WorkspaceProcessForceOption,
@@ -30,7 +31,7 @@ from tdoc_crawler.config.workspace_registry import WorkspaceMember
from tdoc_crawler.database.base import DocDatabase
from tdoc_crawler.database.oxyde_models import Specification, TDocMetadata
from tdoc_crawler.extraction.convert import ConversionError, DoclingConfig, convert_for_wiki
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, DeviceType, ExtractionProfile, FiguresMode, TablesMode
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, DeviceType, ExtractionProfile, FiguresMode, SpreadsheetMode, TablesMode
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import get_logger, set_verbosity
from tdoc_crawler.models.workspaces import SourceKind
@@ -256,6 +257,7 @@ def _process_member(
    docling_config: DoclingConfig,
    docx_direct: bool = False,
    remote_fallback: bool = True,
    spreadsheet_mode: SpreadsheetMode = SpreadsheetMode.DATA,
) -> tuple[str, bool, bool, int]:
    """Process a single workspace member.

@@ -274,6 +276,7 @@ def _process_member(
            document_id=source_id,
            wiki_source_dir=wiki_source_dir,
            source_kind=member.source_kind,
            source_path=member.source_path,
            profile=extraction_profile,
            force=force,
            docling_config=docling_config,
@@ -281,6 +284,7 @@ def _process_member(
            md_yaml_frontmatter=md_yaml_frontmatter,
            db_metadata=db_metadata,
            remote_fallback=remote_fallback,
            spreadsheet_mode=spreadsheet_mode,
        )
        suffix = result_path.suffix.lstrip(".")
        logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
@@ -305,6 +309,7 @@ def _run_processing_loop(
    docling_config: DoclingConfig,
    docx_direct: bool,
    remote_fallback: bool,
    spreadsheet_mode: SpreadsheetMode = SpreadsheetMode.DATA,
) -> tuple[int, int, int, int]:
    """Run the processing loop over workspace members.

@@ -341,6 +346,7 @@ def _run_processing_loop(
                docling_config,
                docx_direct,
                remote_fallback,
                spreadsheet_mode,
            )
            if succeeded:
                processed += 1
@@ -374,6 +380,7 @@ def workspace_process(
    profile: ProfileOption = DEFAULT_EXTRACTION_PROFILE,
    figures: FiguresModeOption = FiguresMode.EMBED,
    tables: TablesModeOption = TablesMode.EMBED,
    spreadsheet_mode: SpreadsheetModeOption = SpreadsheetMode.DATA,
    device: DeviceOption = DeviceType.AUTO,
    docx_direct: DocxDirectOption = False,
    remote_fallback: RemoteFallbackOption = True,
@@ -423,6 +430,7 @@ def workspace_process(
        docling_config,
        docx_direct,
        remote_fallback,
        spreadsheet_mode,
    )

    summary = f"Processing complete: {processed} succeeded, {failed} failed, {skipped} skipped"
Loading