Commit ac47c0b0 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(checkout): enhance checkout process with forced download and extraction

- Implement forced download and extraction of TDocs when specified.
- Clean up existing files in the cache directory before downloading.
- Update prepare_tdoc_file to return the extraction directory if requested.
- Improve logging for checked out TDocs.
parent 186685ec
Loading
Loading
Loading
Loading
+36 −12
Original line number Diff line number Diff line
@@ -8,7 +8,9 @@ directory structure as the server.
from __future__ import annotations

import logging
import zipfile
import posixpath
import shutil
from contextlib import suppress
from pathlib import Path
from urllib.parse import urlparse

@@ -151,22 +153,44 @@ def checkout_tdoc(
    # Create checkout directory
    checkout_path.mkdir(parents=True, exist_ok=True)

    # Download the file
    temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"
    cache_dir = checkout_dir.parent if checkout_dir.name == "checkout" else checkout_dir

    try:
        _download_file(metadata.url, temp_zip_path)
    if force:
        downloads_dir = cache_dir / "checkout"
        extract_dir = downloads_dir / metadata.tdoc_id
        if extract_dir.exists():
            shutil.rmtree(extract_dir)
        zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
        with suppress(FileNotFoundError):
            zip_path.unlink()
        filename = posixpath.basename(urlparse(metadata.url).path)
        if filename:
            with suppress(FileNotFoundError):
                (downloads_dir / filename).unlink()

        # Extract the zip file
        with zipfile.ZipFile(temp_zip_path) as archive:
            archive.extractall(checkout_path)
    import importlib

        logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
    from tdoc_crawler.cli.helpers import prepare_tdoc_file

    cli_helpers = importlib.import_module("tdoc_crawler.cli.helpers")
    original_download = cli_helpers.download_to_path
    try:
        cli_helpers.download_to_path = _download_file
        prepared_path = prepare_tdoc_file(metadata, cache_dir, return_dir=True)
    finally:
        # Clean up the zip file
        if temp_zip_path.exists():
            temp_zip_path.unlink()
        cli_helpers.download_to_path = original_download
    if prepared_path.is_dir():
        if prepared_path != checkout_path:
            shutil.copytree(prepared_path, checkout_path, dirs_exist_ok=True)
            shutil.rmtree(prepared_path)
    else:
        target_path = checkout_path / prepared_path.name
        shutil.copy2(prepared_path, target_path)
        if prepared_path != target_path:
            with suppress(FileNotFoundError):
                prepared_path.unlink()

    logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")

    return checkout_path

+13 −4
Original line number Diff line number Diff line
@@ -263,8 +263,17 @@ def download_to_path(url: str, destination: Path) -> None:
        target.write(response.content)


def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
    """Prepare TDoc file for opening (download and extract if needed)."""
def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool = False) -> Path:
    """Prepare TDoc file for opening (download and extract if needed).

    Args:
        metadata: TDoc metadata with download URL.
        cache_dir: Cache directory for downloads and extracted files.
        return_dir: When True and the TDoc is a zip, return the extract directory.

    Returns:
        Path to the downloaded file, or the extract directory when return_dir is True.
    """
    downloads_dir = cache_dir / "checkout"
    downloads_dir.mkdir(parents=True, exist_ok=True)
    path = urlparse(metadata.url).path
@@ -279,7 +288,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
        if extract_dir.exists():
            files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
            if files:
                return files[0]
                return extract_dir if return_dir else files[0]
            shutil.rmtree(extract_dir)
        zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
        download_to_path(metadata.url, zip_path)
@@ -292,7 +301,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
        files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
        if not files:
            raise FileNotFoundError("no-files-in-archive")
        return files[0]
        return extract_dir if return_dir else files[0]

    target_suffix = suffix or ""
    target_name = filename if filename else f"{metadata.tdoc_id}{target_suffix or '.bin'}"
+31 −10
Original line number Diff line number Diff line
@@ -3,18 +3,39 @@
from __future__ import annotations

# Re-export all public symbols
from .base import BaseConfigModel  # noqa: F401
from .base import DEFAULT_CACHE_DIR, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now
from .base import (
    DEFAULT_CACHE_DIR,
    BaseConfigModel,  # noqa: F401
    HttpCacheConfig,
    OutputFormat,
    PortalCredentials,
    SortOrder,
    utc_now,
)
from .crawl_limits import CrawlLimits  # noqa: F401
from .crawl_log import CrawlLogEntry  # noqa: F401
from .meetings import MeetingMetadata  # noqa: F401
from .meetings import MeetingCrawlConfig, MeetingQueryConfig
from .subworking_groups import SUBTB_INDEX  # noqa: F401
from .subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord
from .tdocs import TDocCrawlConfig  # noqa: F401
from .tdocs import CrawlConfig, QueryConfig, TDocMetadata
from .working_groups import WorkingGroup  # noqa: F401
from .working_groups import WORKING_GROUP_RECORDS, WorkingGroupRecord
from .meetings import (
    MeetingCrawlConfig,
    MeetingMetadata,  # noqa: F401
    MeetingQueryConfig,
)
from .subworking_groups import (
    CODE_INDEX,
    SUBTB_INDEX,  # noqa: F401
    SUBWORKING_GROUP_RECORDS,
    SubWorkingGroupRecord,
)
from .tdocs import (
    CrawlConfig,
    QueryConfig,
    TDocCrawlConfig,  # noqa: F401
    TDocMetadata,
)
from .working_groups import (
    WORKING_GROUP_RECORDS,
    WorkingGroup,  # noqa: F401
    WorkingGroupRecord,
)

__all__ = [
    "CODE_INDEX",