Commit d11f9a60 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(checkout): implement TDoc checkout functionality

* Add functions to download and extract TDocs from the 3GPP server.
* Calculate checkout paths that mirror the server's directory structure.
* Include command in CLI for checking out TDocs with progress reporting.
* Implement tests for checkout functionality, including success and error cases.
parent c4cbcda5
Loading
Loading
Loading
Loading
+168 −0
Original line number Diff line number Diff line
"""TDoc checkout functionality for downloading and extracting documents.

This module provides functionality to download TDocs from the 3GPP server
and extract them to a local checkout folder while maintaining the same
directory structure as the server.
"""

from __future__ import annotations

import logging
import shutil
import zipfile
from pathlib import Path
from urllib.parse import urlparse

from tdoc_crawler.models import TDocMetadata

logger = logging.getLogger(__name__)


def get_checkout_path(metadata: TDocMetadata, checkout_dir: Path) -> Path:
    """Calculate the checkout path for a TDoc based on its URL.

    The checkout path mirrors the 3GPP server directory structure.
    For example:
    - URL: https://www.3gpp.org/ftp/tsg_sa/SA4/s4-251234.zip
    - Checkout: checkout_dir/tsg_sa/SA4/s4-251234/

    Args:
        metadata: TDoc metadata containing the URL
        checkout_dir: Base checkout directory

    Returns:
        Path to the checkout directory for this TDoc
    """
    url_path = urlparse(metadata.url).path

    # Normalize the path: remove leading slash and split into components
    # Use str.split('/') to avoid Path treating it as absolute on Windows
    url_path = url_path.lstrip("/")
    path_parts = url_path.split("/")

    # Find the 'ftp' component and take everything after it
    try:
        ftp_index = path_parts.index("ftp")
        relative_parts = path_parts[ftp_index + 1 :]
    except ValueError:
        # If 'ftp' not found, use the full path
        relative_parts = path_parts

    # Remove the filename (last component) - we'll use tdoc_id as folder name
    if relative_parts:
        relative_parts = relative_parts[:-1]

    # Build the checkout path: checkout_dir / path / tdoc_id
    checkout_path = checkout_dir.joinpath(*relative_parts) / metadata.tdoc_id if relative_parts else checkout_dir / metadata.tdoc_id

    return checkout_path


def checkout_tdoc(
    metadata: TDocMetadata,
    checkout_dir: Path,
    *,
    force: bool = False,
) -> Path:
    """Download and extract a TDoc to the checkout folder.

    Args:
        metadata: TDoc metadata containing URL and ID
        checkout_dir: Base checkout directory
        force: If True, re-download even if already exists

    Returns:
        Path to the extracted TDoc directory

    Raises:
        FileNotFoundError: If download fails or zip is empty
        ValueError: If URL scheme is not supported
        zipfile.BadZipFile: If the downloaded file is not a valid zip
    """
    checkout_path = get_checkout_path(metadata, checkout_dir)

    # Check if already checked out
    if checkout_path.exists() and not force:
        logger.debug(f"TDoc {metadata.tdoc_id} already checked out at {checkout_path}")
        return checkout_path

    # Create checkout directory
    checkout_path.mkdir(parents=True, exist_ok=True)

    # Download the file
    temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"

    try:
        _download_file(metadata.url, temp_zip_path)

        # Extract the zip file
        with zipfile.ZipFile(temp_zip_path) as archive:
            archive.extractall(checkout_path)

        logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")

    finally:
        # Clean up the zip file
        if temp_zip_path.exists():
            temp_zip_path.unlink()

    return checkout_path


def _download_file(url: str, destination: Path) -> None:
    """Download a file from URL to destination path.

    Args:
        url: Source URL
        destination: Destination path

    Raises:
        ValueError: If URL scheme is not supported
        FileNotFoundError: If download fails
    """
    from urllib.request import urlopen

    destination.parent.mkdir(parents=True, exist_ok=True)

    # Validate URL scheme
    allowed_schemes = ("ftp://", "http://", "https://")
    lowered = url.lower()
    if not lowered.startswith(allowed_schemes):
        raise ValueError(f"unsupported-url-scheme: {url}")

    try:
        with urlopen(url, timeout=300) as response, destination.open("wb") as target:  # noqa: S310
            shutil.copyfileobj(response, target)
    except Exception as exc:
        raise FileNotFoundError(f"failed-to-download: {url}") from exc


def get_checked_out_tdocs(checkout_dir: Path) -> list[str]:
    """Get list of TDoc IDs that are already checked out.

    Args:
        checkout_dir: Base checkout directory

    Returns:
        List of TDoc IDs (directory names)
    """
    if not checkout_dir.exists():
        return []

    tdoc_ids = []
    for path in checkout_dir.rglob("*"):
        if path.is_dir():
            # Check if this is a leaf directory (contains files, not just subdirs)
            # and is not an intermediate directory
            has_files = any(f.is_file() for f in path.iterdir())
            if has_files:
                tdoc_ids.append(path.name)

    return tdoc_ids


__all__ = [
    "checkout_tdoc",
    "get_checked_out_tdocs",
    "get_checkout_path",
]
+56 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn
from rich.table import Table

from tdoc_crawler.checkout import checkout_tdoc
from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
@@ -383,6 +384,61 @@ def open_tdoc(
    launch_file(target_file)


@app.command()
def checkout(
    tdoc_id: Annotated[list[str], typer.Argument(help="TDoc identifier(s) to checkout")],
    cache_dir: Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory")] = DEFAULT_CACHE_DIR,
    force: Annotated[bool, typer.Option("--force", "-f", help="Re-download even if already checked out")] = False,
) -> None:
    """Download and extract TDoc(s) to checkout folder."""
    normalized_ids = [tid.strip().upper() for tid in tdoc_id]
    config = QueryConfig(
        cache_dir=cache_dir,
        tdoc_ids=normalized_ids,
    )

    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)

        # Check which TDocs were found
        found_ids = {r.tdoc_id for r in results}
        missing_ids = set(normalized_ids) - found_ids

        if missing_ids:
            console.print(f"[red]TDoc(s) not found: {', '.join(sorted(missing_ids))}[/red]")
            if not results:
                raise typer.Exit(code=1)

    checkout_dir = cache_dir / "checkout"
    success_count = 0
    error_count = 0

    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        console=console,
    ) as progress:
        task = progress.add_task("Checking out TDocs...", total=len(results))

        for metadata in results:
            try:
                checkout_path = checkout_tdoc(metadata, checkout_dir, force=force)
                progress.console.print(f"[green]✓ {metadata.tdoc_id}{checkout_path}")
                success_count += 1
            except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
                progress.console.print(f"[red]✗ {metadata.tdoc_id}: {exc}")
                error_count += 1
            progress.advance(task)

    console.print(f"\n[cyan]Checked out {success_count} TDoc(s)[/cyan]")
    if error_count:
        console.print(f"[red]Failed: {error_count} TDoc(s)[/red]")


@app.command()
def stats(
    cache_dir: Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory")] = DEFAULT_CACHE_DIR,
+31 −5
Original line number Diff line number Diff line
@@ -3,13 +3,39 @@
from __future__ import annotations

# Re-export all public symbols
from .base import DEFAULT_CACHE_DIR, BaseConfigModel, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now  # noqa: F401
from .base import (
    DEFAULT_CACHE_DIR,
    BaseConfigModel,  # noqa: F401
    HttpCacheConfig,
    OutputFormat,
    PortalCredentials,
    SortOrder,
    utc_now,
)
from .crawl_limits import CrawlLimits  # noqa: F401
from .crawl_log import CrawlLogEntry  # noqa: F401
from .meetings import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig  # noqa: F401
from .subworking_groups import CODE_INDEX, SUBTB_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord  # noqa: F401
from .tdocs import CrawlConfig, QueryConfig, TDocCrawlConfig, TDocMetadata  # noqa: F401
from .working_groups import WORKING_GROUP_RECORDS, WorkingGroup, WorkingGroupRecord  # noqa: F401
from .meetings import (
    MeetingCrawlConfig,
    MeetingMetadata,  # noqa: F401
    MeetingQueryConfig,
)
from .subworking_groups import (
    CODE_INDEX,
    SUBTB_INDEX,  # noqa: F401
    SUBWORKING_GROUP_RECORDS,
    SubWorkingGroupRecord,
)
from .tdocs import (
    CrawlConfig,
    QueryConfig,
    TDocCrawlConfig,  # noqa: F401
    TDocMetadata,
)
from .working_groups import (
    WORKING_GROUP_RECORDS,
    WorkingGroup,  # noqa: F401
    WorkingGroupRecord,
)

__all__ = [
    "CODE_INDEX",

tests/test_checkout.py

0 → 100644
+181 −0
Original line number Diff line number Diff line
"""Tests for TDoc checkout functionality."""

import zipfile
from decimal import Decimal
from pathlib import Path
from unittest.mock import Mock, patch

import pytest

from tdoc_crawler.checkout import checkout_tdoc, get_checked_out_tdocs, get_checkout_path
from tdoc_crawler.models import TDocMetadata


@pytest.fixture
def sample_tdoc_metadata() -> TDocMetadata:
    """Create sample TDoc metadata for testing."""
    return TDocMetadata(
        tdoc_id="S4-251234",
        url="https://www.3gpp.org/ftp/tsg_sa/SA4/s4-251234.zip",
        title="Test TDoc",
        meeting_id=113,
        source="Test Source",
        contact="test@example.com",
        agenda_item_nbr=Decimal("1.0"),
    )


@pytest.fixture
def checkout_dir(tmp_path: Path) -> Path:
    """Create temporary checkout directory."""
    return tmp_path / "checkout"


class TestGetCheckoutPath:
    """Tests for get_checkout_path function."""

    def test_standard_url(self, sample_tdoc_metadata: TDocMetadata, checkout_dir: Path) -> None:
        """Test checkout path calculation for standard 3GPP URL."""
        path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
        expected = checkout_dir / "tsg_sa" / "SA4" / "S4-251234"
        assert path == expected

    def test_url_with_meeting_in_path(self, checkout_dir: Path) -> None:
        """Test checkout path for URL with meeting in path."""
        metadata = TDocMetadata(
            tdoc_id="R1-2301234",
            url="https://www.3gpp.org/ftp/tsg_ran/RAN1/r1-2301234.zip",
            title="Test",
            meeting_id=123,
            source="Test",
            contact="test@example.com",
            agenda_item_nbr=Decimal("1.0"),
        )
        path = get_checkout_path(metadata, checkout_dir)
        expected = checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234"
        assert path == expected

    def test_url_without_ftp_in_path(self, checkout_dir: Path) -> None:
        """Test checkout path for URL without 'ftp' component."""
        metadata = TDocMetadata(
            tdoc_id="S4-251234",
            url="some/path/s4-251234.zip",
            title="Test",
            meeting_id=113,
            source="Test",
            contact="test@example.com",
            agenda_item_nbr=Decimal("1.0"),
        )
        path = get_checkout_path(metadata, checkout_dir)
        expected = checkout_dir / "some" / "path" / "S4-251234"
        assert path == expected


class TestCheckoutTDoc:
    """Tests for checkout_tdoc function."""

    @patch("tdoc_crawler.checkout._download_file")
    def test_successful_checkout(
        self,
        mock_download: Mock,
        sample_tdoc_metadata: TDocMetadata,
        checkout_dir: Path,
    ) -> None:
        """Test successful TDoc checkout."""
        checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)

        # Mock download to create the zip file
        def mock_download_impl(url: str, dest: Path) -> None:
            dest.parent.mkdir(parents=True, exist_ok=True)
            with zipfile.ZipFile(dest, "w") as zf:
                zf.writestr("test.txt", "test content")

        mock_download.side_effect = mock_download_impl

        result = checkout_tdoc(sample_tdoc_metadata, checkout_dir)

        assert result == checkout_path
        assert result.exists()
        assert (result / "test.txt").exists()
        assert not (result / "S4-251234.zip").exists()  # Zip should be cleaned up

    @patch("tdoc_crawler.checkout._download_file")
    def test_already_checked_out(
        self,
        mock_download: Mock,
        sample_tdoc_metadata: TDocMetadata,
        checkout_dir: Path,
    ) -> None:
        """Test that already checked out TDoc is skipped."""
        checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
        checkout_path.mkdir(parents=True)
        (checkout_path / "existing.txt").write_text("existing")

        result = checkout_tdoc(sample_tdoc_metadata, checkout_dir)

        assert result == checkout_path
        mock_download.assert_not_called()

    @patch("tdoc_crawler.checkout._download_file")
    def test_force_recheckout(
        self,
        mock_download: Mock,
        sample_tdoc_metadata: TDocMetadata,
        checkout_dir: Path,
    ) -> None:
        """Test force option re-downloads existing TDoc."""
        checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
        checkout_path.mkdir(parents=True)

        def mock_download_impl(url: str, dest: Path) -> None:
            with zipfile.ZipFile(dest, "w") as zf:
                zf.writestr("new.txt", "new content")

        mock_download.side_effect = mock_download_impl

        result = checkout_tdoc(sample_tdoc_metadata, checkout_dir, force=True)

        assert result == checkout_path
        mock_download.assert_called_once()

    def test_invalid_url_scheme(self, checkout_dir: Path) -> None:
        """Test that invalid URL scheme raises ValueError."""
        metadata = TDocMetadata(
            tdoc_id="S4-251234",
            url="file:///local/path/s4-251234.zip",
            title="Test",
            meeting_id=113,
            source="Test",
            contact="test@example.com",
            agenda_item_nbr=Decimal("1.0"),
        )

        with pytest.raises(ValueError, match="unsupported-url-scheme"):
            checkout_tdoc(metadata, checkout_dir)


class TestGetCheckedOutTdocs:
    """Tests for get_checked_out_tdocs function."""

    def test_empty_checkout_dir(self, checkout_dir: Path) -> None:
        """Test empty checkout directory returns empty list."""
        result = get_checked_out_tdocs(checkout_dir)
        assert result == []

    def test_nonexistent_checkout_dir(self, tmp_path: Path) -> None:
        """Test non-existent checkout directory returns empty list."""
        result = get_checked_out_tdocs(tmp_path / "nonexistent")
        assert result == []

    def test_checked_out_tdocs(self, checkout_dir: Path) -> None:
        """Test finding checked out TDocs."""
        # Create some checked out TDoc directories
        (checkout_dir / "tsg_sa" / "SA4" / "S4-251234").mkdir(parents=True)
        (checkout_dir / "tsg_sa" / "SA4" / "S4-251234" / "file.txt").write_text("content")

        (checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234").mkdir(parents=True)
        (checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234" / "doc.txt").write_text("content")

        result = get_checked_out_tdocs(checkout_dir)

        assert sorted(result) == ["R1-2301234", "S4-251234"]