feat(checkout): implement TDoc checkout functionality (d11f9a60) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/checkout.py

0 → 100644

+168 −0

Original line number	Diff line number	Diff line
		"""TDoc checkout functionality for downloading and extracting documents.

		This module provides functionality to download TDocs from the 3GPP server
		and extract them to a local checkout folder while maintaining the same
		directory structure as the server.
		"""

		from __future__ import annotations

		import logging
		import shutil
		import zipfile
		from pathlib import Path
		from urllib.parse import urlparse

		from tdoc_crawler.models import TDocMetadata

		logger = logging.getLogger(__name__)


		def get_checkout_path(metadata: TDocMetadata, checkout_dir: Path) -> Path:
		"""Calculate the checkout path for a TDoc based on its URL.

		The checkout path mirrors the 3GPP server directory structure.
		For example:
		- URL: https://www.3gpp.org/ftp/tsg_sa/SA4/s4-251234.zip
		- Checkout: checkout_dir/tsg_sa/SA4/s4-251234/

		Args:
		metadata: TDoc metadata containing the URL
		checkout_dir: Base checkout directory

		Returns:
		Path to the checkout directory for this TDoc
		"""
		url_path = urlparse(metadata.url).path

		# Normalize the path: remove leading slash and split into components
		# Use str.split('/') to avoid Path treating it as absolute on Windows
		url_path = url_path.lstrip("/")
		path_parts = url_path.split("/")

		# Find the 'ftp' component and take everything after it
		try:
		ftp_index = path_parts.index("ftp")
		relative_parts = path_parts[ftp_index + 1 :]
		except ValueError:
		# If 'ftp' not found, use the full path
		relative_parts = path_parts

		# Remove the filename (last component) - we'll use tdoc_id as folder name
		if relative_parts:
		relative_parts = relative_parts[:-1]

		# Build the checkout path: checkout_dir / path / tdoc_id
		checkout_path = checkout_dir.joinpath(*relative_parts) / metadata.tdoc_id if relative_parts else checkout_dir / metadata.tdoc_id

		return checkout_path


		def checkout_tdoc(
		metadata: TDocMetadata,
		checkout_dir: Path,
		*,
		force: bool = False,
		) -> Path:
		"""Download and extract a TDoc to the checkout folder.

		Args:
		metadata: TDoc metadata containing URL and ID
		checkout_dir: Base checkout directory
		force: If True, re-download even if already exists

		Returns:
		Path to the extracted TDoc directory

		Raises:
		FileNotFoundError: If download fails or zip is empty
		ValueError: If URL scheme is not supported
		zipfile.BadZipFile: If the downloaded file is not a valid zip
		"""
		checkout_path = get_checkout_path(metadata, checkout_dir)

		# Check if already checked out
		if checkout_path.exists() and not force:
		logger.debug(f"TDoc {metadata.tdoc_id} already checked out at {checkout_path}")
		return checkout_path

		# Create checkout directory
		checkout_path.mkdir(parents=True, exist_ok=True)

		# Download the file
		temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"

		try:
		_download_file(metadata.url, temp_zip_path)

		# Extract the zip file
		with zipfile.ZipFile(temp_zip_path) as archive:
		archive.extractall(checkout_path)

		logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")

		finally:
		# Clean up the zip file
		if temp_zip_path.exists():
		temp_zip_path.unlink()

		return checkout_path


		def _download_file(url: str, destination: Path) -> None:
		"""Download a file from URL to destination path.

		Args:
		url: Source URL
		destination: Destination path

		Raises:
		ValueError: If URL scheme is not supported
		FileNotFoundError: If download fails
		"""
		from urllib.request import urlopen

		destination.parent.mkdir(parents=True, exist_ok=True)

		# Validate URL scheme
		allowed_schemes = ("ftp://", "http://", "https://")
		lowered = url.lower()
		if not lowered.startswith(allowed_schemes):
		raise ValueError(f"unsupported-url-scheme: {url}")

		try:
		with urlopen(url, timeout=300) as response, destination.open("wb") as target: # noqa: S310
		shutil.copyfileobj(response, target)
		except Exception as exc:
		raise FileNotFoundError(f"failed-to-download: {url}") from exc


		def get_checked_out_tdocs(checkout_dir: Path) -> list[str]:
		"""Get list of TDoc IDs that are already checked out.

		Args:
		checkout_dir: Base checkout directory

		Returns:
		List of TDoc IDs (directory names)
		"""
		if not checkout_dir.exists():
		return []

		tdoc_ids = []
		for path in checkout_dir.rglob("*"):
		if path.is_dir():
		# Check if this is a leaf directory (contains files, not just subdirs)
		# and is not an intermediate directory
		has_files = any(f.is_file() for f in path.iterdir())
		if has_files:
		tdoc_ids.append(path.name)

		return tdoc_ids


		__all__ = [
		"checkout_tdoc",
		"get_checked_out_tdocs",
		"get_checkout_path",
		]

src/tdoc_crawler/cli/app.py

+56 −0

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ from rich.console import Console
		from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn
		from rich.table import Table

		from tdoc_crawler.checkout import checkout_tdoc
		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig
		@@ -383,6 +384,61 @@ def open_tdoc(
		launch_file(target_file)


		@app.command()
		def checkout(
		tdoc_id: Annotated[list[str], typer.Argument(help="TDoc identifier(s) to checkout")],
		cache_dir: Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory")] = DEFAULT_CACHE_DIR,
		force: Annotated[bool, typer.Option("--force", "-f", help="Re-download even if already checked out")] = False,
		) -> None:
		"""Download and extract TDoc(s) to checkout folder."""
		normalized_ids = [tid.strip().upper() for tid in tdoc_id]
		config = QueryConfig(
		cache_dir=cache_dir,
		tdoc_ids=normalized_ids,
		)

		db_path = database_path(cache_dir)
		with TDocDatabase(db_path) as database:
		results = database.query_tdocs(config)
		results = maybe_fetch_missing_tdocs(database, cache_dir, config, results)

		# Check which TDocs were found
		found_ids = {r.tdoc_id for r in results}
		missing_ids = set(normalized_ids) - found_ids

		if missing_ids:
		console.print(f"[red]TDoc(s) not found: {', '.join(sorted(missing_ids))}[/red]")
		if not results:
		raise typer.Exit(code=1)

		checkout_dir = cache_dir / "checkout"
		success_count = 0
		error_count = 0

		with Progress(
		SpinnerColumn(),
		TextColumn("[progress.description]{task.description}"),
		BarColumn(),
		MofNCompleteColumn(),
		console=console,
		) as progress:
		task = progress.add_task("Checking out TDocs...", total=len(results))

		for metadata in results:
		try:
		checkout_path = checkout_tdoc(metadata, checkout_dir, force=force)
		progress.console.print(f"[green]✓ {metadata.tdoc_id} → {checkout_path}")
		success_count += 1
		except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
		progress.console.print(f"[red]✗ {metadata.tdoc_id}: {exc}")
		error_count += 1
		progress.advance(task)

		console.print(f"\n[cyan]Checked out {success_count} TDoc(s)[/cyan]")
		if error_count:
		console.print(f"[red]Failed: {error_count} TDoc(s)[/red]")


		@app.command()
		def stats(
		cache_dir: Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory")] = DEFAULT_CACHE_DIR,

src/tdoc_crawler/models/init.py

+31 −5

Original line number	Diff line number	Diff line
		@@ -3,13 +3,39 @@
		from __future__ import annotations

		# Re-export all public symbols
		from .base import DEFAULT_CACHE_DIR, BaseConfigModel, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now # noqa: F401
		from .base import (
		DEFAULT_CACHE_DIR,
		BaseConfigModel, # noqa: F401
		HttpCacheConfig,
		OutputFormat,
		PortalCredentials,
		SortOrder,
		utc_now,
		)
		from .crawl_limits import CrawlLimits # noqa: F401
		from .crawl_log import CrawlLogEntry # noqa: F401
		from .meetings import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig # noqa: F401
		from .subworking_groups import CODE_INDEX, SUBTB_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord # noqa: F401
		from .tdocs import CrawlConfig, QueryConfig, TDocCrawlConfig, TDocMetadata # noqa: F401
		from .working_groups import WORKING_GROUP_RECORDS, WorkingGroup, WorkingGroupRecord # noqa: F401
		from .meetings import (
		MeetingCrawlConfig,
		MeetingMetadata, # noqa: F401
		MeetingQueryConfig,
		)
		from .subworking_groups import (
		CODE_INDEX,
		SUBTB_INDEX, # noqa: F401
		SUBWORKING_GROUP_RECORDS,
		SubWorkingGroupRecord,
		)
		from .tdocs import (
		CrawlConfig,
		QueryConfig,
		TDocCrawlConfig, # noqa: F401
		TDocMetadata,
		)
		from .working_groups import (
		WORKING_GROUP_RECORDS,
		WorkingGroup, # noqa: F401
		WorkingGroupRecord,
		)

		__all__ = [
		"CODE_INDEX",

tests/test_checkout.py

0 → 100644

+181 −0

Original line number	Diff line number	Diff line
		"""Tests for TDoc checkout functionality."""

		import zipfile
		from decimal import Decimal
		from pathlib import Path
		from unittest.mock import Mock, patch

		import pytest

		from tdoc_crawler.checkout import checkout_tdoc, get_checked_out_tdocs, get_checkout_path
		from tdoc_crawler.models import TDocMetadata


		@pytest.fixture
		def sample_tdoc_metadata() -> TDocMetadata:
		"""Create sample TDoc metadata for testing."""
		return TDocMetadata(
		tdoc_id="S4-251234",
		url="https://www.3gpp.org/ftp/tsg_sa/SA4/s4-251234.zip",
		title="Test TDoc",
		meeting_id=113,
		source="Test Source",
		contact="test@example.com",
		agenda_item_nbr=Decimal("1.0"),
		)


		@pytest.fixture
		def checkout_dir(tmp_path: Path) -> Path:
		"""Create temporary checkout directory."""
		return tmp_path / "checkout"


		class TestGetCheckoutPath:
		"""Tests for get_checkout_path function."""

		def test_standard_url(self, sample_tdoc_metadata: TDocMetadata, checkout_dir: Path) -> None:
		"""Test checkout path calculation for standard 3GPP URL."""
		path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
		expected = checkout_dir / "tsg_sa" / "SA4" / "S4-251234"
		assert path == expected

		def test_url_with_meeting_in_path(self, checkout_dir: Path) -> None:
		"""Test checkout path for URL with meeting in path."""
		metadata = TDocMetadata(
		tdoc_id="R1-2301234",
		url="https://www.3gpp.org/ftp/tsg_ran/RAN1/r1-2301234.zip",
		title="Test",
		meeting_id=123,
		source="Test",
		contact="test@example.com",
		agenda_item_nbr=Decimal("1.0"),
		)
		path = get_checkout_path(metadata, checkout_dir)
		expected = checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234"
		assert path == expected

		def test_url_without_ftp_in_path(self, checkout_dir: Path) -> None:
		"""Test checkout path for URL without 'ftp' component."""
		metadata = TDocMetadata(
		tdoc_id="S4-251234",
		url="some/path/s4-251234.zip",
		title="Test",
		meeting_id=113,
		source="Test",
		contact="test@example.com",
		agenda_item_nbr=Decimal("1.0"),
		)
		path = get_checkout_path(metadata, checkout_dir)
		expected = checkout_dir / "some" / "path" / "S4-251234"
		assert path == expected


		class TestCheckoutTDoc:
		"""Tests for checkout_tdoc function."""

		@patch("tdoc_crawler.checkout._download_file")
		def test_successful_checkout(
		self,
		mock_download: Mock,
		sample_tdoc_metadata: TDocMetadata,
		checkout_dir: Path,
		) -> None:
		"""Test successful TDoc checkout."""
		checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)

		# Mock download to create the zip file
		def mock_download_impl(url: str, dest: Path) -> None:
		dest.parent.mkdir(parents=True, exist_ok=True)
		with zipfile.ZipFile(dest, "w") as zf:
		zf.writestr("test.txt", "test content")

		mock_download.side_effect = mock_download_impl

		result = checkout_tdoc(sample_tdoc_metadata, checkout_dir)

		assert result == checkout_path
		assert result.exists()
		assert (result / "test.txt").exists()
		assert not (result / "S4-251234.zip").exists() # Zip should be cleaned up

		@patch("tdoc_crawler.checkout._download_file")
		def test_already_checked_out(
		self,
		mock_download: Mock,
		sample_tdoc_metadata: TDocMetadata,
		checkout_dir: Path,
		) -> None:
		"""Test that already checked out TDoc is skipped."""
		checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
		checkout_path.mkdir(parents=True)
		(checkout_path / "existing.txt").write_text("existing")

		result = checkout_tdoc(sample_tdoc_metadata, checkout_dir)

		assert result == checkout_path
		mock_download.assert_not_called()

		@patch("tdoc_crawler.checkout._download_file")
		def test_force_recheckout(
		self,
		mock_download: Mock,
		sample_tdoc_metadata: TDocMetadata,
		checkout_dir: Path,
		) -> None:
		"""Test force option re-downloads existing TDoc."""
		checkout_path = get_checkout_path(sample_tdoc_metadata, checkout_dir)
		checkout_path.mkdir(parents=True)

		def mock_download_impl(url: str, dest: Path) -> None:
		with zipfile.ZipFile(dest, "w") as zf:
		zf.writestr("new.txt", "new content")

		mock_download.side_effect = mock_download_impl

		result = checkout_tdoc(sample_tdoc_metadata, checkout_dir, force=True)

		assert result == checkout_path
		mock_download.assert_called_once()

		def test_invalid_url_scheme(self, checkout_dir: Path) -> None:
		"""Test that invalid URL scheme raises ValueError."""
		metadata = TDocMetadata(
		tdoc_id="S4-251234",
		url="file:///local/path/s4-251234.zip",
		title="Test",
		meeting_id=113,
		source="Test",
		contact="test@example.com",
		agenda_item_nbr=Decimal("1.0"),
		)

		with pytest.raises(ValueError, match="unsupported-url-scheme"):
		checkout_tdoc(metadata, checkout_dir)


		class TestGetCheckedOutTdocs:
		"""Tests for get_checked_out_tdocs function."""

		def test_empty_checkout_dir(self, checkout_dir: Path) -> None:
		"""Test empty checkout directory returns empty list."""
		result = get_checked_out_tdocs(checkout_dir)
		assert result == []

		def test_nonexistent_checkout_dir(self, tmp_path: Path) -> None:
		"""Test non-existent checkout directory returns empty list."""
		result = get_checked_out_tdocs(tmp_path / "nonexistent")
		assert result == []

		def test_checked_out_tdocs(self, checkout_dir: Path) -> None:
		"""Test finding checked out TDocs."""
		# Create some checked out TDoc directories
		(checkout_dir / "tsg_sa" / "SA4" / "S4-251234").mkdir(parents=True)
		(checkout_dir / "tsg_sa" / "SA4" / "S4-251234" / "file.txt").write_text("content")

		(checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234").mkdir(parents=True)
		(checkout_dir / "tsg_ran" / "RAN1" / "R1-2301234" / "doc.txt").write_text("content")

		result = get_checked_out_tdocs(checkout_dir)

		assert sorted(result) == ["R1-2301234", "S4-251234"]