feat(checkout): enhance checkout process with forced download and extraction (ac47c0b0) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/checkout.py

+36 −12

Original line number	Diff line number	Diff line
		@@ -8,7 +8,9 @@ directory structure as the server.
		from __future__ import annotations

		import logging
		import zipfile
		import posixpath
		import shutil
		from contextlib import suppress
		from pathlib import Path
		from urllib.parse import urlparse

		@@ -151,22 +153,44 @@ def checkout_tdoc(
		# Create checkout directory
		checkout_path.mkdir(parents=True, exist_ok=True)

		# Download the file
		temp_zip_path = checkout_path / f"{metadata.tdoc_id}.zip"
		cache_dir = checkout_dir.parent if checkout_dir.name == "checkout" else checkout_dir

		try:
		_download_file(metadata.url, temp_zip_path)
		if force:
		downloads_dir = cache_dir / "checkout"
		extract_dir = downloads_dir / metadata.tdoc_id
		if extract_dir.exists():
		shutil.rmtree(extract_dir)
		zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
		with suppress(FileNotFoundError):
		zip_path.unlink()
		filename = posixpath.basename(urlparse(metadata.url).path)
		if filename:
		with suppress(FileNotFoundError):
		(downloads_dir / filename).unlink()

		# Extract the zip file
		with zipfile.ZipFile(temp_zip_path) as archive:
		archive.extractall(checkout_path)
		import importlib

		logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")
		from tdoc_crawler.cli.helpers import prepare_tdoc_file

		cli_helpers = importlib.import_module("tdoc_crawler.cli.helpers")
		original_download = cli_helpers.download_to_path
		try:
		cli_helpers.download_to_path = _download_file
		prepared_path = prepare_tdoc_file(metadata, cache_dir, return_dir=True)
		finally:
		# Clean up the zip file
		if temp_zip_path.exists():
		temp_zip_path.unlink()
		cli_helpers.download_to_path = original_download
		if prepared_path.is_dir():
		if prepared_path != checkout_path:
		shutil.copytree(prepared_path, checkout_path, dirs_exist_ok=True)
		shutil.rmtree(prepared_path)
		else:
		target_path = checkout_path / prepared_path.name
		shutil.copy2(prepared_path, target_path)
		if prepared_path != target_path:
		with suppress(FileNotFoundError):
		prepared_path.unlink()

		logger.info(f"Checked out {metadata.tdoc_id} to {checkout_path}")

		return checkout_path

src/tdoc_crawler/cli/helpers.py

+13 −4

Original line number	Diff line number	Diff line
		@@ -263,8 +263,17 @@ def download_to_path(url: str, destination: Path) -> None:
		target.write(response.content)


		def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
		"""Prepare TDoc file for opening (download and extract if needed)."""
		def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool = False) -> Path:
		"""Prepare TDoc file for opening (download and extract if needed).

		Args:
		metadata: TDoc metadata with download URL.
		cache_dir: Cache directory for downloads and extracted files.
		return_dir: When True and the TDoc is a zip, return the extract directory.

		Returns:
		Path to the downloaded file, or the extract directory when return_dir is True.
		"""
		downloads_dir = cache_dir / "checkout"
		downloads_dir.mkdir(parents=True, exist_ok=True)
		path = urlparse(metadata.url).path
		@@ -279,7 +288,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
		if extract_dir.exists():
		files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
		if files:
		return files[0]
		return extract_dir if return_dir else files[0]
		shutil.rmtree(extract_dir)
		zip_path = downloads_dir / f"{metadata.tdoc_id}.zip"
		download_to_path(metadata.url, zip_path)
		@@ -292,7 +301,7 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path) -> Path:
		files = sorted(p for p in extract_dir.rglob("*") if p.is_file())
		if not files:
		raise FileNotFoundError("no-files-in-archive")
		return files[0]
		return extract_dir if return_dir else files[0]

		target_suffix = suffix or ""
		target_name = filename if filename else f"{metadata.tdoc_id}{target_suffix or '.bin'}"

src/tdoc_crawler/models/init.py

+31 −10

Original line number	Diff line number	Diff line
		@@ -3,18 +3,39 @@
		from __future__ import annotations

		# Re-export all public symbols
		from .base import BaseConfigModel # noqa: F401
		from .base import DEFAULT_CACHE_DIR, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now
		from .base import (
		DEFAULT_CACHE_DIR,
		BaseConfigModel, # noqa: F401
		HttpCacheConfig,
		OutputFormat,
		PortalCredentials,
		SortOrder,
		utc_now,
		)
		from .crawl_limits import CrawlLimits # noqa: F401
		from .crawl_log import CrawlLogEntry # noqa: F401
		from .meetings import MeetingMetadata # noqa: F401
		from .meetings import MeetingCrawlConfig, MeetingQueryConfig
		from .subworking_groups import SUBTB_INDEX # noqa: F401
		from .subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord
		from .tdocs import TDocCrawlConfig # noqa: F401
		from .tdocs import CrawlConfig, QueryConfig, TDocMetadata
		from .working_groups import WorkingGroup # noqa: F401
		from .working_groups import WORKING_GROUP_RECORDS, WorkingGroupRecord
		from .meetings import (
		MeetingCrawlConfig,
		MeetingMetadata, # noqa: F401
		MeetingQueryConfig,
		)
		from .subworking_groups import (
		CODE_INDEX,
		SUBTB_INDEX, # noqa: F401
		SUBWORKING_GROUP_RECORDS,
		SubWorkingGroupRecord,
		)
		from .tdocs import (
		CrawlConfig,
		QueryConfig,
		TDocCrawlConfig, # noqa: F401
		TDocMetadata,
		)
		from .working_groups import (
		WORKING_GROUP_RECORDS,
		WorkingGroup, # noqa: F401
		WorkingGroupRecord,
		)

		__all__ = [
		"CODE_INDEX",