tdocs: use TDocStatus enum in checkout and fix fetch/doclist URL handling (e277d273) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/tdocs/operations/checkout.py

+21 −3

Original line number	Diff line number	Diff line
		@@ -20,7 +20,7 @@ import requests
		from tdoc_crawler.http_client import download_to_file
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.meetings.models import MeetingMetadata
		from tdoc_crawler.tdocs.models import TDocMetadata
		from tdoc_crawler.tdocs.models import TDocMetadata, TDocStatus
		from tdoc_crawler.tdocs.sources.doclist import DocumentListError, fetch_meeting_document_list

		logger = get_logger(__name__)
		@@ -85,6 +85,11 @@ def checkout_tdoc(
		Raises:
		FileNotFoundError: If download fails or zip is empty
		"""
		# Check if TDoc is withdrawn - if so, skip download and log debug message
		if metadata.status == TDocStatus.WITHDRAWN:
		logger.debug(f"Skipping checkout for withdrawn TDoc {metadata.tdoc_id}")
		raise FileNotFoundError("withdrawn")

		checkout_path = get_checkout_path(metadata, checkout_dir)

		if checkout_path.exists() and not force:
		@@ -120,11 +125,19 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool

		Returns:
		Path to the downloaded file, or the extract directory when return_dir is True.

		Raises:
		FileNotFoundError: If download fails for non-withdrawn TDocs
		"""
		# Handle the case where metadata.url is None
		if metadata.url is None:
		raise ValueError(f"Cannot prepare TDoc file for {metadata.tdoc_id}: URL is None")

		# Check if TDoc is withdrawn - if so, skip download and log debug message
		if metadata.status == TDocStatus.WITHDRAWN:
		logger.debug(f"Skipping download for withdrawn TDoc {metadata.tdoc_id}")
		raise FileNotFoundError("withdrawn")

		downloads_dir = cache_dir / "checkout"
		downloads_dir.mkdir(parents=True, exist_ok=True)
		path = urlparse(metadata.url).path
		@@ -244,6 +257,11 @@ def checkout_tdocs(
		checkout_tdoc(metadata, checkout_dir, force=force, session=session)
		success_count += 1
		except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
		error_message = str(exc)
		# For withdrawn TDocs, log debug instead of adding to errors
		if error_message == "withdrawn": # This matches the FileNotFoundError message
		logger.debug(f"Skipped withdrawn TDoc {metadata.tdoc_id}")
		else:
		errors.append(f"{metadata.tdoc_id}: {exc}")
		error_count += 1

src/tdoc_crawler/tdocs/operations/crawl.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -142,6 +142,7 @@ class TDocCrawler:
		timeout=config.timeout,
		http_cache=config.http_cache,
		cache_manager_name=config.cache_manager_name,
		files_url=meeting.files_url,
		)
		futures[future] = meeting

src/tdoc_crawler/tdocs/operations/fetch.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -145,7 +145,7 @@ def fetch_tdoc(
		for_purpose="unknown",
		agenda_item_nbr=Decimal("0"),
		agenda_item_text="Unknown",
		status="",
		status=None,
		meeting_name=None,
		is_revision_of=None,
		file_size=None,

src/tdoc_crawler/tdocs/sources/doclist.py

+33 −5

Original line number	Diff line number	Diff line
		@@ -28,7 +28,11 @@ class DocumentListError(Exception):


		def fetch_meeting_document_list(
		meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig \| None = None, cache_manager_name: str \| None = None
		meeting_id: int,
		timeout: int = 30,
		http_cache: HttpCacheConfig \| None = None,
		cache_manager_name: str \| None = None,
		files_url: str \| None = None,
		) -> list[TDocMetadata]:
		"""Fetch all TDoc metadata for a meeting via document list Excel file.

		@@ -37,6 +41,8 @@ def fetch_meeting_document_list(
		timeout: Request timeout in seconds
		http_cache: Optional HTTP cache configuration for caching document list responses
		cache_manager_name: Optional name of cache manager to use for HTTP caching
		files_url: Optional HTTP directory containing meeting documents (for URL construction)

		Returns:
		List of TDocMetadata instances for all TDocs in the meeting

		@@ -67,7 +73,7 @@ def fetch_meeting_document_list(

		# Parse Excel file
		logger.debug(f"Parsing Excel document list for meeting {meeting_id}")
		return parse_excel_document_list(response.content, meeting_id)
		return parse_excel_document_list(response.content, meeting_id, files_url)

		except Exception as exc:
		if isinstance(exc, DocumentListError):
		@@ -81,12 +87,14 @@ def fetch_meeting_document_list(
		def parse_excel_document_list(
		excel_content: bytes,
		meeting_id: int,
		files_url: str \| None = None,
		) -> list[TDocMetadata]:
		"""Parse Excel document list and convert to TDocMetadata instances.

		Args:
		excel_content: Raw Excel file content
		meeting_id: Meeting ID for reference
		files_url: Optional HTTP directory containing meeting documents (for URL construction)

		Returns:
		List of TDocMetadata instances
		@@ -117,7 +125,7 @@ def parse_excel_document_list(
		tdoc_metadata_list = []
		for i, (_idx, row) in enumerate(df.iterrows()):
		try:
		tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id)
		tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id, files_url)
		if tdoc_metadata:
		tdoc_metadata_list.append(tdoc_metadata)
		else:
		@@ -136,12 +144,14 @@ def parse_excel_document_list(
		def convert_excel_row_to_tdoc_metadata(
		row: pd.Series,
		meeting_id: int,
		files_url: str \| None = None,
		) -> TDocMetadata \| None:
		"""Convert a single Excel row to TDocMetadata.

		Args:
		row: pandas Series representing one Excel row
		meeting_id: Meeting ID for reference
		files_url: Optional HTTP directory containing meeting documents (for URL construction)

		Returns:
		TDocMetadata instance or None if conversion fails
		@@ -164,8 +174,9 @@ def convert_excel_row_to_tdoc_metadata(
		is_revision_of = _get_column_value(row, ["Is revision of"])
		date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date", "Reservation date"])

		# Generate URL (this will be validated/updated later by the directory crawler)
		url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip"
		# Generate URL from files_url if available
		# Format: files_url + "/Docs/" + tdoc_id + ".zip"
		url = _construct_tdoc_url(files_url, tdoc_id.upper()) if files_url else None

		now = datetime.now(UTC)

		@@ -205,6 +216,23 @@ def convert_excel_row_to_tdoc_metadata(
		return None


		def _construct_tdoc_url(files_url: str \| None, tdoc_id: str) -> str \| None:
		"""Construct the full TDoc URL from files_url.

		Args:
		files_url: Base HTTP directory containing meeting documents
		tdoc_id: TDoc identifier (e.g., "S4-260454")

		Returns:
		Full URL to TDoc zip file (e.g., "https://.../Docs/S4-260454.zip")
		or None if files_url is not available
		"""
		if not files_url:
		return None
		base = files_url.rstrip("/")
		return f"{base}/Docs/{tdoc_id.upper()}.zip"


		def _extract_tdoc_id(row: pd.Series) -> str \| None:
		"""Extract TDoc ID from Excel row.

src/tdoc_crawler/workers/tdoc_worker.py

+9 −1

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ def fetch_meeting_document_list_subinterpreter(
		timeout: int,
		http_cache: HttpCacheConfig \| None = None,
		cache_manager_name: str \| None = None,
		files_url: str \| None = None,
		) -> str:
		"""Fetch meeting document list in subinterpreter context.

		@@ -26,13 +27,20 @@ def fetch_meeting_document_list_subinterpreter(
		timeout: Request timeout in seconds
		http_cache: Optional HTTP cache configuration for caching document list responses
		cache_manager_name: Optional name of cache manager to use for HTTP caching
		files_url: Optional HTTP directory containing meeting documents (for URL construction)

		Returns:
		JSON string containing list of TDocMetadata or error information
		"""
		try:
		# Fetch document list for the meeting
		tdoc_metadata_list = fetch_meeting_document_list(meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name)
		tdoc_metadata_list = fetch_meeting_document_list(
		meeting_id=meeting_id,
		timeout=timeout,
		http_cache=http_cache,
		cache_manager_name=cache_manager_name,
		files_url=files_url,
		)

		# Serialize to JSON for inter-process communication
		serialized = []