Loading src/tdoc_crawler/tdocs/operations/checkout.py +21 −3 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ import requests from tdoc_crawler.http_client import download_to_file from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata from tdoc_crawler.tdocs.models import TDocMetadata from tdoc_crawler.tdocs.models import TDocMetadata, TDocStatus from tdoc_crawler.tdocs.sources.doclist import DocumentListError, fetch_meeting_document_list logger = get_logger(__name__) Loading Loading @@ -85,6 +85,11 @@ def checkout_tdoc( Raises: FileNotFoundError: If download fails or zip is empty """ # Check if TDoc is withdrawn - if so, skip download and log debug message if metadata.status == TDocStatus.WITHDRAWN: logger.debug(f"Skipping checkout for withdrawn TDoc {metadata.tdoc_id}") raise FileNotFoundError("withdrawn") checkout_path = get_checkout_path(metadata, checkout_dir) if checkout_path.exists() and not force: Loading Loading @@ -120,11 +125,19 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool Returns: Path to the downloaded file, or the extract directory when return_dir is True. Raises: FileNotFoundError: If download fails for non-withdrawn TDocs """ # Handle the case where metadata.url is None if metadata.url is None: raise ValueError(f"Cannot prepare TDoc file for {metadata.tdoc_id}: URL is None") # Check if TDoc is withdrawn - if so, skip download and log debug message if metadata.status == TDocStatus.WITHDRAWN: logger.debug(f"Skipping download for withdrawn TDoc {metadata.tdoc_id}") raise FileNotFoundError("withdrawn") downloads_dir = cache_dir / "checkout" downloads_dir.mkdir(parents=True, exist_ok=True) path = urlparse(metadata.url).path Loading Loading @@ -244,6 +257,11 @@ def checkout_tdocs( checkout_tdoc(metadata, checkout_dir, force=force, session=session) success_count += 1 except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc: error_message = str(exc) # For withdrawn TDocs, log debug instead of adding to errors if error_message == "withdrawn": # This matches the FileNotFoundError message logger.debug(f"Skipped withdrawn TDoc {metadata.tdoc_id}") else: errors.append(f"{metadata.tdoc_id}: {exc}") error_count += 1 Loading src/tdoc_crawler/tdocs/operations/crawl.py +1 −0 Original line number Diff line number Diff line Loading @@ -142,6 +142,7 @@ class TDocCrawler: timeout=config.timeout, http_cache=config.http_cache, cache_manager_name=config.cache_manager_name, files_url=meeting.files_url, ) futures[future] = meeting Loading src/tdoc_crawler/tdocs/operations/fetch.py +1 −1 Original line number Diff line number Diff line Loading @@ -145,7 +145,7 @@ def fetch_tdoc( for_purpose="unknown", agenda_item_nbr=Decimal("0"), agenda_item_text="Unknown", status="", status=None, meeting_name=None, is_revision_of=None, file_size=None, Loading src/tdoc_crawler/tdocs/sources/doclist.py +33 −5 Original line number Diff line number Diff line Loading @@ -28,7 +28,11 @@ class DocumentListError(Exception): def fetch_meeting_document_list( meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, files_url: str | None = None, ) -> list[TDocMetadata]: """Fetch all TDoc metadata for a meeting via document list Excel file. Loading @@ -37,6 +41,8 @@ def fetch_meeting_document_list( timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration for caching document list responses cache_manager_name: Optional name of cache manager to use for HTTP caching files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: List of TDocMetadata instances for all TDocs in the meeting Loading Loading @@ -67,7 +73,7 @@ def fetch_meeting_document_list( # Parse Excel file logger.debug(f"Parsing Excel document list for meeting {meeting_id}") return parse_excel_document_list(response.content, meeting_id) return parse_excel_document_list(response.content, meeting_id, files_url) except Exception as exc: if isinstance(exc, DocumentListError): Loading @@ -81,12 +87,14 @@ def fetch_meeting_document_list( def parse_excel_document_list( excel_content: bytes, meeting_id: int, files_url: str | None = None, ) -> list[TDocMetadata]: """Parse Excel document list and convert to TDocMetadata instances. Args: excel_content: Raw Excel file content meeting_id: Meeting ID for reference files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: List of TDocMetadata instances Loading Loading @@ -117,7 +125,7 @@ def parse_excel_document_list( tdoc_metadata_list = [] for i, (_idx, row) in enumerate(df.iterrows()): try: tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id) tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id, files_url) if tdoc_metadata: tdoc_metadata_list.append(tdoc_metadata) else: Loading @@ -136,12 +144,14 @@ def parse_excel_document_list( def convert_excel_row_to_tdoc_metadata( row: pd.Series, meeting_id: int, files_url: str | None = None, ) -> TDocMetadata | None: """Convert a single Excel row to TDocMetadata. Args: row: pandas Series representing one Excel row meeting_id: Meeting ID for reference files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: TDocMetadata instance or None if conversion fails Loading @@ -164,8 +174,9 @@ def convert_excel_row_to_tdoc_metadata( is_revision_of = _get_column_value(row, ["Is revision of"]) date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date", "Reservation date"]) # Generate URL (this will be validated/updated later by the directory crawler) url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip" # Generate URL from files_url if available # Format: files_url + "/Docs/" + tdoc_id + ".zip" url = _construct_tdoc_url(files_url, tdoc_id.upper()) if files_url else None now = datetime.now(UTC) Loading Loading @@ -205,6 +216,23 @@ def convert_excel_row_to_tdoc_metadata( return None def _construct_tdoc_url(files_url: str | None, tdoc_id: str) -> str | None: """Construct the full TDoc URL from files_url. Args: files_url: Base HTTP directory containing meeting documents tdoc_id: TDoc identifier (e.g., "S4-260454") Returns: Full URL to TDoc zip file (e.g., "https://.../Docs/S4-260454.zip") or None if files_url is not available """ if not files_url: return None base = files_url.rstrip("/") return f"{base}/Docs/{tdoc_id.upper()}.zip" def _extract_tdoc_id(row: pd.Series) -> str | None: """Extract TDoc ID from Excel row. Loading src/tdoc_crawler/workers/tdoc_worker.py +9 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ def fetch_meeting_document_list_subinterpreter( timeout: int, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, files_url: str | None = None, ) -> str: """Fetch meeting document list in subinterpreter context. Loading @@ -26,13 +27,20 @@ def fetch_meeting_document_list_subinterpreter( timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration for caching document list responses cache_manager_name: Optional name of cache manager to use for HTTP caching files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: JSON string containing list of TDocMetadata or error information """ try: # Fetch document list for the meeting tdoc_metadata_list = fetch_meeting_document_list(meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name) tdoc_metadata_list = fetch_meeting_document_list( meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name, files_url=files_url, ) # Serialize to JSON for inter-process communication serialized = [] Loading Loading
src/tdoc_crawler/tdocs/operations/checkout.py +21 −3 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ import requests from tdoc_crawler.http_client import download_to_file from tdoc_crawler.logging import get_logger from tdoc_crawler.meetings.models import MeetingMetadata from tdoc_crawler.tdocs.models import TDocMetadata from tdoc_crawler.tdocs.models import TDocMetadata, TDocStatus from tdoc_crawler.tdocs.sources.doclist import DocumentListError, fetch_meeting_document_list logger = get_logger(__name__) Loading Loading @@ -85,6 +85,11 @@ def checkout_tdoc( Raises: FileNotFoundError: If download fails or zip is empty """ # Check if TDoc is withdrawn - if so, skip download and log debug message if metadata.status == TDocStatus.WITHDRAWN: logger.debug(f"Skipping checkout for withdrawn TDoc {metadata.tdoc_id}") raise FileNotFoundError("withdrawn") checkout_path = get_checkout_path(metadata, checkout_dir) if checkout_path.exists() and not force: Loading Loading @@ -120,11 +125,19 @@ def prepare_tdoc_file(metadata: TDocMetadata, cache_dir: Path, return_dir: bool Returns: Path to the downloaded file, or the extract directory when return_dir is True. Raises: FileNotFoundError: If download fails for non-withdrawn TDocs """ # Handle the case where metadata.url is None if metadata.url is None: raise ValueError(f"Cannot prepare TDoc file for {metadata.tdoc_id}: URL is None") # Check if TDoc is withdrawn - if so, skip download and log debug message if metadata.status == TDocStatus.WITHDRAWN: logger.debug(f"Skipping download for withdrawn TDoc {metadata.tdoc_id}") raise FileNotFoundError("withdrawn") downloads_dir = cache_dir / "checkout" downloads_dir.mkdir(parents=True, exist_ok=True) path = urlparse(metadata.url).path Loading Loading @@ -244,6 +257,11 @@ def checkout_tdocs( checkout_tdoc(metadata, checkout_dir, force=force, session=session) success_count += 1 except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc: error_message = str(exc) # For withdrawn TDocs, log debug instead of adding to errors if error_message == "withdrawn": # This matches the FileNotFoundError message logger.debug(f"Skipped withdrawn TDoc {metadata.tdoc_id}") else: errors.append(f"{metadata.tdoc_id}: {exc}") error_count += 1 Loading
src/tdoc_crawler/tdocs/operations/crawl.py +1 −0 Original line number Diff line number Diff line Loading @@ -142,6 +142,7 @@ class TDocCrawler: timeout=config.timeout, http_cache=config.http_cache, cache_manager_name=config.cache_manager_name, files_url=meeting.files_url, ) futures[future] = meeting Loading
src/tdoc_crawler/tdocs/operations/fetch.py +1 −1 Original line number Diff line number Diff line Loading @@ -145,7 +145,7 @@ def fetch_tdoc( for_purpose="unknown", agenda_item_nbr=Decimal("0"), agenda_item_text="Unknown", status="", status=None, meeting_name=None, is_revision_of=None, file_size=None, Loading
src/tdoc_crawler/tdocs/sources/doclist.py +33 −5 Original line number Diff line number Diff line Loading @@ -28,7 +28,11 @@ class DocumentListError(Exception): def fetch_meeting_document_list( meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None meeting_id: int, timeout: int = 30, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, files_url: str | None = None, ) -> list[TDocMetadata]: """Fetch all TDoc metadata for a meeting via document list Excel file. Loading @@ -37,6 +41,8 @@ def fetch_meeting_document_list( timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration for caching document list responses cache_manager_name: Optional name of cache manager to use for HTTP caching files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: List of TDocMetadata instances for all TDocs in the meeting Loading Loading @@ -67,7 +73,7 @@ def fetch_meeting_document_list( # Parse Excel file logger.debug(f"Parsing Excel document list for meeting {meeting_id}") return parse_excel_document_list(response.content, meeting_id) return parse_excel_document_list(response.content, meeting_id, files_url) except Exception as exc: if isinstance(exc, DocumentListError): Loading @@ -81,12 +87,14 @@ def fetch_meeting_document_list( def parse_excel_document_list( excel_content: bytes, meeting_id: int, files_url: str | None = None, ) -> list[TDocMetadata]: """Parse Excel document list and convert to TDocMetadata instances. Args: excel_content: Raw Excel file content meeting_id: Meeting ID for reference files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: List of TDocMetadata instances Loading Loading @@ -117,7 +125,7 @@ def parse_excel_document_list( tdoc_metadata_list = [] for i, (_idx, row) in enumerate(df.iterrows()): try: tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id) tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id, files_url) if tdoc_metadata: tdoc_metadata_list.append(tdoc_metadata) else: Loading @@ -136,12 +144,14 @@ def parse_excel_document_list( def convert_excel_row_to_tdoc_metadata( row: pd.Series, meeting_id: int, files_url: str | None = None, ) -> TDocMetadata | None: """Convert a single Excel row to TDocMetadata. Args: row: pandas Series representing one Excel row meeting_id: Meeting ID for reference files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: TDocMetadata instance or None if conversion fails Loading @@ -164,8 +174,9 @@ def convert_excel_row_to_tdoc_metadata( is_revision_of = _get_column_value(row, ["Is revision of"]) date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date", "Reservation date"]) # Generate URL (this will be validated/updated later by the directory crawler) url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip" # Generate URL from files_url if available # Format: files_url + "/Docs/" + tdoc_id + ".zip" url = _construct_tdoc_url(files_url, tdoc_id.upper()) if files_url else None now = datetime.now(UTC) Loading Loading @@ -205,6 +216,23 @@ def convert_excel_row_to_tdoc_metadata( return None def _construct_tdoc_url(files_url: str | None, tdoc_id: str) -> str | None: """Construct the full TDoc URL from files_url. Args: files_url: Base HTTP directory containing meeting documents tdoc_id: TDoc identifier (e.g., "S4-260454") Returns: Full URL to TDoc zip file (e.g., "https://.../Docs/S4-260454.zip") or None if files_url is not available """ if not files_url: return None base = files_url.rstrip("/") return f"{base}/Docs/{tdoc_id.upper()}.zip" def _extract_tdoc_id(row: pd.Series) -> str | None: """Extract TDoc ID from Excel row. Loading
src/tdoc_crawler/workers/tdoc_worker.py +9 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ def fetch_meeting_document_list_subinterpreter( timeout: int, http_cache: HttpCacheConfig | None = None, cache_manager_name: str | None = None, files_url: str | None = None, ) -> str: """Fetch meeting document list in subinterpreter context. Loading @@ -26,13 +27,20 @@ def fetch_meeting_document_list_subinterpreter( timeout: Request timeout in seconds http_cache: Optional HTTP cache configuration for caching document list responses cache_manager_name: Optional name of cache manager to use for HTTP caching files_url: Optional HTTP directory containing meeting documents (for URL construction) Returns: JSON string containing list of TDocMetadata or error information """ try: # Fetch document list for the meeting tdoc_metadata_list = fetch_meeting_document_list(meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name) tdoc_metadata_list = fetch_meeting_document_list( meeting_id=meeting_id, timeout=timeout, http_cache=http_cache, cache_manager_name=cache_manager_name, files_url=files_url, ) # Serialize to JSON for inter-process communication serialized = [] Loading