feat(portal): introduce PortalClient for unified portal interactions (a29a998f) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/fetching.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ from pydantic import ValidationError

		from tdoc_crawler.cli.console import get_console
		from tdoc_crawler.cli.helpers import resolve_meeting_id
		from tdoc_crawler.crawlers import TDocCrawlResult, WhatTheSpecResolutionError, fetch_tdoc_metadata, resolve_via_whatthespec
		from tdoc_crawler.crawlers import TDocCrawlResult, WhatTheSpecResolutionError, create_portal_client, resolve_via_whatthespec
		from tdoc_crawler.credentials import resolve_credentials
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import HttpCacheConfig, PortalCredentials, QueryConfig, TDocMetadata
		@@ -53,10 +53,12 @@ def fetch_missing_tdocs(
		inserted_count = 0
		updated_count = 0

		client = create_portal_client(credentials=credentials, cache_dir=cache_dir)

		for tdoc_id in missing_ids:
		try:
		# Fetch metadata from portal (now returns TDocMetadata directly)
		metadata = fetch_tdoc_metadata(tdoc_id, credentials)
		# Fetch metadata from portal using PortalClient
		metadata = client.fetch_tdoc_metadata(tdoc_id)

		# Resolve meeting_id from meeting name if available
		if metadata.meeting_name:

src/tdoc_crawler/crawlers/init.py

+4 −2

Original line number	Diff line number	Diff line
		@@ -18,10 +18,11 @@ from tdoc_crawler.crawlers.meetings import MeetingCrawler, MeetingCrawlResult, n
		from tdoc_crawler.crawlers.parallel import fetch_meeting_tdocs
		from tdoc_crawler.crawlers.portal import (
		PortalAuthenticationError,
		PortalClient,
		PortalParsingError,
		PortalSession,
		create_portal_client,
		extract_tdoc_url_from_portal,
		fetch_tdoc_metadata,
		parse_tdoc_portal_page,
		)
		from tdoc_crawler.crawlers.tdocs import TDocCrawler, TDocCrawlResult
		@@ -42,16 +43,17 @@ __all__ = [
		"MeetingCrawlResult",
		"MeetingCrawler",
		"PortalAuthenticationError",
		"PortalClient",
		"PortalParsingError",
		"PortalSession",
		"TDocCrawlResult",
		"TDocCrawler",
		"WhatTheSpecResolutionError",
		"convert_excel_row_to_tdoc_metadata",
		"create_portal_client",
		"extract_tdoc_url_from_portal",
		"fetch_meeting_document_list",
		"fetch_meeting_tdocs",
		"fetch_tdoc_metadata",
		"normalize_subgroup_alias",
		"normalize_working_group_alias",
		"parse_excel_document_list",

src/tdoc_crawler/crawlers/portal.py

+296 −91

Original line number	Diff line number	Diff line
		"""3GPP Portal authentication and TDoc metadata parsing."""
		"""3GPP Portal authentication and TDoc metadata parsing.

		This module provides the PortalClient class as the primary interface for
		interacting with the 3GPP portal. It consolidates:

		- Session management and authentication
		- TDoc URL extraction (unauthenticated)
		- TDoc metadata fetching (authenticated)
		- HTML parsing utilities

		For backward compatibility, standalone functions are also provided that delegate
		to PortalClient under the hood.
		"""

		from __future__ import annotations

		import logging
		import re
		import tempfile
		from decimal import Decimal
		from pathlib import Path
		from typing import Any

		import requests
		from bs4 import BeautifulSoup
		from pydantic import ValidationError

		from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
		from tdoc_crawler.http_client import create_cached_session
		@@ -19,6 +29,17 @@ from tdoc_crawler.models.tdocs import TDocMetadata

		logger = logging.getLogger(__name__)

		# Common browser headers to avoid 403 Forbidden responses
		_BROWSER_HEADERS: dict[str, str] = {
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
		"Accept-Language": "en-US,en;q=0.5",
		"Accept-Encoding": "gzip, deflate, br",
		"DNT": "1",
		"Connection": "keep-alive",
		"Upgrade-Insecure-Requests": "1",
		}


		class PortalAuthenticationError(Exception):
		"""Raised when portal authentication fails."""
		@@ -29,7 +50,11 @@ class PortalParsingError(Exception):


		class PortalSession:
		"""Manages authenticated session with 3GPP portal."""
		"""Manages authenticated session with 3GPP portal.

		Note: This class is kept for backward compatibility. New code should use
		PortalClient which provides a unified interface.
		"""

		def __init__(self, credentials: PortalCredentials, cache_dir: Path, cache_ttl: int = 7200, cache_refresh_on_access: bool = True, timeout: int = 30) -> None:
		"""Initialize portal session.
		@@ -291,58 +316,6 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str \| None = None) -> T
		logger.warning(error_msg)
		raise PortalParsingError(error_msg)

		# URL is extracted from the status field download link during parsing
		# If no download link was found, url will be None

		# Parse agenda_item_nbr as Decimal
		try:
		agenda_nbr_decimal = Decimal(str(metadata["agenda_item_nbr"]))
		except Exception as exc:
		error_msg = f"Invalid agenda item number '{metadata['agenda_item_nbr']}' for TDoc {tdoc_id}: {exc}"
		logger.warning(error_msg)
		raise PortalParsingError(error_msg) from exc

		# Create TDocMetadata instance
		# At this point we've validated all mandatory fields exist and are not None
		try:
		tdoc_metadata = TDocMetadata(
		tdoc_id=tdoc_id.upper(),
		meeting_id=0, # Will be resolved later from meeting_name
		meeting_name=metadata.get("meeting"), # Store original meeting name for resolution
		title=str(metadata["title"]),
		url=url,
		source=str(metadata["source"]),
		contact=str(metadata["contact"]),
		tdoc_type=str(metadata.get("tdoc_type", "unknown")),
		for_purpose=str(metadata.get("for", "unknown")),
		agenda_item_nbr=agenda_nbr_decimal,
		agenda_item_text=str(metadata.get("agenda_item_text", "Unknown")),
		status=metadata.get("status"),
		is_revision_of=metadata.get("is_revision_of"),
		file_size=None,
		date_created=None,
		validated=True,
		validation_failed=False,
		)
		logger.debug(f"Successfully parsed metadata for TDoc {tdoc_id}")
		return tdoc_metadata

		except ValidationError as exc:
		# Extract and display only the actual validation errors, not the full stack trace
		validation_errors = []
		for error in exc.errors():
		field = error["loc"][0] if error["loc"] else "unknown"
		message = error["msg"]
		validation_errors.append(f"{field}: {message}")

		error_msg = f"Failed to create TDocMetadata for TDoc {tdoc_id}: {'; '.join(validation_errors)}"
		logger.warning(error_msg)
		raise PortalParsingError(error_msg) from exc
		except Exception as exc:
		error_msg = f"Failed to create TDocMetadata for TDoc {tdoc_id}: {exc}"
		logger.warning(error_msg)
		raise PortalParsingError(error_msg) from exc


		def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path \| None = None, timeout: int = 15) -> str:
		"""Extract direct FTP download URL for a TDoc using unauthenticated DownloadTDoc.aspx endpoint.
		@@ -430,46 +403,278 @@ def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path \| None = None, ti
		session.close()


		# TODO: ambigious name with PortalSession.fetch_tdoc_metadata!
		def fetch_tdoc_metadata(
		tdoc_id: str,
		credentials: PortalCredentials,
		class PortalClient:
		"""Unified 3GPP portal client with authentication and TDoc fetching.

		This class consolidates all portal-related functionality into a single interface:
		- Session management with browser headers
		- Authentication with EOL credentials
		- Unauthenticated TDoc URL extraction
		- Authenticated TDoc metadata fetching
		- HTML parsing utilities

		Example:
		>>> client = PortalClient(credentials=creds)
		>>> url = client.extract_tdoc_url("S4-251364")
		>>> metadata = client.fetch_tdoc_metadata("S4-251364", url)
		"""

		def __init__(
		self,
		credentials: PortalCredentials \| None = None,
		cache_dir: Path \| None = None,
		cache_ttl: int = 7200,
		cache_refresh_on_access: bool = True,
		timeout: int = 30,
		url: str \| None = None,
		) -> TDocMetadata:
		"""Fetch TDoc metadata from 3GPP portal (convenience function).
		) -> None:
		"""Initialize portal client.

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364')
		credentials: ETSI Online Account credentials
		cache_dir: Directory for HTTP cache storage (defaults to temp directory)
		cache_ttl: HTTP cache TTL in seconds
		cache_refresh_on_access: Whether to refresh cache TTL on access
		timeout: Request timeout in seconds
		url: Optional TDoc URL (if known)
		credentials: Optional credentials (required for full metadata fetching).
		cache_dir: Directory for HTTP cache storage.
		cache_ttl: Cache TTL in seconds.
		cache_refresh_on_access: Whether to refresh TTL on access.
		timeout: Request timeout in seconds.
		"""
		self.credentials = credentials
		self.cache_dir = cache_dir or Path.home() / ".tdoc-crawler"
		self.timeout = timeout
		self._cache_ttl = cache_ttl
		self._cache_refresh_on_access = cache_refresh_on_access
		self._authenticated = False
		self._session: requests.Session \| None = None

		def close(self) -> None:
		"""Close the underlying session if it exists."""
		if self._session is not None:
		self._session.close()
		self._session = None

		def __enter__(self) -> PortalClient:
		"""Enter context manager."""
		return self

		def __exit__(self, *args: Any) -> None:
		"""Exit context manager and close session."""
		self.close()

		def authenticate(self) -> None:
		"""Authenticate with the 3GPP portal using EOL credentials.

		Raises:
		PortalAuthenticationError: If authentication fails.
		ValueError: If no credentials were provided.
		"""
		if self._authenticated:
		return

		if self.credentials is None:
		raise ValueError("Credentials required for authentication")

		logger.info("Authenticating with 3GPP portal...")

		session = self._get_session()

		# Step 1: Visit the login page to establish session and get cookies
		logger.debug("Visiting login page to establish session...")
		initial_response = session.get(LOGIN_URL, timeout=self.timeout)
		initial_response.raise_for_status()

		# Step 2: Call the AJAX login endpoint
		login_api_url = f"{PORTAL_BASE_URL}/ETSIPages/LoginEOL.ashx"
		login_payload = {
		"username": self.credentials.username,
		"password": self.credentials.password,
		}

		logger.debug(f"Calling login API at {login_api_url}")

		login_response = session.post(
		login_api_url,
		json=login_payload,
		headers={
		"Content-Type": "application/json; charset=UTF-8",
		"Accept": "application/json, text/javascript, /; q=0.01",
		"X-Requested-With": "XMLHttpRequest",
		"Referer": LOGIN_URL,
		},
		timeout=self.timeout,
		)
		login_response.raise_for_status()

		response_text = login_response.text.strip()
		logger.debug(f"Login API response: {response_text}")

		if response_text.lower() == "failed":
		raise PortalAuthenticationError("Authentication failed - check credentials")

		self._authenticated = True
		logger.info("Successfully authenticated with 3GPP portal")

		def extract_tdoc_url(self, tdoc_id: str) -> str:
		"""Extract direct FTP download URL for a TDoc (unauthenticated).

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364').

		Returns:
		TDocMetadata instance with portal metadata
		Direct FTP/HTTPS URL for the TDoc download.

		Raises:
		PortalAuthenticationError: If authentication fails
		PortalParsingError: If page parsing fails or TDoc not found
		PortalParsingError: If TDoc ID is invalid or URL extraction fails.
		requests.RequestException: For network errors.
		"""
		# Use temporary directory if no cache_dir provided for backward compatibility
		if cache_dir is None:
		cache_dir = Path(tempfile.gettempdir()) / "tdoc_crawler_test"
		cache_dir.mkdir(exist_ok=True)
		logger.debug(f"Extracting TDoc URL from DownloadTDoc endpoint for {tdoc_id}")

		download_url = f"{TDOC_DOWNLOAD_URL}?contributionUid={tdoc_id}"
		session = self._get_session()

		try:
		response = session.get(download_url, timeout=self.timeout)
		response.raise_for_status()

		if "cannot be found" in response.text.lower() or "not found" in response.text.lower():
		raise PortalParsingError(f"TDoc {tdoc_id} not found on portal")

		# Extract URL from JavaScript redirect pattern
		pattern = r"window\.location\.href\s=\s['\"]([^'\"]+)['\"]"
		match = re.search(pattern, response.text)

		if not match:
		# Try CDATA section pattern
		cdata_pattern = r"<!\[CDATA\[(.*?)\]\]>"
		cdata_matches = re.findall(cdata_pattern, response.text)
		for cdata_match in cdata_matches:
		inner_match = re.search(r"window\.location\.href\s=\s['\"]([^'\"]+)['\"]", cdata_match)
		if inner_match:
		match = inner_match
		break

		if not match:
		raise PortalParsingError(f"Failed to extract URL for TDoc {tdoc_id}: JavaScript redirect not found")

		extracted_url = match.group(1).strip()

		if not extracted_url.startswith(("http://", "https://", "ftp://")):
		raise PortalParsingError(f"Invalid URL format for TDoc {tdoc_id}: {extracted_url}")

		logger.debug(f"Successfully extracted TDoc URL for {tdoc_id}: {extracted_url}")
		return extracted_url

		except requests.RequestException:
		raise
		except PortalParsingError:
		raise
		except Exception as exc:
		error_msg = f"Failed to extract URL for TDoc {tdoc_id}: {exc}"
		logger.error(error_msg)
		raise PortalParsingError(error_msg) from exc

		def fetch_tdoc_metadata(self, tdoc_id: str, url: str \| None = None) -> TDocMetadata:
		"""Fetch full TDoc metadata from portal (requires authentication).

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364').
		url: Optional TDoc URL (if known, skips URL extraction).

		Returns:
		TDocMetadata instance with full portal metadata.

		# Try to extract URL from unauthenticated endpoint first
		Raises:
		PortalAuthenticationError: If authentication fails.
		PortalParsingError: If page parsing fails or TDoc not found.
		ValueError: If no credentials were provided.
		"""
		# Extract URL if not provided
		if url is None:
		try:
		url = extract_tdoc_url_from_portal(tdoc_id, cache_dir=cache_dir, timeout=min(timeout, 15))
		url = self.extract_tdoc_url(tdoc_id)
		logger.debug(f"Using extracted URL for {tdoc_id}")
		except Exception as e:
		logger.debug(f"URL extraction failed for {tdoc_id}, falling back to authenticated method: {e}")
		# Continue with authenticated method below
		logger.debug(f"URL extraction failed for {tdoc_id}: {e}")
		# Continue - authenticated method can still work without pre-extracted URL

		# Ensure authenticated
		self.authenticate()

		# Fetch TDoc page
		view_url = f"{TDOC_VIEW_URL}?mode=view&contributionUid={tdoc_id}"
		logger.debug(f"Fetching TDoc metadata from {view_url}")

		session = self._get_session()
		response = session.get(view_url, timeout=self.timeout)
		response.raise_for_status()

		# Check if redirected to login (session expired)
		if "login.aspx" in response.url.lower():
		self._authenticated = False
		raise PortalAuthenticationError("Session expired - re-authentication required")

		# Parse the page
		return self.parse_tdoc_page(response.text, tdoc_id, url)

		@staticmethod
		def parse_tdoc_page(html: str, tdoc_id: str, url: str \| None = None) -> TDocMetadata:
		"""Parse TDoc metadata from portal HTML page.

		This is a static method for cases where you have raw HTML content
		and want to parse it without making HTTP requests.

		Args:
		html: HTML content of the TDoc portal page.
		tdoc_id: TDoc identifier for logging.
		url: Optional TDoc URL (if known).

		with PortalSession(credentials, cache_dir, cache_ttl, cache_refresh_on_access, timeout) as session:
		return session.fetch_tdoc_metadata(tdoc_id, url)
		Returns:
		TDocMetadata instance with portal metadata.

		Raises:
		PortalParsingError: If TDoc not found, metadata table not found, or mandatory fields missing.
		"""
		return parse_tdoc_portal_page(html, tdoc_id, url)

		def _get_session(self) -> requests.Session:
		"""Get or create a cached session with browser headers.

		Returns:
		Session configured with browser-like headers for portal access.
		"""
		if self._session is None:
		self._session = create_cached_session(
		cache_dir=self.cache_dir,
		ttl=self._cache_ttl,
		refresh_ttl_on_access=self._cache_refresh_on_access,
		max_retries=3,
		)
		self._session.headers.update(_BROWSER_HEADERS)
		return self._session


		def create_portal_client(
		credentials: PortalCredentials \| None = None,
		cache_dir: Path \| None = None,
		cache_ttl: int = 7200,
		cache_refresh_on_access: bool = True,
		timeout: int = 30,
		) -> PortalClient:
		"""Factory function to create a PortalClient instance.

		This is the recommended way to create a PortalClient for new code.

		Args:
		credentials: Optional credentials (required for full metadata).
		cache_dir: Directory for HTTP cache storage.
		cache_ttl: Cache TTL in seconds.
		cache_refresh_on_access: Whether to refresh TTL on access.
		timeout: Request timeout in seconds.

		Returns:
		Configured PortalClient instance.
		"""
		return PortalClient(
		credentials=credentials,
		cache_dir=cache_dir,
		cache_ttl=cache_ttl,
		cache_refresh_on_access=cache_refresh_on_access,
		timeout=timeout,
		)

src/tdoc_crawler/fetching.py

+21 −19

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ from __future__ import annotations
		import logging
		from pathlib import Path

		from tdoc_crawler.crawlers.portal import extract_tdoc_url_from_portal, fetch_tdoc_metadata
		from tdoc_crawler.crawlers.portal import create_portal_client
		from tdoc_crawler.crawlers.whatthespec import resolve_via_whatthespec
		from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
		from tdoc_crawler.models.tdocs import TDocMetadata
		@@ -47,42 +47,44 @@ def fetch_tdoc(
		Raises:
		Exception: If fetching fails for any reason.
		"""
		# Import here to avoid circular imports

		if use_whatthespec:
		# Always use WhatTheSpec method (Method 3)
		logger.debug(f"Fetching {tdoc_id} via WhatTheSpec API")
		return resolve_via_whatthespec(tdoc_id, cache_dir, http_cache, timeout)
		metadata = resolve_via_whatthespec(tdoc_id, cache_dir, http_cache, timeout)
		if metadata is None:
		raise ValueError(f"TDoc {tdoc_id} not found via WhatTheSpec")
		return metadata

		elif full_metadata:
		# Use authenticated portal method (Method 2)
		if credentials is None:
		raise ValueError("Portal credentials required for full metadata fetching")
		logger.debug(f"Fetching {tdoc_id} via authenticated 3GPP portal")
		return fetch_tdoc_metadata(tdoc_id, credentials, cache_dir, http_cache.ttl, http_cache.refresh_ttl_on_access, timeout)
		client = create_portal_client(
		credentials=credentials,
		cache_dir=cache_dir,
		cache_ttl=http_cache.ttl,
		cache_refresh_on_access=http_cache.refresh_ttl_on_access,
		timeout=timeout,
		)
		return client.fetch_tdoc_metadata(tdoc_id)

		else:
		# Use unauthenticated portal method (Method 1) - URL only
		logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
		# Extract URL and create minimal TDocMetadata
		url = extract_tdoc_url_from_portal(tdoc_id, timeout=min(timeout, 15))
		client = create_portal_client(cache_dir=cache_dir, timeout=min(timeout, 15))
		url = client.extract_tdoc_url(tdoc_id)
		return TDocMetadata(
		tdoc_id=tdoc_id,
		url=url,
		title="",
		tdoc_id=tdoc_id.upper(),
		meeting_id=0,
		title="",
		url=url,
		source="",
		contact="",
		tdoc_type="unknown",
		for_purpose="unknown",
		agenda_item_nbr=0,
		date=None,
		revision_of="",
		technical_committee="",
		working_group="",
		type="",
		status="",
		referenced_documents=[],
		filename="",
		size=0,
		agenda_item_text="Unknown",
		validated=False,
		validation_failed=False,
		)

tests/test_portal_client.py

0 → 100644

+335 −0

File added.

Preview size limit exceeded, changes collapsed.