feat(portal): implement portal authentication and TDoc metadata fetching (93d10c5f) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli.py

+182 −23

Original line number	Diff line number	Diff line
		@@ -23,7 +23,7 @@ from dotenv import load_dotenv
		from rich.console import Console
		from rich.table import Table

		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler, TDocCrawlResult
		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler, TDocCrawlResult, fetch_tdoc_metadata
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import (
		CrawlLimits,
		@@ -221,28 +221,170 @@ def _launch_file(path: Path) -> None:
		raise typer.Exit(code=1) from exc


		def _fetch_missing_tdocs(database: TDocDatabase, cache_dir: Path, missing_ids: list[str]) -> TDocCrawlResult:
		working_groups = _infer_working_groups_from_ids(missing_ids)
		config = TDocCrawlConfig(
		cache_dir=cache_dir,
		working_groups=working_groups,
		incremental=False,
		max_retries=3,
		timeout=30,
		verbose=False,
		limits=_build_limits(None, None, None, None),
		target_ids=missing_ids,
		_logger = logging.getLogger(__name__)


		def _normalize_portal_meeting_name(portal_meeting: str) -> str:
		"""Normalize portal meeting name to database format.

		The portal uses format like "SA4#133-e" while the database uses "S4-133-e".
		This function converts portal format to database format.

		Args:
		portal_meeting: Meeting name from portal (e.g., "SA4#133-e")

		Returns:
		Normalized meeting name (e.g., "S4-133-e")
		"""
		# Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc.
		normalized = portal_meeting.replace("#", "-")

		# Handle full working group names (SA, RAN, CT)
		for full_name, short_prefix in [("SA", "S"), ("RAN", "R"), ("CT", "C")]:
		# Match patterns like "SA4-" and replace with "S4-"
		if normalized.startswith(f"{full_name}"):
		# Extract the subgroup number if present
		for i, char in enumerate(normalized[len(full_name) :]):
		if not char.isdigit():
		subgroup_num = normalized[len(full_name) : len(full_name) + i] if i > 0 else ""
		rest = normalized[len(full_name) + i :]
		if subgroup_num:
		normalized = f"{short_prefix}{subgroup_num}{rest}"
		break
		break

		return normalized


		def _resolve_meeting_id(database: TDocDatabase, meeting_name: str) -> int \| None:
		"""Resolve meeting name to meeting_id from database.

		Args:
		database: Database connection
		meeting_name: Meeting identifier (e.g., "SA4#133-e" or "S4-133-e")

		Returns:
		Meeting ID if found, None otherwise
		"""
		# Try original name first
		cursor = database.connection.execute(
		"SELECT meeting_id FROM meetings WHERE short_name = ? COLLATE NOCASE",
		(meeting_name,),
		)
		crawler = TDocCrawler(database)
		crawl_id = database.log_crawl_start("tdoc", config.working_groups, config.incremental)
		result = crawler.crawl(config)
		database.log_crawl_end(
		crawl_id,
		items_added=result.inserted,
		items_updated=result.updated,
		errors_count=len(result.errors),
		row = cursor.fetchone()
		if row:
		return row[0]

		# Try normalized name
		normalized = _normalize_portal_meeting_name(meeting_name)
		if normalized != meeting_name:
		cursor = database.connection.execute(
		"SELECT meeting_id FROM meetings WHERE short_name = ? COLLATE NOCASE",
		(normalized,),
		)
		row = cursor.fetchone()
		if row:
		return row[0]

		return None


		def _fetch_missing_tdocs(
		database: TDocDatabase,
		cache_dir: Path,
		missing_ids: list[str],
		credentials: PortalCredentials \| None = None,
		) -> TDocCrawlResult:
		"""Fetch missing TDocs using portal authentication.

		Args:
		database: Database connection
		cache_dir: Cache directory path
		missing_ids: List of TDoc IDs to fetch
		credentials: Portal credentials (optional)

		Returns:
		TDocCrawlResult with inserted/updated counts and errors
		"""
		errors = []

		if not credentials:
		errors.append("Portal credentials required for targeted fetch. Set EOL_USERNAME and EOL_PASSWORD.")
		return TDocCrawlResult(processed=len(missing_ids), inserted=0, updated=0, errors=errors)

		inserted_count = 0
		updated_count = 0

		for tdoc_id in missing_ids:
		try:
		# Fetch metadata from portal
		portal_data = fetch_tdoc_metadata(tdoc_id, credentials)

		if not portal_data:
		errors.append(f"Portal returned no data for {tdoc_id}")
		continue

		# Resolve meeting_id from meeting name
		meeting_id = None
		meeting_name = portal_data.get("meeting")
		if meeting_name:
		meeting_id = _resolve_meeting_id(database, meeting_name)
		if not meeting_id:
		_logger.warning(f"Could not resolve meeting '{meeting_name}' to meeting_id for {tdoc_id}")

		# Infer working group from TDoc ID
		tdoc_prefix = tdoc_id[0].upper()
		working_group_map = {"R": WorkingGroup.RAN, "S": WorkingGroup.SA, "C": WorkingGroup.CT, "T": WorkingGroup.CT}
		working_group = working_group_map.get(tdoc_prefix, WorkingGroup.RAN)

		# Build TDoc URL (using meeting info if available)
		# For now, use a placeholder URL since we're fetching from portal
		url = f"https://www.3gpp.org/ftp/tsg_{working_group.value.lower()}/.../{tdoc_id}.zip"

		# Create TDocMetadata object (all fields without defaults must be provided)
		metadata = TDocMetadata(
		tdoc_id=tdoc_id.upper(),
		url=url,
		working_group=working_group,
		subgroup=None,
		meeting=meeting_name,
		meeting_id=meeting_id,
		file_size=None,
		title=portal_data.get("title"),
		contact=portal_data.get("contact"),
		tdoc_type=portal_data.get("tdoc_type"),
		for_purpose=portal_data.get("for_purpose"),
		agenda_item=portal_data.get("agenda_item"),
		status=portal_data.get("status"),
		is_revision_of=portal_data.get("is_revision_of"),
		document_type=None,
		checksum=None,
		source_path=None,
		date_created=None,
		validated=True,
		validation_failed=False,
		)

		# Insert/update in database
		inserted, updated = database.upsert_tdoc(metadata)
		if inserted:
		inserted_count += 1
		elif updated:
		updated_count += 1

		_logger.info(f"Successfully fetched and stored {tdoc_id}")

		except Exception as exc:
		error_msg = f"Failed to fetch {tdoc_id}: {exc}"
		_logger.error(error_msg)
		errors.append(error_msg)

		return TDocCrawlResult(
		processed=len(missing_ids),
		inserted=inserted_count,
		updated=updated_count,
		errors=errors,
		)
		return result


		def _maybe_fetch_missing_tdocs(
		@@ -250,6 +392,7 @@ def _maybe_fetch_missing_tdocs(
		cache_dir: Path,
		config: QueryConfig,
		results: list[TDocMetadata],
		credentials: PortalCredentials \| None = None,
		) -> list[TDocMetadata]:
		if not config.tdoc_ids:
		return results
		@@ -260,7 +403,7 @@ def _maybe_fetch_missing_tdocs(
		return results

		console.print(f"[cyan]Fetching missing TDocs: {', '.join(missing)}[/cyan]")
		fetch_result = _fetch_missing_tdocs(database, cache_dir, missing)
		fetch_result = _fetch_missing_tdocs(database, cache_dir, missing, credentials)
		if fetch_result.errors:
		console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
		for error in fetch_result.errors[:3]:
		@@ -296,12 +439,19 @@ def crawl(
		config = TDocCrawlConfig(
		cache_dir=cache_dir,
		working_groups=working_groups,
		subgroups=None,
		meeting_ids=None,
		start_date=None,
		end_date=None,
		incremental=incremental,
		force_revalidate=False,
		workers=4,
		max_retries=max_retries,
		timeout=timeout,
		verbose=verbose,
		limits=limits,
		target_ids=None,
		credentials=None,
		)

		database_path = _database_path(config.cache_dir)
		@@ -393,6 +543,9 @@ def query(
		order: str = typer.Option(SortOrder.DESC.value, "--order", help="Sort order (asc\|desc)"),
		start_date: str \| None = typer.Option(None, "--start-date", help="Filter from ISO timestamp"),
		end_date: str \| None = typer.Option(None, "--end-date", help="Filter until ISO timestamp"),
		no_fetch: bool = typer.Option(False, "--no-fetch", help="Disable automatic fetching of missing TDocs from portal"),
		eol_username: str \| None = typer.Option(None, "--eol-username", help="ETSI Online Account username"),
		eol_password: str \| None = typer.Option(None, "--eol-password", help="ETSI Online Account password"),
		) -> None:
		working_groups = _parse_working_groups(working_group)
		try:
		@@ -423,10 +576,16 @@ def query(
		order=sort_order,
		)

		# Resolve credentials (only if --no-fetch is not set)
		credentials = None
		if not no_fetch:
		credentials = _resolve_credentials(eol_username, eol_password, prompt=True)

		database_path = _database_path(config.cache_dir)
		with TDocDatabase(database_path) as database:
		results = database.query_tdocs(config)
		results = _maybe_fetch_missing_tdocs(database, config.cache_dir, config, results)
		if not no_fetch:
		results = _maybe_fetch_missing_tdocs(database, config.cache_dir, config, results, credentials)

		if not results:
		console.print("[yellow]No TDocs found[/yellow]")

src/tdoc_crawler/crawlers/init.py

+12 −0

Original line number	Diff line number	Diff line
		@@ -10,6 +10,13 @@ from .meetings import (
		normalize_subgroup_alias,
		normalize_working_group_alias,
		)
		from .portal import (
		PortalAuthenticationError,
		PortalParsingError,
		PortalSession,
		fetch_tdoc_metadata,
		parse_tdoc_portal_page,
		)
		from .tdocs import EXCLUDED_DIRS, TDOC_PATTERN, TDocCrawler, TDocCrawlResult

		__all__ = [
		@@ -17,9 +24,14 @@ __all__ = [
		"MEETING_CODE_REGISTRY",
		"MeetingCrawler",
		"MeetingCrawlResult",
		"PortalAuthenticationError",
		"PortalParsingError",
		"PortalSession",
		"TDOC_PATTERN",
		"TDocCrawler",
		"TDocCrawlResult",
		"fetch_tdoc_metadata",
		"normalize_subgroup_alias",
		"normalize_working_group_alias",
		"parse_tdoc_portal_page",
		]

src/tdoc_crawler/crawlers/meetings.py

+21 −21

Original line number	Diff line number	Diff line
		@@ -23,31 +23,31 @@ DATE_PATTERN = re.compile(r"(\d{4}[\-\u2010-\u2015]\d{2}[\-\u2010-\u2015]\d{2})"

		MEETING_CODE_REGISTRY: dict[WorkingGroup, list[tuple[str, str \| None]]] = {
		WorkingGroup.RAN: [
		("RP", "RAN Plenary"),
		("R1", "RAN1"),
		("R2", "RAN2"),
		("R3", "RAN3"),
		("R4", "RAN4"),
		("R5", "RAN5"),
		("R6", "RAN6"),
		("RP", "RP"), # RAN Plenary
		("R1", "R1"),
		("R2", "R2"),
		("R3", "R3"),
		("R4", "R4"),
		("R5", "R5"),
		("R6", "R6"),
		],
		WorkingGroup.SA: [
		("SP", "SA Plenary"),
		("S1", "SA1"),
		("S2", "SA2"),
		("S3", "SA3"),
		("S4", "SA4"),
		("S5", "SA5"),
		("S6", "SA6"),
		("SP", "SP"), # SA Plenary
		("S1", "S1"),
		("S2", "S2"),
		("S3", "S3"),
		("S4", "S4"),
		("S5", "S5"),
		("S6", "S6"),
		],
		WorkingGroup.CT: [
		("CP", "CT Plenary"),
		("C1", "CT1"),
		("C2", "CT2"),
		("C3", "CT3"),
		("C4", "CT4"),
		("C5", "CT5"),
		("C6", "CT6"),
		("CP", "CP"), # CT Plenary
		("C1", "C1"),
		("C2", "C2"),
		("C3", "C3"),
		("C4", "C4"),
		("C5", "C5"),
		("C6", "C6"),
		],
		}

src/tdoc_crawler/crawlers/portal.py

0 → 100644

+363 −0

Original line number	Diff line number	Diff line
		"""3GPP Portal authentication and TDoc metadata parsing."""

		from __future__ import annotations

		import logging
		import re
		from typing import TYPE_CHECKING

		import requests
		from bs4 import BeautifulSoup

		if TYPE_CHECKING:
		from tdoc_crawler.models.base import PortalCredentials

		logger = logging.getLogger(__name__)

		PORTAL_BASE_URL = "https://portal.3gpp.org"
		TDOC_VIEW_URL = f"{PORTAL_BASE_URL}/ngppapp/CreateTdoc.Aspx"
		LOGIN_URL = f"{PORTAL_BASE_URL}/login.aspx"


		class PortalAuthenticationError(Exception):
		"""Raised when portal authentication fails."""


		class PortalParsingError(Exception):
		"""Raised when portal page parsing fails."""


		class PortalSession:
		"""Manages authenticated session with 3GPP portal."""

		def __init__(self, credentials: PortalCredentials, timeout: int = 30) -> None:
		"""Initialize portal session.

		Args:
		credentials: ETSI Online Account credentials
		timeout: Request timeout in seconds
		"""
		self.credentials = credentials
		self.timeout = timeout
		self.session = requests.Session()
		self._authenticated = False

		# Set browser-like headers to avoid 403 Forbidden
		self.session.headers.update({
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
		"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
		"Accept-Language": "en-US,en;q=0.5",
		"Accept-Encoding": "gzip, deflate, br",
		"DNT": "1",
		"Connection": "keep-alive",
		"Upgrade-Insecure-Requests": "1",
		})

		def __enter__(self) -> PortalSession:
		"""Enter context manager."""
		return self

		def __exit__(self, *args) -> None:
		"""Exit context manager and close session."""
		self.session.close()

		def authenticate(self) -> None:
		"""Authenticate with the 3GPP portal using EOL credentials.

		The portal uses JavaScript-based authentication via AJAX call to LoginEOL.ashx endpoint.
		We need to first visit the login page to establish a session, then call the AJAX endpoint.

		Raises:
		PortalAuthenticationError: If authentication fails
		"""
		if self._authenticated:
		return

		logger.info("Authenticating with 3GPP portal...")

		# Step 1: Visit the login page to establish session and get cookies
		logger.debug("Visiting login page to establish session...")
		initial_response = self.session.get(LOGIN_URL, timeout=self.timeout)
		initial_response.raise_for_status()

		# Step 2: Call the AJAX login endpoint
		# The portal uses a JavaScript function that POSTs JSON to /ETSIPages/LoginEOL.ashx
		login_api_url = f"{PORTAL_BASE_URL}/ETSIPages/LoginEOL.ashx"

		# Build JSON payload matching the JavaScript login() function
		login_payload = {
		"username": self.credentials.username,
		"password": self.credentials.password,
		}

		logger.debug(f"Calling login API at {login_api_url}")

		# Submit login via AJAX API endpoint
		login_response = self.session.post(
		login_api_url,
		json=login_payload,
		headers={
		"Content-Type": "application/json; charset=UTF-8",
		"Accept": "application/json, text/javascript, /; q=0.01",
		"X-Requested-With": "XMLHttpRequest",
		"Referer": LOGIN_URL,
		},
		timeout=self.timeout,
		)
		login_response.raise_for_status()

		# Check response - the JavaScript checks if responseText == "Failed"
		response_text = login_response.text.strip()
		logger.debug(f"Login API response: {response_text}")

		if response_text.lower() == "failed":
		raise PortalAuthenticationError("Authentication failed - check credentials")

		# If response is not "Failed", authentication succeeded
		# The session cookies should now be set
		self._authenticated = True
		logger.info("Successfully authenticated with 3GPP portal")

		def fetch_tdoc_metadata(self, tdoc_id: str) -> dict[str, str \| None] \| None:
		"""Fetch TDoc metadata from portal.

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364')

		Returns:
		Dictionary with parsed metadata fields or None if TDoc not found

		Raises:
		PortalAuthenticationError: If authentication is required but fails
		PortalParsingError: If page parsing fails
		"""
		# Ensure authenticated
		self.authenticate()

		# Fetch TDoc page
		url = f"{TDOC_VIEW_URL}?mode=view&contributionUid={tdoc_id}"
		logger.debug(f"Fetching TDoc metadata from {url}")

		response = self.session.get(url, timeout=self.timeout)
		response.raise_for_status()

		# Check if redirected to login (session expired)
		if "login.aspx" in response.url.lower():
		self._authenticated = False
		raise PortalAuthenticationError("Session expired - re-authentication required")

		# Parse the page
		return parse_tdoc_portal_page(response.text, tdoc_id)


		def parse_tdoc_portal_page(html: str, tdoc_id: str) -> dict[str, str \| None] \| None:
		"""Parse TDoc metadata from portal HTML page.

		Args:
		html: HTML content of the TDoc portal page
		tdoc_id: TDoc identifier for logging

		Returns:
		Dictionary with parsed metadata fields or None if TDoc not found

		Expected fields:
		- meeting: Meeting identifier (required)
		- title: Document title (required)
		- contact: Contact person/organization (required)
		- tdoc_type: Document type classification (required)
		- for_purpose: Purpose (agreement, discussion, etc.) (required)
		- agenda_item: Associated agenda item (required)
		- status: Document status (required)
		- is_revision_of: Reference to previous TDoc version (optional)
		"""
		soup = BeautifulSoup(html, "html.parser")

		# Check for "not found" or error messages
		error_indicators = [
		"not found",
		"does not exist",
		"invalid",
		"no document",
		]
		page_text = soup.get_text().lower()
		if any(indicator in page_text for indicator in error_indicators):
		logger.warning(f"TDoc {tdoc_id} not found in portal")
		return None

		# The portal page uses a form with labels and input/span elements
		# Strategy: Find all label elements and their associated values

		metadata: dict[str, str \| None] = {
		"meeting": None,
		"title": None,
		"contact": None,
		"tdoc_type": None,
		"for_purpose": None,
		"agenda_item": None,
		"status": None,
		"is_revision_of": None,
		}

		# Find all table rows that might contain metadata
		# Common patterns:
		# 1. <label>Field Name</label> followed by <input> or <span>
		# 2. <td>Field Name</td><td>Value</td>

		# Try to find labels and their associated values
		labels = soup.find_all("label")
		for label in labels:
		label_text = label.get_text(strip=True).lower()

		# Find the associated input/span/select element
		# It might be:
		# - Next sibling
		# - In the same <td> parent
		# - Referenced by 'for' attribute

		value_element = None

		# Check 'for' attribute
		label_for = label.get("for")
		if label_for:
		value_element = soup.find(id=label_for)

		# If not found, check next siblings
		if not value_element:
		for sibling in label.find_next_siblings():
		if sibling.name in ("input", "select", "span", "div"):
		value_element = sibling
		break

		# If still not found, check parent and next sibling
		if not value_element and label.parent:
		next_td = label.parent.find_next_sibling("td")
		if next_td:
		value_element = next_td.find(["input", "select", "span", "div"])

		if not value_element:
		continue

		# Extract value
		value = None
		if value_element.name == "input":
		raw_value = value_element.get("value", "")
		value = str(raw_value).strip() if raw_value else None
		elif value_element.name == "select":
		selected = value_element.find("option", {"selected": True})
		value = selected.get_text(strip=True) if selected else None
		else:
		value = value_element.get_text(strip=True)

		if not value:
		continue

		# Map label text to metadata field
		if "meeting" in label_text:
		metadata["meeting"] = value
		elif "title" in label_text:
		metadata["title"] = value
		elif "contact" in label_text:
		metadata["contact"] = value
		elif "type" in label_text and "tdoc" in label_text:
		metadata["tdoc_type"] = value
		elif label_text.startswith("for") or "purpose" in label_text:
		metadata["for_purpose"] = value
		elif "agenda" in label_text:
		metadata["agenda_item"] = value
		elif "status" in label_text:
		metadata["status"] = value
		elif "revision" in label_text:
		metadata["is_revision_of"] = value

		# Also try table-based parsing (alternative structure)
		tables = soup.find_all("table")
		for table in tables:
		rows = table.find_all("tr")
		for row in rows:
		cells = row.find_all(["td", "th"])
		if len(cells) < 2:
		continue

		label_cell = cells[0].get_text(strip=True).lower()
		value_cell = cells[1]

		# Extract value from cell
		value_input = value_cell.find(["input", "select", "span"])
		if value_input:
		if value_input.name == "input":
		raw_value = value_input.get("value", "")
		value = str(raw_value).strip() if raw_value else None
		elif value_input.name == "select":
		selected = value_input.find("option", {"selected": True})
		value = selected.get_text(strip=True) if selected else None
		else:
		value = value_input.get_text(strip=True)
		else:
		value = value_cell.get_text(strip=True)

		if not value:
		continue

		# Map to metadata fields
		if "meeting" in label_cell:
		metadata["meeting"] = value
		elif "title" in label_cell:
		metadata["title"] = value
		elif "contact" in label_cell:
		metadata["contact"] = value
		elif "type" in label_cell:
		metadata["tdoc_type"] = value
		elif label_cell.startswith("for"):
		metadata["for_purpose"] = value
		elif "agenda" in label_cell:
		metadata["agenda_item"] = value
		elif "status" in label_cell:
		metadata["status"] = value
		elif "revision" in label_cell:
		metadata["is_revision_of"] = value

		# Clean up is_revision_of field if it contains a URL
		revision_value = metadata.get("is_revision_of")
		if revision_value:
		# If it's a URL like "CreateTDoc.aspx?mode=view&contributionId=...", skip it
		# We only want TDoc IDs like "S4-251363"
		if "CreateTDoc.aspx" in revision_value or "contributionId" in revision_value:
		# Try to extract TDoc ID from nearby text or skip
		metadata["is_revision_of"] = None
		elif not re.match(r"^[RSTC]\d+-\d+", revision_value, re.IGNORECASE):
		# Not a valid TDoc ID pattern, set to None
		metadata["is_revision_of"] = None

		# Validate required fields
		required_fields = ["meeting", "title", "contact", "tdoc_type", "for_purpose", "agenda_item", "status"]
		missing_fields = [field for field in required_fields if not metadata.get(field)]

		if missing_fields:
		logger.warning(f"TDoc {tdoc_id}: Missing required fields: {', '.join(missing_fields)}. Parsed: {metadata}")
		return None

		logger.debug(f"Successfully parsed metadata for TDoc {tdoc_id}")
		return metadata


		def fetch_tdoc_metadata(
		tdoc_id: str,
		credentials: PortalCredentials,
		timeout: int = 30,
		) -> dict[str, str \| None] \| None:
		"""Fetch TDoc metadata from 3GPP portal (convenience function).

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364')
		credentials: ETSI Online Account credentials
		timeout: Request timeout in seconds

		Returns:
		Dictionary with parsed metadata fields or None if TDoc not found

		Raises:
		PortalAuthenticationError: If authentication fails
		PortalParsingError: If page parsing fails
		"""
		with PortalSession(credentials, timeout) as session:
		return session.fetch_tdoc_metadata(tdoc_id)

src/tdoc_crawler/database.py

+36 −25

File changed.

Preview size limit exceeded, changes collapsed.