Commit 93d10c5f authored by jr2804's avatar jr2804
Browse files

feat(portal): implement portal authentication and TDoc metadata fetching

* Add PortalSession class for managing authenticated sessions with the 3GPP portal.
* Implement fetch_tdoc_metadata function to retrieve TDoc metadata.
* Introduce error handling for authentication and parsing failures.
* Update existing models and database interactions to support new functionality.
* Add tests for portal authentication and metadata fetching.
parent c5165596
Loading
Loading
Loading
Loading
+182 −23
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ from dotenv import load_dotenv
from rich.console import Console
from rich.table import Table

from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler, TDocCrawlResult
from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler, TDocCrawlResult, fetch_tdoc_metadata
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import (
    CrawlLimits,
@@ -221,28 +221,170 @@ def _launch_file(path: Path) -> None:
        raise typer.Exit(code=1) from exc


def _fetch_missing_tdocs(database: TDocDatabase, cache_dir: Path, missing_ids: list[str]) -> TDocCrawlResult:
    working_groups = _infer_working_groups_from_ids(missing_ids)
    config = TDocCrawlConfig(
        cache_dir=cache_dir,
        working_groups=working_groups,
        incremental=False,
        max_retries=3,
        timeout=30,
        verbose=False,
        limits=_build_limits(None, None, None, None),
        target_ids=missing_ids,
_logger = logging.getLogger(__name__)


def _normalize_portal_meeting_name(portal_meeting: str) -> str:
    """Normalize portal meeting name to database format.

    The portal uses format like "SA4#133-e" while the database uses "S4-133-e".
    This function converts portal format to database format.

    Args:
        portal_meeting: Meeting name from portal (e.g., "SA4#133-e")

    Returns:
        Normalized meeting name (e.g., "S4-133-e")
    """
    # Replace "SA4#" with "S4-", "RAN1#" with "R1-", etc.
    normalized = portal_meeting.replace("#", "-")

    # Handle full working group names (SA, RAN, CT)
    for full_name, short_prefix in [("SA", "S"), ("RAN", "R"), ("CT", "C")]:
        # Match patterns like "SA4-" and replace with "S4-"
        if normalized.startswith(f"{full_name}"):
            # Extract the subgroup number if present
            for i, char in enumerate(normalized[len(full_name) :]):
                if not char.isdigit():
                    subgroup_num = normalized[len(full_name) : len(full_name) + i] if i > 0 else ""
                    rest = normalized[len(full_name) + i :]
                    if subgroup_num:
                        normalized = f"{short_prefix}{subgroup_num}{rest}"
                    break
            break

    return normalized


def _resolve_meeting_id(database: TDocDatabase, meeting_name: str) -> int | None:
    """Resolve meeting name to meeting_id from database.

    Args:
        database: Database connection
        meeting_name: Meeting identifier (e.g., "SA4#133-e" or "S4-133-e")

    Returns:
        Meeting ID if found, None otherwise
    """
    # Try original name first
    cursor = database.connection.execute(
        "SELECT meeting_id FROM meetings WHERE short_name = ? COLLATE NOCASE",
        (meeting_name,),
    )
    crawler = TDocCrawler(database)
    crawl_id = database.log_crawl_start("tdoc", config.working_groups, config.incremental)
    result = crawler.crawl(config)
    database.log_crawl_end(
        crawl_id,
        items_added=result.inserted,
        items_updated=result.updated,
        errors_count=len(result.errors),
    row = cursor.fetchone()
    if row:
        return row[0]

    # Try normalized name
    normalized = _normalize_portal_meeting_name(meeting_name)
    if normalized != meeting_name:
        cursor = database.connection.execute(
            "SELECT meeting_id FROM meetings WHERE short_name = ? COLLATE NOCASE",
            (normalized,),
        )
        row = cursor.fetchone()
        if row:
            return row[0]

    return None


def _fetch_missing_tdocs(
    database: TDocDatabase,
    cache_dir: Path,
    missing_ids: list[str],
    credentials: PortalCredentials | None = None,
) -> TDocCrawlResult:
    """Fetch missing TDocs using portal authentication.

    Args:
        database: Database connection
        cache_dir: Cache directory path
        missing_ids: List of TDoc IDs to fetch
        credentials: Portal credentials (optional)

    Returns:
        TDocCrawlResult with inserted/updated counts and errors
    """
    errors = []

    if not credentials:
        errors.append("Portal credentials required for targeted fetch. Set EOL_USERNAME and EOL_PASSWORD.")
        return TDocCrawlResult(processed=len(missing_ids), inserted=0, updated=0, errors=errors)

    inserted_count = 0
    updated_count = 0

    for tdoc_id in missing_ids:
        try:
            # Fetch metadata from portal
            portal_data = fetch_tdoc_metadata(tdoc_id, credentials)

            if not portal_data:
                errors.append(f"Portal returned no data for {tdoc_id}")
                continue

            # Resolve meeting_id from meeting name
            meeting_id = None
            meeting_name = portal_data.get("meeting")
            if meeting_name:
                meeting_id = _resolve_meeting_id(database, meeting_name)
                if not meeting_id:
                    _logger.warning(f"Could not resolve meeting '{meeting_name}' to meeting_id for {tdoc_id}")

            # Infer working group from TDoc ID
            tdoc_prefix = tdoc_id[0].upper()
            working_group_map = {"R": WorkingGroup.RAN, "S": WorkingGroup.SA, "C": WorkingGroup.CT, "T": WorkingGroup.CT}
            working_group = working_group_map.get(tdoc_prefix, WorkingGroup.RAN)

            # Build TDoc URL (using meeting info if available)
            # For now, use a placeholder URL since we're fetching from portal
            url = f"https://www.3gpp.org/ftp/tsg_{working_group.value.lower()}/.../{tdoc_id}.zip"

            # Create TDocMetadata object (all fields without defaults must be provided)
            metadata = TDocMetadata(
                tdoc_id=tdoc_id.upper(),
                url=url,
                working_group=working_group,
                subgroup=None,
                meeting=meeting_name,
                meeting_id=meeting_id,
                file_size=None,
                title=portal_data.get("title"),
                contact=portal_data.get("contact"),
                tdoc_type=portal_data.get("tdoc_type"),
                for_purpose=portal_data.get("for_purpose"),
                agenda_item=portal_data.get("agenda_item"),
                status=portal_data.get("status"),
                is_revision_of=portal_data.get("is_revision_of"),
                document_type=None,
                checksum=None,
                source_path=None,
                date_created=None,
                validated=True,
                validation_failed=False,
            )

            # Insert/update in database
            inserted, updated = database.upsert_tdoc(metadata)
            if inserted:
                inserted_count += 1
            elif updated:
                updated_count += 1

            _logger.info(f"Successfully fetched and stored {tdoc_id}")

        except Exception as exc:
            error_msg = f"Failed to fetch {tdoc_id}: {exc}"
            _logger.error(error_msg)
            errors.append(error_msg)

    return TDocCrawlResult(
        processed=len(missing_ids),
        inserted=inserted_count,
        updated=updated_count,
        errors=errors,
    )
    return result


def _maybe_fetch_missing_tdocs(
@@ -250,6 +392,7 @@ def _maybe_fetch_missing_tdocs(
    cache_dir: Path,
    config: QueryConfig,
    results: list[TDocMetadata],
    credentials: PortalCredentials | None = None,
) -> list[TDocMetadata]:
    if not config.tdoc_ids:
        return results
@@ -260,7 +403,7 @@ def _maybe_fetch_missing_tdocs(
        return results

    console.print(f"[cyan]Fetching missing TDocs: {', '.join(missing)}[/cyan]")
    fetch_result = _fetch_missing_tdocs(database, cache_dir, missing)
    fetch_result = _fetch_missing_tdocs(database, cache_dir, missing, credentials)
    if fetch_result.errors:
        console.print(f"[yellow]{len(fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        for error in fetch_result.errors[:3]:
@@ -296,12 +439,19 @@ def crawl(
    config = TDocCrawlConfig(
        cache_dir=cache_dir,
        working_groups=working_groups,
        subgroups=None,
        meeting_ids=None,
        start_date=None,
        end_date=None,
        incremental=incremental,
        force_revalidate=False,
        workers=4,
        max_retries=max_retries,
        timeout=timeout,
        verbose=verbose,
        limits=limits,
        target_ids=None,
        credentials=None,
    )

    database_path = _database_path(config.cache_dir)
@@ -393,6 +543,9 @@ def query(
    order: str = typer.Option(SortOrder.DESC.value, "--order", help="Sort order (asc|desc)"),
    start_date: str | None = typer.Option(None, "--start-date", help="Filter from ISO timestamp"),
    end_date: str | None = typer.Option(None, "--end-date", help="Filter until ISO timestamp"),
    no_fetch: bool = typer.Option(False, "--no-fetch", help="Disable automatic fetching of missing TDocs from portal"),
    eol_username: str | None = typer.Option(None, "--eol-username", help="ETSI Online Account username"),
    eol_password: str | None = typer.Option(None, "--eol-password", help="ETSI Online Account password"),
) -> None:
    working_groups = _parse_working_groups(working_group)
    try:
@@ -423,10 +576,16 @@ def query(
        order=sort_order,
    )

    # Resolve credentials (only if --no-fetch is not set)
    credentials = None
    if not no_fetch:
        credentials = _resolve_credentials(eol_username, eol_password, prompt=True)

    database_path = _database_path(config.cache_dir)
    with TDocDatabase(database_path) as database:
        results = database.query_tdocs(config)
        results = _maybe_fetch_missing_tdocs(database, config.cache_dir, config, results)
        if not no_fetch:
            results = _maybe_fetch_missing_tdocs(database, config.cache_dir, config, results, credentials)

    if not results:
        console.print("[yellow]No TDocs found[/yellow]")
+12 −0
Original line number Diff line number Diff line
@@ -10,6 +10,13 @@ from .meetings import (
    normalize_subgroup_alias,
    normalize_working_group_alias,
)
from .portal import (
    PortalAuthenticationError,
    PortalParsingError,
    PortalSession,
    fetch_tdoc_metadata,
    parse_tdoc_portal_page,
)
from .tdocs import EXCLUDED_DIRS, TDOC_PATTERN, TDocCrawler, TDocCrawlResult

__all__ = [
@@ -17,9 +24,14 @@ __all__ = [
    "MEETING_CODE_REGISTRY",
    "MeetingCrawler",
    "MeetingCrawlResult",
    "PortalAuthenticationError",
    "PortalParsingError",
    "PortalSession",
    "TDOC_PATTERN",
    "TDocCrawler",
    "TDocCrawlResult",
    "fetch_tdoc_metadata",
    "normalize_subgroup_alias",
    "normalize_working_group_alias",
    "parse_tdoc_portal_page",
]
+21 −21
Original line number Diff line number Diff line
@@ -23,31 +23,31 @@ DATE_PATTERN = re.compile(r"(\d{4}[\-\u2010-\u2015]\d{2}[\-\u2010-\u2015]\d{2})"

MEETING_CODE_REGISTRY: dict[WorkingGroup, list[tuple[str, str | None]]] = {
    WorkingGroup.RAN: [
        ("RP", "RAN Plenary"),
        ("R1", "RAN1"),
        ("R2", "RAN2"),
        ("R3", "RAN3"),
        ("R4", "RAN4"),
        ("R5", "RAN5"),
        ("R6", "RAN6"),
        ("RP", "RP"),  # RAN Plenary
        ("R1", "R1"),
        ("R2", "R2"),
        ("R3", "R3"),
        ("R4", "R4"),
        ("R5", "R5"),
        ("R6", "R6"),
    ],
    WorkingGroup.SA: [
        ("SP", "SA Plenary"),
        ("S1", "SA1"),
        ("S2", "SA2"),
        ("S3", "SA3"),
        ("S4", "SA4"),
        ("S5", "SA5"),
        ("S6", "SA6"),
        ("SP", "SP"),  # SA Plenary
        ("S1", "S1"),
        ("S2", "S2"),
        ("S3", "S3"),
        ("S4", "S4"),
        ("S5", "S5"),
        ("S6", "S6"),
    ],
    WorkingGroup.CT: [
        ("CP", "CT Plenary"),
        ("C1", "CT1"),
        ("C2", "CT2"),
        ("C3", "CT3"),
        ("C4", "CT4"),
        ("C5", "CT5"),
        ("C6", "CT6"),
        ("CP", "CP"),  # CT Plenary
        ("C1", "C1"),
        ("C2", "C2"),
        ("C3", "C3"),
        ("C4", "C4"),
        ("C5", "C5"),
        ("C6", "C6"),
    ],
}

+363 −0
Original line number Diff line number Diff line
"""3GPP Portal authentication and TDoc metadata parsing."""

from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING

import requests
from bs4 import BeautifulSoup

if TYPE_CHECKING:
    from tdoc_crawler.models.base import PortalCredentials

logger = logging.getLogger(__name__)

PORTAL_BASE_URL = "https://portal.3gpp.org"
TDOC_VIEW_URL = f"{PORTAL_BASE_URL}/ngppapp/CreateTdoc.Aspx"
LOGIN_URL = f"{PORTAL_BASE_URL}/login.aspx"


class PortalAuthenticationError(Exception):
    """Raised when portal authentication fails."""


class PortalParsingError(Exception):
    """Raised when portal page parsing fails."""


class PortalSession:
    """Manages authenticated session with 3GPP portal."""

    def __init__(self, credentials: PortalCredentials, timeout: int = 30) -> None:
        """Initialize portal session.

        Args:
            credentials: ETSI Online Account credentials
            timeout: Request timeout in seconds
        """
        self.credentials = credentials
        self.timeout = timeout
        self.session = requests.Session()
        self._authenticated = False

        # Set browser-like headers to avoid 403 Forbidden
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "DNT": "1",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        })

    def __enter__(self) -> PortalSession:
        """Enter context manager."""
        return self

    def __exit__(self, *args) -> None:
        """Exit context manager and close session."""
        self.session.close()

    def authenticate(self) -> None:
        """Authenticate with the 3GPP portal using EOL credentials.

        The portal uses JavaScript-based authentication via AJAX call to LoginEOL.ashx endpoint.
        We need to first visit the login page to establish a session, then call the AJAX endpoint.

        Raises:
            PortalAuthenticationError: If authentication fails
        """
        if self._authenticated:
            return

        logger.info("Authenticating with 3GPP portal...")

        # Step 1: Visit the login page to establish session and get cookies
        logger.debug("Visiting login page to establish session...")
        initial_response = self.session.get(LOGIN_URL, timeout=self.timeout)
        initial_response.raise_for_status()

        # Step 2: Call the AJAX login endpoint
        # The portal uses a JavaScript function that POSTs JSON to /ETSIPages/LoginEOL.ashx
        login_api_url = f"{PORTAL_BASE_URL}/ETSIPages/LoginEOL.ashx"

        # Build JSON payload matching the JavaScript login() function
        login_payload = {
            "username": self.credentials.username,
            "password": self.credentials.password,
        }

        logger.debug(f"Calling login API at {login_api_url}")

        # Submit login via AJAX API endpoint
        login_response = self.session.post(
            login_api_url,
            json=login_payload,
            headers={
                "Content-Type": "application/json; charset=UTF-8",
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Referer": LOGIN_URL,
            },
            timeout=self.timeout,
        )
        login_response.raise_for_status()

        # Check response - the JavaScript checks if responseText == "Failed"
        response_text = login_response.text.strip()
        logger.debug(f"Login API response: {response_text}")

        if response_text.lower() == "failed":
            raise PortalAuthenticationError("Authentication failed - check credentials")

        # If response is not "Failed", authentication succeeded
        # The session cookies should now be set
        self._authenticated = True
        logger.info("Successfully authenticated with 3GPP portal")

    def fetch_tdoc_metadata(self, tdoc_id: str) -> dict[str, str | None] | None:
        """Fetch TDoc metadata from portal.

        Args:
            tdoc_id: TDoc identifier (e.g., 'S4-251364')

        Returns:
            Dictionary with parsed metadata fields or None if TDoc not found

        Raises:
            PortalAuthenticationError: If authentication is required but fails
            PortalParsingError: If page parsing fails
        """
        # Ensure authenticated
        self.authenticate()

        # Fetch TDoc page
        url = f"{TDOC_VIEW_URL}?mode=view&contributionUid={tdoc_id}"
        logger.debug(f"Fetching TDoc metadata from {url}")

        response = self.session.get(url, timeout=self.timeout)
        response.raise_for_status()

        # Check if redirected to login (session expired)
        if "login.aspx" in response.url.lower():
            self._authenticated = False
            raise PortalAuthenticationError("Session expired - re-authentication required")

        # Parse the page
        return parse_tdoc_portal_page(response.text, tdoc_id)


def parse_tdoc_portal_page(html: str, tdoc_id: str) -> dict[str, str | None] | None:
    """Parse TDoc metadata from portal HTML page.

    Args:
        html: HTML content of the TDoc portal page
        tdoc_id: TDoc identifier for logging

    Returns:
        Dictionary with parsed metadata fields or None if TDoc not found

    Expected fields:
        - meeting: Meeting identifier (required)
        - title: Document title (required)
        - contact: Contact person/organization (required)
        - tdoc_type: Document type classification (required)
        - for_purpose: Purpose (agreement, discussion, etc.) (required)
        - agenda_item: Associated agenda item (required)
        - status: Document status (required)
        - is_revision_of: Reference to previous TDoc version (optional)
    """
    soup = BeautifulSoup(html, "html.parser")

    # Check for "not found" or error messages
    error_indicators = [
        "not found",
        "does not exist",
        "invalid",
        "no document",
    ]
    page_text = soup.get_text().lower()
    if any(indicator in page_text for indicator in error_indicators):
        logger.warning(f"TDoc {tdoc_id} not found in portal")
        return None

    # The portal page uses a form with labels and input/span elements
    # Strategy: Find all label elements and their associated values

    metadata: dict[str, str | None] = {
        "meeting": None,
        "title": None,
        "contact": None,
        "tdoc_type": None,
        "for_purpose": None,
        "agenda_item": None,
        "status": None,
        "is_revision_of": None,
    }

    # Find all table rows that might contain metadata
    # Common patterns:
    # 1. <label>Field Name</label> followed by <input> or <span>
    # 2. <td>Field Name</td><td>Value</td>

    # Try to find labels and their associated values
    labels = soup.find_all("label")
    for label in labels:
        label_text = label.get_text(strip=True).lower()

        # Find the associated input/span/select element
        # It might be:
        # - Next sibling
        # - In the same <td> parent
        # - Referenced by 'for' attribute

        value_element = None

        # Check 'for' attribute
        label_for = label.get("for")
        if label_for:
            value_element = soup.find(id=label_for)

        # If not found, check next siblings
        if not value_element:
            for sibling in label.find_next_siblings():
                if sibling.name in ("input", "select", "span", "div"):
                    value_element = sibling
                    break

        # If still not found, check parent and next sibling
        if not value_element and label.parent:
            next_td = label.parent.find_next_sibling("td")
            if next_td:
                value_element = next_td.find(["input", "select", "span", "div"])

        if not value_element:
            continue

        # Extract value
        value = None
        if value_element.name == "input":
            raw_value = value_element.get("value", "")
            value = str(raw_value).strip() if raw_value else None
        elif value_element.name == "select":
            selected = value_element.find("option", {"selected": True})
            value = selected.get_text(strip=True) if selected else None
        else:
            value = value_element.get_text(strip=True)

        if not value:
            continue

        # Map label text to metadata field
        if "meeting" in label_text:
            metadata["meeting"] = value
        elif "title" in label_text:
            metadata["title"] = value
        elif "contact" in label_text:
            metadata["contact"] = value
        elif "type" in label_text and "tdoc" in label_text:
            metadata["tdoc_type"] = value
        elif label_text.startswith("for") or "purpose" in label_text:
            metadata["for_purpose"] = value
        elif "agenda" in label_text:
            metadata["agenda_item"] = value
        elif "status" in label_text:
            metadata["status"] = value
        elif "revision" in label_text:
            metadata["is_revision_of"] = value

    # Also try table-based parsing (alternative structure)
    tables = soup.find_all("table")
    for table in tables:
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["td", "th"])
            if len(cells) < 2:
                continue

            label_cell = cells[0].get_text(strip=True).lower()
            value_cell = cells[1]

            # Extract value from cell
            value_input = value_cell.find(["input", "select", "span"])
            if value_input:
                if value_input.name == "input":
                    raw_value = value_input.get("value", "")
                    value = str(raw_value).strip() if raw_value else None
                elif value_input.name == "select":
                    selected = value_input.find("option", {"selected": True})
                    value = selected.get_text(strip=True) if selected else None
                else:
                    value = value_input.get_text(strip=True)
            else:
                value = value_cell.get_text(strip=True)

            if not value:
                continue

            # Map to metadata fields
            if "meeting" in label_cell:
                metadata["meeting"] = value
            elif "title" in label_cell:
                metadata["title"] = value
            elif "contact" in label_cell:
                metadata["contact"] = value
            elif "type" in label_cell:
                metadata["tdoc_type"] = value
            elif label_cell.startswith("for"):
                metadata["for_purpose"] = value
            elif "agenda" in label_cell:
                metadata["agenda_item"] = value
            elif "status" in label_cell:
                metadata["status"] = value
            elif "revision" in label_cell:
                metadata["is_revision_of"] = value

    # Clean up is_revision_of field if it contains a URL
    revision_value = metadata.get("is_revision_of")
    if revision_value:
        # If it's a URL like "CreateTDoc.aspx?mode=view&contributionId=...", skip it
        # We only want TDoc IDs like "S4-251363"
        if "CreateTDoc.aspx" in revision_value or "contributionId" in revision_value:
            # Try to extract TDoc ID from nearby text or skip
            metadata["is_revision_of"] = None
        elif not re.match(r"^[RSTC]\d+-\d+", revision_value, re.IGNORECASE):
            # Not a valid TDoc ID pattern, set to None
            metadata["is_revision_of"] = None

    # Validate required fields
    required_fields = ["meeting", "title", "contact", "tdoc_type", "for_purpose", "agenda_item", "status"]
    missing_fields = [field for field in required_fields if not metadata.get(field)]

    if missing_fields:
        logger.warning(f"TDoc {tdoc_id}: Missing required fields: {', '.join(missing_fields)}. Parsed: {metadata}")
        return None

    logger.debug(f"Successfully parsed metadata for TDoc {tdoc_id}")
    return metadata


def fetch_tdoc_metadata(
    tdoc_id: str,
    credentials: PortalCredentials,
    timeout: int = 30,
) -> dict[str, str | None] | None:
    """Fetch TDoc metadata from 3GPP portal (convenience function).

    Args:
        tdoc_id: TDoc identifier (e.g., 'S4-251364')
        credentials: ETSI Online Account credentials
        timeout: Request timeout in seconds

    Returns:
        Dictionary with parsed metadata fields or None if TDoc not found

    Raises:
        PortalAuthenticationError: If authentication fails
        PortalParsingError: If page parsing fails
    """
    with PortalSession(credentials, timeout) as session:
        return session.fetch_tdoc_metadata(tdoc_id)
+36 −25

File changed.

Preview size limit exceeded, changes collapsed.

Loading