Commit eaa07db9 authored by Jan Reimes's avatar Jan Reimes
Browse files

🔥 chore(portal): remove deprecated PortalSession class, cleaned imports

- Removed the PortalSession class for better clarity and to encourage the use of PortalClient.
- Updated tests to utilize PortalClient instead of PortalSession.
- Cleaned up unused imports and commented-out code in the portal module.
parent fd78d859
Loading
Loading
Loading
Loading
+1 −10
Original line number Diff line number Diff line
@@ -2,13 +2,7 @@

from __future__ import annotations

from tdoc_crawler.clients.portal import (
    PortalAuthenticationError,
    PortalClient,
    # PortalSession,
    # create_portal_client,
    # extract_tdoc_url_from_portal,
)
from tdoc_crawler.clients.portal import PortalAuthenticationError, PortalClient

# Re-export PortalParsingError from parsers for backward compatibility
from tdoc_crawler.parsers.portal import PortalParsingError
@@ -17,7 +11,4 @@ __all__ = [
    "PortalAuthenticationError",
    "PortalClient",
    "PortalParsingError",
    # "PortalSession",
    # "create_portal_client",
    # "extract_tdoc_url_from_portal",
]
+4 −218
Original line number Diff line number Diff line
@@ -44,220 +44,6 @@ class PortalAuthenticationError(Exception):
    """Raised when portal authentication fails."""


'''
class PortalSession:
    """Manages authenticated session with 3GPP portal.

    Note: This class is kept for backward compatibility. New code should use
    PortalClient which provides a unified interface.
    """

    def __init__(
        self,
        credentials: PortalCredentials,
        cache_dir: Path,
        cache_ttl: int = 7200,
        cache_refresh_on_access: bool = True,
        timeout: int = 30,
    ) -> None:
        """Initialize portal session.

        Args:
            credentials: ETSI Online Account credentials
            cache_dir: Directory for HTTP cache storage
            cache_ttl: HTTP cache TTL in seconds
            cache_refresh_on_access: Whether to refresh cache TTL on access
            timeout: Request timeout in seconds
        """
        self.credentials = credentials
        self.cache_dir = cache_dir
        self.timeout = timeout
        self.session = create_cached_session(
            cache_dir=cache_dir,
            ttl=cache_ttl,
            refresh_ttl_on_access=cache_refresh_on_access,
            max_retries=3,
        )
        self._authenticated = False

        # Set browser-like headers to avoid 403 Forbidden
        self.session.headers.update(_BROWSER_HEADERS)

    def __enter__(self) -> PortalSession:
        """Enter context manager."""
        return self

    def __exit__(self, *args) -> None:
        """Exit context manager and close session."""
        self.session.close()

    def authenticate(self) -> None:
        """Authenticate with the 3GPP portal using EOL credentials.

        The portal uses JavaScript-based authentication via AJAX call to LoginEOL.ashx endpoint.
        We need to first visit the login page to establish a session, then call the AJAX endpoint.

        Raises:
            PortalAuthenticationError: If authentication fails
        """
        if self._authenticated:
            return

        logger.info("Authenticating with 3GPP portal...")

        # Step 1: Visit the login page to establish session and get cookies
        logger.debug("Visiting login page to establish session...")
        initial_response = self.session.get(LOGIN_URL, timeout=self.timeout)
        initial_response.raise_for_status()

        # Step 2: Call the AJAX login endpoint
        login_api_url = f"{PORTAL_BASE_URL}/ETSIPages/LoginEOL.ashx"

        # Build JSON payload matching the JavaScript login() function
        login_payload = {
            "username": self.credentials.username,
            "password": self.credentials.password,
        }

        logger.debug(f"Calling login API at {login_api_url}")

        # Submit login via AJAX API endpoint
        login_response = self.session.post(
            login_api_url,
            json=login_payload,
            headers={
                "Content-Type": "application/json; charset=UTF-8",
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Referer": LOGIN_URL,
            },
            timeout=self.timeout,
        )
        login_response.raise_for_status()

        # Check response - the JavaScript checks if responseText == "Failed"
        response_text = login_response.text.strip()
        logger.debug(f"Login API response: {response_text}")

        if response_text.lower() == "failed":
            raise PortalAuthenticationError("Authentication failed - check credentials")

        # If response is not "Failed", authentication succeeded
        self._authenticated = True
        logger.info("Successfully authenticated with 3GPP portal")

    def fetch_tdoc_metadata(self, tdoc_id: str, url: str | None = None) -> TDocMetadata:
        """Fetch TDoc metadata from portal.

        Args:
            tdoc_id: TDoc identifier (e.g., 'S4-251364')
            url: Optional TDoc URL (if known)

        Returns:
            TDocMetadata instance with portal metadata

        Raises:
            PortalAuthenticationError: If authentication is required but fails
            PortalParsingError: If page parsing fails or TDoc not found
        """
        # If no URL provided, try to extract it from unauthenticated endpoint first
        if url is None:
            try:
                url = extract_tdoc_url_from_portal(tdoc_id, cache_dir=self.cache_dir, timeout=min(self.timeout, 15))
                logger.debug(f"Using URL extracted from unauthenticated endpoint for {tdoc_id}")
            except Exception as e:
                logger.debug(f"URL extraction failed for {tdoc_id}, using authenticated method: {e}")

        # Ensure authenticated
        self.authenticate()

        # Fetch TDoc page
        view_url = f"{TDOC_VIEW_URL}?mode=view&contributionUid={tdoc_id}"
        logger.debug(f"Fetching TDoc metadata from {view_url}")

        response = self.session.get(view_url, timeout=self.timeout)
        response.raise_for_status()

        # Check if redirected to login (session expired)
        if "login.aspx" in response.url.lower():
            self._authenticated = False
            raise PortalAuthenticationError("Session expired - re-authentication required")

        # Parse the page using the parser module
        return parse_tdoc_portal_page(response.text, tdoc_id, url)


def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path | None = None, timeout: int = 15) -> str:
    """Extract direct FTP download URL for a TDoc using unauthenticated DownloadTDoc.aspx endpoint.

    Args:
        tdoc_id: TDoc identifier (e.g., 'S4-251364')
        cache_dir: Directory for HTTP cache storage (optional)
        timeout: Request timeout in seconds (default 15 seconds)

    Returns:
        Direct FTP URL (e.g., 'https://www.3gpp.org/ftp/tsg_sa/WG4_CODEC/TSGS4_133-e/Docs/S4-251364.zip')

    Raises:
        PortalParsingError: If TDoc ID is invalid or URL extraction fails
        requests.RequestException: For network errors
    """
    logger.debug(f"Attempting to extract TDoc URL from DownloadTDoc endpoint for {tdoc_id}")

    # Build the URL for the DownloadTDoc endpoint
    download_url = f"{TDOC_DOWNLOAD_URL}?contributionUid={tdoc_id}"

    # Create a session with browser-like headers to avoid 403 Forbidden
    session = create_cached_session(cache_dir) if cache_dir is not None else requests.Session()
    session.headers.update(_BROWSER_HEADERS)

    try:
        # Fetch the DownloadTDoc page
        response = session.get(download_url, timeout=timeout)
        response.raise_for_status()

        # Check if the response contains an error message
        if "cannot be found" in response.text.lower() or "not found" in response.text.lower():
            raise PortalParsingError(f"TDoc {tdoc_id} not found on portal")

        # Extract URL from JavaScript redirect pattern
        pattern = r"window\\.location\\.href\\s*=\\s*['\"]([^'\"]+)['\"]"
        match = re.search(pattern, response.text)

        if not match:
            # Also try for the pattern in the CDATA section
            cdata_pattern = r"<!\\[CDATA\\[(.*?)\\]\\]>"
            cdata_matches = re.findall(cdata_pattern, response.text)
            for cdata_match in cdata_matches:
                inner_match = re.search(r"window\\.location\\.href\\s*=\\s*['\"]([^'\"]+)['\"]", cdata_match)
                if inner_match:
                    match = inner_match
                    break

        if not match:
            raise PortalParsingError(f"Failed to extract URL for TDoc {tdoc_id}: JavaScript redirect not found")

        extracted_url = match.group(1).strip()

        # Validate that the extracted URL is a proper URL format
        if not extracted_url.startswith(("http://", "https://", "ftp://")):
            raise PortalParsingError(f"Invalid URL format for TDoc {tdoc_id}: {extracted_url}")

        logger.debug(f"Successfully extracted TDoc URL for {tdoc_id}: {extracted_url}")
        return extracted_url

    except requests.RequestException as e:
        logger.error(f"Network error extracting URL for TDoc {tdoc_id}: {e}")
        raise
    except Exception as e:
        error_msg = f"Failed to extract URL for TDoc {tdoc_id}: {e}"
        logger.error(error_msg)
        raise PortalParsingError(error_msg) from e
    finally:
        session.close()
'''


class PortalClient:
    """Unified 3GPP portal client with authentication and TDoc fetching.

@@ -370,7 +156,10 @@ class PortalClient:
        login_response.raise_for_status()

        response_text = login_response.text.strip()
        if response_text.lower() == "failed":
            # response is too comprehensive, even for DEBUG level - log it only when failing
            logger.debug(f"Login API response: {response_text}")
            raise PortalAuthenticationError("Authentication failed - check credentials")

        if response_text.lower() == "failed":
            raise PortalAuthenticationError("Authentication failed - check credentials")
@@ -528,7 +317,4 @@ class PortalClient:
__all__ = [
    "PortalAuthenticationError",
    "PortalClient",
    # "PortalSession",
    # "create_portal_client",
    # "extract_tdoc_url_from_portal",
]
+2 −0
Original line number Diff line number Diff line
@@ -94,6 +94,7 @@ class SubWorkingGroupRecord(BaseModel):
                return working_group
        raise ValueError


@cache
def _generate_swb_group_records() -> tuple[SubWorkingGroupRecord, ...]:
    """Generate SubWorkingGroupRecord entries for all defined SubWorkingGroup enums."""
@@ -106,6 +107,7 @@ def _generate_swb_group_records() -> tuple[SubWorkingGroupRecord, ...]:
        records.append(record)
    return tuple(records)


SUBWORKING_GROUP_RECORDS: tuple[SubWorkingGroupRecord, ...] = _generate_swb_group_records()

SUBTB_INDEX: dict[int, SubWorkingGroupRecord] = {record.subtb: record for record in SUBWORKING_GROUP_RECORDS}
+7 −36
Original line number Diff line number Diff line
@@ -2,15 +2,13 @@

from __future__ import annotations

import contextlib
import os
import tempfile
from pathlib import Path

import pytest
import requests

from tdoc_crawler.clients.portal import PortalAuthenticationError, PortalSession
from tdoc_crawler.clients.portal import PortalAuthenticationError, PortalClient
from tdoc_crawler.models.base import PortalCredentials
from tdoc_crawler.parsers.portal import PortalParsingError

@@ -80,13 +78,9 @@ def test_fetch_tdoc_metadata_success(credentials: PortalCredentials) -> None:
    # Using a recent TDoc - if this fails, the portal structure may have changed
    tdoc_id = "S4-260001"

    # Create a temporary cache file (not a directory)
    with tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False) as f:
        cache_file = Path(f.name)

    try:
        with PortalSession(credentials=credentials, cache_dir=cache_file, timeout=30) as session:
            metadata = session.fetch_tdoc_metadata(tdoc_id)
        with PortalClient(credentials=credentials, timeout=30) as client:
            metadata = client.fetch_tdoc_metadata(tdoc_id)

        # If we get here, parsing succeeded - verify basic fields
        assert metadata is not None, f"Expected metadata for {tdoc_id}"
@@ -95,28 +89,14 @@ def test_fetch_tdoc_metadata_success(credentials: PortalCredentials) -> None:
    except PortalParsingError as exc:
        # The portal structure may have changed - this is not a refactoring issue
        pytest.skip(f"Portal structure may have changed: {exc}")
    finally:
        # Clean up temporary cache file
        with contextlib.suppress(OSError):
            cache_file.unlink()


def test_fetch_tdoc_metadata_invalid_tdoc(credentials: PortalCredentials) -> None:
    """Test fetching metadata for non-existent TDoc."""
    tdoc_id = "S4-999999999"

    # Create a temporary cache file (not a directory)
    with tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False) as f:
        cache_file = Path(f.name)

    try:
        # Should raise PortalParsingError for non-existent TDoc
        with pytest.raises(PortalParsingError), PortalSession(credentials=credentials, cache_dir=cache_file, timeout=30) as session:
            session.fetch_tdoc_metadata(tdoc_id)
    finally:
        # Clean up temporary cache file
        with contextlib.suppress(OSError):
            cache_file.unlink()
    with pytest.raises(PortalParsingError), PortalClient(credentials=credentials, timeout=30) as client:
        client.fetch_tdoc_metadata(tdoc_id)


def test_fetch_tdoc_metadata_invalid_credentials() -> None:
@@ -125,14 +105,5 @@ def test_fetch_tdoc_metadata_invalid_credentials() -> None:
    invalid_credentials = PortalCredentials(username="invalid_user", password="invalid_password")  # noqa: S106
    tdoc_id = "S4-251364"

    # Create a temporary cache file (not a directory)
    with tempfile.NamedTemporaryFile(suffix=".sqlite3", delete=False) as f:
        cache_file = Path(f.name)

    try:
        with pytest.raises(PortalAuthenticationError), PortalSession(credentials=invalid_credentials, cache_dir=cache_file, timeout=30) as session:
            session.fetch_tdoc_metadata(tdoc_id)
    finally:
        # Clean up temporary cache file
        with contextlib.suppress(OSError):
            cache_file.unlink()
    with pytest.raises(PortalAuthenticationError), PortalClient(credentials=invalid_credentials, timeout=30) as client:
        client.fetch_tdoc_metadata(tdoc_id)
+8 −64
Original line number Diff line number Diff line
@@ -8,12 +8,11 @@ from unittest.mock import MagicMock, patch

import pytest

from tdoc_crawler.clients.portal import PortalAuthenticationError, PortalClient, PortalSession, create_portal_client, extract_tdoc_url_from_portal
from tdoc_crawler.clients.portal import PortalAuthenticationError, PortalClient
from tdoc_crawler.clients.portal import PortalClient as ReExportedPortalClient
from tdoc_crawler.clients.portal import create_portal_client as ReExportedCreatePortalClient
from tdoc_crawler.constants.urls import TDOC_DOWNLOAD_URL
from tdoc_crawler.models.base import PortalCredentials
from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page
from tdoc_crawler.parsers.portal import PortalParsingError
from tdoc_crawler.tdocs.models import TDocMetadata


@@ -34,73 +33,43 @@ class TestPortalClientInit:
        assert client.credentials == creds
        assert client.credentials.username == "test"

    def test_init_with_custom_cache_dir(self) -> None:
        """Client can be created with custom cache directory."""
        cache_dir = Path("/custom/cache")
        client = PortalClient(cache_dir=cache_dir)
        assert client.cache_dir == cache_dir

    def test_init_with_custom_timeout(self) -> None:
        """Client can be created with custom timeout."""
        client = PortalClient(timeout=60)
        assert client.timeout == 60

    def test_default_cache_dir(self) -> None:
        """Default cache dir is ~/.tdoc-crawler."""
        client = PortalClient()
        expected = Path.home() / ".tdoc-crawler"
        assert client.cache_dir == expected


class TestCreatePortalClient:
    """Tests for create_portal_client factory function."""

    def test_create_portal_client(self) -> None:
        """Factory function creates PortalClient instance."""
        creds = PortalCredentials(username="test", password="test")
        client = create_portal_client(credentials=creds, timeout=45)
        assert isinstance(client, PortalClient)
        assert client.credentials == creds
        assert client.timeout == 45

    def test_create_portal_client_defaults(self) -> None:
        """Factory function uses sensible defaults."""
        client = create_portal_client()
        assert client.timeout == 30
        assert client._cache_ttl == 7200


class TestPortalClientSession:
    """Tests for PortalClient session management."""

    def test_get_session_creates_session(self) -> None:
    def test_get_session_creates_session(self, test_cache_dir: Path) -> None:
        """Getting session creates it if not exists."""
        client = PortalClient()
        session = client._get_session()
        assert session is not None
        assert client._session is not None

    def test_get_session_returns_same_session(self) -> None:
    def test_get_session_returns_same_session(self, test_cache_dir: Path) -> None:
        """Getting session returns the same instance."""
        client = PortalClient()
        session1 = client._get_session()
        session2 = client._get_session()
        assert session1 is session2

    def test_close_closes_session(self) -> None:
    def test_close_closes_session(self, test_cache_dir: Path) -> None:
        """Close method closes the session."""
        client = PortalClient()
        client._get_session()
        client.close()
        assert client._session is None

    def test_context_manager(self) -> None:
    def test_context_manager(self, test_cache_dir: Path) -> None:
        """Client works as context manager."""
        with PortalClient() as client:
            assert client._get_session() is not None
        assert client._session is None

    def test_close_idempotent(self) -> None:
    def test_close_idempotent(self, test_cache_dir: Path) -> None:
        """Close can be called multiple times safely."""
        client = PortalClient()
        client._get_session()
@@ -111,7 +80,7 @@ class TestPortalClientSession:
class TestPortalClientBrowserHeaders:
    """Tests that browser headers are properly set."""

    def test_session_has_browser_headers(self) -> None:
    def test_session_has_browser_headers(self, test_cache_dir: Path) -> None:
        """Session has browser-like headers to avoid 403 Forbidden."""
        client = PortalClient()
        session = client._get_session()
@@ -297,34 +266,9 @@ class TestPortalClientParseTdocPage:
        assert "url" in params


class TestBackwardCompatibility:
    """Tests to verify backward compatibility with existing API."""

    def test_portal_session_still_available(self) -> None:
        """PortalSession class is still available for backward compatibility."""
        assert PortalSession is not None

    def test_extract_tdoc_url_from_portal_still_available(self) -> None:
        """extract_tdoc_url_from_portal function is still available for backward compatibility."""
        assert extract_tdoc_url_from_portal is not None

    def test_parse_tdoc_portal_page_still_available(self) -> None:
        """parse_tdoc_portal_page function is still available for backward compatibility."""
        assert parse_tdoc_portal_page is not None

    def test_exceptions_still_available(self) -> None:
        """Exception classes are still available."""
        assert issubclass(PortalAuthenticationError, Exception)
        assert issubclass(PortalParsingError, Exception)


class TestPortalClientReExports:
    """Tests for re-exports in crawlers package."""

    def test_portal_client_re_exported(self) -> None:
        """PortalClient is re-exported from crawlers package."""
        assert ReExportedPortalClient is not None

    def test_create_portal_client_re_exported(self) -> None:
        """create_portal_client is re-exported from crawlers package."""
        assert ReExportedCreatePortalClient is not None