Commit b27ccfb2 authored by Jan Reimes's avatar Jan Reimes
Browse files

🔒 fix(portal): add authentication session handling and credential resolution

parent 21eaa269
Loading
Loading
Loading
Loading
+18 −3
Original line number Diff line number Diff line
@@ -79,7 +79,7 @@ from tdoc_crawler.cli.printing import (
    tdoc_to_dict,
)
from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.credentials import resolve_credentials, set_credentials
from tdoc_crawler.database import TDocDatabase, database_path
from tdoc_crawler.fetching import fetch_missing_tdocs
from tdoc_crawler.logging import set_verbosity
@@ -609,10 +609,13 @@ def open_tdoc(
    cache_dir: CacheDirOption = DEFAULT_CACHE_DIR,
    full_metadata: FullMetadataOption = False,
    use_whatthespec: UseWhatTheSpecOption = False,
    eol_username: EolUsernameOption = None,
    eol_password: EolPasswordOption = None,
    verbosity: VerbosityOption = DEFAULT_VERBOSITY,
) -> None:
    """Download, extract, and open a TDoc file."""
    set_verbosity(verbosity)
    set_credentials(eol_username, eol_password, prompt=None)
    normalized_id = tdoc_id.strip().upper()
    config = QueryConfig(
        cache_dir=cache_dir,
@@ -622,7 +625,11 @@ def open_tdoc(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)

        credentials = resolve_credentials(eol_username, eol_password, prompt=None)
        result = fetch_missing_tdocs(
            database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec
        )
        if result.fetch_result and result.fetch_result.errors:
            console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        results = result.refreshed
@@ -648,10 +655,13 @@ def checkout(
    force: ForceOption = False,
    full_metadata: FullMetadataOption = False,
    use_whatthespec: UseWhatTheSpecOption = False,
    eol_username: EolUsernameOption = None,
    eol_password: EolPasswordOption = None,
    verbosity: VerbosityOption = DEFAULT_VERBOSITY,
) -> None:
    """Download and extract TDoc(s) to checkout folder."""
    set_verbosity(verbosity)
    set_credentials(eol_username, eol_password, prompt=None)
    normalized_ids = [tid.strip().upper() for tid in tdoc_id]
    config = QueryConfig(
        cache_dir=cache_dir,
@@ -661,7 +671,12 @@ def checkout(
    db_path = database_path(cache_dir)
    with TDocDatabase(db_path) as database:
        results = database.query_tdocs(config)
        result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec)
        from tdoc_crawler.credentials import resolve_credentials

        credentials = resolve_credentials(eol_username, eol_password, prompt=None)
        result = fetch_missing_tdocs(
            database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec
        )
        if result.fetch_result and result.fetch_result.errors:
            console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
        results = result.refreshed
+35 −1
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@ from __future__ import annotations

import logging
import re
from decimal import Decimal
from pathlib import Path
from typing import Any

@@ -316,6 +317,25 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T
        logger.warning(error_msg)
        raise PortalParsingError(error_msg)

    # Create and return TDocMetadata instance
    # Note: meeting_id is not available from portal parsing, caller must resolve it
    return TDocMetadata(
        tdoc_id=tdoc_id,
        meeting_id=0,  # Placeholder - caller must resolve via meeting_name
        title=metadata.get("title", ""),
        url=url,
        source=metadata.get("source", ""),
        contact=metadata.get("contact", ""),
        tdoc_type=metadata.get("tdoc_type", "unknown"),
        for_purpose=metadata.get("for", "unknown"),
        agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")),
        agenda_item_text=metadata.get("agenda_item_text", "Unknown"),
        status=metadata.get("status"),
        meeting_name=metadata.get("meeting"),
        is_revision_of=metadata.get("is_revision_of"),
        validated=True,
    )


def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path | None = None, timeout: int = 15) -> str:
    """Extract direct FTP download URL for a TDoc using unauthenticated DownloadTDoc.aspx endpoint.
@@ -473,7 +493,9 @@ class PortalClient:

        logger.info("Authenticating with 3GPP portal...")

        session = self._get_session()
        # Use a non-cached session for authentication to avoid
        # hishel cache interfering with session cookies
        session = self._get_auth_session()

        # Step 1: Visit the login page to establish session and get cookies
        logger.debug("Visiting login page to establish session...")
@@ -508,6 +530,8 @@ class PortalClient:
        if response_text.lower() == "failed":
            raise PortalAuthenticationError("Authentication failed - check credentials")

        # Step 3: Store the authenticated session and copy cookies to cached session
        self._session = session
        self._authenticated = True
        logger.info("Successfully authenticated with 3GPP portal")

@@ -633,6 +657,16 @@ class PortalClient:
        """
        return parse_tdoc_portal_page(html, tdoc_id, url)

    def _get_auth_session(self) -> requests.Session:
        """Get a non-cached session for authentication.

        Returns:
            Non-cached session with browser-like headers.
        """
        session = requests.Session()
        session.headers.update(_BROWSER_HEADERS)
        return session

    def _get_session(self) -> requests.Session:
        """Get or create a cached session with browser headers.

+7 −4
Original line number Diff line number Diff line
@@ -173,12 +173,14 @@ class TestPortalClientAuthenticate:
        """Authentication succeeds with valid credentials."""
        creds = PortalCredentials(username="test", password="test")
        client = PortalClient(credentials=creds)
        with patch.object(client, "_get_session") as mock_get_session:
        with patch.object(client, "_get_auth_session") as mock_get_auth_session:
            mock_session = MagicMock()
            mock_response = MagicMock()
            mock_response.raise_for_status = MagicMock()
            mock_response.text = "Success"  # Non-failed response
            mock_session.get.return_value = mock_response
            mock_get_session.return_value = mock_session
            mock_session.post.return_value = mock_response
            mock_get_auth_session.return_value = mock_session

            client.authenticate()
            assert client._authenticated is True
@@ -197,13 +199,14 @@ class TestPortalClientAuthenticate:
        """Authentication fails with invalid credentials."""
        creds = PortalCredentials(username="test", password="wrong")
        client = PortalClient(credentials=creds)
        with patch.object(client, "_get_session") as mock_get_session:
        with patch.object(client, "_get_auth_session") as mock_get_auth_session:
            mock_session = MagicMock()
            mock_response = MagicMock()
            mock_response.raise_for_status = MagicMock()
            mock_response.text = "Failed"
            mock_session.get.return_value = mock_response
            mock_session.post.return_value = mock_response
            mock_get_session.return_value = mock_session
            mock_get_auth_session.return_value = mock_session

            with pytest.raises(PortalAuthenticationError):
                client.authenticate()