Loading src/tdoc_crawler/cli/app.py +18 −3 Original line number Diff line number Diff line Loading @@ -79,7 +79,7 @@ from tdoc_crawler.cli.printing import ( tdoc_to_dict, ) from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import set_credentials from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import TDocDatabase, database_path from tdoc_crawler.fetching import fetch_missing_tdocs from tdoc_crawler.logging import set_verbosity Loading Loading @@ -609,10 +609,13 @@ def open_tdoc( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, eol_password: EolPasswordOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download, extract, and open a TDoc file.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) normalized_id = tdoc_id.strip().upper() config = QueryConfig( cache_dir=cache_dir, Loading @@ -622,7 +625,11 @@ def open_tdoc( db_path = database_path(cache_dir) with TDocDatabase(db_path) as database: results = database.query_tdocs(config) result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") results = result.refreshed Loading @@ -648,10 +655,13 @@ def checkout( force: ForceOption = False, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, eol_password: EolPasswordOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and extract TDoc(s) to checkout folder.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) normalized_ids = [tid.strip().upper() for tid in tdoc_id] config = QueryConfig( cache_dir=cache_dir, Loading @@ -661,7 +671,12 @@ def checkout( db_path = database_path(cache_dir) with TDocDatabase(db_path) as database: results = database.query_tdocs(config) result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec) from tdoc_crawler.credentials import resolve_credentials credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") results = result.refreshed Loading src/tdoc_crawler/crawlers/portal.py +35 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ from __future__ import annotations import logging import re from decimal import Decimal from pathlib import Path from typing import Any Loading Loading @@ -316,6 +317,25 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) # Create and return TDocMetadata instance # Note: meeting_id is not available from portal parsing, caller must resolve it return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name title=metadata.get("title", ""), url=url, source=metadata.get("source", ""), contact=metadata.get("contact", ""), tdoc_type=metadata.get("tdoc_type", "unknown"), for_purpose=metadata.get("for", "unknown"), agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")), agenda_item_text=metadata.get("agenda_item_text", "Unknown"), status=metadata.get("status"), meeting_name=metadata.get("meeting"), is_revision_of=metadata.get("is_revision_of"), validated=True, ) def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path | None = None, timeout: int = 15) -> str: """Extract direct FTP download URL for a TDoc using unauthenticated DownloadTDoc.aspx endpoint. Loading Loading @@ -473,7 +493,9 @@ class PortalClient: logger.info("Authenticating with 3GPP portal...") session = self._get_session() # Use a non-cached session for authentication to avoid # hishel cache interfering with session cookies session = self._get_auth_session() # Step 1: Visit the login page to establish session and get cookies logger.debug("Visiting login page to establish session...") Loading Loading @@ -508,6 +530,8 @@ class PortalClient: if response_text.lower() == "failed": raise PortalAuthenticationError("Authentication failed - check credentials") # Step 3: Store the authenticated session and copy cookies to cached session self._session = session self._authenticated = True logger.info("Successfully authenticated with 3GPP portal") Loading Loading @@ -633,6 +657,16 @@ class PortalClient: """ return parse_tdoc_portal_page(html, tdoc_id, url) def _get_auth_session(self) -> requests.Session: """Get a non-cached session for authentication. Returns: Non-cached session with browser-like headers. """ session = requests.Session() session.headers.update(_BROWSER_HEADERS) return session def _get_session(self) -> requests.Session: """Get or create a cached session with browser headers. Loading tests/test_portal_client.py +7 −4 Original line number Diff line number Diff line Loading @@ -173,12 +173,14 @@ class TestPortalClientAuthenticate: """Authentication succeeds with valid credentials.""" creds = PortalCredentials(username="test", password="test") client = PortalClient(credentials=creds) with patch.object(client, "_get_session") as mock_get_session: with patch.object(client, "_get_auth_session") as mock_get_auth_session: mock_session = MagicMock() mock_response = MagicMock() mock_response.raise_for_status = MagicMock() mock_response.text = "Success" # Non-failed response mock_session.get.return_value = mock_response mock_get_session.return_value = mock_session mock_session.post.return_value = mock_response mock_get_auth_session.return_value = mock_session client.authenticate() assert client._authenticated is True Loading @@ -197,13 +199,14 @@ class TestPortalClientAuthenticate: """Authentication fails with invalid credentials.""" creds = PortalCredentials(username="test", password="wrong") client = PortalClient(credentials=creds) with patch.object(client, "_get_session") as mock_get_session: with patch.object(client, "_get_auth_session") as mock_get_auth_session: mock_session = MagicMock() mock_response = MagicMock() mock_response.raise_for_status = MagicMock() mock_response.text = "Failed" mock_session.get.return_value = mock_response mock_session.post.return_value = mock_response mock_get_session.return_value = mock_session mock_get_auth_session.return_value = mock_session with pytest.raises(PortalAuthenticationError): client.authenticate() Loading Loading
src/tdoc_crawler/cli/app.py +18 −3 Original line number Diff line number Diff line Loading @@ -79,7 +79,7 @@ from tdoc_crawler.cli.printing import ( tdoc_to_dict, ) from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.credentials import set_credentials from tdoc_crawler.credentials import resolve_credentials, set_credentials from tdoc_crawler.database import TDocDatabase, database_path from tdoc_crawler.fetching import fetch_missing_tdocs from tdoc_crawler.logging import set_verbosity Loading Loading @@ -609,10 +609,13 @@ def open_tdoc( cache_dir: CacheDirOption = DEFAULT_CACHE_DIR, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, eol_password: EolPasswordOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download, extract, and open a TDoc file.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) normalized_id = tdoc_id.strip().upper() config = QueryConfig( cache_dir=cache_dir, Loading @@ -622,7 +625,11 @@ def open_tdoc( db_path = database_path(cache_dir) with TDocDatabase(db_path) as database: results = database.query_tdocs(config) result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec) credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") results = result.refreshed Loading @@ -648,10 +655,13 @@ def checkout( force: ForceOption = False, full_metadata: FullMetadataOption = False, use_whatthespec: UseWhatTheSpecOption = False, eol_username: EolUsernameOption = None, eol_password: EolPasswordOption = None, verbosity: VerbosityOption = DEFAULT_VERBOSITY, ) -> None: """Download and extract TDoc(s) to checkout folder.""" set_verbosity(verbosity) set_credentials(eol_username, eol_password, prompt=None) normalized_ids = [tid.strip().upper() for tid in tdoc_id] config = QueryConfig( cache_dir=cache_dir, Loading @@ -661,7 +671,12 @@ def checkout( db_path = database_path(cache_dir) with TDocDatabase(db_path) as database: results = database.query_tdocs(config) result = fetch_missing_tdocs(database, cache_dir, config, results, full_metadata=full_metadata, use_whatthespec=use_whatthespec) from tdoc_crawler.credentials import resolve_credentials credentials = resolve_credentials(eol_username, eol_password, prompt=None) result = fetch_missing_tdocs( database, cache_dir, config, results, credentials=credentials, full_metadata=full_metadata, use_whatthespec=use_whatthespec ) if result.fetch_result and result.fetch_result.errors: console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]") results = result.refreshed Loading
src/tdoc_crawler/crawlers/portal.py +35 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ from __future__ import annotations import logging import re from decimal import Decimal from pathlib import Path from typing import Any Loading Loading @@ -316,6 +317,25 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) # Create and return TDocMetadata instance # Note: meeting_id is not available from portal parsing, caller must resolve it return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name title=metadata.get("title", ""), url=url, source=metadata.get("source", ""), contact=metadata.get("contact", ""), tdoc_type=metadata.get("tdoc_type", "unknown"), for_purpose=metadata.get("for", "unknown"), agenda_item_nbr=Decimal(metadata.get("agenda_item_nbr", "0")), agenda_item_text=metadata.get("agenda_item_text", "Unknown"), status=metadata.get("status"), meeting_name=metadata.get("meeting"), is_revision_of=metadata.get("is_revision_of"), validated=True, ) def extract_tdoc_url_from_portal(tdoc_id: str, cache_dir: Path | None = None, timeout: int = 15) -> str: """Extract direct FTP download URL for a TDoc using unauthenticated DownloadTDoc.aspx endpoint. Loading Loading @@ -473,7 +493,9 @@ class PortalClient: logger.info("Authenticating with 3GPP portal...") session = self._get_session() # Use a non-cached session for authentication to avoid # hishel cache interfering with session cookies session = self._get_auth_session() # Step 1: Visit the login page to establish session and get cookies logger.debug("Visiting login page to establish session...") Loading Loading @@ -508,6 +530,8 @@ class PortalClient: if response_text.lower() == "failed": raise PortalAuthenticationError("Authentication failed - check credentials") # Step 3: Store the authenticated session and copy cookies to cached session self._session = session self._authenticated = True logger.info("Successfully authenticated with 3GPP portal") Loading Loading @@ -633,6 +657,16 @@ class PortalClient: """ return parse_tdoc_portal_page(html, tdoc_id, url) def _get_auth_session(self) -> requests.Session: """Get a non-cached session for authentication. Returns: Non-cached session with browser-like headers. """ session = requests.Session() session.headers.update(_BROWSER_HEADERS) return session def _get_session(self) -> requests.Session: """Get or create a cached session with browser headers. Loading
tests/test_portal_client.py +7 −4 Original line number Diff line number Diff line Loading @@ -173,12 +173,14 @@ class TestPortalClientAuthenticate: """Authentication succeeds with valid credentials.""" creds = PortalCredentials(username="test", password="test") client = PortalClient(credentials=creds) with patch.object(client, "_get_session") as mock_get_session: with patch.object(client, "_get_auth_session") as mock_get_auth_session: mock_session = MagicMock() mock_response = MagicMock() mock_response.raise_for_status = MagicMock() mock_response.text = "Success" # Non-failed response mock_session.get.return_value = mock_response mock_get_session.return_value = mock_session mock_session.post.return_value = mock_response mock_get_auth_session.return_value = mock_session client.authenticate() assert client._authenticated is True Loading @@ -197,13 +199,14 @@ class TestPortalClientAuthenticate: """Authentication fails with invalid credentials.""" creds = PortalCredentials(username="test", password="wrong") client = PortalClient(credentials=creds) with patch.object(client, "_get_session") as mock_get_session: with patch.object(client, "_get_auth_session") as mock_get_auth_session: mock_session = MagicMock() mock_response = MagicMock() mock_response.raise_for_status = MagicMock() mock_response.text = "Failed" mock_session.get.return_value = mock_response mock_session.post.return_value = mock_response mock_get_session.return_value = mock_session mock_get_auth_session.return_value = mock_session with pytest.raises(PortalAuthenticationError): client.authenticate() Loading