Loading src/tdoc_crawler/crawlers/__init__.py +34 −55 Original line number Diff line number Diff line Loading @@ -2,11 +2,40 @@ from __future__ import annotations from importlib import import_module from typing import Any # No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__ # TODO: Why is this complicated dynamic import needed? Must be simplified dramatically! from .constants import ( EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, MEETING_CODE_REGISTRY, TDOC_DOWNLOAD_URL, TDOC_PATTERN, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED, ) from .hybrid import HybridCrawlResult, HybridTDocCrawler from .meeting_doclist import ( DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list, ) from .meetings import ( MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias, ) from .parallel import fetch_meeting_tdocs from .portal import ( PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page, ) from .tdocs import TDocCrawler, TDocCrawlResult from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec __all__ = [ "EXCLUDED_DIRS", Loading Loading @@ -39,53 +68,3 @@ __all__ = [ "parse_tdoc_portal_page", "resolve_via_whatthespec", ] # TODO: overcomplicated dynamic import mechanism; simplify! _ATTR_MODULES: dict[str, tuple[str, str]] = { "DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"), "EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"), "EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"), "HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"), "HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"), "MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"), "MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"), "MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"), "PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"), "PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"), "PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"), "TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"), "TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"), "TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"), "TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"), "TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"), "TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"), "TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"), "WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"), "convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"), "extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"), "fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"), "fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"), "fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"), "normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"), "normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"), "parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"), "parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"), "resolve_via_whatthespec": ("tdoc_crawler.crawlers.whatthespec", "resolve_via_whatthespec"), } # TODO: overcomplicated dynamic import mechanism; simplify! def __getattr__(name: str) -> Any: try: module_name, attr_name = _ATTR_MODULES[name] except KeyError as exc: raise AttributeError(f"module 'tdoc_crawler.crawlers' has no attribute {name!r}") from exc module = import_module(module_name) value = getattr(module, attr_name) globals()[name] = value return value # TODO: overcomplicated dynamic import mechanism; simplify! def __dir__() -> list[str]: return sorted(set(globals()) | set(__all__)) src/tdoc_crawler/crawlers/portal.py +2 −0 Original line number Diff line number Diff line Loading @@ -15,8 +15,10 @@ from requests.adapters import HTTPAdapter from urllib3.util import Retry from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.models.base import PortalCredentials from tdoc_crawler.models.tdocs import TDocMetadata # TODO: cache_dir not used at all in PortalSession? should be used for caching HTTP requests to portal pages via hishel package - wrong HTTPAdapter? See tdoc_crawler.models.base.HttpCacheConfig def create_cached_session(cache_dir: Path, ttl: int = 7200, refresh_ttl_on_access: bool = True, max_retries: int = 3) -> requests.Session: """Create an HTTP session with caching.""" Loading Loading
src/tdoc_crawler/crawlers/__init__.py +34 −55 Original line number Diff line number Diff line Loading @@ -2,11 +2,40 @@ from __future__ import annotations from importlib import import_module from typing import Any # No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__ # TODO: Why is this complicated dynamic import needed? Must be simplified dramatically! from .constants import ( EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, MEETING_CODE_REGISTRY, TDOC_DOWNLOAD_URL, TDOC_PATTERN, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED, ) from .hybrid import HybridCrawlResult, HybridTDocCrawler from .meeting_doclist import ( DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list, ) from .meetings import ( MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias, ) from .parallel import fetch_meeting_tdocs from .portal import ( PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page, ) from .tdocs import TDocCrawler, TDocCrawlResult from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec __all__ = [ "EXCLUDED_DIRS", Loading Loading @@ -39,53 +68,3 @@ __all__ = [ "parse_tdoc_portal_page", "resolve_via_whatthespec", ] # TODO: overcomplicated dynamic import mechanism; simplify! _ATTR_MODULES: dict[str, tuple[str, str]] = { "DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"), "EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"), "EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"), "HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"), "HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"), "MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"), "MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"), "MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"), "PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"), "PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"), "PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"), "TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"), "TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"), "TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"), "TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"), "TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"), "TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"), "TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"), "WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"), "convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"), "extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"), "fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"), "fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"), "fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"), "normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"), "normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"), "parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"), "parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"), "resolve_via_whatthespec": ("tdoc_crawler.crawlers.whatthespec", "resolve_via_whatthespec"), } # TODO: overcomplicated dynamic import mechanism; simplify! def __getattr__(name: str) -> Any: try: module_name, attr_name = _ATTR_MODULES[name] except KeyError as exc: raise AttributeError(f"module 'tdoc_crawler.crawlers' has no attribute {name!r}") from exc module = import_module(module_name) value = getattr(module, attr_name) globals()[name] = value return value # TODO: overcomplicated dynamic import mechanism; simplify! def __dir__() -> list[str]: return sorted(set(globals()) | set(__all__))
src/tdoc_crawler/crawlers/portal.py +2 −0 Original line number Diff line number Diff line Loading @@ -15,8 +15,10 @@ from requests.adapters import HTTPAdapter from urllib3.util import Retry from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL from tdoc_crawler.models.base import PortalCredentials from tdoc_crawler.models.tdocs import TDocMetadata # TODO: cache_dir not used at all in PortalSession? should be used for caching HTTP requests to portal pages via hishel package - wrong HTTPAdapter? See tdoc_crawler.models.base.HttpCacheConfig def create_cached_session(cache_dir: Path, ttl: int = 7200, refresh_ttl_on_access: bool = True, max_retries: int = 3) -> requests.Session: """Create an HTTP session with caching.""" Loading