Commit 211f8c6f authored by Jan Reimes's avatar Jan Reimes
Browse files

crawlers: simplify package exports and import PortalCredentials/constants

parent 30a6d552
Loading
Loading
Loading
Loading
+34 −55
Original line number Diff line number Diff line
@@ -2,11 +2,40 @@

from __future__ import annotations

from importlib import import_module
from typing import Any

# No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__
# TODO: Why is this complicated dynamic import needed? Must be simplified dramatically!
from .constants import (
    EXCLUDED_DIRS,
    EXCLUDED_DIRS_NORMALIZED,
    MEETING_CODE_REGISTRY,
    TDOC_DOWNLOAD_URL,
    TDOC_PATTERN,
    TDOC_PATTERN_STR,
    TDOC_SUBDIRS,
    TDOC_SUBDIRS_NORMALIZED,
)
from .hybrid import HybridCrawlResult, HybridTDocCrawler
from .meeting_doclist import (
    DocumentListError,
    convert_excel_row_to_tdoc_metadata,
    fetch_meeting_document_list,
    parse_excel_document_list,
)
from .meetings import (
    MeetingCrawler,
    MeetingCrawlResult,
    normalize_subgroup_alias,
    normalize_working_group_alias,
)
from .parallel import fetch_meeting_tdocs
from .portal import (
    PortalAuthenticationError,
    PortalParsingError,
    PortalSession,
    extract_tdoc_url_from_portal,
    fetch_tdoc_metadata,
    parse_tdoc_portal_page,
)
from .tdocs import TDocCrawler, TDocCrawlResult
from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec

__all__ = [
    "EXCLUDED_DIRS",
@@ -39,53 +68,3 @@ __all__ = [
    "parse_tdoc_portal_page",
    "resolve_via_whatthespec",
]
# TODO: overcomplicated dynamic import mechanism; simplify!
_ATTR_MODULES: dict[str, tuple[str, str]] = {
    "DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"),
    "EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"),
    "EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"),
    "HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"),
    "HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"),
    "MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"),
    "MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"),
    "MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"),
    "PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
    "PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
    "PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
    "TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"),
    "TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
    "TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
    "TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
    "TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"),
    "TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"),
    "TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
    "WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"),
    "convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
    "extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"),
    "fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
    "fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
    "fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),
    "normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"),
    "normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"),
    "parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"),
    "parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"),
    "resolve_via_whatthespec": ("tdoc_crawler.crawlers.whatthespec", "resolve_via_whatthespec"),
}


# TODO: overcomplicated dynamic import mechanism; simplify!
def __getattr__(name: str) -> Any:
    try:
        module_name, attr_name = _ATTR_MODULES[name]
    except KeyError as exc:
        raise AttributeError(f"module 'tdoc_crawler.crawlers' has no attribute {name!r}") from exc

    module = import_module(module_name)
    value = getattr(module, attr_name)
    globals()[name] = value
    return value


# TODO: overcomplicated dynamic import mechanism; simplify!
def __dir__() -> list[str]:
    return sorted(set(globals()) | set(__all__))
+2 −0
Original line number Diff line number Diff line
@@ -15,8 +15,10 @@ from requests.adapters import HTTPAdapter
from urllib3.util import Retry

from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.models.base import PortalCredentials
from tdoc_crawler.models.tdocs import TDocMetadata


# TODO: cache_dir not used at all in PortalSession? should be used for caching HTTP requests to portal pages via hishel package - wrong HTTPAdapter? See tdoc_crawler.models.base.HttpCacheConfig
def create_cached_session(cache_dir: Path, ttl: int = 7200, refresh_ttl_on_access: bool = True, max_retries: int = 3) -> requests.Session:
    """Create an HTTP session with caching."""