crawlers: simplify package exports and import PortalCredentials/constants (211f8c6f) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/crawlers/init.py

+34 −55

Original line number	Diff line number	Diff line
		@@ -2,11 +2,40 @@

		from __future__ import annotations

		from importlib import import_module
		from typing import Any

		# No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__
		# TODO: Why is this complicated dynamic import needed? Must be simplified dramatically!
		from .constants import (
		EXCLUDED_DIRS,
		EXCLUDED_DIRS_NORMALIZED,
		MEETING_CODE_REGISTRY,
		TDOC_DOWNLOAD_URL,
		TDOC_PATTERN,
		TDOC_PATTERN_STR,
		TDOC_SUBDIRS,
		TDOC_SUBDIRS_NORMALIZED,
		)
		from .hybrid import HybridCrawlResult, HybridTDocCrawler
		from .meeting_doclist import (
		DocumentListError,
		convert_excel_row_to_tdoc_metadata,
		fetch_meeting_document_list,
		parse_excel_document_list,
		)
		from .meetings import (
		MeetingCrawler,
		MeetingCrawlResult,
		normalize_subgroup_alias,
		normalize_working_group_alias,
		)
		from .parallel import fetch_meeting_tdocs
		from .portal import (
		PortalAuthenticationError,
		PortalParsingError,
		PortalSession,
		extract_tdoc_url_from_portal,
		fetch_tdoc_metadata,
		parse_tdoc_portal_page,
		)
		from .tdocs import TDocCrawler, TDocCrawlResult
		from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec

		__all__ = [
		"EXCLUDED_DIRS",
		@@ -39,53 +68,3 @@ __all__ = [
		"parse_tdoc_portal_page",
		"resolve_via_whatthespec",
		]
		# TODO: overcomplicated dynamic import mechanism; simplify!
		_ATTR_MODULES: dict[str, tuple[str, str]] = {
		"DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"),
		"EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"),
		"EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"),
		"HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"),
		"HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"),
		"MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"),
		"MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"),
		"MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"),
		"PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
		"PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
		"PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
		"TDOC_DOWNLOAD_URL": ("tdoc_crawler.crawlers.constants", "TDOC_DOWNLOAD_URL"),
		"TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
		"TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
		"TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
		"TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"),
		"TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"),
		"TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
		"WhatTheSpecResolutionError": ("tdoc_crawler.crawlers.whatthespec", "WhatTheSpecResolutionError"),
		"convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
		"extract_tdoc_url_from_portal": ("tdoc_crawler.crawlers.portal", "extract_tdoc_url_from_portal"),
		"fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
		"fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
		"fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),
		"normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"),
		"normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"),
		"parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"),
		"parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"),
		"resolve_via_whatthespec": ("tdoc_crawler.crawlers.whatthespec", "resolve_via_whatthespec"),
		}


		# TODO: overcomplicated dynamic import mechanism; simplify!
		def __getattr__(name: str) -> Any:
		try:
		module_name, attr_name = _ATTR_MODULES[name]
		except KeyError as exc:
		raise AttributeError(f"module 'tdoc_crawler.crawlers' has no attribute {name!r}") from exc

		module = import_module(module_name)
		value = getattr(module, attr_name)
		globals()[name] = value
		return value


		# TODO: overcomplicated dynamic import mechanism; simplify!
		def __dir__() -> list[str]:
		return sorted(set(globals()) \| set(__all__))

src/tdoc_crawler/crawlers/portal.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -15,8 +15,10 @@ from requests.adapters import HTTPAdapter
		from urllib3.util import Retry

		from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
		from tdoc_crawler.models.base import PortalCredentials
		from tdoc_crawler.models.tdocs import TDocMetadata


		# TODO: cache_dir not used at all in PortalSession? should be used for caching HTTP requests to portal pages via hishel package - wrong HTTPAdapter? See tdoc_crawler.models.base.HttpCacheConfig
		def create_cached_session(cache_dir: Path, ttl: int = 7200, refresh_ttl_on_access: bool = True, max_retries: int = 3) -> requests.Session:
		"""Create an HTTP session with caching."""

Admin message