Loading src/tdoc_crawler/cli/app.py +10 −19 Original line number Diff line number Diff line Loading @@ -13,31 +13,22 @@ import typer import yaml from dotenv import load_dotenv from rich.console import Console from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn from rich.progress import (BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn) from rich.table import Table from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import ( MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig, ) from tdoc_crawler.models import (MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig) from .fetching import maybe_fetch_missing_tdocs from .helpers import ( build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials, ) from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict from .helpers import (build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials) from .printing import (meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict) load_dotenv() Loading src/tdoc_crawler/cli/helpers.py +4 −7 Original line number Diff line number Diff line Loading @@ -18,14 +18,11 @@ from urllib.request import urlopen import typer from rich.console import Console from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.crawlers import (normalize_subgroup_alias, normalize_working_group_alias) from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import ( CrawlLimits, PortalCredentials, TDocMetadata, WorkingGroup, ) from tdoc_crawler.models import (CrawlLimits, PortalCredentials, TDocMetadata, WorkingGroup) console = Console() _logger = logging.getLogger(__name__) Loading src/tdoc_crawler/crawlers/meetings.py +6 −7 Original line number Diff line number Diff line Loading @@ -13,14 +13,13 @@ from urllib.parse import urljoin import requests from bs4 import BeautifulSoup, Tag from tdoc_crawler.crawlers.constants import ( DATE_PATTERN, from tdoc_crawler.crawlers.constants import (DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL, ) PORTAL_BASE_URL) from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup from tdoc_crawler.models import (CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup) logger = logging.getLogger(__name__) Loading src/tdoc_crawler/crawlers/parallel.py +4 −7 Original line number Diff line number Diff line Loading @@ -15,13 +15,10 @@ from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from tdoc_crawler.crawlers.constants import ( EXCLUDED_DIRS, from tdoc_crawler.crawlers.constants import (EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED, ) TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED) if TYPE_CHECKING: # pragma: no cover - helpful for type checkers only pass Loading src/tdoc_crawler/crawlers/portal.py +13 −10 Original line number Diff line number Diff line Loading @@ -9,7 +9,8 @@ from typing import TYPE_CHECKING import requests from bs4 import BeautifulSoup from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_VIEW_URL from tdoc_crawler.crawlers.constants import (LOGIN_URL, PORTAL_BASE_URL, TDOC_VIEW_URL) if TYPE_CHECKING: from tdoc_crawler.models.base import PortalCredentials Loading Loading @@ -42,7 +43,8 @@ class PortalSession: self._authenticated = False # Set browser-like headers to avoid 403 Forbidden self.session.headers.update({ self.session.headers.update( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", Loading @@ -50,7 +52,8 @@ class PortalSession: "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", }) } ) def __enter__(self) -> PortalSession: """Enter context manager.""" Loading Loading
src/tdoc_crawler/cli/app.py +10 −19 Original line number Diff line number Diff line Loading @@ -13,31 +13,22 @@ import typer import yaml from dotenv import load_dotenv from rich.console import Console from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn from rich.progress import (BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn) from rich.table import Table from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import ( MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig, ) from tdoc_crawler.models import (MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig) from .fetching import maybe_fetch_missing_tdocs from .helpers import ( build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials, ) from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict from .helpers import (build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials) from .printing import (meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict) load_dotenv() Loading
src/tdoc_crawler/cli/helpers.py +4 −7 Original line number Diff line number Diff line Loading @@ -18,14 +18,11 @@ from urllib.request import urlopen import typer from rich.console import Console from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.crawlers import (normalize_subgroup_alias, normalize_working_group_alias) from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import ( CrawlLimits, PortalCredentials, TDocMetadata, WorkingGroup, ) from tdoc_crawler.models import (CrawlLimits, PortalCredentials, TDocMetadata, WorkingGroup) console = Console() _logger = logging.getLogger(__name__) Loading
src/tdoc_crawler/crawlers/meetings.py +6 −7 Original line number Diff line number Diff line Loading @@ -13,14 +13,13 @@ from urllib.parse import urljoin import requests from bs4 import BeautifulSoup, Tag from tdoc_crawler.crawlers.constants import ( DATE_PATTERN, from tdoc_crawler.crawlers.constants import (DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL, ) PORTAL_BASE_URL) from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup from tdoc_crawler.models import (CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup) logger = logging.getLogger(__name__) Loading
src/tdoc_crawler/crawlers/parallel.py +4 −7 Original line number Diff line number Diff line Loading @@ -15,13 +15,10 @@ from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from tdoc_crawler.crawlers.constants import ( EXCLUDED_DIRS, from tdoc_crawler.crawlers.constants import (EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED, ) TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED) if TYPE_CHECKING: # pragma: no cover - helpful for type checkers only pass Loading
src/tdoc_crawler/crawlers/portal.py +13 −10 Original line number Diff line number Diff line Loading @@ -9,7 +9,8 @@ from typing import TYPE_CHECKING import requests from bs4 import BeautifulSoup from tdoc_crawler.crawlers.constants import LOGIN_URL, PORTAL_BASE_URL, TDOC_VIEW_URL from tdoc_crawler.crawlers.constants import (LOGIN_URL, PORTAL_BASE_URL, TDOC_VIEW_URL) if TYPE_CHECKING: from tdoc_crawler.models.base import PortalCredentials Loading Loading @@ -42,7 +43,8 @@ class PortalSession: self._authenticated = False # Set browser-like headers to avoid 403 Forbidden self.session.headers.update({ self.session.headers.update( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", Loading @@ -50,7 +52,8 @@ class PortalSession: "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", }) } ) def __enter__(self) -> PortalSession: """Enter context manager.""" Loading