Loading pyproject.toml +4 −0 Original line number Diff line number Diff line Loading @@ -64,6 +64,10 @@ tdoc-crawler = "tdoc_crawler.cli:app" [tool.pytest.ini_options] testpaths = ["tests"] pythonpath = ["src"] # Suppress Pydantic deprecation warning from pydantic-sqlite library (external dependency) filterwarnings = [ "ignore:.*Accessing the 'model_fields' attribute on the instance is deprecated.*:DeprecationWarning", ] [tool.coverage.report] Loading src/tdoc_crawler/cli/app.py +4 −10 Original line number Diff line number Diff line Loading @@ -13,22 +13,16 @@ import typer import yaml from dotenv import load_dotenv from rich.console import Console from rich.progress import (BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn) from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn from rich.table import Table from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig) from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig from .fetching import maybe_fetch_missing_tdocs from .helpers import (build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials) from .printing import (meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict) from .helpers import build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict load_dotenv() Loading src/tdoc_crawler/cli/helpers.py +4 −13 Original line number Diff line number Diff line Loading @@ -18,12 +18,9 @@ from urllib.request import urlopen import typer from rich.console import Console from tdoc_crawler.crawlers import (normalize_subgroup_alias, normalize_working_group_alias) from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (CrawlLimits, HttpCacheConfig, MeetingQueryConfig, PortalCredentials, SortOrder, TDocMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, PortalCredentials, SortOrder, TDocMetadata, WorkingGroup console = Console() _logger = logging.getLogger(__name__) Loading Loading @@ -344,19 +341,13 @@ def resolve_http_cache_config(cache_ttl: int | None = None, cache_refresh_on_acc ttl = cache_ttl else: env_ttl = os.getenv("HTTP_CACHE_TTL") if env_ttl: ttl = int(env_ttl) else: ttl = 7200 # default TTL of 2 hours ttl = int(env_ttl) if env_ttl else 7200 # Handle refresh on access - check CLI param, then env var, then default if cache_refresh_on_access is not None: refresh_on_access = cache_refresh_on_access else: env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower() if env_refresh: refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") else: refresh_on_access = True # default to True refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access) src/tdoc_crawler/crawlers/hybrid.py +2 −5 Original line number Diff line number Diff line Loading @@ -9,12 +9,9 @@ from collections.abc import Callable from dataclasses import dataclass from tdoc_crawler.crawlers.executor_adapter import Runner from tdoc_crawler.crawlers.parallel import ( fetch_meeting_document_list_subinterpreter, fetch_meeting_tdocs) from tdoc_crawler.crawlers.parallel import fetch_meeting_document_list_subinterpreter, fetch_meeting_tdocs from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (CrawlLimits, MeetingMetadata, MeetingQueryConfig, SortOrder, TDocCrawlConfig, TDocMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, MeetingMetadata, MeetingQueryConfig, SortOrder, TDocCrawlConfig, TDocMetadata, WorkingGroup logger = logging.getLogger(__name__) Loading src/tdoc_crawler/crawlers/meetings.py +2 −6 Original line number Diff line number Diff line Loading @@ -12,14 +12,10 @@ from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag from tdoc_crawler.crawlers.constants import (DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL) from tdoc_crawler.crawlers.constants import DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL from tdoc_crawler.database import TDocDatabase from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.models import (CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup logger = logging.getLogger(__name__) Loading Loading
pyproject.toml +4 −0 Original line number Diff line number Diff line Loading @@ -64,6 +64,10 @@ tdoc-crawler = "tdoc_crawler.cli:app" [tool.pytest.ini_options] testpaths = ["tests"] pythonpath = ["src"] # Suppress Pydantic deprecation warning from pydantic-sqlite library (external dependency) filterwarnings = [ "ignore:.*Accessing the 'model_fields' attribute on the instance is deprecated.*:DeprecationWarning", ] [tool.coverage.report] Loading
src/tdoc_crawler/cli/app.py +4 −10 Original line number Diff line number Diff line Loading @@ -13,22 +13,16 @@ import typer import yaml from dotenv import load_dotenv from rich.console import Console from rich.progress import (BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn) from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn from rich.table import Table from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig) from tdoc_crawler.models import MeetingCrawlConfig, MeetingQueryConfig, OutputFormat, QueryConfig, SortOrder, TDocCrawlConfig from .fetching import maybe_fetch_missing_tdocs from .helpers import (build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials) from .printing import (meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict) from .helpers import build_limits, database_path, launch_file, parse_subgroups, parse_working_groups, prepare_tdoc_file, resolve_credentials from .printing import meeting_to_dict, print_meeting_table, print_tdoc_table, tdoc_to_dict load_dotenv() Loading
src/tdoc_crawler/cli/helpers.py +4 −13 Original line number Diff line number Diff line Loading @@ -18,12 +18,9 @@ from urllib.request import urlopen import typer from rich.console import Console from tdoc_crawler.crawlers import (normalize_subgroup_alias, normalize_working_group_alias) from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (CrawlLimits, HttpCacheConfig, MeetingQueryConfig, PortalCredentials, SortOrder, TDocMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, PortalCredentials, SortOrder, TDocMetadata, WorkingGroup console = Console() _logger = logging.getLogger(__name__) Loading Loading @@ -344,19 +341,13 @@ def resolve_http_cache_config(cache_ttl: int | None = None, cache_refresh_on_acc ttl = cache_ttl else: env_ttl = os.getenv("HTTP_CACHE_TTL") if env_ttl: ttl = int(env_ttl) else: ttl = 7200 # default TTL of 2 hours ttl = int(env_ttl) if env_ttl else 7200 # Handle refresh on access - check CLI param, then env var, then default if cache_refresh_on_access is not None: refresh_on_access = cache_refresh_on_access else: env_refresh = os.getenv("HTTP_CACHE_REFRESH_ON_ACCESS", "").lower() if env_refresh: refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") else: refresh_on_access = True # default to True refresh_on_access = env_refresh in ("true", "1", "yes", "on", "t", "y") if env_refresh else True return HttpCacheConfig(ttl=ttl, refresh_ttl_on_access=refresh_on_access)
src/tdoc_crawler/crawlers/hybrid.py +2 −5 Original line number Diff line number Diff line Loading @@ -9,12 +9,9 @@ from collections.abc import Callable from dataclasses import dataclass from tdoc_crawler.crawlers.executor_adapter import Runner from tdoc_crawler.crawlers.parallel import ( fetch_meeting_document_list_subinterpreter, fetch_meeting_tdocs) from tdoc_crawler.crawlers.parallel import fetch_meeting_document_list_subinterpreter, fetch_meeting_tdocs from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models import (CrawlLimits, MeetingMetadata, MeetingQueryConfig, SortOrder, TDocCrawlConfig, TDocMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, MeetingMetadata, MeetingQueryConfig, SortOrder, TDocCrawlConfig, TDocMetadata, WorkingGroup logger = logging.getLogger(__name__) Loading
src/tdoc_crawler/crawlers/meetings.py +2 −6 Original line number Diff line number Diff line Loading @@ -12,14 +12,10 @@ from urllib.parse import urljoin from bs4 import BeautifulSoup, Tag from tdoc_crawler.crawlers.constants import (DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL) from tdoc_crawler.crawlers.constants import DATE_PATTERN, MEETING_CODE_REGISTRY, MEETINGS_BASE_URL, PORTAL_BASE_URL from tdoc_crawler.database import TDocDatabase from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.models import (CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup) from tdoc_crawler.models import CrawlLimits, MeetingCrawlConfig, MeetingMetadata, WorkingGroup logger = logging.getLogger(__name__) Loading