Loading src/tdoc_crawler/config/service.py +0 −12 Original line number Diff line number Diff line Loading @@ -76,17 +76,5 @@ class ConfigService: self._http_cache = HttpCacheConfig.resolve_http_cache_config(cache_file=self.cache_manager.http_cache_file) return self._http_cache @classmethod def from_env(cls, cache_manager_name: str | None = None) -> ConfigService: """Create ConfigService loading settings from environment variables. Args: cache_manager_name: Optional name for the CacheManager to use. Returns: ConfigService instance configured from the environment. """ return cls(cache_manager_name=cache_manager_name) __all__ = ["ConfigService"] src/tdoc_crawler/database/base.py +0 −7 Original line number Diff line number Diff line Loading @@ -59,13 +59,6 @@ class DocDatabase: # ------------------------------------------------------------------ # Core accessors and utilities # ------------------------------------------------------------------ @property def connection(self) -> AsyncDatabase: """Expose the underlying AsyncDatabase instance (read-only).""" if self._database is None: raise DatabaseError.connection_not_open() return self._database async def clear_tdocs(self) -> int: """Clear all TDoc records from database. Loading src/tdoc_crawler/specs/models.py +1 −13 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ from pydantic import BaseModel, Field, field_validator from rich.console import Console, ConsoleOptions, RenderResult from rich.text import Text from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version, parse_spec_version_nbr from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version_nbr class Specification(BaseModel): Loading @@ -30,13 +30,6 @@ class Specification(BaseModel): series: str latest_version: SpecificationVersionNumber | None = None @property def latest_version_number(self) -> Version | None: """Return latest version as comparable Version object.""" if self.latest_version is None: return None return parse_spec_version(self.latest_version) def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult: _ = (console, options) yield Text(f"{self.spec_number} - {self.title}") Loading Loading @@ -79,11 +72,6 @@ class SpecificationVersion(BaseModel): file_name: str source_name: str @property def version_number(self) -> Version: """Return version as comparable Version object.""" return parse_spec_version(self.version) @field_validator("version", mode="before") @classmethod def _normalize_version(cls, value: Version | SpecificationVersionNumber) -> SpecificationVersionNumber: Loading src/tdoc_crawler/tdocs/models.py +2 −58 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from collections.abc import Iterable from datetime import date, datetime from enum import StrEnum, auto import requests from packaging.version import Version from pydantic import BaseModel, Field, field_serializer, field_validator Loading @@ -19,9 +18,9 @@ from tdoc_crawler.models.base import ( ) from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.tdocs.utils import normalize_tdoc_id, normalize_tdoc_ids from tdoc_crawler.utils.misc import utc_now from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr, parse_agenda_item_version from tdoc_crawler.utils.normalization import normalize_tdoc_id, normalize_tdoc_ids from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr _logger = get_logger(__name__) Loading Loading @@ -98,11 +97,6 @@ class TDocMetadata(BaseModel): agenda_item_text: str = Field("Unknown", description="Associated agenda item (text identifier)") status: str | None = Field(None, description="Document status as reported by the portal") @property def agenda_item_version(self) -> Version: """Return agenda item number as comparable Version object.""" return parse_agenda_item_version(self.agenda_item_nbr) @property def is_valid(self) -> bool: """Check if TDoc metadata is valid and downloadable. Loading @@ -122,56 +116,6 @@ class TDocMetadata(BaseModel): # Check for placeholder/corrupt URL patterns return not ("/.../" in self.url or "..." in self.url) @property def has_valid_url(self) -> bool: """Check if URL is present and valid (alias for is_valid).""" return self.is_valid def validate_url_accessible(self, timeout: int = 10) -> bool: """Validate that the URL is accessible via HTTP. Makes an HTTP HEAD request to check if the URL returns a successful status code (2xx or 3xx). This is a lightweight check that doesn't download the full file. Args: timeout: Request timeout in seconds (default: 10) Returns: True if URL is accessible (2xx/3xx status), False otherwise Note: Returns False for network errors, timeouts, or 4xx/5xx responses. Logs warnings for failed requests. """ if not self.is_valid: _logger.debug(f"URL validation skipped for {self.tdoc_id}: basic validation failed") return False url = self.url if url is None: return False is_accessible = False try: # Use HEAD request to check accessibility without downloading response = requests.head(url, timeout=timeout, allow_redirects=True) # Check for successful response (2xx or 3xx) if response.status_code < 400: _logger.debug(f"URL accessible for {self.tdoc_id}: {url} (status {response.status_code})") is_accessible = True else: _logger.warning(f"URL returned error for {self.tdoc_id}: {url} (status {response.status_code})") except requests.exceptions.Timeout: _logger.warning(f"URL validation timeout for {self.tdoc_id}: {url}") except requests.exceptions.ConnectionError as exc: _logger.warning(f"URL validation connection error for {self.tdoc_id}: {exc}") except requests.exceptions.RequestException as exc: _logger.warning(f"URL validation failed for {self.tdoc_id}: {exc}") return is_accessible @classmethod @field_serializer("agenda_item_nbr") def _serialize_agenda_item_nbr(cls, value: AgendaItemNumber) -> str: Loading Loading
src/tdoc_crawler/config/service.py +0 −12 Original line number Diff line number Diff line Loading @@ -76,17 +76,5 @@ class ConfigService: self._http_cache = HttpCacheConfig.resolve_http_cache_config(cache_file=self.cache_manager.http_cache_file) return self._http_cache @classmethod def from_env(cls, cache_manager_name: str | None = None) -> ConfigService: """Create ConfigService loading settings from environment variables. Args: cache_manager_name: Optional name for the CacheManager to use. Returns: ConfigService instance configured from the environment. """ return cls(cache_manager_name=cache_manager_name) __all__ = ["ConfigService"]
src/tdoc_crawler/database/base.py +0 −7 Original line number Diff line number Diff line Loading @@ -59,13 +59,6 @@ class DocDatabase: # ------------------------------------------------------------------ # Core accessors and utilities # ------------------------------------------------------------------ @property def connection(self) -> AsyncDatabase: """Expose the underlying AsyncDatabase instance (read-only).""" if self._database is None: raise DatabaseError.connection_not_open() return self._database async def clear_tdocs(self) -> int: """Clear all TDoc records from database. Loading
src/tdoc_crawler/specs/models.py +1 −13 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ from pydantic import BaseModel, Field, field_validator from rich.console import Console, ConsoleOptions, RenderResult from rich.text import Text from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version, parse_spec_version_nbr from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version_nbr class Specification(BaseModel): Loading @@ -30,13 +30,6 @@ class Specification(BaseModel): series: str latest_version: SpecificationVersionNumber | None = None @property def latest_version_number(self) -> Version | None: """Return latest version as comparable Version object.""" if self.latest_version is None: return None return parse_spec_version(self.latest_version) def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult: _ = (console, options) yield Text(f"{self.spec_number} - {self.title}") Loading Loading @@ -79,11 +72,6 @@ class SpecificationVersion(BaseModel): file_name: str source_name: str @property def version_number(self) -> Version: """Return version as comparable Version object.""" return parse_spec_version(self.version) @field_validator("version", mode="before") @classmethod def _normalize_version(cls, value: Version | SpecificationVersionNumber) -> SpecificationVersionNumber: Loading
src/tdoc_crawler/tdocs/models.py +2 −58 Original line number Diff line number Diff line Loading @@ -6,7 +6,6 @@ from collections.abc import Iterable from datetime import date, datetime from enum import StrEnum, auto import requests from packaging.version import Version from pydantic import BaseModel, Field, field_serializer, field_validator Loading @@ -19,9 +18,9 @@ from tdoc_crawler.models.base import ( ) from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.tdocs.utils import normalize_tdoc_id, normalize_tdoc_ids from tdoc_crawler.utils.misc import utc_now from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr, parse_agenda_item_version from tdoc_crawler.utils.normalization import normalize_tdoc_id, normalize_tdoc_ids from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr _logger = get_logger(__name__) Loading Loading @@ -98,11 +97,6 @@ class TDocMetadata(BaseModel): agenda_item_text: str = Field("Unknown", description="Associated agenda item (text identifier)") status: str | None = Field(None, description="Document status as reported by the portal") @property def agenda_item_version(self) -> Version: """Return agenda item number as comparable Version object.""" return parse_agenda_item_version(self.agenda_item_nbr) @property def is_valid(self) -> bool: """Check if TDoc metadata is valid and downloadable. Loading @@ -122,56 +116,6 @@ class TDocMetadata(BaseModel): # Check for placeholder/corrupt URL patterns return not ("/.../" in self.url or "..." in self.url) @property def has_valid_url(self) -> bool: """Check if URL is present and valid (alias for is_valid).""" return self.is_valid def validate_url_accessible(self, timeout: int = 10) -> bool: """Validate that the URL is accessible via HTTP. Makes an HTTP HEAD request to check if the URL returns a successful status code (2xx or 3xx). This is a lightweight check that doesn't download the full file. Args: timeout: Request timeout in seconds (default: 10) Returns: True if URL is accessible (2xx/3xx status), False otherwise Note: Returns False for network errors, timeouts, or 4xx/5xx responses. Logs warnings for failed requests. """ if not self.is_valid: _logger.debug(f"URL validation skipped for {self.tdoc_id}: basic validation failed") return False url = self.url if url is None: return False is_accessible = False try: # Use HEAD request to check accessibility without downloading response = requests.head(url, timeout=timeout, allow_redirects=True) # Check for successful response (2xx or 3xx) if response.status_code < 400: _logger.debug(f"URL accessible for {self.tdoc_id}: {url} (status {response.status_code})") is_accessible = True else: _logger.warning(f"URL returned error for {self.tdoc_id}: {url} (status {response.status_code})") except requests.exceptions.Timeout: _logger.warning(f"URL validation timeout for {self.tdoc_id}: {url}") except requests.exceptions.ConnectionError as exc: _logger.warning(f"URL validation connection error for {self.tdoc_id}: {exc}") except requests.exceptions.RequestException as exc: _logger.warning(f"URL validation failed for {self.tdoc_id}: {exc}") return is_accessible @classmethod @field_serializer("agenda_item_nbr") def _serialize_agenda_item_nbr(cls, value: AgendaItemNumber) -> str: Loading