Commit e7ca255a authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(tdocs): remove redundant version parsing properties

parent a6e15087
Loading
Loading
Loading
Loading
+0 −12
Original line number Diff line number Diff line
@@ -76,17 +76,5 @@ class ConfigService:
            self._http_cache = HttpCacheConfig.resolve_http_cache_config(cache_file=self.cache_manager.http_cache_file)
        return self._http_cache

    @classmethod
    def from_env(cls, cache_manager_name: str | None = None) -> ConfigService:
        """Create ConfigService loading settings from environment variables.

        Args:
            cache_manager_name: Optional name for the CacheManager to use.

        Returns:
            ConfigService instance configured from the environment.
        """
        return cls(cache_manager_name=cache_manager_name)


__all__ = ["ConfigService"]
+0 −7
Original line number Diff line number Diff line
@@ -59,13 +59,6 @@ class DocDatabase:
    # ------------------------------------------------------------------
    # Core accessors and utilities
    # ------------------------------------------------------------------
    @property
    def connection(self) -> AsyncDatabase:
        """Expose the underlying AsyncDatabase instance (read-only)."""
        if self._database is None:
            raise DatabaseError.connection_not_open()
        return self._database

    async def clear_tdocs(self) -> int:
        """Clear all TDoc records from database.

+1 −13
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ from pydantic import BaseModel, Field, field_validator
from rich.console import Console, ConsoleOptions, RenderResult
from rich.text import Text

from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version, parse_spec_version_nbr
from tdoc_crawler.utils.parse import SpecificationVersionNumber, parse_spec_version_nbr


class Specification(BaseModel):
@@ -30,13 +30,6 @@ class Specification(BaseModel):
    series: str
    latest_version: SpecificationVersionNumber | None = None

    @property
    def latest_version_number(self) -> Version | None:
        """Return latest version as comparable Version object."""
        if self.latest_version is None:
            return None
        return parse_spec_version(self.latest_version)

    def __rich_console__(self, console: Console, options: ConsoleOptions) -> RenderResult:
        _ = (console, options)
        yield Text(f"{self.spec_number} - {self.title}")
@@ -79,11 +72,6 @@ class SpecificationVersion(BaseModel):
    file_name: str
    source_name: str

    @property
    def version_number(self) -> Version:
        """Return version as comparable Version object."""
        return parse_spec_version(self.version)

    @field_validator("version", mode="before")
    @classmethod
    def _normalize_version(cls, value: Version | SpecificationVersionNumber) -> SpecificationVersionNumber:
+2 −58
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ from collections.abc import Iterable
from datetime import date, datetime
from enum import StrEnum, auto

import requests
from packaging.version import Version
from pydantic import BaseModel, Field, field_serializer, field_validator

@@ -19,9 +18,9 @@ from tdoc_crawler.models.base import (
)
from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.tdocs.utils import normalize_tdoc_id, normalize_tdoc_ids
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr, parse_agenda_item_version
from tdoc_crawler.utils.normalization import normalize_tdoc_id, normalize_tdoc_ids
from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr

_logger = get_logger(__name__)

@@ -98,11 +97,6 @@ class TDocMetadata(BaseModel):
    agenda_item_text: str = Field("Unknown", description="Associated agenda item (text identifier)")
    status: str | None = Field(None, description="Document status as reported by the portal")

    @property
    def agenda_item_version(self) -> Version:
        """Return agenda item number as comparable Version object."""
        return parse_agenda_item_version(self.agenda_item_nbr)

    @property
    def is_valid(self) -> bool:
        """Check if TDoc metadata is valid and downloadable.
@@ -122,56 +116,6 @@ class TDocMetadata(BaseModel):
        # Check for placeholder/corrupt URL patterns
        return not ("/.../" in self.url or "..." in self.url)

    @property
    def has_valid_url(self) -> bool:
        """Check if URL is present and valid (alias for is_valid)."""
        return self.is_valid

    def validate_url_accessible(self, timeout: int = 10) -> bool:
        """Validate that the URL is accessible via HTTP.

        Makes an HTTP HEAD request to check if the URL returns a successful
        status code (2xx or 3xx). This is a lightweight check that doesn't
        download the full file.

        Args:
            timeout: Request timeout in seconds (default: 10)

        Returns:
            True if URL is accessible (2xx/3xx status), False otherwise

        Note:
            Returns False for network errors, timeouts, or 4xx/5xx responses.
            Logs warnings for failed requests.
        """
        if not self.is_valid:
            _logger.debug(f"URL validation skipped for {self.tdoc_id}: basic validation failed")
            return False
        url = self.url
        if url is None:
            return False

        is_accessible = False
        try:
            # Use HEAD request to check accessibility without downloading
            response = requests.head(url, timeout=timeout, allow_redirects=True)

            # Check for successful response (2xx or 3xx)
            if response.status_code < 400:
                _logger.debug(f"URL accessible for {self.tdoc_id}: {url} (status {response.status_code})")
                is_accessible = True
            else:
                _logger.warning(f"URL returned error for {self.tdoc_id}: {url} (status {response.status_code})")

        except requests.exceptions.Timeout:
            _logger.warning(f"URL validation timeout for {self.tdoc_id}: {url}")
        except requests.exceptions.ConnectionError as exc:
            _logger.warning(f"URL validation connection error for {self.tdoc_id}: {exc}")
        except requests.exceptions.RequestException as exc:
            _logger.warning(f"URL validation failed for {self.tdoc_id}: {exc}")

        return is_accessible

    @classmethod
    @field_serializer("agenda_item_nbr")
    def _serialize_agenda_item_nbr(cls, value: AgendaItemNumber) -> str: