Commit 56fa82a0 authored by Jan Reimes's avatar Jan Reimes
Browse files

Eliminate duplicate TDocMetadata — use single Oxyde DB model

The Pydantic TDocMetadata in tdocs.models is deleted. All code now
imports TDocMetadata from database.oxyde_models directly.

Root cause of the AttributeError: the crawl pipeline produced Pydantic
instances (no tbid/file_size fields) which were then passed to the DB
layer that expected the Oxyde model (has tbid/file_size). Having two
models for the same concept was the bug — not missing getattr guards.

Changes:
- Delete Pydantic TDocMetadata class from tdocs/models.py
- Update 11 source modules to import from database.oxyde_models
- Simplify extraction/fetch_tdoc.py (removed manual field-by-field
  conversion between the two models)
- Update 6 test files for str agenda_item_nbr and removed validators
- Keep TDocStatus enum in tdocs/models.py (used by checkout logic)
parent 83639502
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ from datetime import date
from typing import Any

from tdoc_crawler.cli.formatting import TableColumnSpec, print_structured_output
from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.database.specs import SpecCrawlResult
from tdoc_crawler.logging import get_console
from tdoc_crawler.meetings.models import MeetingMetadata
@@ -14,7 +15,6 @@ from tdoc_crawler.models.base import OutputFormat
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.specs.models import SpecQueryResult
from tdoc_crawler.tdocs.models import TDocMetadata

console = get_console()

+1 −1
Original line number Diff line number Diff line
@@ -22,11 +22,11 @@ import niquests as requests
from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.constants.urls import LOGIN_URL, PORTAL_BASE_URL, TDOC_DOWNLOAD_URL, TDOC_VIEW_URL
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.http_client import create_cached_session, resolve_ssl_verify
from tdoc_crawler.logging import get_logger
from tdoc_crawler.models.base import PortalCredentials
from tdoc_crawler.parsers.portal import PortalParsingError, parse_tdoc_portal_page
from tdoc_crawler.tdocs.models import TDocMetadata

logger = get_logger(__name__)

+4 −16
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from pathlib import Path

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.settings import PathConfig
from tdoc_crawler.database.oxyde_models import TDocMetadata as TDocRecord
from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.database.tdocs import TDocDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.models.workspaces import TDocNotFoundError
@@ -54,7 +54,7 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
    """
    normalized_id = normalize_tdoc_id(document_id)

    async def _resolve_metadata() -> TDocRecord | None:
    async def _resolve_metadata() -> TDocMetadata | None:
        manager = resolve_cache_manager()
        async with TDocDatabase(manager.db_file) as db:
            # 1. Check database
@@ -65,21 +65,9 @@ def fetch_tdoc_files(document_id: str, force_download: bool = False) -> TDocFile
            # 2. Resolve via WhatTheSpec
            metadata = resolve_via_whatthespec(normalized_id)
            if metadata:
                # Convert Pydantic to Oxyde record
                new_record = TDocRecord(
                    tdoc_id=metadata.tdoc_id,
                    meeting_id=metadata.meeting_id,
                    title=metadata.title,
                    url=metadata.url,
                    source=metadata.source,
                    agenda_item_nbr=metadata.agenda_item_nbr,
                    agenda_item_text=metadata.agenda_item_text,
                    status=metadata.status,
                    is_withdrawn=metadata.is_withdrawn,
                )
                # 3. Put into database
                await db.upsert_tdoc(new_record)
                return new_record
                await db.upsert_tdoc(metadata)
                return metadata
        return None

    metadata = run_async(_resolve_metadata())
+1 −1
Original line number Diff line number Diff line
@@ -7,9 +7,9 @@ from typing import Any
import niquests as requests
from bs4 import BeautifulSoup

from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.utils.parse import parse_agenda_item_nbr

logger = get_logger(__name__)
+5 −91
Original line number Diff line number Diff line
@@ -6,22 +6,16 @@ from collections.abc import Iterable
from datetime import date, datetime
from enum import StrEnum, auto

from packaging.version import Version
from pydantic import BaseModel, Field, field_serializer, field_validator
from pydantic import BaseModel, Field, field_validator

from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.logging import get_logger
from tdoc_crawler.meetings.utils import normalize_subgroup_alias, normalize_working_group_alias
from tdoc_crawler.models.base import (
    OutputFormat,
    SortOrder,
)
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.normalization import normalize_tdoc_id, normalize_tdoc_ids
from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr

_logger = get_logger(__name__)
from tdoc_crawler.utils.normalization import normalize_tdoc_ids


class TDocStatus(StrEnum):
@@ -78,88 +72,9 @@ class TDocStatus(StrEnum):
        return None


class TDocMetadata(BaseModel):
    """Persistent representation for a TDoc entry."""

    # Mandatory metadata fields from portal
    tdoc_id: str = Field(..., description="Unique TDoc identifier (case-normalized)")
    meeting_id: int = Field(..., description="Foreign key reference to the meetings table")
    title: str = Field(..., description="Document title as published on the portal")
    url: str | None = Field(None, description="Full URL to the TDoc resource, None if not available")
    source: str = Field(..., description="Contact person or organization")
    contact: str = Field(..., description="Contact person or organization")

    tdoc_type: str = Field("unknown", description="TDoc classification as reported by the portal")
    for_purpose: str = Field("unknown", description="Purpose of the contribution (agreement, information, etc.)")
    agenda_item_nbr: AgendaItemNumber = Field(..., description="Associated agenda item number (hierarchical identifier, e.g., 1.2.3)")
    agenda_item_text: str = Field("Unknown", description="Associated agenda item (text identifier)")
    status: str | None = Field(None, description="Document status as reported by the portal")

    @classmethod
    @field_serializer("agenda_item_nbr")
    def _serialize_agenda_item_nbr(cls, value: AgendaItemNumber) -> str:
        """Serialize agenda item version for storage and JSON output."""
        return value

    @field_validator("agenda_item_nbr", mode="before")
    @classmethod
    def _validate_agenda_item_nbr(cls, value: Version | AgendaItemNumber | float | None) -> AgendaItemNumber:
        """Normalize agenda item numbers into Version objects."""
        return parse_agenda_item_nbr(value)

    @classmethod
    @field_serializer("status")
    def _serialize_status(cls, value: TDocStatus | str | None) -> str | None:
        """Serialize TDocStatus enum to string for database storage."""
        if value is None:
            return None
        if isinstance(value, TDocStatus):
            return value.value
        return value

    @field_validator("status", mode="before")
    @classmethod
    def _validate_status(cls, value: str | TDocStatus | None) -> str | None:
        """Validate and normalize status value against TDocStatus enum.

        Args:
            value: Status string or enum value

        Returns:
            Normalized status string (lowercase) or None

        Raises:
            ValueError: If status value is not in TDocStatus enum
        """
        if value is None:
            return None

        if isinstance(value, TDocStatus):
            return value.value

        # Use _missing_ directly for case-insensitive lookup; return None for unknown statuses
        status = TDocStatus._missing_(value)
        if status is None:
            return None
        return status.value

    @field_validator("tdoc_id")
    @classmethod
    def _normalize_tdoc_id(cls, value: str) -> str:
        """Ensure identifiers are uppercase and trimmed."""
        return normalize_tdoc_id(value)

    # Optional metadata fields (from portal or determined otherwise)
    is_revision_of: str | None = Field(None, description="Reference to a previous TDoc version")
    file_size: int | None = Field(None, description="File size in bytes, when available/downloaded")

    # fields for local database management

    date_created: datetime | None = Field(None, description="Original creation timestamp when provided")
    date_retrieved: datetime = Field(default_factory=utc_now, description="Timestamp of the last retrieval")
    date_updated: datetime = Field(default_factory=utc_now, description="Timestamp of the last database update")
    validated: bool = Field(False, description="Flag indicating successful portal validation")
    validation_failed: bool = Field(False, description="Flag indicating cached failed validation")
# TDocMetadata is defined in tdoc_crawler.database.oxyde_models.
# Import it from there directly:
#   from tdoc_crawler.database.oxyde_models import TDocMetadata


class TDocCrawlConfig(BaseModel):
@@ -266,7 +181,6 @@ class TDocQueryConfig(BaseModel):

__all__ = [
    "TDocCrawlConfig",
    "TDocMetadata",
    "TDocQueryConfig",
    "TDocStatus",
]
Loading