Commit e33d8cfa authored by Jan Reimes's avatar Jan Reimes
Browse files

fix(parse): update agenda item number handling to use Version type

* Add packaging dependency for version handling.
* Modify agenda_item_nbr to use parse_agenda_item_nbr in TDocDatabase.
* Update agenda_item_nbr parsing in portal.py and doclist.py.
* Change agenda_item_nbr type to AgendaItemNumber in TDocMetadata.
parent f6c360b7
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ dependencies = [
    "beautifulsoup4>=4.14.2",
    "brotli>=1.2.0",
    "hishel>=1.1.8",
    "packaging>=25.0",
    "pandas>=3.0.0",
    "pydantic>=2.12.2",
    "pydantic-sqlite>=0.4.0",
+1 −1
Original line number Diff line number Diff line
@@ -117,7 +117,7 @@ def tdoc_to_dict(
    """
    data = result.model_dump(mode="json")

    # Add agenda_item_nbr as plain value (not Decimal for JSON)
    # Add agenda_item_nbr as plain value for JSON output
    data["agenda_item"] = str(result.agenda_item_nbr) if result.agenda_item_nbr is not None else None

    # Add short form of agenda item text
+2 −1
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ from tdoc_crawler.models import WorkingGroup
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.tdocs.models import TDocMetadata, TDocQueryConfig
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.parse import parse_agenda_item_nbr

_logger = get_logger(__name__)

@@ -187,7 +188,7 @@ class TDocDatabase(MeetingDatabase):
                        url=row[3],
                        source=row[4],
                        contact=row[5],
                        agenda_item_nbr=row[6],
                        agenda_item_nbr=parse_agenda_item_nbr(row[6]),
                    )
                return None
        except sqlite3.Error:
+2 −2
Original line number Diff line number Diff line
@@ -2,7 +2,6 @@

from __future__ import annotations

from decimal import Decimal
from typing import Any

import requests
@@ -11,6 +10,7 @@ from bs4 import BeautifulSoup
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import get_logger
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.utils.parse import parse_agenda_item_nbr

logger = get_logger(__name__)

@@ -189,7 +189,7 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T
        contact=str(metadata.get("contact") or ""),
        tdoc_type=str(metadata.get("tdoc_type") or "unknown"),
        for_purpose=str(metadata.get("for") or "unknown"),
        agenda_item_nbr=Decimal(str(agenda_item_value)),
        agenda_item_nbr=parse_agenda_item_nbr(agenda_item_value),
        agenda_item_text=str(metadata.get("agenda_item_text") or "Unknown"),
        status=metadata.get("status"),
        is_revision_of=metadata.get("is_revision_of"),
+20 −2
Original line number Diff line number Diff line
@@ -4,10 +4,10 @@ from __future__ import annotations

from collections.abc import Iterable
from datetime import date, datetime
from decimal import Decimal
from enum import StrEnum

import requests
from packaging.version import Version
from pydantic import BaseModel, Field, field_serializer, field_validator

from tdoc_crawler.logging import get_logger
@@ -20,6 +20,7 @@ from tdoc_crawler.models.crawl_limits import CrawlLimits, _new_crawl_limits
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.tdocs.utils import normalize_tdoc_ids
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.parse import AgendaItemNumber, parse_agenda_item_nbr, parse_agenda_item_version

_logger = get_logger(__name__)

@@ -92,10 +93,15 @@ class TDocMetadata(BaseModel):

    tdoc_type: str = Field("unknown", description="TDoc classification as reported by the portal")
    for_purpose: str = Field("unknown", description="Purpose of the contribution (agreement, information, etc.)")
    agenda_item_nbr: Decimal = Field(..., max_digits=3, decimal_places=1, description="Associated agenda item (numerical identifier)")
    agenda_item_nbr: AgendaItemNumber = Field(..., description="Associated agenda item number (hierarchical identifier, e.g., 1.2.3)")
    agenda_item_text: str = Field("Unknown", description="Associated agenda item (text identifier)")
    status: str | None = Field(None, description="Document status as reported by the portal")

    @property
    def agenda_item_version(self) -> Version:
        """Return agenda item number as comparable Version object."""
        return parse_agenda_item_version(self.agenda_item_nbr)

    @property
    def is_valid(self) -> bool:
        """Check if TDoc metadata is valid and downloadable.
@@ -165,6 +171,18 @@ class TDocMetadata(BaseModel):

        return is_accessible

    @classmethod
    @field_serializer("agenda_item_nbr")
    def _serialize_agenda_item_nbr(cls, value: AgendaItemNumber) -> str:
        """Serialize agenda item version for storage and JSON output."""
        return value

    @field_validator("agenda_item_nbr", mode="before")
    @classmethod
    def _validate_agenda_item_nbr(cls, value: Version | AgendaItemNumber | int | float | None) -> AgendaItemNumber:
        """Normalize agenda item numbers into Version objects."""
        return parse_agenda_item_nbr(value)

    @classmethod
    @field_serializer("status")
    def _serialize_status(cls, value: TDocStatus | str | None) -> str | None:
Loading