Commit 1d66e0da authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(models): streamline working group normalization logic

* Update MeetingCrawlConfig and TDocCrawlConfig to use normalize_working_group_list for consistency.
* Refactor _normalize_working_groups method in MeetingQueryConfig and TDocQueryConfig.
* Enhance convert_for_wiki return type to always return Path.
* Simplify download_to_file method by removing unnecessary session handling.
parent f98c6cb3
Loading
Loading
Loading
Loading
+5 −8
Original line number Diff line number Diff line
@@ -5,7 +5,6 @@ from __future__ import annotations
import json
import time
from pathlib import Path
from typing import Any

import typer

@@ -24,6 +23,7 @@ from tdoc_crawler.cli.args import (
    WorkspaceProcessForceOption,
)
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.config.workspace_registry import WorkspaceMember
from tdoc_crawler.extraction.convert import ConversionError, DoclingConfig, convert_for_wiki
from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
@@ -125,7 +125,7 @@ def _read_page_count(json_dir: Path) -> int:


def _process_member(
    member: Any,
    member: WorkspaceMember,
    wiki_source_dir_base: Path,
    extraction_profile: ExtractionProfile,
    force: bool,
@@ -154,12 +154,9 @@ def _process_member(
            docx_direct=docx_direct,
            extract_media=extract_media,
        )
        if result_path:
        suffix = result_path.suffix.lstrip(".")
        logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
        return source_id, True, False, _read_page_count(wiki_source_dir)
        logger.debug("No output for %s", source_id)
        return source_id, False, False, 0
    except (ConversionError, FileNotFoundError) as e:
        console.print(f"[red]  Failed {source_id}: {e}[/red]")
        logger.error("Failed to process %s: %s", source_id, e)
+5 −3
Original line number Diff line number Diff line
@@ -357,7 +357,7 @@ def convert_for_wiki(
    docling_config: DoclingConfig | None = None,
    docx_direct: bool = False,
    extract_media: bool = False,
) -> Path | None:
) -> Path:
    """Convert a document for wiki ingestion using the specified profile.

    For **markdown-only** the pipeline is:
@@ -384,8 +384,10 @@ def convert_for_wiki(
            markdown (markdown-only profile).

    Returns:
        Path to the primary output file (PDF for pdf-only, MD for others),
        or ``None`` if conversion fails.
        Path to the primary output file (PDF for pdf-only, MD for others).

    Raises:
        ConversionError: If no document files are found for the given document_id.
    """
    if profile is None:
        profile = DEFAULT_EXTRACTION_PROFILE
+3 −10
Original line number Diff line number Diff line
@@ -161,20 +161,18 @@ def download_to_file(
    url: str,
    target_file: Path,
    session: requests.Session | None = None,
    close_session: bool = True,
    http_cache_file: Path | None = None,
    http_cache_enabled: bool | None = None,
    pool_config: PoolConfig | None = None,
    verify: bool | str | None = None,
    http_config: HttpConfig | None = None,
) -> requests.Session | None:
) -> None:
    """Download a file from URL to destination path.

    Args:
        url: Source URL
        target_file: Destination file path
        session: Optional requests.Session to reuse. If None, a temporary cached session is created.
        close_session: Whether to close the session after download. Only applicable if a temporary session is created.
        session: Optional requests.Session to reuse. If None, a temporary cached session is created and closed.
        http_cache_file: Optional explicit path to the HTTP cache database. Falls back to PathConfig default.
        http_cache_enabled: Whether to enable HTTP caching. If None, defaults to http_config.cache_enabled or True.
        pool_config: Optional connection pool configuration.
@@ -215,13 +213,8 @@ def download_to_file(
                if chunk:
                    target.write(chunk)

        # return session or None if we created a temporary session and are closing it
        if close_session:
            active_session = None
        return active_session

    finally:
        if temp_session and close_session:
        if temp_session:
            temp_session.close()


+4 −4
Original line number Diff line number Diff line
@@ -8,11 +8,12 @@ from datetime import date, datetime
from pydantic import BaseModel, Field, field_validator, model_validator

from tdoc_crawler.config.settings import HttpConfig
from tdoc_crawler.meetings.utils import normalize_subgroup_alias, normalize_working_group_alias
from tdoc_crawler.meetings.utils import normalize_subgroup_alias
from tdoc_crawler.models.base import SortOrder
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
from tdoc_crawler.models.working_groups import WorkingGroup
from tdoc_crawler.utils.misc import utc_now
from tdoc_crawler.utils.normalization import normalize_working_group_list


class MeetingMetadata(BaseModel):
@@ -88,7 +89,7 @@ class MeetingCrawlConfig(BaseModel):
    @classmethod
    def _normalize_working_groups(cls, value: Iterable[str | WorkingGroup]) -> list[WorkingGroup]:
        """Ensure the working groups list only contains valid enum members."""
        return [normalize_working_group_alias(str(item)) if not isinstance(item, WorkingGroup) else item for item in value]
        return normalize_working_group_list(value)

    @field_validator("subgroups", mode="before")
    @classmethod
@@ -109,7 +110,6 @@ class MeetingQueryConfig(BaseModel):
    limit: int | None = Field(None, ge=1, description="Maximum results")
    order: SortOrder = Field(SortOrder.DESC, description="Sort order applied to start date")
    include_without_files: bool = Field(False, description="Include meetings without associated files URL")
    # Date range filters
    start_date: date | None = Field(None, description="Filter meetings starting from this date")
    end_date: date | None = Field(None, description="Filter meetings ending before this date")

@@ -119,7 +119,7 @@ class MeetingQueryConfig(BaseModel):
        """Ensure the working group list is comprised of enum members."""
        if value is None:
            return None
        return [normalize_working_group_alias(str(item)) if not isinstance(item, WorkingGroup) else item for item in value]
        return normalize_working_group_list(value)

    @field_validator("subgroups", mode="before")
    @classmethod
+5 −7
Original line number Diff line number Diff line
@@ -2,10 +2,8 @@

from __future__ import annotations

from typing import Any

import niquests as requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag

from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.http_client import create_cached_session
@@ -79,7 +77,7 @@ def _validate_page_content(soup: BeautifulSoup, tdoc_id: str) -> None:
        raise PortalParsingError(msg)


def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any:
def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Tag:
    """Get TDoc metadata table from portal HTML."""
    table = soup.find("table", {"class": "ultimate3gpp", "id": "tableTdocGeneralTabView"})
    if not table:
@@ -89,7 +87,7 @@ def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any:
    return table


def _normalize_label(cells: list[Any]) -> str | None:
def _normalize_label(cells: list[Tag]) -> str | None:
    """Extract normalized key from first column label."""
    label_cell = cells[0].get_text(strip=True)
    if not label_cell or not label_cell.endswith(":"):
@@ -98,7 +96,7 @@ def _normalize_label(cells: list[Any]) -> str | None:
    return label.lower().replace(" ", "_")


def _normalize_value(cells: list[Any], label_key: str) -> str | None:
def _normalize_value(cells: list[Tag], label_key: str) -> str | None:
    """Extract normalized value from second column with status cleanup."""
    value = cells[1].get_text(strip=True) if len(cells) > 1 else ""
    value = value.strip() if value else None
@@ -121,7 +119,7 @@ def _store_agenda_fields(metadata: dict[str, str | None], value: str) -> None:
    metadata["agenda_item_nbr"] = value


def _parse_metadata_table(table: Any) -> dict[str, str | None]:
def _parse_metadata_table(table: Tag) -> dict[str, str | None]:
    """Parse metadata rows from portal table."""
    metadata: dict[str, str | None] = {}
    for row in table.find_all("tr"):
Loading