refactor(models): streamline working group normalization logic (1d66e0da) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/workspace/process.py

+5 −8

Original line number	Diff line number	Diff line
		@@ -5,7 +5,6 @@ from __future__ import annotations
		import json
		import time
		from pathlib import Path
		from typing import Any

		import typer

		@@ -24,6 +23,7 @@ from tdoc_crawler.cli.args import (
		WorkspaceProcessForceOption,
		)
		from tdoc_crawler.config import resolve_cache_manager
		from tdoc_crawler.config.workspace_registry import WorkspaceMember
		from tdoc_crawler.extraction.convert import ConversionError, DoclingConfig, convert_for_wiki
		from tdoc_crawler.extraction.profiles import DEFAULT_EXTRACTION_PROFILE, ExtractionProfile
		from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
		@@ -125,7 +125,7 @@ def _read_page_count(json_dir: Path) -> int:


		def _process_member(
		member: Any,
		member: WorkspaceMember,
		wiki_source_dir_base: Path,
		extraction_profile: ExtractionProfile,
		force: bool,
		@@ -154,12 +154,9 @@ def _process_member(
		docx_direct=docx_direct,
		extract_media=extract_media,
		)
		if result_path:
		suffix = result_path.suffix.lstrip(".")
		logger.debug("%s [%s] → %s", source_id, extraction_profile.value, suffix)
		return source_id, True, False, _read_page_count(wiki_source_dir)
		logger.debug("No output for %s", source_id)
		return source_id, False, False, 0
		except (ConversionError, FileNotFoundError) as e:
		console.print(f"[red] Failed {source_id}: {e}[/red]")
		logger.error("Failed to process %s: %s", source_id, e)

src/tdoc_crawler/extraction/convert.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -357,7 +357,7 @@ def convert_for_wiki(
		docling_config: DoclingConfig \| None = None,
		docx_direct: bool = False,
		extract_media: bool = False,
		) -> Path \| None:
		) -> Path:
		"""Convert a document for wiki ingestion using the specified profile.

		For markdown-only the pipeline is:
		@@ -384,8 +384,10 @@ def convert_for_wiki(
		markdown (markdown-only profile).

		Returns:
		Path to the primary output file (PDF for pdf-only, MD for others),
		or ``None`` if conversion fails.
		Path to the primary output file (PDF for pdf-only, MD for others).

		Raises:
		ConversionError: If no document files are found for the given document_id.
		"""
		if profile is None:
		profile = DEFAULT_EXTRACTION_PROFILE

src/tdoc_crawler/http_client/session.py

+3 −10

Original line number	Diff line number	Diff line
		@@ -161,20 +161,18 @@ def download_to_file(
		url: str,
		target_file: Path,
		session: requests.Session \| None = None,
		close_session: bool = True,
		http_cache_file: Path \| None = None,
		http_cache_enabled: bool \| None = None,
		pool_config: PoolConfig \| None = None,
		verify: bool \| str \| None = None,
		http_config: HttpConfig \| None = None,
		) -> requests.Session \| None:
		) -> None:
		"""Download a file from URL to destination path.

		Args:
		url: Source URL
		target_file: Destination file path
		session: Optional requests.Session to reuse. If None, a temporary cached session is created.
		close_session: Whether to close the session after download. Only applicable if a temporary session is created.
		session: Optional requests.Session to reuse. If None, a temporary cached session is created and closed.
		http_cache_file: Optional explicit path to the HTTP cache database. Falls back to PathConfig default.
		http_cache_enabled: Whether to enable HTTP caching. If None, defaults to http_config.cache_enabled or True.
		pool_config: Optional connection pool configuration.
		@@ -215,13 +213,8 @@ def download_to_file(
		if chunk:
		target.write(chunk)

		# return session or None if we created a temporary session and are closing it
		if close_session:
		active_session = None
		return active_session

		finally:
		if temp_session and close_session:
		if temp_session:
		temp_session.close()

src/tdoc_crawler/meetings/models.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -8,11 +8,12 @@ from datetime import date, datetime
		from pydantic import BaseModel, Field, field_validator, model_validator

		from tdoc_crawler.config.settings import HttpConfig
		from tdoc_crawler.meetings.utils import normalize_subgroup_alias, normalize_working_group_alias
		from tdoc_crawler.meetings.utils import normalize_subgroup_alias
		from tdoc_crawler.models.base import SortOrder
		from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
		from tdoc_crawler.models.working_groups import WorkingGroup
		from tdoc_crawler.utils.misc import utc_now
		from tdoc_crawler.utils.normalization import normalize_working_group_list


		class MeetingMetadata(BaseModel):
		@@ -88,7 +89,7 @@ class MeetingCrawlConfig(BaseModel):
		@classmethod
		def _normalize_working_groups(cls, value: Iterable[str \| WorkingGroup]) -> list[WorkingGroup]:
		"""Ensure the working groups list only contains valid enum members."""
		return [normalize_working_group_alias(str(item)) if not isinstance(item, WorkingGroup) else item for item in value]
		return normalize_working_group_list(value)

		@field_validator("subgroups", mode="before")
		@classmethod
		@@ -109,7 +110,6 @@ class MeetingQueryConfig(BaseModel):
		limit: int \| None = Field(None, ge=1, description="Maximum results")
		order: SortOrder = Field(SortOrder.DESC, description="Sort order applied to start date")
		include_without_files: bool = Field(False, description="Include meetings without associated files URL")
		# Date range filters
		start_date: date \| None = Field(None, description="Filter meetings starting from this date")
		end_date: date \| None = Field(None, description="Filter meetings ending before this date")

		@@ -119,7 +119,7 @@ class MeetingQueryConfig(BaseModel):
		"""Ensure the working group list is comprised of enum members."""
		if value is None:
		return None
		return [normalize_working_group_alias(str(item)) if not isinstance(item, WorkingGroup) else item for item in value]
		return normalize_working_group_list(value)

		@field_validator("subgroups", mode="before")
		@classmethod

src/tdoc_crawler/parsers/portal.py

+5 −7

Original line number	Diff line number	Diff line
		@@ -2,10 +2,8 @@

		from __future__ import annotations

		from typing import Any

		import niquests as requests
		from bs4 import BeautifulSoup
		from bs4 import BeautifulSoup, Tag

		from tdoc_crawler.database.oxyde_models import TDocMetadata
		from tdoc_crawler.http_client import create_cached_session
		@@ -79,7 +77,7 @@ def _validate_page_content(soup: BeautifulSoup, tdoc_id: str) -> None:
		raise PortalParsingError(msg)


		def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any:
		def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Tag:
		"""Get TDoc metadata table from portal HTML."""
		table = soup.find("table", {"class": "ultimate3gpp", "id": "tableTdocGeneralTabView"})
		if not table:
		@@ -89,7 +87,7 @@ def _get_metadata_table(soup: BeautifulSoup, tdoc_id: str) -> Any:
		return table


		def _normalize_label(cells: list[Any]) -> str \| None:
		def _normalize_label(cells: list[Tag]) -> str \| None:
		"""Extract normalized key from first column label."""
		label_cell = cells[0].get_text(strip=True)
		if not label_cell or not label_cell.endswith(":"):
		@@ -98,7 +96,7 @@ def _normalize_label(cells: list[Any]) -> str \| None:
		return label.lower().replace(" ", "_")


		def _normalize_value(cells: list[Any], label_key: str) -> str \| None:
		def _normalize_value(cells: list[Tag], label_key: str) -> str \| None:
		"""Extract normalized value from second column with status cleanup."""
		value = cells[1].get_text(strip=True) if len(cells) > 1 else ""
		value = value.strip() if value else None
		@@ -121,7 +119,7 @@ def _store_agenda_fields(metadata: dict[str, str \| None], value: str) -> None:
		metadata["agenda_item_nbr"] = value


		def _parse_metadata_table(table: Any) -> dict[str, str \| None]:
		def _parse_metadata_table(table: Tag) -> dict[str, str \| None]:
		"""Parse metadata rows from portal table."""
		metadata: dict[str, str \| None] = {}
		for row in table.find_all("tr"):