Commit ca346eaa authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor: remove TYPE_CHECKING guards and lazy imports

- Replace TYPE_CHECKING conditionals with direct imports from proper locations
- Remove _parse_agenda_item_nbr and _resolve_meeting_id helpers in favor of utilities
- Import TDocMetadata directly in doclist since it's in same domain package
- Simplify imports in whatthespec source to eliminate circular import workarounds
- Fix CacheManager register() to accept optional force parameter
- Update cache_manager initialization to use lambda for default factory
parent 5a504eea
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -133,7 +133,7 @@ class TDocMetadata(BaseModel):
class TDocCrawlConfig(BaseConfigModel):
    """Configuration for TDoc crawling runs."""

    cache_dir: Path = Field(default_factory=resolve_cache_manager, description="Cache directory path")
    cache_dir: Path = Field(default_factory=lambda: resolve_cache_manager().root, description="Cache directory path")
    working_groups: list[WorkingGroup] = Field(
        default_factory=lambda: [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT],
        description="Working groups to crawl",
@@ -198,6 +198,7 @@ class TDocCrawlConfig(BaseConfigModel):
        return normalize_tdoc_ids(value)


# TODO: Should be named TDocQueryConfig for consistency?
class QueryConfig(BaseConfigModel):
    """Configuration for querying TDoc metadata."""

@@ -236,7 +237,7 @@ class QueryConfig(BaseConfigModel):
        return normalized


# Legacy alias
# TODO: Legacy alias; Kept for backward compatibility with CLI commands that reference CrawlConfig. should be renamed soon in the whole project for consistency
CrawlConfig = TDocCrawlConfig

__all__ = [
+3 −3
Original line number Diff line number Diff line
@@ -252,7 +252,7 @@ def checkout_tdocs(
def checkout_meeting_tdocs(
    meetings: list[MeetingMetadata],
    checkout_dir: Path,
    http_cache_dir: Path,
    http_cache_file: Path,
    session: requests.Session | None = None,
    cache_manager_name: str | None = None,
) -> CheckoutResult:
@@ -261,7 +261,7 @@ def checkout_meeting_tdocs(
    Args:
        meetings: List of MeetingMetadata to checkout TDocs from
        checkout_dir: Base checkout directory
        http_cache_dir: Path to HTTP cache database
        http_cache_file: Path to HTTP cache database
        session: Optional requests.Session to reuse for downloads
        cache_manager_name: Optional cache manager name for HTTP caching

@@ -282,7 +282,7 @@ def checkout_meeting_tdocs(
            errors.append(f"{meeting.short_name}: no files URL")
            continue
        try:
            tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_dir)
            tdocs = fetch_meeting_document_list(meeting.meeting_id, http_cache_file)
        except DocumentListError as exc:
            errors.append(f"{meeting.short_name}: {exc}")
            continue
+2 −7
Original line number Diff line number Diff line
@@ -12,16 +12,13 @@ import re
from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
from typing import TYPE_CHECKING

import pandas as pd

from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.tdocs.models import TDocMetadata
from tdoc_crawler.tdocs.sources.base import TDocSourceConfig

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata

logger = logging.getLogger(__name__)


@@ -51,8 +48,6 @@ def fetch_meeting_document_list(
    Raises:
        DocumentListError: If document list cannot be fetched or parsed
    """
    from tdoc_crawler.models.tdocs import TDocMetadata  # noqa: F401

    # Construct document list URL
    doclist_url = f"https://portal.3gpp.org/ngppapp/GenerateDocumentList.aspx?meetingId={meeting_id}"
    logger.debug(f"Fetching document list for meeting {meeting_id} from {doclist_url}")
@@ -158,7 +153,7 @@ def convert_excel_row_to_tdoc_metadata(
    Returns:
        TDocMetadata instance or None if conversion fails
    """
    from tdoc_crawler.models.tdocs import TDocMetadata
    from tdoc_crawler.tdocs.models import TDocMetadata

    # Map Excel columns to TDocMetadata fields
    # Try multiple possible column names to handle different Excel formats
+13 −34
Original line number Diff line number Diff line
@@ -7,18 +7,20 @@ from the whatthespec.net community API.
from __future__ import annotations

import logging
from decimal import Decimal, InvalidOperation
from pathlib import Path
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING

import requests

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.database.meetings import MeetingDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.models.base import HttpCacheConfig
from tdoc_crawler.tdocs.sources.base import TDocSourceConfig
from tdoc_crawler.utils.parse import parse_agenda_item_nbr

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata

logger = logging.getLogger(__name__)

@@ -29,32 +31,6 @@ class WhatTheSpecResolutionError(Exception):
    """Raised when WhatTheSpec resolution fails."""


def _parse_agenda_item_nbr(value: Any) -> Decimal:
    """Parse agenda item number as Decimal with fallback to zero."""
    if value is None:
        return Decimal(0)
    try:
        return Decimal(str(value))
    except (InvalidOperation, ValueError) as exc:
        logger.warning(f"Invalid agenda item number '{value}': {exc}")
        return Decimal(0)


def _resolve_meeting_id(db_file: Path, meeting_name: str | None) -> int:
    """Helper to resolve meeting name to meeting_id using local database."""
    if not meeting_name:
        return 0
    try:
        from tdoc_crawler.database import TDocDatabase

        with TDocDatabase(db_file) as database:
            resolved = database.resolve_meeting_id(meeting_name)
    except Exception as exc:
        logger.warning(f"Failed to resolve meeting '{meeting_name}': {exc}")
        return 0
    return resolved or 0


def resolve_via_whatthespec(
    tdoc_id: str,
    db_file: Path,
@@ -82,7 +58,7 @@ def resolve_via_whatthespec(
    temp_session: requests.Session | None = None
    if session is None:
        temp_session = create_cached_session(
            cache_dir=manager.http_cache_dir,
            cache_dir=manager.http_cache_file,
            ttl=http_cache.ttl,
            refresh_ttl_on_access=http_cache.refresh_ttl_on_access,
            max_retries=3,
@@ -109,12 +85,15 @@ def resolve_via_whatthespec(

    record = payload[0] or {}
    resolved_id = str(record.get("name") or tdoc_id).strip().upper()
    meeting_name = record.get("meeting")
    agenda_item_nbr = _parse_agenda_item_nbr(record.get("ainumber"))
    meeting_name = str(record.get("meeting") or "")
    agenda_item_nbr = parse_agenda_item_nbr(record.get("ainumber"))

    with MeetingDatabase(db_file) as db:
        meeting_id = db.resolve_meeting_id(meeting_name) or 0

    metadata = TDocMetadata(
        tdoc_id=resolved_id,
        meeting_id=_resolve_meeting_id(db_file, meeting_name),
        meeting_id=meeting_id,
        meeting_name=meeting_name,
        title=str(record.get("title") or ""),
        url=str(record.get("link") or ""),