Commit 5ee0f4e2 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(tdocs): remove circular dependencies by moving imports to tdoc_crawler.tdocs.models

parent 77db5be7
Loading
Loading
Loading
Loading
+0 −9
Original line number Diff line number Diff line
@@ -17,9 +17,6 @@ from .base import (
)
from .crawl_limits import CrawlLimits
from .crawl_log import CrawlLogEntry

# Note: Specification models have been moved to tdoc_crawler.specs.models
# Import from there directly to avoid circular dependencies
from .subworking_groups import (
    CODE_INDEX,
    SUBTB_INDEX,
@@ -44,12 +41,6 @@ __all__ = [
    "OutputFormat",
    "PortalCredentials",
    "SortOrder",
    "SpecQueryFilters",
    "SpecQueryResult",
    "Specification",
    "SpecificationDownload",
    "SpecificationSourceRecord",
    "SpecificationVersion",
    "SubWorkingGroupRecord",
    "WorkingGroup",
    "WorkingGroupRecord",
+5 −6
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from __future__ import annotations

import logging
import re
from collections.abc import Callable
from datetime import date
from urllib.parse import urljoin

@@ -11,6 +12,8 @@ from bs4 import BeautifulSoup, Tag

from tdoc_crawler.constants.patterns import DATE_PATTERN
from tdoc_crawler.constants.urls import PORTAL_BASE_URL
from tdoc_crawler.meetings.models import MeetingMetadata
from tdoc_crawler.models.working_groups import WorkingGroup

logger = logging.getLogger(__name__)

@@ -19,7 +22,7 @@ def parse_meeting_page(
    html: str,
    working_group: WorkingGroup,
    subgroup: str | None,
    get_subtb: callable | None = None,
    get_subtb: Callable[[str], int] | None = None,
) -> list[MeetingMetadata]:
    """Parse meeting page HTML into list of MeetingMetadata.

@@ -53,7 +56,7 @@ def parse_meeting_row(
    cells: list[Tag],
    working_group: WorkingGroup,
    subgroup: str | None,
    get_subtb: callable | None = None,
    get_subtb: Callable[[str], int] | None = None,
) -> MeetingMetadata:
    """Parse a single meeting row from the table.

@@ -81,10 +84,6 @@ def parse_meeting_row(
    location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC"
    files_url = extract_first_link(cells[-3])

    # Get tbid from working group, subtb from callback if subgroup is available
    # Import here to avoid circular dependency
    from tdoc_crawler.models import MeetingMetadata

    tbid = working_group.tbid
    subtb: int | None = None
    if subgroup and get_subtb:
+2 −4
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@ from decimal import Decimal

from bs4 import BeautifulSoup

from tdoc_crawler.tdocs.models import TDocMetadata

logger = logging.getLogger(__name__)


@@ -117,10 +119,6 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T
        logger.warning(error_msg)
        raise PortalParsingError(error_msg)

    # Create and return TDocMetadata instance
    # Import here to avoid circular dependency
    from tdoc_crawler.models.tdocs import TDocMetadata

    return TDocMetadata(
        tdoc_id=tdoc_id,
        meeting_id=0,  # Placeholder - caller must resolve via meeting_name
+4 −8
Original line number Diff line number Diff line
@@ -9,7 +9,6 @@ from __future__ import annotations
import logging
from decimal import Decimal
from enum import Enum
from typing import TYPE_CHECKING

import requests
from pydantic import ValidationError
@@ -17,7 +16,9 @@ from pydantic import ValidationError
from tdoc_crawler.clients.portal import create_portal_client
from tdoc_crawler.config import CacheManager, resolve_cache_manager
from tdoc_crawler.credentials import resolve_credentials
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
from tdoc_crawler.tdocs.models import QueryConfig, TDocMetadata
from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
from tdoc_crawler.tdocs.sources import (
    DocumentListSource,
@@ -26,10 +27,6 @@ from tdoc_crawler.tdocs.sources import (
    WhatTheSpecSource,
)

if TYPE_CHECKING:
    from tdoc_crawler.database import TDocDatabase
    from tdoc_crawler.models.tdocs import QueryConfig, TDocMetadata

logger = logging.getLogger(__name__)


@@ -170,8 +167,6 @@ def fetch_tdoc(

    # Handle URL-only method separately (doesn't use source abstraction)
    if method == FetchMethod.PORTAL_URL_ONLY:
        # Import here to avoid circular dependency
        from tdoc_crawler.models.tdocs import TDocMetadata

        logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
        client = create_portal_client(cache_dir=manager.root, timeout=min(timeout, 15), session=session)
@@ -272,7 +267,8 @@ def fetch_missing_tdocs_batch(

            # Resolve meeting_id if needed
            if metadata.meeting_name:
                meeting_id = database.resolve_meeting_id(metadata.meeting_name)
                with MeetingDatabase(database.db_file) as meeting_db:
                    meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name)
                if meeting_id is not None:
                    metadata.meeting_id = meeting_id
                else:
+2 −4
Original line number Diff line number Diff line
@@ -7,12 +7,10 @@ of TDoc metadata (WhatTheSpec, 3GPP portal, meeting document lists, etc.).
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING, Protocol
from typing import Protocol

from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata
from tdoc_crawler.tdocs.models import TDocMetadata


class TDocSource(Protocol):
Loading