♻️ refactor(tdocs): remove circular dependencies by moving imports to tdoc_crawler.tdocs.models (5ee0f4e2) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/models/init.py

+0 −9

Original line number	Diff line number	Diff line
		@@ -17,9 +17,6 @@ from .base import (
		)
		from .crawl_limits import CrawlLimits
		from .crawl_log import CrawlLogEntry

		# Note: Specification models have been moved to tdoc_crawler.specs.models
		# Import from there directly to avoid circular dependencies
		from .subworking_groups import (
		CODE_INDEX,
		SUBTB_INDEX,
		@@ -44,12 +41,6 @@ __all__ = [
		"OutputFormat",
		"PortalCredentials",
		"SortOrder",
		"SpecQueryFilters",
		"SpecQueryResult",
		"Specification",
		"SpecificationDownload",
		"SpecificationSourceRecord",
		"SpecificationVersion",
		"SubWorkingGroupRecord",
		"WorkingGroup",
		"WorkingGroupRecord",

src/tdoc_crawler/parsers/meetings.py

+5 −6

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ from __future__ import annotations

		import logging
		import re
		from collections.abc import Callable
		from datetime import date
		from urllib.parse import urljoin

		@@ -11,6 +12,8 @@ from bs4 import BeautifulSoup, Tag

		from tdoc_crawler.constants.patterns import DATE_PATTERN
		from tdoc_crawler.constants.urls import PORTAL_BASE_URL
		from tdoc_crawler.meetings.models import MeetingMetadata
		from tdoc_crawler.models.working_groups import WorkingGroup

		logger = logging.getLogger(__name__)

		@@ -19,7 +22,7 @@ def parse_meeting_page(
		html: str,
		working_group: WorkingGroup,
		subgroup: str \| None,
		get_subtb: callable \| None = None,
		get_subtb: Callable[[str], int] \| None = None,
		) -> list[MeetingMetadata]:
		"""Parse meeting page HTML into list of MeetingMetadata.

		@@ -53,7 +56,7 @@ def parse_meeting_row(
		cells: list[Tag],
		working_group: WorkingGroup,
		subgroup: str \| None,
		get_subtb: callable \| None = None,
		get_subtb: Callable[[str], int] \| None = None,
		) -> MeetingMetadata:
		"""Parse a single meeting row from the table.

		@@ -81,10 +84,6 @@ def parse_meeting_row(
		location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC"
		files_url = extract_first_link(cells[-3])

		# Get tbid from working group, subtb from callback if subgroup is available
		# Import here to avoid circular dependency
		from tdoc_crawler.models import MeetingMetadata

		tbid = working_group.tbid
		subtb: int \| None = None
		if subgroup and get_subtb:

src/tdoc_crawler/parsers/portal.py

+2 −4

Original line number	Diff line number	Diff line
		@@ -7,6 +7,8 @@ from decimal import Decimal

		from bs4 import BeautifulSoup

		from tdoc_crawler.tdocs.models import TDocMetadata

		logger = logging.getLogger(__name__)


		@@ -117,10 +119,6 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str \| None = None) -> T
		logger.warning(error_msg)
		raise PortalParsingError(error_msg)

		# Create and return TDocMetadata instance
		# Import here to avoid circular dependency
		from tdoc_crawler.models.tdocs import TDocMetadata

		return TDocMetadata(
		tdoc_id=tdoc_id,
		meeting_id=0, # Placeholder - caller must resolve via meeting_name

src/tdoc_crawler/tdocs/operations/fetch.py

+4 −8

Original line number	Diff line number	Diff line
		@@ -9,7 +9,6 @@ from __future__ import annotations
		import logging
		from decimal import Decimal
		from enum import Enum
		from typing import TYPE_CHECKING

		import requests
		from pydantic import ValidationError
		@@ -17,7 +16,9 @@ from pydantic import ValidationError
		from tdoc_crawler.clients.portal import create_portal_client
		from tdoc_crawler.config import CacheManager, resolve_cache_manager
		from tdoc_crawler.credentials import resolve_credentials
		from tdoc_crawler.database import MeetingDatabase, TDocDatabase
		from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials
		from tdoc_crawler.tdocs.models import QueryConfig, TDocMetadata
		from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
		from tdoc_crawler.tdocs.sources import (
		DocumentListSource,
		@@ -26,10 +27,6 @@ from tdoc_crawler.tdocs.sources import (
		WhatTheSpecSource,
		)

		if TYPE_CHECKING:
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models.tdocs import QueryConfig, TDocMetadata

		logger = logging.getLogger(__name__)


		@@ -170,8 +167,6 @@ def fetch_tdoc(

		# Handle URL-only method separately (doesn't use source abstraction)
		if method == FetchMethod.PORTAL_URL_ONLY:
		# Import here to avoid circular dependency
		from tdoc_crawler.models.tdocs import TDocMetadata

		logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal")
		client = create_portal_client(cache_dir=manager.root, timeout=min(timeout, 15), session=session)
		@@ -272,7 +267,8 @@ def fetch_missing_tdocs_batch(

		# Resolve meeting_id if needed
		if metadata.meeting_name:
		meeting_id = database.resolve_meeting_id(metadata.meeting_name)
		with MeetingDatabase(database.db_file) as meeting_db:
		meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name)
		if meeting_id is not None:
		metadata.meeting_id = meeting_id
		else:

src/tdoc_crawler/tdocs/sources/base.py

+2 −4

Original line number	Diff line number	Diff line
		@@ -7,12 +7,10 @@ of TDoc metadata (WhatTheSpec, 3GPP portal, meeting document lists, etc.).
		from __future__ import annotations

		from pathlib import Path
		from typing import TYPE_CHECKING, Protocol
		from typing import Protocol

		from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials

		if TYPE_CHECKING:
		from tdoc_crawler.models.tdocs import TDocMetadata
		from tdoc_crawler.tdocs.models import TDocMetadata


		class TDocSource(Protocol):