Loading src/tdoc_crawler/models/__init__.py +0 −9 Original line number Diff line number Diff line Loading @@ -17,9 +17,6 @@ from .base import ( ) from .crawl_limits import CrawlLimits from .crawl_log import CrawlLogEntry # Note: Specification models have been moved to tdoc_crawler.specs.models # Import from there directly to avoid circular dependencies from .subworking_groups import ( CODE_INDEX, SUBTB_INDEX, Loading @@ -44,12 +41,6 @@ __all__ = [ "OutputFormat", "PortalCredentials", "SortOrder", "SpecQueryFilters", "SpecQueryResult", "Specification", "SpecificationDownload", "SpecificationSourceRecord", "SpecificationVersion", "SubWorkingGroupRecord", "WorkingGroup", "WorkingGroupRecord", Loading src/tdoc_crawler/parsers/meetings.py +5 −6 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import logging import re from collections.abc import Callable from datetime import date from urllib.parse import urljoin Loading @@ -11,6 +12,8 @@ from bs4 import BeautifulSoup, Tag from tdoc_crawler.constants.patterns import DATE_PATTERN from tdoc_crawler.constants.urls import PORTAL_BASE_URL from tdoc_crawler.meetings.models import MeetingMetadata from tdoc_crawler.models.working_groups import WorkingGroup logger = logging.getLogger(__name__) Loading @@ -19,7 +22,7 @@ def parse_meeting_page( html: str, working_group: WorkingGroup, subgroup: str | None, get_subtb: callable | None = None, get_subtb: Callable[[str], int] | None = None, ) -> list[MeetingMetadata]: """Parse meeting page HTML into list of MeetingMetadata. Loading Loading @@ -53,7 +56,7 @@ def parse_meeting_row( cells: list[Tag], working_group: WorkingGroup, subgroup: str | None, get_subtb: callable | None = None, get_subtb: Callable[[str], int] | None = None, ) -> MeetingMetadata: """Parse a single meeting row from the table. Loading Loading @@ -81,10 +84,6 @@ def parse_meeting_row( location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC" files_url = extract_first_link(cells[-3]) # Get tbid from working group, subtb from callback if subgroup is available # Import here to avoid circular dependency from tdoc_crawler.models import MeetingMetadata tbid = working_group.tbid subtb: int | None = None if subgroup and get_subtb: Loading src/tdoc_crawler/parsers/portal.py +2 −4 Original line number Diff line number Diff line Loading @@ -7,6 +7,8 @@ from decimal import Decimal from bs4 import BeautifulSoup from tdoc_crawler.tdocs.models import TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -117,10 +119,6 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) # Create and return TDocMetadata instance # Import here to avoid circular dependency from tdoc_crawler.models.tdocs import TDocMetadata return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name Loading src/tdoc_crawler/tdocs/operations/fetch.py +4 −8 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ from __future__ import annotations import logging from decimal import Decimal from enum import Enum from typing import TYPE_CHECKING import requests from pydantic import ValidationError Loading @@ -17,7 +16,9 @@ from pydantic import ValidationError from tdoc_crawler.clients.portal import create_portal_client from tdoc_crawler.config import CacheManager, resolve_cache_manager from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database import MeetingDatabase, TDocDatabase from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials from tdoc_crawler.tdocs.models import QueryConfig, TDocMetadata from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult from tdoc_crawler.tdocs.sources import ( DocumentListSource, Loading @@ -26,10 +27,6 @@ from tdoc_crawler.tdocs.sources import ( WhatTheSpecSource, ) if TYPE_CHECKING: from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models.tdocs import QueryConfig, TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -170,8 +167,6 @@ def fetch_tdoc( # Handle URL-only method separately (doesn't use source abstraction) if method == FetchMethod.PORTAL_URL_ONLY: # Import here to avoid circular dependency from tdoc_crawler.models.tdocs import TDocMetadata logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal") client = create_portal_client(cache_dir=manager.root, timeout=min(timeout, 15), session=session) Loading Loading @@ -272,7 +267,8 @@ def fetch_missing_tdocs_batch( # Resolve meeting_id if needed if metadata.meeting_name: meeting_id = database.resolve_meeting_id(metadata.meeting_name) with MeetingDatabase(database.db_file) as meeting_db: meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name) if meeting_id is not None: metadata.meeting_id = meeting_id else: Loading src/tdoc_crawler/tdocs/sources/base.py +2 −4 Original line number Diff line number Diff line Loading @@ -7,12 +7,10 @@ of TDoc metadata (WhatTheSpec, 3GPP portal, meeting document lists, etc.). from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Protocol from typing import Protocol from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials if TYPE_CHECKING: from tdoc_crawler.models.tdocs import TDocMetadata from tdoc_crawler.tdocs.models import TDocMetadata class TDocSource(Protocol): Loading Loading
src/tdoc_crawler/models/__init__.py +0 −9 Original line number Diff line number Diff line Loading @@ -17,9 +17,6 @@ from .base import ( ) from .crawl_limits import CrawlLimits from .crawl_log import CrawlLogEntry # Note: Specification models have been moved to tdoc_crawler.specs.models # Import from there directly to avoid circular dependencies from .subworking_groups import ( CODE_INDEX, SUBTB_INDEX, Loading @@ -44,12 +41,6 @@ __all__ = [ "OutputFormat", "PortalCredentials", "SortOrder", "SpecQueryFilters", "SpecQueryResult", "Specification", "SpecificationDownload", "SpecificationSourceRecord", "SpecificationVersion", "SubWorkingGroupRecord", "WorkingGroup", "WorkingGroupRecord", Loading
src/tdoc_crawler/parsers/meetings.py +5 −6 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from __future__ import annotations import logging import re from collections.abc import Callable from datetime import date from urllib.parse import urljoin Loading @@ -11,6 +12,8 @@ from bs4 import BeautifulSoup, Tag from tdoc_crawler.constants.patterns import DATE_PATTERN from tdoc_crawler.constants.urls import PORTAL_BASE_URL from tdoc_crawler.meetings.models import MeetingMetadata from tdoc_crawler.models.working_groups import WorkingGroup logger = logging.getLogger(__name__) Loading @@ -19,7 +22,7 @@ def parse_meeting_page( html: str, working_group: WorkingGroup, subgroup: str | None, get_subtb: callable | None = None, get_subtb: Callable[[str], int] | None = None, ) -> list[MeetingMetadata]: """Parse meeting page HTML into list of MeetingMetadata. Loading Loading @@ -53,7 +56,7 @@ def parse_meeting_row( cells: list[Tag], working_group: WorkingGroup, subgroup: str | None, get_subtb: callable | None = None, get_subtb: Callable[[str], int] | None = None, ) -> MeetingMetadata: """Parse a single meeting row from the table. Loading Loading @@ -81,10 +84,6 @@ def parse_meeting_row( location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC" files_url = extract_first_link(cells[-3]) # Get tbid from working group, subtb from callback if subgroup is available # Import here to avoid circular dependency from tdoc_crawler.models import MeetingMetadata tbid = working_group.tbid subtb: int | None = None if subgroup and get_subtb: Loading
src/tdoc_crawler/parsers/portal.py +2 −4 Original line number Diff line number Diff line Loading @@ -7,6 +7,8 @@ from decimal import Decimal from bs4 import BeautifulSoup from tdoc_crawler.tdocs.models import TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -117,10 +119,6 @@ def parse_tdoc_portal_page(html: str, tdoc_id: str, url: str | None = None) -> T logger.warning(error_msg) raise PortalParsingError(error_msg) # Create and return TDocMetadata instance # Import here to avoid circular dependency from tdoc_crawler.models.tdocs import TDocMetadata return TDocMetadata( tdoc_id=tdoc_id, meeting_id=0, # Placeholder - caller must resolve via meeting_name Loading
src/tdoc_crawler/tdocs/operations/fetch.py +4 −8 Original line number Diff line number Diff line Loading @@ -9,7 +9,6 @@ from __future__ import annotations import logging from decimal import Decimal from enum import Enum from typing import TYPE_CHECKING import requests from pydantic import ValidationError Loading @@ -17,7 +16,9 @@ from pydantic import ValidationError from tdoc_crawler.clients.portal import create_portal_client from tdoc_crawler.config import CacheManager, resolve_cache_manager from tdoc_crawler.credentials import resolve_credentials from tdoc_crawler.database import MeetingDatabase, TDocDatabase from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials from tdoc_crawler.tdocs.models import QueryConfig, TDocMetadata from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult from tdoc_crawler.tdocs.sources import ( DocumentListSource, Loading @@ -26,10 +27,6 @@ from tdoc_crawler.tdocs.sources import ( WhatTheSpecSource, ) if TYPE_CHECKING: from tdoc_crawler.database import TDocDatabase from tdoc_crawler.models.tdocs import QueryConfig, TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -170,8 +167,6 @@ def fetch_tdoc( # Handle URL-only method separately (doesn't use source abstraction) if method == FetchMethod.PORTAL_URL_ONLY: # Import here to avoid circular dependency from tdoc_crawler.models.tdocs import TDocMetadata logger.debug(f"Fetching {tdoc_id} via unauthenticated 3GPP portal") client = create_portal_client(cache_dir=manager.root, timeout=min(timeout, 15), session=session) Loading Loading @@ -272,7 +267,8 @@ def fetch_missing_tdocs_batch( # Resolve meeting_id if needed if metadata.meeting_name: meeting_id = database.resolve_meeting_id(metadata.meeting_name) with MeetingDatabase(database.db_file) as meeting_db: meeting_id = meeting_db.resolve_meeting_id(metadata.meeting_name) if meeting_id is not None: metadata.meeting_id = meeting_id else: Loading
src/tdoc_crawler/tdocs/sources/base.py +2 −4 Original line number Diff line number Diff line Loading @@ -7,12 +7,10 @@ of TDoc metadata (WhatTheSpec, 3GPP portal, meeting document lists, etc.). from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Protocol from typing import Protocol from tdoc_crawler.models.base import HttpCacheConfig, PortalCredentials if TYPE_CHECKING: from tdoc_crawler.models.tdocs import TDocMetadata from tdoc_crawler.tdocs.models import TDocMetadata class TDocSource(Protocol): Loading