Loading src/tdoc_crawler/crawlers/__init__.py +3 −20 Original line number Diff line number Diff line Loading @@ -13,27 +13,10 @@ from .constants import ( TDOC_SUBDIRS_NORMALIZED, ) from .hybrid import HybridCrawlResult, HybridTDocCrawler from .meeting_doclist import ( DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list, ) from .meetings import ( MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias, ) from .meeting_doclist import DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list from .meetings import MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias from .parallel import fetch_meeting_tdocs from .portal import ( PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page, ) from .portal import PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page from .tdocs import TDocCrawler, TDocCrawlResult from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec Loading src/tdoc_crawler/database/connection.py +48 −4 Original line number Diff line number Diff line """Database access layer backed by pydantic_sqlite.""" import json from collections import defaultdict from collections.abc import Callable, Iterable from datetime import UTC, datetime Loading Loading @@ -417,10 +418,7 @@ class TDocDatabase: def get_spec_versions(self, spec_number: str) -> list[SpecificationVersion]: """Get all versions for a spec.""" try: cursor = self.connection._db.execute( "SELECT * FROM spec_versions WHERE spec_number = ?", (spec_number,) ) cursor = self.connection._db.execute("SELECT * FROM spec_versions WHERE spec_number = ?", (spec_number,)) columns = [description[0] for description in cursor.description] rows = cursor.fetchall() Loading Loading @@ -451,6 +449,10 @@ class TDocDatabase: from tdoc_crawler.specs.query import SpecQueryResult # noqa: PLC0415 specs = self._spec_table_rows() source_records = self._table_rows("spec_source_records") records_by_spec: dict[str, list[SpecificationSourceRecord]] = defaultdict(list) for record in source_records: records_by_spec[record.spec_number].append(record) if filters.spec_numbers: allowed = {value.strip() for value in filters.spec_numbers} Loading @@ -468,12 +470,41 @@ class TDocDatabase: needle = filters.status.strip().lower() specs = [spec for spec in specs if (spec.status or "").lower() == needle] def build_source_differences(records: list[SpecificationSourceRecord]) -> dict[str, dict[str, str | None]]: if len(records) < 2: return {} fields = ("title", "status", "working_group", "latest_version", "spec_type", "series") differences: dict[str, dict[str, str | None]] = {} for field in fields: values: dict[str, str | None] = {} normalized_values: set[str] = set() for record in records: payload = record.metadata_payload if isinstance(payload, str): try: payload = json.loads(payload) except json.JSONDecodeError: payload = {} if not isinstance(payload, dict): payload = {} raw_value = payload.get(field) value = str(raw_value) if raw_value is not None else None values[record.source_name] = value normalized_values.add((value or "").strip().lower()) if len(normalized_values) > 1: differences[field] = values return differences return [ SpecQueryResult( spec_number=spec.spec_number, title=spec.title, status=spec.status, working_group=spec.working_group, source_differences=build_source_differences(records_by_spec.get(spec.spec_number, [])), ) for spec in specs ] Loading Loading @@ -675,6 +706,19 @@ class TDocDatabase: row_dict[key] = datetime.fromisoformat(value) except (ValueError, AttributeError): pass if table == "spec_source_records": metadata_payload = row_dict.get("metadata_payload") if isinstance(metadata_payload, str): try: row_dict["metadata_payload"] = json.loads(metadata_payload) except json.JSONDecodeError: row_dict["metadata_payload"] = {} versions = row_dict.get("versions") if isinstance(versions, str): try: row_dict["versions"] = json.loads(versions) except json.JSONDecodeError: row_dict["versions"] = [] result.append(model_class(**row_dict)) return result except Exception: Loading src/tdoc_crawler/models/__init__.py +10 −31 Original line number Diff line number Diff line Loading @@ -3,39 +3,18 @@ from __future__ import annotations # Re-export all public symbols from .base import ( DEFAULT_CACHE_DIR, BaseConfigModel, # noqa: F401 HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now, ) from .base import BaseConfigModel # noqa: F401 from .base import DEFAULT_CACHE_DIR, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now from .crawl_limits import CrawlLimits # noqa: F401 from .crawl_log import CrawlLogEntry # noqa: F401 from .meetings import ( MeetingCrawlConfig, MeetingMetadata, # noqa: F401 MeetingQueryConfig, ) from .subworking_groups import ( CODE_INDEX, SUBTB_INDEX, # noqa: F401 SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord, ) from .tdocs import ( CrawlConfig, QueryConfig, TDocCrawlConfig, # noqa: F401 TDocMetadata, ) from .working_groups import ( WORKING_GROUP_RECORDS, WorkingGroup, # noqa: F401 WorkingGroupRecord, ) from .meetings import MeetingMetadata # noqa: F401 from .meetings import MeetingCrawlConfig, MeetingQueryConfig from .subworking_groups import SUBTB_INDEX # noqa: F401 from .subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord from .tdocs import TDocCrawlConfig # noqa: F401 from .tdocs import CrawlConfig, QueryConfig, TDocMetadata from .working_groups import WorkingGroup # noqa: F401 from .working_groups import WORKING_GROUP_RECORDS, WorkingGroupRecord __all__ = [ "CODE_INDEX", Loading tests/test_meeting_document_list.py +14 −14 File changed.Contains only whitespace changes. Show changes Loading
src/tdoc_crawler/crawlers/__init__.py +3 −20 Original line number Diff line number Diff line Loading @@ -13,27 +13,10 @@ from .constants import ( TDOC_SUBDIRS_NORMALIZED, ) from .hybrid import HybridCrawlResult, HybridTDocCrawler from .meeting_doclist import ( DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list, ) from .meetings import ( MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias, ) from .meeting_doclist import DocumentListError, convert_excel_row_to_tdoc_metadata, fetch_meeting_document_list, parse_excel_document_list from .meetings import MeetingCrawler, MeetingCrawlResult, normalize_subgroup_alias, normalize_working_group_alias from .parallel import fetch_meeting_tdocs from .portal import ( PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page, ) from .portal import PortalAuthenticationError, PortalParsingError, PortalSession, extract_tdoc_url_from_portal, fetch_tdoc_metadata, parse_tdoc_portal_page from .tdocs import TDocCrawler, TDocCrawlResult from .whatthespec import WhatTheSpecResolutionError, resolve_via_whatthespec Loading
src/tdoc_crawler/database/connection.py +48 −4 Original line number Diff line number Diff line """Database access layer backed by pydantic_sqlite.""" import json from collections import defaultdict from collections.abc import Callable, Iterable from datetime import UTC, datetime Loading Loading @@ -417,10 +418,7 @@ class TDocDatabase: def get_spec_versions(self, spec_number: str) -> list[SpecificationVersion]: """Get all versions for a spec.""" try: cursor = self.connection._db.execute( "SELECT * FROM spec_versions WHERE spec_number = ?", (spec_number,) ) cursor = self.connection._db.execute("SELECT * FROM spec_versions WHERE spec_number = ?", (spec_number,)) columns = [description[0] for description in cursor.description] rows = cursor.fetchall() Loading Loading @@ -451,6 +449,10 @@ class TDocDatabase: from tdoc_crawler.specs.query import SpecQueryResult # noqa: PLC0415 specs = self._spec_table_rows() source_records = self._table_rows("spec_source_records") records_by_spec: dict[str, list[SpecificationSourceRecord]] = defaultdict(list) for record in source_records: records_by_spec[record.spec_number].append(record) if filters.spec_numbers: allowed = {value.strip() for value in filters.spec_numbers} Loading @@ -468,12 +470,41 @@ class TDocDatabase: needle = filters.status.strip().lower() specs = [spec for spec in specs if (spec.status or "").lower() == needle] def build_source_differences(records: list[SpecificationSourceRecord]) -> dict[str, dict[str, str | None]]: if len(records) < 2: return {} fields = ("title", "status", "working_group", "latest_version", "spec_type", "series") differences: dict[str, dict[str, str | None]] = {} for field in fields: values: dict[str, str | None] = {} normalized_values: set[str] = set() for record in records: payload = record.metadata_payload if isinstance(payload, str): try: payload = json.loads(payload) except json.JSONDecodeError: payload = {} if not isinstance(payload, dict): payload = {} raw_value = payload.get(field) value = str(raw_value) if raw_value is not None else None values[record.source_name] = value normalized_values.add((value or "").strip().lower()) if len(normalized_values) > 1: differences[field] = values return differences return [ SpecQueryResult( spec_number=spec.spec_number, title=spec.title, status=spec.status, working_group=spec.working_group, source_differences=build_source_differences(records_by_spec.get(spec.spec_number, [])), ) for spec in specs ] Loading Loading @@ -675,6 +706,19 @@ class TDocDatabase: row_dict[key] = datetime.fromisoformat(value) except (ValueError, AttributeError): pass if table == "spec_source_records": metadata_payload = row_dict.get("metadata_payload") if isinstance(metadata_payload, str): try: row_dict["metadata_payload"] = json.loads(metadata_payload) except json.JSONDecodeError: row_dict["metadata_payload"] = {} versions = row_dict.get("versions") if isinstance(versions, str): try: row_dict["versions"] = json.loads(versions) except json.JSONDecodeError: row_dict["versions"] = [] result.append(model_class(**row_dict)) return result except Exception: Loading
src/tdoc_crawler/models/__init__.py +10 −31 Original line number Diff line number Diff line Loading @@ -3,39 +3,18 @@ from __future__ import annotations # Re-export all public symbols from .base import ( DEFAULT_CACHE_DIR, BaseConfigModel, # noqa: F401 HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now, ) from .base import BaseConfigModel # noqa: F401 from .base import DEFAULT_CACHE_DIR, HttpCacheConfig, OutputFormat, PortalCredentials, SortOrder, utc_now from .crawl_limits import CrawlLimits # noqa: F401 from .crawl_log import CrawlLogEntry # noqa: F401 from .meetings import ( MeetingCrawlConfig, MeetingMetadata, # noqa: F401 MeetingQueryConfig, ) from .subworking_groups import ( CODE_INDEX, SUBTB_INDEX, # noqa: F401 SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord, ) from .tdocs import ( CrawlConfig, QueryConfig, TDocCrawlConfig, # noqa: F401 TDocMetadata, ) from .working_groups import ( WORKING_GROUP_RECORDS, WorkingGroup, # noqa: F401 WorkingGroupRecord, ) from .meetings import MeetingMetadata # noqa: F401 from .meetings import MeetingCrawlConfig, MeetingQueryConfig from .subworking_groups import SUBTB_INDEX # noqa: F401 from .subworking_groups import CODE_INDEX, SUBWORKING_GROUP_RECORDS, SubWorkingGroupRecord from .tdocs import TDocCrawlConfig # noqa: F401 from .tdocs import CrawlConfig, QueryConfig, TDocMetadata from .working_groups import WorkingGroup # noqa: F401 from .working_groups import WORKING_GROUP_RECORDS, WorkingGroupRecord __all__ = [ "CODE_INDEX", Loading
tests/test_meeting_document_list.py +14 −14 File changed.Contains only whitespace changes. Show changes