Loading src/tdoc_crawler/crawlers/meeting_doclist.py +19 −3 Original line number Diff line number Diff line Loading @@ -8,6 +8,10 @@ import re from datetime import UTC, datetime from decimal import Decimal from pathlib import Path import pandas as pd from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.models.tdocs import TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -99,17 +103,29 @@ def parse_excel_document_list( logger.debug(f"Found {len(df)} rows in TDoc_List sheet for meeting {meeting_id}") # Check if the dataframe has any recognizable TDoc ID columns # Look for any column that might contain TDoc IDs tdoc_columns_found = False for col in df.columns: col_str = str(col).lower() if any(keyword in col_str for keyword in ["tdoc", "contribution", "document", "id", "number"]): tdoc_columns_found = True break if not tdoc_columns_found: raise DocumentListError(f"No recognizable TDoc ID columns found in Excel file for meeting {meeting_id}") # Convert DataFrame rows to TDocMetadata instances tdoc_metadata_list = [] for index, row in df.iterrows(): for i, (_idx, row) in enumerate(df.iterrows()): try: tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id) if tdoc_metadata: tdoc_metadata_list.append(tdoc_metadata) else: logger.debug(f"Skipping row {index + 1}: missing required TDoc ID") logger.debug(f"Skipping row {i + 1}: missing required TDoc ID") except Exception as exc: logger.warning(f"Failed to parse row {index + 1} for meeting {meeting_id}: {exc}") logger.warning(f"Failed to parse row {i + 1} for meeting {meeting_id}: {exc}") continue logger.info(f"Successfully parsed {len(tdoc_metadata_list)} TDoc metadata entries for meeting {meeting_id}") Loading src/tdoc_crawler/crawlers/parallel.py +271 −261 Original line number Diff line number Diff line Loading @@ -8,7 +8,16 @@ import re from collections.abc import Iterable from datetime import UTC, datetime from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from tdoc_crawler.crawlers.constants import (EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED) from tdoc_crawler.http_client import create_cached_session logger = logging.getLogger(__name__) Loading Loading @@ -221,7 +230,8 @@ def fetch_meeting_document_list_subinterpreter( """ try: # Import inside function for subinterpreter context from tdoc_crawler.crawlers.meeting_doclist import fetch_meeting_document_list # noqa: PLC0415 from tdoc_crawler.crawlers.meeting_doclist import \ fetch_meeting_document_list # noqa: PLC0415 # Convert string path to Path object cache_dir_path = Path(cache_dir) Loading Loading
src/tdoc_crawler/crawlers/meeting_doclist.py +19 −3 Original line number Diff line number Diff line Loading @@ -8,6 +8,10 @@ import re from datetime import UTC, datetime from decimal import Decimal from pathlib import Path import pandas as pd from tdoc_crawler.http_client import create_cached_session from tdoc_crawler.models.tdocs import TDocMetadata logger = logging.getLogger(__name__) Loading Loading @@ -99,17 +103,29 @@ def parse_excel_document_list( logger.debug(f"Found {len(df)} rows in TDoc_List sheet for meeting {meeting_id}") # Check if the dataframe has any recognizable TDoc ID columns # Look for any column that might contain TDoc IDs tdoc_columns_found = False for col in df.columns: col_str = str(col).lower() if any(keyword in col_str for keyword in ["tdoc", "contribution", "document", "id", "number"]): tdoc_columns_found = True break if not tdoc_columns_found: raise DocumentListError(f"No recognizable TDoc ID columns found in Excel file for meeting {meeting_id}") # Convert DataFrame rows to TDocMetadata instances tdoc_metadata_list = [] for index, row in df.iterrows(): for i, (_idx, row) in enumerate(df.iterrows()): try: tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id) if tdoc_metadata: tdoc_metadata_list.append(tdoc_metadata) else: logger.debug(f"Skipping row {index + 1}: missing required TDoc ID") logger.debug(f"Skipping row {i + 1}: missing required TDoc ID") except Exception as exc: logger.warning(f"Failed to parse row {index + 1} for meeting {meeting_id}: {exc}") logger.warning(f"Failed to parse row {i + 1} for meeting {meeting_id}: {exc}") continue logger.info(f"Successfully parsed {len(tdoc_metadata_list)} TDoc metadata entries for meeting {meeting_id}") Loading
src/tdoc_crawler/crawlers/parallel.py +271 −261 Original line number Diff line number Diff line Loading @@ -8,7 +8,16 @@ import re from collections.abc import Iterable from datetime import UTC, datetime from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from tdoc_crawler.crawlers.constants import (EXCLUDED_DIRS, EXCLUDED_DIRS_NORMALIZED, TDOC_PATTERN_STR, TDOC_SUBDIRS, TDOC_SUBDIRS_NORMALIZED) from tdoc_crawler.http_client import create_cached_session logger = logging.getLogger(__name__) Loading Loading @@ -221,7 +230,8 @@ def fetch_meeting_document_list_subinterpreter( """ try: # Import inside function for subinterpreter context from tdoc_crawler.crawlers.meeting_doclist import fetch_meeting_document_list # noqa: PLC0415 from tdoc_crawler.crawlers.meeting_doclist import \ fetch_meeting_document_list # noqa: PLC0415 # Convert string path to Path object cache_dir_path = Path(cache_dir) Loading