feat(meeting-doclist): implement meeting document list fetching feature (974bdd4e) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/cli/app.py

+35 −3

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@ from rich.console import Console
		from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn
		from rich.table import Table

		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
		from tdoc_crawler.crawlers import HybridTDocCrawler, MeetingCrawler, TDocCrawler
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import (
		MeetingCrawlConfig,
		@@ -67,13 +67,29 @@ def crawl_tdocs(
		timeout: int = typer.Option(30, "--timeout", help="HTTP request timeout seconds"),
		cache_ttl: int \| None = typer.Option(None, "--cache-ttl", help="HTTP cache TTL in seconds (default: 7200)"),
		cache_refresh_on_access: bool \| None = typer.Option(None, "--cache-refresh/--no-cache-refresh", help="Refresh cache TTL on access (default: True)"),
		# New options for document list vs parallel crawling
		use_document_list: bool = typer.Option(
		True, "--use-document-list/--no-use-document-list", help="Use meeting document list (Excel) for metadata (default: True, no credentials required)"
		),
		use_parallel_crawling: bool = typer.Option(
		False, "--use-parallel-crawling", help="Force use of parallel directory crawling (requires credentials for validation)"
		),
		allow_parallel_fallback: bool = typer.Option(
		True, "--allow-parallel-fallback/--no-parallel-fallback", help="Allow fallback to parallel crawling if document list fails (default: True)"
		),
		verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose logging"),
		) -> None:
		"""Crawl TDocs from 3GPP FTP directories."""
		"""Crawl TDocs from 3GPP FTP directories using hybrid crawling methods."""
		subgroups = parse_subgroups(subgroup)
		working_groups = parse_working_groups(working_group, subgroups)
		limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)
		http_cache = resolve_http_cache_config(cache_ttl, cache_refresh_on_access)

		# Validate conflicting options
		if use_parallel_crawling and use_document_list:
		console.print("[red]Error: Cannot use both --use-parallel-crawling and --use-document-list simultaneously[/red]")
		raise typer.Exit(code=2)

		config = TDocCrawlConfig(
		cache_dir=cache_dir,
		working_groups=working_groups,
		@@ -87,6 +103,9 @@ def crawl_tdocs(
		max_retries=max_retries,
		timeout=timeout,
		verbose=verbose,
		use_document_list=use_document_list,
		use_parallel_crawling=use_parallel_crawling,
		allow_parallel_fallback=allow_parallel_fallback,
		limits=limits,
		target_ids=None,
		credentials=None,
		@@ -103,6 +122,14 @@ def crawl_tdocs(
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

		# Show method being used
		if config.use_parallel_crawling:
		console.print("[yellow]Using parallel crawling method (requires credentials for validation)[/yellow]")
		elif config.use_document_list:
		console.print("[yellow]Using document list method (no credentials required)[/yellow]")
		if config.allow_parallel_fallback:
		console.print("[yellow]Parallel fallback enabled for failed meetings[/yellow]")

		if config.verbose:
		logging.getLogger().setLevel(logging.DEBUG)

		@@ -112,7 +139,8 @@ def crawl_tdocs(
		deleted_count = database.clear_tdocs()
		console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]")

		crawler = TDocCrawler(database)
		# Use hybrid crawler for both methods
		crawler = HybridTDocCrawler(database)
		crawl_id = database.log_crawl_start("tdoc", config.working_groups, config.incremental)

		# Track crawl start time for performance metrics
		@@ -153,6 +181,10 @@ def crawl_tdocs(

		console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
		console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
		if hasattr(result, "document_list_meetings"):
		console.print(f"[blue]Document list meetings: {result.document_list_meetings}, Parallel meetings: {result.parallel_meetings}[/blue]")
		if result.fallbacks > 0:
		console.print(f"[blue]Fallbacks applied: {result.fallbacks}[/blue]")
		if result.errors:
		console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
		for error in result.errors[:5]:

src/tdoc_crawler/crawlers/init.py

+20 −4

Original line number	Diff line number	Diff line
		@@ -5,47 +5,63 @@ from __future__ import annotations
		from importlib import import_module
		from typing import Any

		# No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__

		__all__ = [
		"EXCLUDED_DIRS",
		"EXCLUDED_DIRS_NORMALIZED",
		"MEETING_CODE_REGISTRY",
		"TDOC_PATTERN",
		"TDOC_PATTERN_STR",
		"TDOC_SUBDIRS",
		"TDOC_SUBDIRS_NORMALIZED",
		"DocumentListError",
		"HybridCrawlResult",
		"HybridTDocCrawler",
		"MeetingCrawlResult",
		"MeetingCrawler",
		"PortalAuthenticationError",
		"PortalParsingError",
		"PortalSession",
		"TDOC_PATTERN",
		"TDOC_PATTERN_STR",
		"TDOC_SUBDIRS",
		"TDOC_SUBDIRS_NORMALIZED",
		"Runner",
		"TDocCrawlResult",
		"TDocCrawler",
		"convert_excel_row_to_tdoc_metadata",
		"fetch_meeting_document_list",
		"fetch_meeting_tdocs",
		"fetch_tdoc_metadata",
		"normalize_subgroup_alias",
		"normalize_working_group_alias",
		"parse_excel_document_list",
		"parse_tdoc_portal_page",
		]

		_ATTR_MODULES: dict[str, tuple[str, str]] = {
		"DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"),
		"EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"),
		"EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"),
		"HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"),
		"HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"),
		"MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"),
		"MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"),
		"MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"),
		"PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
		"PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
		"PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
		"Runner": ("tdoc_crawler.crawlers.executor_adapter", "Runner"),
		"TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
		"TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
		"TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
		"TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"),
		"TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"),
		"TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
		"convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
		"fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
		"fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
		"fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),
		"normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"),
		"normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"),
		"parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"),
		"parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"),
		}

src/tdoc_crawler/crawlers/executor_adapter.py

0 → 100644

+96 −0

Original line number	Diff line number	Diff line
		"""Adapter to provide aiointerpreters-compatible API using pool_executors."""

		from __future__ import annotations

		import asyncio
		from collections.abc import Callable, Generator
		from concurrent.futures import Executor
		from contextlib import contextmanager
		from typing import Any, TypeVar

		from pool_executors import ExecutorType, create_executor

		T = TypeVar("T")


		class _RunnerContextManager:
		"""Context manager for Runner lifecycle."""

		def __init__(self, runner: Runner) -> None:
		self.runner = runner

		def __enter__(self) -> Runner:
		"""Start executor and return runner."""
		# Create executor using pool_executors factory
		self.runner._executor = create_executor(self.runner.executor_type, max_workers=self.runner.workers)
		try:
		self.runner._loop = asyncio.get_running_loop()
		except RuntimeError:
		# No running loop, create a new one
		self.runner._loop = asyncio.new_event_loop()
		asyncio.set_event_loop(self.runner._loop)
		return self.runner

		def __exit__(self, exc_type, exc_val, exc_tb) -> None:
		"""Shutdown executor."""
		if self.runner._executor:
		self.runner._executor.shutdown(wait=True)
		self.runner._executor = None
		self.runner._loop = None


		class Runner:
		"""Adapter that provides aiointerpreters.Runner API using pool_executors."""

		def __init__(self, workers: int = 4, executor_type: str = "subinterpreter") -> None:
		"""Initialize runner with specified number of workers.

		Args:
		workers: Number of worker processes/threads
		executor_type: Type of executor ("subinterpreter", "multiprocessing", "threading", "serial")
		"""
		self.workers = workers
		self.executor_type = executor_type
		self._executor: Executor \| None = None
		self._loop: asyncio.AbstractEventLoop \| None = None

		def start(self) -> _RunnerContextManager:
		"""Start executor and return context manager.

		Returns:
		Context manager that yields self
		"""
		return _RunnerContextManager(self)

		async def run(self, func: Callable[..., T], args: Any, *kwargs: Any) -> T:
		"""Run a function in executor and return awaitable result.

		Args:
		func: Function to run in executor
		*args: Positional arguments to pass to function
		**kwargs: Keyword arguments to pass to function

		Returns:
		Result from function execution

		Raises:
		RuntimeError: If runner not started or executor not available
		"""
		if self._executor is None:
		raise RuntimeError("Runner not started. Use with runner.start():")
		if self._loop is None:
		raise RuntimeError("Event loop not available")

		# Run function in executor and return awaitable
		return await self._loop.run_in_executor(self._executor, lambda: func(args, *kwargs))

		def __enter__(self) -> Runner:
		"""Context manager entry - not used, use start() instead."""
		return self

		def __exit__(self, exc_type, exc_val, exc_tb) -> None:
		"""Context manager exit - not used, use start() instead."""
		pass


		__all__ = ["Runner"]

src/tdoc_crawler/crawlers/hybrid.py

0 → 100644

+493 −0

File added.

Preview size limit exceeded, changes collapsed.

src/tdoc_crawler/crawlers/meeting_doclist.py

0 → 100644

+331 −0

Original line number	Diff line number	Diff line
		"""Meeting-based TDoc document list fetcher for credential-free metadata retrieval."""

		from __future__ import annotations

		import io
		import logging
		from datetime import UTC, datetime
		from decimal import Decimal
		from pathlib import Path
		from typing import TYPE_CHECKING

		import pandas as pd

		from tdoc_crawler.http_client import create_cached_session

		if TYPE_CHECKING:
		from tdoc_crawler.models.tdocs import TDocMetadata

		logger = logging.getLogger(__name__)


		class DocumentListError(Exception):
		"""Raised when document list fetching or parsing fails."""


		def fetch_meeting_document_list(
		meeting_id: int,
		cache_dir: Path,
		cache_ttl: int = 7200,
		cache_refresh_on_access: bool = True,
		timeout: int = 30,
		) -> list[TDocMetadata]:
		"""Fetch all TDoc metadata for a meeting via document list Excel file.

		Args:
		meeting_id: 3GPP meeting identifier
		cache_dir: Directory for HTTP cache storage
		cache_ttl: HTTP cache TTL in seconds
		cache_refresh_on_access: Whether to refresh cache TTL on access
		timeout: Request timeout in seconds

		Returns:
		List of TDocMetadata instances for all TDocs in the meeting

		Raises:
		DocumentListError: If document list cannot be fetched or parsed
		"""
		# Construct document list URL
		doclist_url = f"https://portal.3gpp.org/ngppapp/GenerateDocumentList.aspx?meetingId={meeting_id}"
		logger.debug(f"Fetching document list for meeting {meeting_id} from {doclist_url}")

		# Create cached session (no credentials required)
		session = create_cached_session(
		cache_dir=cache_dir,
		ttl=cache_ttl,
		refresh_ttl_on_access=cache_refresh_on_access,
		max_retries=3,
		)

		try:
		# Download Excel file
		logger.debug(f"Downloading Excel document list for meeting {meeting_id}")
		response = session.get(doclist_url, timeout=timeout)
		response.raise_for_status()

		# Check if we got a valid Excel file
		content_type = response.headers.get("content-type", "").lower()
		if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type:
		# Some responses might not set content-type correctly, check file signature
		if not response.content.startswith(b"PK"):
		raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}")

		# Parse Excel file
		logger.debug(f"Parsing Excel document list for meeting {meeting_id}")
		return parse_excel_document_list(response.content, meeting_id)

		except Exception as exc:
		if isinstance(exc, DocumentListError):
		raise
		logger.warning(f"Failed to fetch document list for meeting {meeting_id}: {exc}")
		raise DocumentListError(f"Failed to fetch document list for meeting {meeting_id}: {exc}") from exc
		finally:
		session.close()


		def parse_excel_document_list(
		excel_content: bytes,
		meeting_id: int,
		) -> list[TDocMetadata]:
		"""Parse Excel document list and convert to TDocMetadata instances.

		Args:
		excel_content: Raw Excel file content
		meeting_id: Meeting ID for reference

		Returns:
		List of TDocMetadata instances
		"""
		try:
		# Use pandas with python-calamine engine for fast Excel parsing
		df = pd.read_excel(
		io.BytesIO(excel_content),
		engine="calamine",
		sheet_name="TDoc_List",
		)

		logger.debug(f"Found {len(df)} rows in TDoc_List sheet for meeting {meeting_id}")

		# Convert DataFrame rows to TDocMetadata instances
		tdoc_metadata_list = []
		for index, row in df.iterrows():
		try:
		tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id)
		if tdoc_metadata:
		tdoc_metadata_list.append(tdoc_metadata)
		else:
		logger.debug(f"Skipping row {index + 1}: missing required TDoc ID")
		except Exception as exc:
		logger.warning(f"Failed to parse row {index + 1} for meeting {meeting_id}: {exc}")
		continue

		logger.info(f"Successfully parsed {len(tdoc_metadata_list)} TDoc metadata entries for meeting {meeting_id}")
		return tdoc_metadata_list

		except Exception as exc:
		raise DocumentListError(f"Failed to parse Excel document list for meeting {meeting_id}: {exc}") from exc


		def convert_excel_row_to_tdoc_metadata(
		row: pd.Series,
		meeting_id: int,
		) -> TDocMetadata \| None:
		"""Convert a single Excel row to TDocMetadata.

		Args:
		row: pandas Series representing one Excel row
		meeting_id: Meeting ID for reference

		Returns:
		TDocMetadata instance or None if conversion fails
		"""
		from tdoc_crawler.models.tdocs import TDocMetadata

		# Map Excel columns to TDocMetadata fields
		# Try multiple possible column names to handle different Excel formats
		tdoc_id = _extract_tdoc_id(row)
		if not tdoc_id:
		return None

		# Extract other fields with fallbacks
		title = _get_column_value(row, ["Title", "Document Title", "Description", "Subject"])
		tdoc_type = _get_column_value(row, ["Type", "Document Type", "TDoc Type"])
		for_purpose = _get_column_value(row, ["For", "Purpose", "For Purpose"])
		source = _get_column_value(row, ["Source", "Organization", "Company"])
		contact = _get_column_value(row, ["Contact", "Contact Person", "Author"])
		agenda_item = _get_column_value(row, ["Agenda Item", "Agenda", "Agenda Ref"])
		status = _get_column_value(row, ["Status", "Document Status"])
		is_revision_of = _get_column_value(row, ["Revision of", "Is Revision of", "Based on"])
		date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date"])

		# Generate URL (this will be validated/updated later by the directory crawler)
		url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip"

		now = datetime.now(UTC)

		try:
		# Parse agenda item number
		agenda_nbr, agenda_text = _parse_agenda_item(agenda_item)

		# Parse date
		parsed_date = _parse_date(date_created)

		return TDocMetadata(
		tdoc_id=tdoc_id.upper(),
		meeting_id=meeting_id,
		title=title or "Unknown Title",
		url=url,
		source=source or "Unknown",
		contact=contact or "Unknown",
		tdoc_type=tdoc_type or "unknown",
		for_purpose=for_purpose or "unknown",
		agenda_item_nbr=agenda_nbr,
		agenda_item_text=agenda_text,
		status=status,
		meeting_name=None, # Will be resolved from meeting database
		is_revision_of=is_revision_of,
		file_size=None, # Not available in Excel
		date_created=parsed_date,
		date_retrieved=now,
		date_updated=now,
		validated=True, # Direct from portal, no validation needed
		validation_failed=False,
		)

		except Exception as exc:
		logger.debug(f"Failed to create TDocMetadata for {tdoc_id}: {exc}")
		return None


		def _extract_tdoc_id(row: pd.Series) -> str \| None:
		"""Extract TDoc ID from Excel row.

		Args:
		row: pandas Series representing one Excel row

		Returns:
		TDoc ID string or None if not found
		"""
		# Try multiple possible column names for TDoc ID
		tdoc_id_columns = ["TDoc", "Contribution", "Document", "TDoc ID", "Contribution ID", "ID", "Number", "TDoc Number", "Contribution Number"]

		for col in tdoc_id_columns:
		value = row.get(col)
		if value and isinstance(value, str):
		tdoc_id = value.strip()
		# Validate TDoc ID format
		if _is_valid_tdoc_id(tdoc_id):
		return tdoc_id

		return None


		def _is_valid_tdoc_id(tdoc_id: str) -> bool:
		"""Check if string is a valid TDoc ID format.

		Args:
		tdoc_id: String to validate

		Returns:
		True if valid TDoc ID format
		"""
		import re

		# TDoc ID pattern: [RSC][1-6P] followed by 4-10 chars
		pattern = re.compile(r"^[RSC][1-6P].{4,10}$", re.IGNORECASE)
		return bool(pattern.match(tdoc_id.strip()))


		def _get_column_value(row: pd.Series, possible_names: list[str]) -> str \| None:
		"""Get value from row using possible column names.

		Args:
		row: pandas Series representing one Excel row
		possible_names: List of possible column names

		Returns:
		Column value or None if not found
		"""
		for col_name in possible_names:
		value = row.get(col_name)
		if value and isinstance(value, str):
		return value.strip()
		elif value is not None:
		# Handle non-string values (dates, numbers, etc.)
		return str(value).strip()
		return None


		def _parse_agenda_item(agenda_item: str \| None) -> tuple[Decimal, str]:
		"""Parse agenda item into number and text components.

		Args:
		agenda_item: Agenda item string (e.g., "7.1 - Some text" or just "7.1")

		Returns:
		Tuple of (agenda_number, agenda_text)
		"""
		if not agenda_item:
		return Decimal("0.0"), "Unknown"

		# Try to split by " - " to get number and text
		parts = agenda_item.split(" - ", 1)
		if len(parts) == 2:
		agenda_nbr_str = parts[0].strip()
		agenda_text = parts[1].strip()
		else:
		# No separator found, treat whole thing as number
		agenda_nbr_str = agenda_item.strip()
		agenda_text = "Unknown"

		# Parse agenda number as Decimal
		try:
		agenda_nbr = Decimal(agenda_nbr_str)
		except Exception:
		agenda_nbr = Decimal("0.0")

		return agenda_nbr, agenda_text


		def _parse_date(date_value: str \| None) -> datetime \| None:
		"""Parse date from Excel value.

		Args:
		date_value: Date value from Excel

		Returns:
		datetime object or None if parsing fails
		"""
		if not date_value:
		return None

		try:
		# Handle different date formats
		if isinstance(date_value, datetime):
		return date_value
		elif isinstance(date_value, str):
		# Try common date formats
		formats = [
		"%Y-%m-%d",
		"%d/%m/%Y",
		"%m/%d/%Y",
		"%d-%m-%Y",
		"%Y/%m/%d",
		]
		for fmt in formats:
		try:
		return datetime.strptime(date_value, fmt)
		except ValueError:
		continue
		return None

		except Exception:
		return None


		__all__ = [
		"DocumentListError",
		"fetch_meeting_document_list",
		"parse_excel_document_list",
		"convert_excel_row_to_tdoc_metadata",
		]