Commit 974bdd4e authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(meeting-doclist): implement meeting document list fetching feature

* Add `fetch_meeting_document_list` function to retrieve TDoc metadata from Excel files.
* Implement Excel parsing with `parse_excel_document_list` and related helper functions.
* Introduce error handling for document list fetching and parsing.
* Create a new module `meeting_doclist.py` for document list functionalities.
* Update `parallel.py` to include subinterpreter support for document list fetching.
* Enhance `tdocs.py` to use the new executor adapter for parallel crawling.
* Add configuration options in `tdocs.py` for document list and parallel crawling behavior.
* Create tests for document list fetching and parsing in `test_meeting_document_list.py`.
* Implement tests for the executor adapter in `test_executor_adapter.py`.
parent 19eed79a
Loading
Loading
Loading
Loading
+35 −3
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn
from rich.table import Table

from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.crawlers import HybridTDocCrawler, MeetingCrawler, TDocCrawler
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import (
    MeetingCrawlConfig,
@@ -67,13 +67,29 @@ def crawl_tdocs(
    timeout: int = typer.Option(30, "--timeout", help="HTTP request timeout seconds"),
    cache_ttl: int | None = typer.Option(None, "--cache-ttl", help="HTTP cache TTL in seconds (default: 7200)"),
    cache_refresh_on_access: bool | None = typer.Option(None, "--cache-refresh/--no-cache-refresh", help="Refresh cache TTL on access (default: True)"),
    # New options for document list vs parallel crawling
    use_document_list: bool = typer.Option(
        True, "--use-document-list/--no-use-document-list", help="Use meeting document list (Excel) for metadata (default: True, no credentials required)"
    ),
    use_parallel_crawling: bool = typer.Option(
        False, "--use-parallel-crawling", help="Force use of parallel directory crawling (requires credentials for validation)"
    ),
    allow_parallel_fallback: bool = typer.Option(
        True, "--allow-parallel-fallback/--no-parallel-fallback", help="Allow fallback to parallel crawling if document list fails (default: True)"
    ),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose logging"),
) -> None:
    """Crawl TDocs from 3GPP FTP directories."""
    """Crawl TDocs from 3GPP FTP directories using hybrid crawling methods."""
    subgroups = parse_subgroups(subgroup)
    working_groups = parse_working_groups(working_group, subgroups)
    limits = build_limits(limit_tdocs, limit_meetings, limit_meetings_per_wg, limit_wgs)
    http_cache = resolve_http_cache_config(cache_ttl, cache_refresh_on_access)

    # Validate conflicting options
    if use_parallel_crawling and use_document_list:
        console.print("[red]Error: Cannot use both --use-parallel-crawling and --use-document-list simultaneously[/red]")
        raise typer.Exit(code=2)

    config = TDocCrawlConfig(
        cache_dir=cache_dir,
        working_groups=working_groups,
@@ -87,6 +103,9 @@ def crawl_tdocs(
        max_retries=max_retries,
        timeout=timeout,
        verbose=verbose,
        use_document_list=use_document_list,
        use_parallel_crawling=use_parallel_crawling,
        allow_parallel_fallback=allow_parallel_fallback,
        limits=limits,
        target_ids=None,
        credentials=None,
@@ -103,6 +122,14 @@ def crawl_tdocs(
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

    # Show method being used
    if config.use_parallel_crawling:
        console.print("[yellow]Using parallel crawling method (requires credentials for validation)[/yellow]")
    elif config.use_document_list:
        console.print("[yellow]Using document list method (no credentials required)[/yellow]")
        if config.allow_parallel_fallback:
            console.print("[yellow]Parallel fallback enabled for failed meetings[/yellow]")

    if config.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

@@ -112,7 +139,8 @@ def crawl_tdocs(
            deleted_count = database.clear_tdocs()
            console.print(f"[yellow]Cleared {deleted_count} TDocs from database[/yellow]")

        crawler = TDocCrawler(database)
        # Use hybrid crawler for both methods
        crawler = HybridTDocCrawler(database)
        crawl_id = database.log_crawl_start("tdoc", config.working_groups, config.incremental)

        # Track crawl start time for performance metrics
@@ -153,6 +181,10 @@ def crawl_tdocs(

    console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if hasattr(result, "document_list_meetings"):
        console.print(f"[blue]Document list meetings: {result.document_list_meetings}, Parallel meetings: {result.parallel_meetings}[/blue]")
        if result.fallbacks > 0:
            console.print(f"[blue]Fallbacks applied: {result.fallbacks}[/blue]")
    if result.errors:
        console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]")
        for error in result.errors[:5]:
+20 −4
Original line number Diff line number Diff line
@@ -5,47 +5,63 @@ from __future__ import annotations
from importlib import import_module
from typing import Any

# No direct imports for any symbols listed in __all__; all are dynamically imported via __getattr__

__all__ = [
    "EXCLUDED_DIRS",
    "EXCLUDED_DIRS_NORMALIZED",
    "MEETING_CODE_REGISTRY",
    "TDOC_PATTERN",
    "TDOC_PATTERN_STR",
    "TDOC_SUBDIRS",
    "TDOC_SUBDIRS_NORMALIZED",
    "DocumentListError",
    "HybridCrawlResult",
    "HybridTDocCrawler",
    "MeetingCrawlResult",
    "MeetingCrawler",
    "PortalAuthenticationError",
    "PortalParsingError",
    "PortalSession",
    "TDOC_PATTERN",
    "TDOC_PATTERN_STR",
    "TDOC_SUBDIRS",
    "TDOC_SUBDIRS_NORMALIZED",
    "Runner",
    "TDocCrawlResult",
    "TDocCrawler",
    "convert_excel_row_to_tdoc_metadata",
    "fetch_meeting_document_list",
    "fetch_meeting_tdocs",
    "fetch_tdoc_metadata",
    "normalize_subgroup_alias",
    "normalize_working_group_alias",
    "parse_excel_document_list",
    "parse_tdoc_portal_page",
]

_ATTR_MODULES: dict[str, tuple[str, str]] = {
    "DocumentListError": ("tdoc_crawler.crawlers.meeting_doclist", "DocumentListError"),
    "EXCLUDED_DIRS": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS"),
    "EXCLUDED_DIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "EXCLUDED_DIRS_NORMALIZED"),
    "HybridCrawlResult": ("tdoc_crawler.crawlers.hybrid", "HybridCrawlResult"),
    "HybridTDocCrawler": ("tdoc_crawler.crawlers.hybrid", "HybridTDocCrawler"),
    "MEETING_CODE_REGISTRY": ("tdoc_crawler.crawlers.constants", "MEETING_CODE_REGISTRY"),
    "MeetingCrawlResult": ("tdoc_crawler.crawlers.meetings", "MeetingCrawlResult"),
    "MeetingCrawler": ("tdoc_crawler.crawlers.meetings", "MeetingCrawler"),
    "PortalAuthenticationError": ("tdoc_crawler.crawlers.portal", "PortalAuthenticationError"),
    "PortalParsingError": ("tdoc_crawler.crawlers.portal", "PortalParsingError"),
    "PortalSession": ("tdoc_crawler.crawlers.portal", "PortalSession"),
    "Runner": ("tdoc_crawler.crawlers.executor_adapter", "Runner"),
    "TDOC_PATTERN": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN"),
    "TDOC_PATTERN_STR": ("tdoc_crawler.crawlers.constants", "TDOC_PATTERN_STR"),
    "TDOC_SUBDIRS": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS"),
    "TDOC_SUBDIRS_NORMALIZED": ("tdoc_crawler.crawlers.constants", "TDOC_SUBDIRS_NORMALIZED"),
    "TDocCrawlResult": ("tdoc_crawler.crawlers.tdocs", "TDocCrawlResult"),
    "TDocCrawler": ("tdoc_crawler.crawlers.tdocs", "TDocCrawler"),
    "convert_excel_row_to_tdoc_metadata": ("tdoc_crawler.crawlers.meeting_doclist", "convert_excel_row_to_tdoc_metadata"),
    "fetch_meeting_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "fetch_meeting_document_list"),
    "fetch_meeting_tdocs": ("tdoc_crawler.crawlers.parallel", "fetch_meeting_tdocs"),
    "fetch_tdoc_metadata": ("tdoc_crawler.crawlers.portal", "fetch_tdoc_metadata"),
    "normalize_subgroup_alias": ("tdoc_crawler.crawlers.meetings", "normalize_subgroup_alias"),
    "normalize_working_group_alias": ("tdoc_crawler.crawlers.meetings", "normalize_working_group_alias"),
    "parse_excel_document_list": ("tdoc_crawler.crawlers.meeting_doclist", "parse_excel_document_list"),
    "parse_tdoc_portal_page": ("tdoc_crawler.crawlers.portal", "parse_tdoc_portal_page"),
}

+96 −0
Original line number Diff line number Diff line
"""Adapter to provide aiointerpreters-compatible API using pool_executors."""

from __future__ import annotations

import asyncio
from collections.abc import Callable, Generator
from concurrent.futures import Executor
from contextlib import contextmanager
from typing import Any, TypeVar

from pool_executors import ExecutorType, create_executor

T = TypeVar("T")


class _RunnerContextManager:
    """Context manager for Runner lifecycle."""

    def __init__(self, runner: Runner) -> None:
        self.runner = runner

    def __enter__(self) -> Runner:
        """Start executor and return runner."""
        # Create executor using pool_executors factory
        self.runner._executor = create_executor(self.runner.executor_type, max_workers=self.runner.workers)
        try:
            self.runner._loop = asyncio.get_running_loop()
        except RuntimeError:
            # No running loop, create a new one
            self.runner._loop = asyncio.new_event_loop()
            asyncio.set_event_loop(self.runner._loop)
        return self.runner

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Shutdown executor."""
        if self.runner._executor:
            self.runner._executor.shutdown(wait=True)
            self.runner._executor = None
        self.runner._loop = None


class Runner:
    """Adapter that provides aiointerpreters.Runner API using pool_executors."""

    def __init__(self, workers: int = 4, executor_type: str = "subinterpreter") -> None:
        """Initialize runner with specified number of workers.

        Args:
            workers: Number of worker processes/threads
            executor_type: Type of executor ("subinterpreter", "multiprocessing", "threading", "serial")
        """
        self.workers = workers
        self.executor_type = executor_type
        self._executor: Executor | None = None
        self._loop: asyncio.AbstractEventLoop | None = None

    def start(self) -> _RunnerContextManager:
        """Start executor and return context manager.

        Returns:
            Context manager that yields self
        """
        return _RunnerContextManager(self)

    async def run(self, func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
        """Run a function in executor and return awaitable result.

        Args:
            func: Function to run in executor
            *args: Positional arguments to pass to function
            **kwargs: Keyword arguments to pass to function

        Returns:
            Result from function execution

        Raises:
            RuntimeError: If runner not started or executor not available
        """
        if self._executor is None:
            raise RuntimeError("Runner not started. Use with runner.start():")
        if self._loop is None:
            raise RuntimeError("Event loop not available")

        # Run function in executor and return awaitable
        return await self._loop.run_in_executor(self._executor, lambda: func(*args, **kwargs))

    def __enter__(self) -> Runner:
        """Context manager entry - not used, use start() instead."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Context manager exit - not used, use start() instead."""
        pass


__all__ = ["Runner"]
+493 −0

File added.

Preview size limit exceeded, changes collapsed.

+331 −0
Original line number Diff line number Diff line
"""Meeting-based TDoc document list fetcher for credential-free metadata retrieval."""

from __future__ import annotations

import io
import logging
from datetime import UTC, datetime
from decimal import Decimal
from pathlib import Path
from typing import TYPE_CHECKING

import pandas as pd

from tdoc_crawler.http_client import create_cached_session

if TYPE_CHECKING:
    from tdoc_crawler.models.tdocs import TDocMetadata

logger = logging.getLogger(__name__)


class DocumentListError(Exception):
    """Raised when document list fetching or parsing fails."""


def fetch_meeting_document_list(
    meeting_id: int,
    cache_dir: Path,
    cache_ttl: int = 7200,
    cache_refresh_on_access: bool = True,
    timeout: int = 30,
) -> list[TDocMetadata]:
    """Fetch all TDoc metadata for a meeting via document list Excel file.

    Args:
        meeting_id: 3GPP meeting identifier
        cache_dir: Directory for HTTP cache storage
        cache_ttl: HTTP cache TTL in seconds
        cache_refresh_on_access: Whether to refresh cache TTL on access
        timeout: Request timeout in seconds

    Returns:
        List of TDocMetadata instances for all TDocs in the meeting

    Raises:
        DocumentListError: If document list cannot be fetched or parsed
    """
    # Construct document list URL
    doclist_url = f"https://portal.3gpp.org/ngppapp/GenerateDocumentList.aspx?meetingId={meeting_id}"
    logger.debug(f"Fetching document list for meeting {meeting_id} from {doclist_url}")

    # Create cached session (no credentials required)
    session = create_cached_session(
        cache_dir=cache_dir,
        ttl=cache_ttl,
        refresh_ttl_on_access=cache_refresh_on_access,
        max_retries=3,
    )

    try:
        # Download Excel file
        logger.debug(f"Downloading Excel document list for meeting {meeting_id}")
        response = session.get(doclist_url, timeout=timeout)
        response.raise_for_status()

        # Check if we got a valid Excel file
        content_type = response.headers.get("content-type", "").lower()
        if "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" not in content_type:
            # Some responses might not set content-type correctly, check file signature
            if not response.content.startswith(b"PK"):
                raise DocumentListError(f"Expected Excel file for meeting {meeting_id}, got content-type: {content_type}")

        # Parse Excel file
        logger.debug(f"Parsing Excel document list for meeting {meeting_id}")
        return parse_excel_document_list(response.content, meeting_id)

    except Exception as exc:
        if isinstance(exc, DocumentListError):
            raise
        logger.warning(f"Failed to fetch document list for meeting {meeting_id}: {exc}")
        raise DocumentListError(f"Failed to fetch document list for meeting {meeting_id}: {exc}") from exc
    finally:
        session.close()


def parse_excel_document_list(
    excel_content: bytes,
    meeting_id: int,
) -> list[TDocMetadata]:
    """Parse Excel document list and convert to TDocMetadata instances.

    Args:
        excel_content: Raw Excel file content
        meeting_id: Meeting ID for reference

    Returns:
        List of TDocMetadata instances
    """
    try:
        # Use pandas with python-calamine engine for fast Excel parsing
        df = pd.read_excel(
            io.BytesIO(excel_content),
            engine="calamine",
            sheet_name="TDoc_List",
        )

        logger.debug(f"Found {len(df)} rows in TDoc_List sheet for meeting {meeting_id}")

        # Convert DataFrame rows to TDocMetadata instances
        tdoc_metadata_list = []
        for index, row in df.iterrows():
            try:
                tdoc_metadata = convert_excel_row_to_tdoc_metadata(row, meeting_id)
                if tdoc_metadata:
                    tdoc_metadata_list.append(tdoc_metadata)
                else:
                    logger.debug(f"Skipping row {index + 1}: missing required TDoc ID")
            except Exception as exc:
                logger.warning(f"Failed to parse row {index + 1} for meeting {meeting_id}: {exc}")
                continue

        logger.info(f"Successfully parsed {len(tdoc_metadata_list)} TDoc metadata entries for meeting {meeting_id}")
        return tdoc_metadata_list

    except Exception as exc:
        raise DocumentListError(f"Failed to parse Excel document list for meeting {meeting_id}: {exc}") from exc


def convert_excel_row_to_tdoc_metadata(
    row: pd.Series,
    meeting_id: int,
) -> TDocMetadata | None:
    """Convert a single Excel row to TDocMetadata.

    Args:
        row: pandas Series representing one Excel row
        meeting_id: Meeting ID for reference

    Returns:
        TDocMetadata instance or None if conversion fails
    """
    from tdoc_crawler.models.tdocs import TDocMetadata

    # Map Excel columns to TDocMetadata fields
    # Try multiple possible column names to handle different Excel formats
    tdoc_id = _extract_tdoc_id(row)
    if not tdoc_id:
        return None

    # Extract other fields with fallbacks
    title = _get_column_value(row, ["Title", "Document Title", "Description", "Subject"])
    tdoc_type = _get_column_value(row, ["Type", "Document Type", "TDoc Type"])
    for_purpose = _get_column_value(row, ["For", "Purpose", "For Purpose"])
    source = _get_column_value(row, ["Source", "Organization", "Company"])
    contact = _get_column_value(row, ["Contact", "Contact Person", "Author"])
    agenda_item = _get_column_value(row, ["Agenda Item", "Agenda", "Agenda Ref"])
    status = _get_column_value(row, ["Status", "Document Status"])
    is_revision_of = _get_column_value(row, ["Revision of", "Is Revision of", "Based on"])
    date_created = _get_column_value(row, ["Date", "Created", "Date Created", "Submission Date"])

    # Generate URL (this will be validated/updated later by the directory crawler)
    url = f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id.upper()}.zip"

    now = datetime.now(UTC)

    try:
        # Parse agenda item number
        agenda_nbr, agenda_text = _parse_agenda_item(agenda_item)

        # Parse date
        parsed_date = _parse_date(date_created)

        return TDocMetadata(
            tdoc_id=tdoc_id.upper(),
            meeting_id=meeting_id,
            title=title or "Unknown Title",
            url=url,
            source=source or "Unknown",
            contact=contact or "Unknown",
            tdoc_type=tdoc_type or "unknown",
            for_purpose=for_purpose or "unknown",
            agenda_item_nbr=agenda_nbr,
            agenda_item_text=agenda_text,
            status=status,
            meeting_name=None,  # Will be resolved from meeting database
            is_revision_of=is_revision_of,
            file_size=None,  # Not available in Excel
            date_created=parsed_date,
            date_retrieved=now,
            date_updated=now,
            validated=True,  # Direct from portal, no validation needed
            validation_failed=False,
        )

    except Exception as exc:
        logger.debug(f"Failed to create TDocMetadata for {tdoc_id}: {exc}")
        return None


def _extract_tdoc_id(row: pd.Series) -> str | None:
    """Extract TDoc ID from Excel row.

    Args:
        row: pandas Series representing one Excel row

    Returns:
        TDoc ID string or None if not found
    """
    # Try multiple possible column names for TDoc ID
    tdoc_id_columns = ["TDoc", "Contribution", "Document", "TDoc ID", "Contribution ID", "ID", "Number", "TDoc Number", "Contribution Number"]

    for col in tdoc_id_columns:
        value = row.get(col)
        if value and isinstance(value, str):
            tdoc_id = value.strip()
            # Validate TDoc ID format
            if _is_valid_tdoc_id(tdoc_id):
                return tdoc_id

    return None


def _is_valid_tdoc_id(tdoc_id: str) -> bool:
    """Check if string is a valid TDoc ID format.

    Args:
        tdoc_id: String to validate

    Returns:
        True if valid TDoc ID format
    """
    import re

    # TDoc ID pattern: [RSC][1-6P] followed by 4-10 chars
    pattern = re.compile(r"^[RSC][1-6P].{4,10}$", re.IGNORECASE)
    return bool(pattern.match(tdoc_id.strip()))


def _get_column_value(row: pd.Series, possible_names: list[str]) -> str | None:
    """Get value from row using possible column names.

    Args:
        row: pandas Series representing one Excel row
        possible_names: List of possible column names

    Returns:
        Column value or None if not found
    """
    for col_name in possible_names:
        value = row.get(col_name)
        if value and isinstance(value, str):
            return value.strip()
        elif value is not None:
            # Handle non-string values (dates, numbers, etc.)
            return str(value).strip()
    return None


def _parse_agenda_item(agenda_item: str | None) -> tuple[Decimal, str]:
    """Parse agenda item into number and text components.

    Args:
        agenda_item: Agenda item string (e.g., "7.1 - Some text" or just "7.1")

    Returns:
        Tuple of (agenda_number, agenda_text)
    """
    if not agenda_item:
        return Decimal("0.0"), "Unknown"

    # Try to split by " - " to get number and text
    parts = agenda_item.split(" - ", 1)
    if len(parts) == 2:
        agenda_nbr_str = parts[0].strip()
        agenda_text = parts[1].strip()
    else:
        # No separator found, treat whole thing as number
        agenda_nbr_str = agenda_item.strip()
        agenda_text = "Unknown"

    # Parse agenda number as Decimal
    try:
        agenda_nbr = Decimal(agenda_nbr_str)
    except Exception:
        agenda_nbr = Decimal("0.0")

    return agenda_nbr, agenda_text


def _parse_date(date_value: str | None) -> datetime | None:
    """Parse date from Excel value.

    Args:
        date_value: Date value from Excel

    Returns:
        datetime object or None if parsing fails
    """
    if not date_value:
        return None

    try:
        # Handle different date formats
        if isinstance(date_value, datetime):
            return date_value
        elif isinstance(date_value, str):
            # Try common date formats
            formats = [
                "%Y-%m-%d",
                "%d/%m/%Y",
                "%m/%d/%Y",
                "%d-%m-%Y",
                "%Y/%m/%d",
            ]
            for fmt in formats:
                try:
                    return datetime.strptime(date_value, fmt)
                except ValueError:
                    continue
        return None

    except Exception:
        return None


__all__ = [
    "DocumentListError",
    "fetch_meeting_document_list",
    "parse_excel_document_list",
    "convert_excel_row_to_tdoc_metadata",
]
Loading