Commit 593b7001 authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(crawl): streamline TDoc crawling process and remove hybrid crawler

* Remove HybridTDocCrawler and its associated methods.
* Consolidate crawling logic into TDocCrawler for clarity.
* Update meeting fetching and processing to use document list method.
* Simplify error handling and logging for document list fetching.
* Adjust tests to reflect changes in crawling logic and remove hybrid scenarios.
* Ensure compatibility with existing database operations and configurations.
parent 5cd33bef
Loading
Loading
Loading
Loading
+33 −7
Original line number Diff line number Diff line
@@ -25,7 +25,7 @@ src/tdoc_crawler/
│   ├── operations/     # Spec operations (crawl, checkout, normalize)
│   └── sources/        # Spec data sources (3gpp, whatthespec)
├── clients/            # External API clients (Portal)
├── parsers/            # HTML/data parsers (portal, meetings, directory)
├── parsers/            # HTML/data parsers (portal, meetings)
├── workers/            # Parallel processing workers
├── database/           # Database layer (base, connection)
├── models/             # Shared data models
@@ -40,7 +40,7 @@ src/tdoc_crawler/

```python
# TDoc operations
from tdoc_crawler.tdocs import TDocCrawler, HybridTDocCrawler
from tdoc_crawler.tdocs import TDocCrawler
from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc
from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
@@ -381,12 +381,38 @@ The project maintains a modular documentation structure:
- `docs/index.md` and related referenced files **MUST** always be up to date and reflect the current state of ALL commands.
- When adding or modifying commands, **BOTH** the history file AND the relevant documentation files must be updated.

## Data Source Guidelines
## TDoc Data Sources

- **WhatTheSpec (whatthespec.net)** is the primary unauthenticated community source for metadata.
- **3GPP Portal (EOL)** is the official authenticated fallback source.
- Credentials (EOL) are only needed for authoritative 3GPP-official data or when WhatTheSpec is unavailable.
- For most users, **WhatTheSpec** is sufficient and preferred as it requires no login.
The project uses **three distinct mechanisms** for fetching TDoc metadata — each suited to different use cases. Do NOT add new crawl mechanisms without understanding why these three exist.

| Source | Module | Auth | Batch | Single | Use Case |
|--------|--------|:----:|:-----:|:------:|----------|
| Excel DocList | `tdocs/sources/doclist.py` | No | ✓ | ✗ | Batch crawl all TDocs per meeting (`crawl-tdocs` command) |
| WhatTheSpec API | `tdocs/sources/whatthespec.py` | No | ✗ | ✓ | Single/few TDoc lookups (`query`, `open` commands) |
| 3GPP Portal | `tdocs/sources/portal.py` | Yes (EOL) | ✗ | ✓ | Authenticated fallback when WhatTheSpec unavailable |

### Excel Document List (batch crawl, no auth)

- **Primary method for `crawl-tdocs`** — downloads the per-meeting Excel spreadsheet from 3GPP FTP and parses it
- Best for batch-crawling all TDocs per meeting, but cannot resolve a single TDoc without knowing its meeting
- Implemented in `TDocCrawler` (`tdocs/operations/crawl.py`) which delegates to `fetch_meeting_document_list_subinterpreter()` in the workers module

### WhatTheSpec API (single/few TDocs, no auth)

- Community-maintained API at `whatthespec.net` — most flexible for individual TDoc lookups
- Primary source for `query` and `open` commands
- No authentication required
- Preferred over Portal for most single-TDoc queries

### 3GPP Portal Authentication (fallback, requires EOL credentials)

- Official authenticated source via 3GPP EOL portal
- Should only be used as a fallback when WhatTheSpec is unavailable or when explicitly requested
- Credentials are only needed for authoritative 3GPP-official data

### Historical Note

A fourth mechanism (FTP/HTTP directory crawling via `parsers/directory.py`) was removed because it only produced placeholder metadata with `title='Pending validation'` — no actual TDoc content was extracted. The Excel document list method fully supersedes it for batch crawling.

## AGENTS.md File Design Guidelines

scripts/check.py

0 → 100644
+4 −0
Original line number Diff line number Diff line
import sqlite3

conn = sqlite3.connect('scripts/cache/tdoc_crawler.db')
print(conn.execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall())
+35 −73
Original line number Diff line number Diff line
import gc
import shutil
import tempfile
import time
from pathlib import Path

import typer
from typer.testing import CliRunner

from tdoc_crawler.cli import app
from tdoc_crawler.logging import get_console, get_logger
from tdoc_crawler.cli.crawl import crawl_meetings, crawl_tdocs
from tdoc_crawler.cli.query import query_tdocs
from tdoc_crawler.logging import get_console

this_dir = Path(__file__).parent
logger = get_logger(__name__)

# Example data
TDOC1 = "S4-260001"  # docx
@@ -18,90 +15,55 @@ TDOC2 = "S4-260002" # xlsx
TDOC3 = "S4-260003"  # pptx
TDOCS = [TDOC1, TDOC2, TDOC3]

MEETING1 = "SA#123"
MEETING2 = "SA#124"
WORKING_GROUP1 = "RAN#1"
WORKING_GROUP2 = "SA#4"

DATE1 = "2024-01-01"
DATE2 = "2024-02-01"

SPEC1 = "26.130"
SPEC2 = "26.131"
SPEC3 = "26.132"
SPECS = [SPEC1, SPEC2, SPEC3]

runner = CliRunner()
console = get_console()
cache_dir = this_dir / "cache"  # Default cache dir if not using temp dir
common_args = ["--cache-dir", cache_dir, "-v", "debug"]  #

common_args = {
    "cache_dir": cache_dir,
    "verbosity": "debug",
}


def clean_cache() -> None:
    shutil.rmtree(cache_dir, ignore_errors=True)
    # Small delay to ensure files are released on Windows
    time.sleep(0.5)


def run_command(command: str, args: list[str]) -> None:
    logger.info(f"Running command: {command} with args: {args}")
    res = runner.invoke(app, [command] + args)
    typer.echo(res.output)


def demo_tdocs() -> None:
def demo_tdocs_crawl() -> None:
    """Demo workflow: crawl meetings -> crawl tdocs -> query tdocs."""
    # start with cleaned cache
    clean_cache()

    # 1. Test checkout command (no metadata crawling if not explicitly requested)
    run_command("checkout", TDOCS + common_args)

    # 2. Simply open documents (no metadata crawling)
    for tdoc in TDOCS:
        run_command("open", [tdoc] + common_args)

    # 3. Crawl Meetings run_command("crawl-meetings", TDOCS + common_args)
    # 1. Crawl meetings (last 10 SA4 meetings with files)
    console.print("[cyan]Step 1: Crawling meetings...[/cyan]")
    crawl_meetings(
        subgroup=["SA4"],
        limit_meetings=10,
        cache_dir=cache_dir,
        verbosity="debug",
    )

    # # 4. Crawl TDocs run_command("crawl-tdocs", TDOCS + common_args) # 5. Query TDocs run_command("query-tdocs", ["--tdoc-ids"] + TDOCS + common_args) # 6. Crawl Specs # 7. Query Specs
    # Force garbage collection and delay to release SQLite locks
    gc.collect()
    time.sleep(2)

    # 8. Crawl spec metadata
    # 2. Crawl TDocs (all meetings in database)
    console.print("[cyan]Step 2: Crawling TDocs...[/cyan]")
    crawl_tdocs(workers=1, no_progress=True, **common_args)

    # 9. Query spec metadata
    # Force garbage collection and delay to release SQLite locks
    gc.collect()
    time.sleep(2)


def demo_specs() -> None:
    # checkout specs
    res = runner.invoke(app, ["checkout-spec"] + SPECS + common_args)
    # logger.info(res.output)

    # Simply open specs (no metadata crawling)
    for spec in SPECS:
        logger.info(f"Testing with spec {spec}...")
        res = runner.invoke(app, ["open-spec", spec] + common_args)
        typer.echo(res.output)

        break

    # 2. Test checkout command (no metadata crawling if not explicitly requested)
    res = runner.invoke(app, ["checkout"] + TDOCS + common_args)
    # logger.info(res.output)

    # 3. Crawl Meetings
    res = runner.invoke(app, ["crawl-meetings"] + TDOCS + common_args)
    # logger.info(res.output)

    # 4. Crawl TDocs
    # 5. Query TDocs
    # 3. Query TDocs
    console.print("[cyan]Step 3: Querying TDocs...[/cyan]")
    query_tdocs(tdoc_ids=[TDOC1], **common_args)


def main() -> None:
    demo_tdocs_crawl()

    # tmp_dir_args = {"suffix": "tdoc", "dir": this_dir, "delete": True}  # Set to False to inspect cache contents after run
    #   # Clean up any existing cache dir before run

    # with tempfile.TemporaryDirectory(**tmp_dir_args) as cache_dir:

    # with tempfile.TemporaryDirectory(**tmp_dir_args) as cache_dir:
    # common_args = ["--cache-dir", cache_dir, "-v", "debug"]
    demo_tdocs()
    # demo_specs()

if __name__ == "__main__":
    main()
+3 −5
Original line number Diff line number Diff line
@@ -82,9 +82,9 @@ def crawl_tdocs(
    http_cache_enabled: HttpCacheOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Crawl TDocs from 3GPP FTP directories.
    """Crawl TDoc metadata from 3GPP meeting document lists (Excel).

    No credentials needed, crawl-tdocs always resolves meetings first -> parse Excel files that includes metadata.
    No credentials needed. Resolves meetings first, then downloads Excel document lists with full metadata.
    """
    set_verbosity(verbosity)

@@ -109,11 +109,9 @@ def crawl_tdocs(
        workers=workers,
        overall_timeout=overall_timeout,
        timeout=timeout,
        max_retries=max_retries,
        limits=limits,
        target_ids=None,
        use_document_list=True,
        allow_parallel_fallback=True,
        use_parallel_crawling=False,
        http_cache=http_cache,
    )

+0 −164
Original line number Diff line number Diff line
"""Directory HTML parsing utilities for TDoc discovery."""

from __future__ import annotations

import json
import re
from datetime import UTC, datetime
from urllib.parse import urljoin

from bs4 import BeautifulSoup

from tdoc_crawler.constants.patterns import EXCLUDED_DIRS_NORMALIZED, TDOC_SUBDIRS_NORMALIZED
from tdoc_crawler.logging import get_logger

logger = get_logger(__name__)


def parse_file_size(text_content: str) -> int | None:
    """Extract file size in bytes from text fragments.

    Args:
        text_content: Text containing file size (e.g., "123 KB", "4.5 MB")

    Returns:
        File size in bytes, or None if not found
    """
    size_match = re.search(r"(\d+)\s*([KMG]?)B", text_content, re.IGNORECASE)
    if not size_match:
        return None

    size_str = size_match.group(1)
    unit = size_match.group(2).upper()

    try:
        size = int(size_str)
    except ValueError:
        return None

    if unit == "K":
        size *= 1024
    elif unit == "M":
        size *= 1024 * 1024
    elif unit == "G":
        size *= 1024 * 1024 * 1024
    return size


def extract_subdirectories(base_url: str, html: str) -> list[str]:
    """Detect potential TDoc subdirectories (Docs/, Documents/, etc.).

    Args:
        base_url: Base URL of the directory listing
        html: HTML content of the directory page

    Returns:
        List of subdirectory URLs
    """
    soup = BeautifulSoup(html, "html.parser")
    subdirs: list[str] = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if not href or not isinstance(href, str):
            continue
        if href.startswith(("?", "..")):
            continue

        dir_name = href.rstrip("/").split("/")[-1]
        if dir_name.upper() not in TDOC_SUBDIRS_NORMALIZED:
            continue
        subdir_url = href if href.startswith("http") else urljoin(base_url, href)
        if not subdir_url.endswith("/"):
            subdir_url += "/"
        subdirs.append(subdir_url)
    return subdirs


def collect_tdocs_from_html(
    html: str,
    directory_url: str,
    meeting_id: int,
    meeting_short_name: str,
    pattern: re.Pattern[str],
    target_ids: set[str] | None,
    seen_ids: set[str],
) -> list[str]:
    """Parse a directory HTML listing for TDoc links.

    Args:
        html: HTML content of the directory page
        directory_url: URL of the directory being parsed
        meeting_id: Meeting identifier
        meeting_short_name: Short name of the meeting (e.g., "RAN1#98")
        pattern: Compiled regex pattern for matching TDoc IDs
        target_ids: Optional set of target TDoc IDs to filter
        seen_ids: Set tracking already-seen TDoc IDs (modified in place)

    Returns:
        List of JSON-serialized TDocMetadata records
    """
    soup = BeautifulSoup(html, "html.parser")
    serialized: list[str] = []

    for link in soup.find_all("a"):
        href_value = link.get("href")
        if not href_value or not isinstance(href_value, str):
            continue

        href = href_value.strip()
        if href in ("../", "./", "..", "."):
            continue
        if href.rstrip("/").split("/")[-1].upper() in EXCLUDED_DIRS_NORMALIZED:
            continue

        match = pattern.search(href)
        if not match:
            continue

        tdoc_id = match.group(1).upper()
        if tdoc_id in seen_ids:
            continue
        if target_ids is not None and tdoc_id not in target_ids:
            continue

        file_url = href if href.startswith(("http://", "https://")) else urljoin(directory_url, href)

        file_size = None
        parent = link.parent
        if parent:
            text_content = parent.get_text(" ")
            file_size = parse_file_size(text_content)

        now = datetime.now(UTC)
        payload = {
            "tdoc_id": tdoc_id,
            "meeting_id": meeting_id,
            "title": "Pending validation",
            "url": file_url,
            "source": "Unknown",
            "contact": "Unknown",
            "tdoc_type": "unknown",
            "for_purpose": "unknown",
            "agenda_item_nbr": "0.0",
            "agenda_item_text": "Unknown",
            "status": None,
            "meeting_name": meeting_short_name or None,
            "is_revision_of": None,
            "file_size": file_size,
            "date_created": None,
            "date_retrieved": now.isoformat().replace("+00:00", "Z"),
            "date_updated": now.isoformat().replace("+00:00", "Z"),
            "validated": False,
            "validation_failed": False,
        }
        serialized.append(json.dumps(payload))
        seen_ids.add(tdoc_id)

    return serialized


__all__ = [
    "collect_tdocs_from_html",
    "extract_subdirectories",
    "parse_file_size",
]
Loading