refactor(crawl): streamline TDoc crawling process and remove hybrid crawler (593b7001) · Commits · Jan Reimes / 3gpp-crawler

AGENTS.md

+33 −7

Original line number	Diff line number	Diff line
		@@ -25,7 +25,7 @@ src/tdoc_crawler/
		│ ├── operations/ # Spec operations (crawl, checkout, normalize)
		│ └── sources/ # Spec data sources (3gpp, whatthespec)
		├── clients/ # External API clients (Portal)
		├── parsers/ # HTML/data parsers (portal, meetings, directory)
		├── parsers/ # HTML/data parsers (portal, meetings)
		├── workers/ # Parallel processing workers
		├── database/ # Database layer (base, connection)
		├── models/ # Shared data models
		@@ -40,7 +40,7 @@ src/tdoc_crawler/

		```python
		# TDoc operations
		from tdoc_crawler.tdocs import TDocCrawler, HybridTDocCrawler
		from tdoc_crawler.tdocs import TDocCrawler
		from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
		from tdoc_crawler.tdocs.operations.checkout import checkout_tdoc
		from tdoc_crawler.tdocs.sources.whatthespec import resolve_via_whatthespec
		@@ -381,12 +381,38 @@ The project maintains a modular documentation structure:
		- `docs/index.md` and related referenced files MUST always be up to date and reflect the current state of ALL commands.
		- When adding or modifying commands, BOTH the history file AND the relevant documentation files must be updated.

		## Data Source Guidelines
		## TDoc Data Sources

		- WhatTheSpec (whatthespec.net) is the primary unauthenticated community source for metadata.
		- 3GPP Portal (EOL) is the official authenticated fallback source.
		- Credentials (EOL) are only needed for authoritative 3GPP-official data or when WhatTheSpec is unavailable.
		- For most users, WhatTheSpec is sufficient and preferred as it requires no login.
		The project uses three distinct mechanisms for fetching TDoc metadata — each suited to different use cases. Do NOT add new crawl mechanisms without understanding why these three exist.

		\| Source \| Module \| Auth \| Batch \| Single \| Use Case \|
		\|--------\|--------\|:----:\|:-----:\|:------:\|----------\|
		\| Excel DocList \| `tdocs/sources/doclist.py` \| No \| ✓ \| ✗ \| Batch crawl all TDocs per meeting (`crawl-tdocs` command) \|
		\| WhatTheSpec API \| `tdocs/sources/whatthespec.py` \| No \| ✗ \| ✓ \| Single/few TDoc lookups (`query`, `open` commands) \|
		\| 3GPP Portal \| `tdocs/sources/portal.py` \| Yes (EOL) \| ✗ \| ✓ \| Authenticated fallback when WhatTheSpec unavailable \|

		### Excel Document List (batch crawl, no auth)

		- Primary method for `crawl-tdocs` — downloads the per-meeting Excel spreadsheet from 3GPP FTP and parses it
		- Best for batch-crawling all TDocs per meeting, but cannot resolve a single TDoc without knowing its meeting
		- Implemented in `TDocCrawler` (`tdocs/operations/crawl.py`) which delegates to `fetch_meeting_document_list_subinterpreter()` in the workers module

		### WhatTheSpec API (single/few TDocs, no auth)

		- Community-maintained API at `whatthespec.net` — most flexible for individual TDoc lookups
		- Primary source for `query` and `open` commands
		- No authentication required
		- Preferred over Portal for most single-TDoc queries

		### 3GPP Portal Authentication (fallback, requires EOL credentials)

		- Official authenticated source via 3GPP EOL portal
		- Should only be used as a fallback when WhatTheSpec is unavailable or when explicitly requested
		- Credentials are only needed for authoritative 3GPP-official data

		### Historical Note

		A fourth mechanism (FTP/HTTP directory crawling via `parsers/directory.py`) was removed because it only produced placeholder metadata with `title='Pending validation'` — no actual TDoc content was extracted. The Excel document list method fully supersedes it for batch crawling.

		## AGENTS.md File Design Guidelines

scripts/check.py

0 → 100644

+4 −0

Original line number	Diff line number	Diff line
		import sqlite3

		conn = sqlite3.connect('scripts/cache/tdoc_crawler.db')
		print(conn.execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall())

scripts/demo.py

+35 −73

Original line number	Diff line number	Diff line
		import gc
		import shutil
		import tempfile
		import time
		from pathlib import Path

		import typer
		from typer.testing import CliRunner

		from tdoc_crawler.cli import app
		from tdoc_crawler.logging import get_console, get_logger
		from tdoc_crawler.cli.crawl import crawl_meetings, crawl_tdocs
		from tdoc_crawler.cli.query import query_tdocs
		from tdoc_crawler.logging import get_console

		this_dir = Path(__file__).parent
		logger = get_logger(__name__)

		# Example data
		TDOC1 = "S4-260001" # docx
		@@ -18,90 +15,55 @@ TDOC2 = "S4-260002" # xlsx
		TDOC3 = "S4-260003" # pptx
		TDOCS = [TDOC1, TDOC2, TDOC3]

		MEETING1 = "SA#123"
		MEETING2 = "SA#124"
		WORKING_GROUP1 = "RAN#1"
		WORKING_GROUP2 = "SA#4"

		DATE1 = "2024-01-01"
		DATE2 = "2024-02-01"

		SPEC1 = "26.130"
		SPEC2 = "26.131"
		SPEC3 = "26.132"
		SPECS = [SPEC1, SPEC2, SPEC3]

		runner = CliRunner()
		console = get_console()
		cache_dir = this_dir / "cache" # Default cache dir if not using temp dir
		common_args = ["--cache-dir", cache_dir, "-v", "debug"] #

		common_args = {
		"cache_dir": cache_dir,
		"verbosity": "debug",
		}


		def clean_cache() -> None:
		shutil.rmtree(cache_dir, ignore_errors=True)
		# Small delay to ensure files are released on Windows
		time.sleep(0.5)


		def run_command(command: str, args: list[str]) -> None:
		logger.info(f"Running command: {command} with args: {args}")
		res = runner.invoke(app, [command] + args)
		typer.echo(res.output)


		def demo_tdocs() -> None:
		def demo_tdocs_crawl() -> None:
		"""Demo workflow: crawl meetings -> crawl tdocs -> query tdocs."""
		# start with cleaned cache
		clean_cache()

		# 1. Test checkout command (no metadata crawling if not explicitly requested)
		run_command("checkout", TDOCS + common_args)

		# 2. Simply open documents (no metadata crawling)
		for tdoc in TDOCS:
		run_command("open", [tdoc] + common_args)

		# 3. Crawl Meetings run_command("crawl-meetings", TDOCS + common_args)
		# 1. Crawl meetings (last 10 SA4 meetings with files)
		console.print("[cyan]Step 1: Crawling meetings...[/cyan]")
		crawl_meetings(
		subgroup=["SA4"],
		limit_meetings=10,
		cache_dir=cache_dir,
		verbosity="debug",
		)

		# # 4. Crawl TDocs run_command("crawl-tdocs", TDOCS + common_args) # 5. Query TDocs run_command("query-tdocs", ["--tdoc-ids"] + TDOCS + common_args) # 6. Crawl Specs # 7. Query Specs
		# Force garbage collection and delay to release SQLite locks
		gc.collect()
		time.sleep(2)

		# 8. Crawl spec metadata
		# 2. Crawl TDocs (all meetings in database)
		console.print("[cyan]Step 2: Crawling TDocs...[/cyan]")
		crawl_tdocs(workers=1, no_progress=True, **common_args)

		# 9. Query spec metadata
		# Force garbage collection and delay to release SQLite locks
		gc.collect()
		time.sleep(2)


		def demo_specs() -> None:
		# checkout specs
		res = runner.invoke(app, ["checkout-spec"] + SPECS + common_args)
		# logger.info(res.output)

		# Simply open specs (no metadata crawling)
		for spec in SPECS:
		logger.info(f"Testing with spec {spec}...")
		res = runner.invoke(app, ["open-spec", spec] + common_args)
		typer.echo(res.output)

		break

		# 2. Test checkout command (no metadata crawling if not explicitly requested)
		res = runner.invoke(app, ["checkout"] + TDOCS + common_args)
		# logger.info(res.output)

		# 3. Crawl Meetings
		res = runner.invoke(app, ["crawl-meetings"] + TDOCS + common_args)
		# logger.info(res.output)

		# 4. Crawl TDocs
		# 5. Query TDocs
		# 3. Query TDocs
		console.print("[cyan]Step 3: Querying TDocs...[/cyan]")
		query_tdocs(tdoc_ids=[TDOC1], **common_args)


		def main() -> None:
		demo_tdocs_crawl()

		# tmp_dir_args = {"suffix": "tdoc", "dir": this_dir, "delete": True} # Set to False to inspect cache contents after run
		# # Clean up any existing cache dir before run

		# with tempfile.TemporaryDirectory(**tmp_dir_args) as cache_dir:

		# with tempfile.TemporaryDirectory(**tmp_dir_args) as cache_dir:
		# common_args = ["--cache-dir", cache_dir, "-v", "debug"]
		demo_tdocs()
		# demo_specs()

		if __name__ == "__main__":
		main()

src/tdoc_crawler/cli/crawl.py

+3 −5

Original line number	Diff line number	Diff line
		@@ -82,9 +82,9 @@ def crawl_tdocs(
		http_cache_enabled: HttpCacheOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		"""Crawl TDocs from 3GPP FTP directories.
		"""Crawl TDoc metadata from 3GPP meeting document lists (Excel).

		No credentials needed, crawl-tdocs always resolves meetings first -> parse Excel files that includes metadata.
		No credentials needed. Resolves meetings first, then downloads Excel document lists with full metadata.
		"""
		set_verbosity(verbosity)

		@@ -109,11 +109,9 @@ def crawl_tdocs(
		workers=workers,
		overall_timeout=overall_timeout,
		timeout=timeout,
		max_retries=max_retries,
		limits=limits,
		target_ids=None,
		use_document_list=True,
		allow_parallel_fallback=True,
		use_parallel_crawling=False,
		http_cache=http_cache,
		)

src/tdoc_crawler/parsers/directory.py

deleted100644 → 0

+0 −164

Original line number	Diff line number	Diff line
		"""Directory HTML parsing utilities for TDoc discovery."""

		from __future__ import annotations

		import json
		import re
		from datetime import UTC, datetime
		from urllib.parse import urljoin

		from bs4 import BeautifulSoup

		from tdoc_crawler.constants.patterns import EXCLUDED_DIRS_NORMALIZED, TDOC_SUBDIRS_NORMALIZED
		from tdoc_crawler.logging import get_logger

		logger = get_logger(__name__)


		def parse_file_size(text_content: str) -> int \| None:
		"""Extract file size in bytes from text fragments.

		Args:
		text_content: Text containing file size (e.g., "123 KB", "4.5 MB")

		Returns:
		File size in bytes, or None if not found
		"""
		size_match = re.search(r"(\d+)\s*([KMG]?)B", text_content, re.IGNORECASE)
		if not size_match:
		return None

		size_str = size_match.group(1)
		unit = size_match.group(2).upper()

		try:
		size = int(size_str)
		except ValueError:
		return None

		if unit == "K":
		size *= 1024
		elif unit == "M":
		size = 1024 1024
		elif unit == "G":
		size = 1024 1024 * 1024
		return size


		def extract_subdirectories(base_url: str, html: str) -> list[str]:
		"""Detect potential TDoc subdirectories (Docs/, Documents/, etc.).

		Args:
		base_url: Base URL of the directory listing
		html: HTML content of the directory page

		Returns:
		List of subdirectory URLs
		"""
		soup = BeautifulSoup(html, "html.parser")
		subdirs: list[str] = []
		for link in soup.find_all("a"):
		href = link.get("href")
		if not href or not isinstance(href, str):
		continue
		if href.startswith(("?", "..")):
		continue

		dir_name = href.rstrip("/").split("/")[-1]
		if dir_name.upper() not in TDOC_SUBDIRS_NORMALIZED:
		continue
		subdir_url = href if href.startswith("http") else urljoin(base_url, href)
		if not subdir_url.endswith("/"):
		subdir_url += "/"
		subdirs.append(subdir_url)
		return subdirs


		def collect_tdocs_from_html(
		html: str,
		directory_url: str,
		meeting_id: int,
		meeting_short_name: str,
		pattern: re.Pattern[str],
		target_ids: set[str] \| None,
		seen_ids: set[str],
		) -> list[str]:
		"""Parse a directory HTML listing for TDoc links.

		Args:
		html: HTML content of the directory page
		directory_url: URL of the directory being parsed
		meeting_id: Meeting identifier
		meeting_short_name: Short name of the meeting (e.g., "RAN1#98")
		pattern: Compiled regex pattern for matching TDoc IDs
		target_ids: Optional set of target TDoc IDs to filter
		seen_ids: Set tracking already-seen TDoc IDs (modified in place)

		Returns:
		List of JSON-serialized TDocMetadata records
		"""
		soup = BeautifulSoup(html, "html.parser")
		serialized: list[str] = []

		for link in soup.find_all("a"):
		href_value = link.get("href")
		if not href_value or not isinstance(href_value, str):
		continue

		href = href_value.strip()
		if href in ("../", "./", "..", "."):
		continue
		if href.rstrip("/").split("/")[-1].upper() in EXCLUDED_DIRS_NORMALIZED:
		continue

		match = pattern.search(href)
		if not match:
		continue

		tdoc_id = match.group(1).upper()
		if tdoc_id in seen_ids:
		continue
		if target_ids is not None and tdoc_id not in target_ids:
		continue

		file_url = href if href.startswith(("http://", "https://")) else urljoin(directory_url, href)

		file_size = None
		parent = link.parent
		if parent:
		text_content = parent.get_text(" ")
		file_size = parse_file_size(text_content)

		now = datetime.now(UTC)
		payload = {
		"tdoc_id": tdoc_id,
		"meeting_id": meeting_id,
		"title": "Pending validation",
		"url": file_url,
		"source": "Unknown",
		"contact": "Unknown",
		"tdoc_type": "unknown",
		"for_purpose": "unknown",
		"agenda_item_nbr": "0.0",
		"agenda_item_text": "Unknown",
		"status": None,
		"meeting_name": meeting_short_name or None,
		"is_revision_of": None,
		"file_size": file_size,
		"date_created": None,
		"date_retrieved": now.isoformat().replace("+00:00", "Z"),
		"date_updated": now.isoformat().replace("+00:00", "Z"),
		"validated": False,
		"validation_failed": False,
		}
		serialized.append(json.dumps(payload))
		seen_ids.add(tdoc_id)

		return serialized


		__all__ = [
		"collect_tdocs_from_html",
		"extract_subdirectories",
		"parse_file_size",
		]