feat(models): Introduce subworking group metadata and enhance TDoc metadata (1b8451e0) · Commits · Jan Reimes / 3gpp-crawler

docs/history/2025-10-21_PLAN_TDOC_CRAWLER_REFACTORING.md

0 → 100644

+447 −0

Original line number	Diff line number	Diff line
		# TDoc Crawler Refactoring Plan

		Date: 2025-10-21
		Type: Major Refactoring

		## Problem Statement

		The current TDoc crawler has fundamental design flaws:
		1. Parses FTP directory listings expecting 9 tokens (HTML table format) when FTP LIST returns 4-5 tokens
		2. Uses brute-force FTP crawling instead of meeting-driven approach
		3. No portal validation of TDoc candidates
		4. No parallel processing support
		5. No caching of negative results

		## New Architecture

		### Core Principle: Meeting-Driven TDoc Crawling

		TDoc crawling must depend on meeting metadata. The workflow is:

		```
		1. User runs `crawl-meetings` → Populates meetings table
		2. User runs `crawl` → Queries meetings table → Crawls each meeting's files_url
		3. For each candidate file → Validate via portal → Store/cache result
		```

		## Changes Required

		### 1. Models (`src/tdoc_crawler/models.py`)

		Updated:
		- ✅ `TDocMetadata`: Added portal fields (title, contact, tdoc_type, for_purpose, agenda_item, status, is_revision_of, validated, validation_failed)
		- ✅ `TDocRecord`: Updated to match TDocMetadata fields
		- ✅ `TDocCrawlConfig`: Added subgroups, meeting_ids, start_date, end_date, force_revalidate, workers, credentials

		### 2. Database Schema (`src/tdoc_crawler/database.py`)

		TODO: Update tdocs table schema:
		```sql
		CREATE TABLE IF NOT EXISTS tdocs (
		tdoc_id TEXT PRIMARY KEY,
		url TEXT NOT NULL,
		working_group TEXT NOT NULL,
		subgroup TEXT,
		meeting TEXT,
		meeting_id INTEGER,
		file_size INTEGER,

		-- Portal metadata
		title TEXT,
		contact TEXT,
		tdoc_type TEXT,
		for_purpose TEXT,
		agenda_item TEXT,
		status TEXT,
		is_revision_of TEXT,

		-- Legacy/system fields
		document_type TEXT,
		checksum TEXT,
		source_path TEXT,
		date_created TIMESTAMP,
		date_retrieved TIMESTAMP NOT NULL,
		date_updated TIMESTAMP NOT NULL,
		validated BOOLEAN DEFAULT 0,
		validation_failed BOOLEAN DEFAULT 0,

		FOREIGN KEY (meeting_id) REFERENCES meetings(meeting_id)
		)
		```

		TODO: Add methods:
		- `get_processed_meetings(working_groups, subgroups) -> set[int]`: Returns meeting IDs already processed
		- `cache_invalid_tdoc(tdoc_id)`: Cache negative validation result
		- `get_cached_invalid_tdocs() -> set[str]`: Get list of known invalid TDocs

		### 3. Portal Parsing (`src/tdoc_crawler/portal.py`) - NEW FILE

		TODO: Create new module for portal interactions:

		```python
		def fetch_tdoc_metadata(
		tdoc_id: str,
		credentials: PortalCredentials \| None = None,
		timeout: int = 30
		) -> dict \| None:
		"""Fetch and parse TDoc metadata from 3GPP portal.

		Args:
		tdoc_id: TDoc identifier (e.g., 'S4-251364')
		credentials: Optional portal credentials
		timeout: Request timeout in seconds

		Returns:
		Dictionary with parsed fields or None if validation fails
		"""
		url = f"https://portal.3gpp.org/ngppapp/CreateTdoc.Aspx?mode=view&contributionUid={tdoc_id}"

		# Handle login redirect if needed
		response = requests.get(url, timeout=timeout, allow_redirects=True)

		if "login.aspx" in response.url.lower():
		if credentials is None:
		logger.warning(f"Portal requires authentication for {tdoc_id}")
		return None
		# TODO: Implement login flow

		if response.status_code != 200:
		return None

		return parse_tdoc_portal_page(response.text)


		def parse_tdoc_portal_page(html: str) -> dict \| None:
		"""Parse TDoc metadata from portal HTML.

		Expected fields (from table/form structure):
		- Meeting (required)
		- Title (required)
		- Contact (required)
		- TDoc type (required)
		- For (required)
		- Agenda item (required)
		- Status (required)
		- Is revision of (optional)
		"""
		soup = BeautifulSoup(html, "lxml")

		# TODO: Analyze HTML structure and implement parsing
		# Pattern: likely <label>Field Name</label> followed by <input> or <span>

		metadata = {}
		required_fields = ["meeting", "title", "contact", "tdoc_type", "for_purpose", "agenda_item", "status"]

		# Parse each field
		# ...

		# Validate required fields present
		if not all(metadata.get(field) for field in required_fields):
		return None

		return metadata
		```

		### 4. TDoc Crawler Refactoring (`src/tdoc_crawler/crawler.py`)

		TODO: Complete rewrite of `TDocCrawler.crawl()`:

		```python
		def crawl(self, config: TDocCrawlConfig) -> TDocCrawlResult:
		"""Execute meeting-driven TDoc crawl."""

		# Step 1: Check meetings available
		meetings = self.database.query_meetings(MeetingQueryConfig(
		working_groups=config.working_groups,
		subgroups=config.subgroups,
		# ... filter by meeting_ids, dates, etc.
		))

		if not meetings:
		raise ValueError(
		"No meetings available. Run 'crawl-meetings' first to populate meeting metadata."
		)

		# Step 2: Filter meetings based on incremental mode
		if config.incremental and not config.force_revalidate:
		processed = self.database.get_processed_meetings(config.working_groups, config.subgroups)
		meetings = [m for m in meetings if m.meeting_id not in processed]

		# Step 3: Parallel processing
		with concurrent.futures.ThreadPoolExecutor(max_workers=config.workers) as executor:
		futures = []
		for meeting in meetings:
		if not meeting.files_url:
		logger.debug(f"Meeting {meeting.short_name} has no files_url, skipping")
		continue

		future = executor.submit(self._crawl_meeting, meeting, config)
		futures.append(future)

		for future in concurrent.futures.as_completed(futures):
		try:
		result = future.result()
		# Aggregate results
		except Exception as exc:
		logger.error(f"Meeting crawl failed: {exc}")

		return TDocCrawlResult(...)


		def _crawl_meeting(self, meeting: MeetingMetadata, config: TDocCrawlConfig) -> list[TDocMetadata]:
		"""Crawl a single meeting's files_url."""

		collected = []

		try:
		# List files at files_url (FTP or HTTP)
		files = self._list_files_at_url(meeting.files_url, config.timeout)
		except Exception as exc:
		logger.warning(f"Failed to access files for meeting {meeting.short_name}: {exc}")
		return collected

		# Get existing/invalid TDocs for filtering
		existing_ids = set()
		invalid_ids = set()
		if config.incremental:
		existing_ids = self.database.get_existing_tdoc_ids([meeting.working_group])
		invalid_ids = self.database.get_cached_invalid_tdocs()

		for filename in files:
		if not filename.endswith('.zip'):
		continue

		tdoc_id = filename.removesuffix('.zip').upper()

		# Check TDoc pattern
		if not TDOC_PATTERN.match(tdoc_id):
		continue

		# Skip if already processed (unless force_revalidate)
		if config.incremental and not config.force_revalidate:
		if tdoc_id in existing_ids or tdoc_id in invalid_ids:
		continue

		# Validate via portal
		portal_metadata = fetch_tdoc_metadata(tdoc_id, config.credentials, config.timeout)

		if portal_metadata is None:
		# Cache negative result
		self.database.cache_invalid_tdoc(tdoc_id)
		logger.debug(f"TDoc {tdoc_id} failed portal validation")
		continue

		# Build metadata object
		tdoc = TDocMetadata(
		tdoc_id=tdoc_id,
		url=f"{meeting.files_url}/{filename}",
		working_group=meeting.working_group,
		subgroup=meeting.subgroup,
		meeting=meeting.short_name,
		meeting_id=meeting.meeting_id,
		validated=True,
		validation_failed=False,
		**portal_metadata # title, contact, etc.
		)

		collected.append(tdoc)

		return collected


		def _list_files_at_url(self, url: str, timeout: int) -> list[str]:
		"""List files at FTP or HTTP URL."""

		if url.startswith('ftp://'):
		return self._list_ftp_directory(url, timeout)
		elif url.startswith('http://') or url.startswith('https://'):
		return self._list_http_directory(url, timeout)
		else:
		raise ValueError(f"Unsupported URL scheme: {url}")


		def _list_ftp_directory(self, url: str, timeout: int) -> list[str]:
		"""Parse FTP directory listing for filenames.

		FTP LIST format: '09-22-25 10:31AM <DIR> TSG_RAN'
		Tokens: [date, time, size_or_dir, filename...]
		"""
		from ftplib import FTP
		from urllib.parse import urlparse

		parsed = urlparse(url)
		ftp = FTP(parsed.hostname, timeout=timeout)
		ftp.login()
		ftp.cwd(parsed.path)

		entries = []
		ftp.retrlines('LIST', entries.append)
		ftp.quit()

		files = []
		for entry in entries:
		tokens = entry.split()
		if len(tokens) < 4:
		continue

		# Skip directories
		if '<DIR>' in entry.upper() or entry.startswith('d'):
		continue

		# Filename is everything from token[3] onward
		filename = ' '.join(tokens[3:])
		files.append(filename)

		return files


		def _list_http_directory(self, url: str, timeout: int) -> list[str]:
		"""Parse HTTP directory listing (Apache style)."""
		import requests
		from bs4 import BeautifulSoup

		response = requests.get(url, timeout=timeout)
		response.raise_for_status()

		soup = BeautifulSoup(response.text, 'lxml')

		# Apache directory listings use <a> tags
		files = []
		for link in soup.find_all('a'):
		href = link.get('href', '')
		if href and not href.startswith('?') and not href.startswith('/'):
		files.append(href)

		return files
		```

		### 5. CLI Updates (`src/tdoc_crawler/cli.py`)

		TODO: Update `crawl` command:

		```python
		@app.command()
		def crawl(
		cache_dir: Path = typer.Option(...),
		working_group: list[str] \| None = typer.Option(None, "-w", "--working-group"),
		subgroup: list[str] \| None = typer.Option(None, "-s", "--sub-group"),
		meeting_ids: list[int] \| None = typer.Option(None, "--meeting-ids"),
		start_date: str \| None = typer.Option(None, "--start-date"),
		end_date: str \| None = typer.Option(None, "--end-date"),
		incremental: bool = typer.Option(True, "--incremental/--full"),
		force_revalidate: bool = typer.Option(False, "--force-revalidate"),
		workers: int = typer.Option(4, "--workers"),
		limit_tdocs: int \| None = typer.Option(None, "--limit-tdocs"),
		max_retries: int = typer.Option(3, "--max-retries"),
		timeout: int = typer.Option(30, "--timeout"),
		verbose: bool = typer.Option(False, "-v", "--verbose"),
		eol_username: str \| None = typer.Option(None, "--eol-username"),
		eol_password: str \| None = typer.Option(None, "--eol-password"),
		) -> None:
		"""Crawl TDocs from meetings (requires crawl-meetings first)."""

		# Parse dates
		start = date.fromisoformat(start_date) if start_date else None
		end = date.fromisoformat(end_date) if end_date else None

		# Build config
		config = TDocCrawlConfig(
		cache_dir=cache_dir,
		working_groups=_parse_working_groups(working_group),
		subgroups=_parse_subgroups(subgroup),
		meeting_ids=meeting_ids,
		start_date=start,
		end_date=end,
		incremental=incremental,
		force_revalidate=force_revalidate,
		workers=workers,
		limits=_build_limits(limit_tdocs, None, None, None),
		credentials=_resolve_credentials(eol_username, eol_password, True),
		max_retries=max_retries,
		timeout=timeout,
		verbose=verbose,
		)

		# Execute crawl
		with TDocDatabase(database_path) as database:
		crawler = TDocCrawler(database)
		try:
		result = crawler.crawl(config)
		except ValueError as exc:
		console.print(f"[red]{exc}[/red]")
		console.print("[yellow]Run 'crawl-meetings' first to populate meeting metadata[/yellow]")
		raise typer.Exit(code=1)

		# Display results
		console.print(f"[green]Processed {result.processed} meetings[/green]")
		console.print(f"[green]Found {result.inserted} new TDocs, updated {result.updated}[/green]")
		if result.errors:
		console.print(f"[yellow]{len(result.errors)} errors encountered[/yellow]")
		```

		### 6. Database Migration

		TODO: Create migration script to update schema:

		```python
		def migrate_tdocs_table_v2(database: TDocDatabase) -> None:
		"""Add portal metadata columns to tdocs table."""

		with database._cursor() as cursor:
		# Add new columns (safe if already exist)
		columns_to_add = [
		("title", "TEXT"),
		("contact", "TEXT"),
		("tdoc_type", "TEXT"),
		("for_purpose", "TEXT"),
		("agenda_item", "TEXT"),
		("status", "TEXT"),
		("is_revision_of", "TEXT"),
		("validated", "BOOLEAN DEFAULT 0"),
		("validation_failed", "BOOLEAN DEFAULT 0"),
		]

		for col_name, col_type in columns_to_add:
		try:
		cursor.execute(f"ALTER TABLE tdocs ADD COLUMN {col_name} {col_type}")
		except sqlite3.OperationalError:
		# Column already exists
		pass

		database._conn.commit()
		```

		### 7. Testing

		TODO: Add comprehensive tests:

		- `test_portal.py`: Portal parsing tests
		- `test_crawler_refactored.py`: New crawler workflow tests
		- `test_parallel_processing.py`: Concurrent crawling tests
		- Update existing tests for new schema

		## Implementation Order

		1. ✅ Update models (TDocMetadata, TDocRecord, TDocCrawlConfig)
		2. ⏳ Create `portal.py` module with parsing logic
		3. ⏳ Update database schema and add helper methods
		4. ⏳ Refactor `TDocCrawler` class
		5. ⏳ Update CLI command
		6. ⏳ Add migration script
		7. ⏳ Write tests
		8. ⏳ Update documentation

		## Open Questions

		1. Portal authentication: Need to test login flow with actual credentials
		2. HTML structure: Need to parse actual portal page to confirm field locations
		3. Rate limiting: Should we add delays between portal requests?
		4. Batch size: For parallel workers, should we batch meetings or process all?

		## Next Steps

		1. Test portal access with credentials to understand HTML structure
		2. Implement portal parsing module
		3. Update database schema with migration
		4. Refactor crawler class
		5. Update CLI and tests

docs/history/2025-10-21_SUMMARY_DATABASE_SCHEMA_UPDATES.md

0 → 100644

+204 −0

Original line number	Diff line number	Diff line
		# Database Schema Updates for Meeting-Driven TDoc Crawling

		Date: 2025-10-21
		Status: ✅ Completed
		Tests: All 62 tests passing

		## Summary

		Successfully updated the database schema to support the new meeting-driven TDoc crawling architecture with normalized reference tables and portal metadata fields.

		## Changes Completed

		### 1. Reference Tables Created

		#### `working_groups` Table
		- Columns: `tbid` (PK), `code`, `name`, `ftp_root`
		- Data: Populated with 3 working groups:
		- RAN (tbid=373)
		- SA (tbid=375)
		- CT (tbid=649)

		#### `subworking_groups` Table
		- Columns: `subtb` (PK), `tbid` (FK → working_groups), `code`, `name`
		- Data: Populated with 23 subgroups:
		- SA: SP, S1-S6 (7 subgroups)
		- CT: CP, C1-C6 (7 subgroups)
		- RAN: RP, R1-R6 (9 subgroups including RAN6)

		### 2. Meetings Table Updates

		New Columns:
		- `tbid` INTEGER NOT NULL (FK → working_groups.tbid)
		- `subtb` INTEGER (FK → subworking_groups.subtb)

		Indexes Created:
		- `idx_meetings_tbid` on `tbid`
		- `idx_meetings_subtb` on `subtb`

		### 3. TDocs Table Updates

		New Portal Metadata Columns:
		- `title` TEXT
		- `contact` TEXT
		- `tdoc_type` TEXT
		- `for_purpose` TEXT
		- `agenda_item` TEXT
		- `status` TEXT
		- `is_revision_of` TEXT

		New Validation Columns:
		- `validated` BOOLEAN DEFAULT 0
		- `validation_failed` BOOLEAN DEFAULT 0

		Indexes Created:
		- `idx_tdocs_validated` on `validated`
		- `idx_tdocs_validation_failed` on `validation_failed`

		### 4. Model Updates (models.py)

		#### New Models
		- `SubworkingGroup`: Dataclass with `subtb`, `tbid`, `working_group`, `code`, `name`

		#### Extended Models
		- `WorkingGroup`: Added `tbid` property (373/375/649)
		- `TDocMetadata`: Added 9 portal fields + 2 validation flags (now 26 fields total)
		- `TDocRecord`: Updated to match TDocMetadata
		- `MeetingMetadata`: Added `tbid`, `subtb` fields
		- `MeetingRecord`: Updated to match MeetingMetadata
		- `TDocCrawlConfig`: Added `subgroups`, `meeting_ids`, `start_date`, `end_date`, `force_revalidate`, `workers`, `credentials`
		- `MeetingQueryConfig`: Added `subgroups` field

		### 5. Database Layer Updates (database.py)

		#### Row Parsers Updated
		- `_row_to_tdoc_metadata`: Now reads all 26 fields including portal metadata
		- `_row_to_meeting_metadata`: Now reads `tbid` and `subtb`

		#### Upsert Methods Updated
		- `upsert_tdoc`: Handles 9 new portal columns + 2 validation flags
		- `upsert_meeting`: Handles `tbid` and `subtb` foreign keys

		#### New Helper Methods
		1. `get_processed_meetings(working_groups, subgroups) -> set[int]`
		- Returns meeting IDs that have been crawled (have TDocs in database)
		- Supports filtering by working groups and subgroups
		- Used for incremental crawling mode

		2. `cache_invalid_tdoc(tdoc_id, url, working_group, subgroup) -> None`
		- Stores minimal TDoc record with `validation_failed=True`
		- Prevents re-checking invalid TDocs in future crawls
		- Idempotent (safe to call multiple times)

		3. `get_cached_invalid_tdocs() -> set[str]`
		- Returns set of TDoc IDs that failed portal validation
		- Used to skip invalid TDocs in incremental mode

		4. `get_subgroup_by_code(code) -> dict \| None`
		- Looks up subgroup by code (e.g., 'S4', 'R1', 'CP')
		- Returns `subtb`, `tbid`, `code`, `name` or None
		- Used during meeting parsing to populate foreign keys

		### 6. Crawler Updates (crawler.py)

		#### MeetingCrawler Updates
		- `_parse_meeting_row`: Now populates `tbid` and `subtb` fields
		- Gets `tbid` from `working_group.tbid` property
		- Gets `subtb` by calling `database.get_subgroup_by_code(subgroup)`

		### 7. Test Fixtures Updated

		#### conftest.py
		- `sample_tdocs` fixture: Added 9 portal fields + 2 validation flags to all 3 test TDocs
		- All fields set to `None` or `False` (no portal validation in test data)

		#### test_database.py
		- `test_meeting_upsert_and_query`: Added `tbid=373`, `subtb=379` for RAN1 meeting
		- `test_get_existing_meeting_ids`: Added `tbid=375`, `subtb=387` for SA4 meeting
		- `test_update_existing_tdoc`: Added all 9 portal fields + validation flags
		- Fixed `MeetingQueryConfig` calls to include `subgroups=None` parameter

		## Data Integrity

		### Foreign Key Relationships
		```
		working_groups (tbid PK)
		↑
		subworking_groups (subtb PK, tbid FK)
		↑
		meetings (meeting_id PK, tbid FK, subtb FK)
		↑
		tdocs (tdoc_id PK, meeting_id)
		```

		### Reference Data Completeness
		- ✅ All 3 working groups (RAN, SA, CT)
		- ✅ All 23 subgroups including:
		- RAN6 (subtb=843, added in later 3GPP release)
		- All plenary groups (SP=375, CP=649, RP=373)

		## Test Results

		Total Tests: 62
		Passed: 62 ✅
		Failed: 0
		Duration: 1.90s

		### Test Coverage
		- ✅ Database initialization with reference tables
		- ✅ TDoc upsert with portal metadata
		- ✅ Meeting upsert with foreign keys
		- ✅ Row parsing for both TDocs and meetings
		- ✅ Helper methods (get_processed_meetings, cache_invalid_tdoc, etc.)
		- ✅ Meeting crawler with tbid/subtb population
		- ✅ All CLI commands (crawl, query, stats, open)
		- ✅ All model validation

		## Breaking Changes

		### Required Parameters
		- `TDocMetadata` now requires 9 new portal fields (can be None)
		- `TDocMetadata` now requires 2 validation flags (defaults to False)
		- `MeetingMetadata` now requires `tbid` field (integer)
		- `MeetingQueryConfig` now requires `subgroups` field (can be None)

		### Database Schema
		- Existing databases will be automatically migrated on first connection
		- Reference tables created with `IF NOT EXISTS` (safe upgrade)
		- New columns added with defaults (0 for booleans, NULL for text)
		- Indexes created with `IF NOT EXISTS` (safe upgrade)

		## Next Steps

		1. ✅ Database schema updated - Reference tables, foreign keys, portal metadata
		2. ✅ Helper methods added - get_processed_meetings, cache_invalid_tdoc, etc.
		3. ⏭️ Refactor TDocCrawler.crawl() - Meeting-driven architecture
		4. ⏭️ Create portal.py module - HTML parsing for portal validation
		5. ⏭️ Update CLI crawl command - New parameters (subgroups, meeting_ids, etc.)
		6. ⏭️ Add parallel processing - ThreadPoolExecutor with workers parameter
		7. ⏭️ Add tests for new functionality - Portal validation, meeting filtering

		## Files Modified

		1. src/tdoc_crawler/models.py - Extended with portal fields, reference data models
		2. src/tdoc_crawler/database.py - Schema updates, new helper methods, updated parsers
		3. src/tdoc_crawler/crawler.py - Meeting parser now populates tbid/subtb
		4. tests/conftest.py - Updated sample_tdocs fixture
		5. tests/test_database.py - Updated meeting tests with new required fields

		## Validation

		All changes validated through:
		- ✅ Type checking (no compilation errors)
		- ✅ Unit tests (62/62 passing)
		- ✅ Code formatting (ruff format applied)
		- ✅ Database migrations (reference tables created, data populated)
		- ✅ Foreign key constraints (enforced at database level)

		## Notes

		- Portal metadata fields are optional (can be NULL) to support incremental rollout
		- Validation flags default to False/0 for backwards compatibility
		- Reference tables are immutable (no update/delete operations needed)
		- Subgroup lookup is case-insensitive (code stored as uppercase)
		- Meeting parser gracefully handles missing subgroup lookups (subtb=None)

src/tdoc_crawler/cli.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -23,7 +23,7 @@ from dotenv import load_dotenv
		from rich.console import Console
		from rich.table import Table

		from tdoc_crawler.crawler import MeetingCrawler, TDocCrawlResult, TDocCrawler
		from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler, TDocCrawlResult
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.models import (
		CrawlLimits,
		@@ -49,7 +49,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(mess

		def _parse_working_groups(values: list[str] \| None) -> list[WorkingGroup]:
		"""Parse and normalize working group names, expanding plenary aliases."""
		from tdoc_crawler.crawler import normalize_working_group_alias
		from tdoc_crawler.crawlers import normalize_working_group_alias

		if not values:
		return [WorkingGroup.RAN, WorkingGroup.SA, WorkingGroup.CT]
		@@ -70,7 +70,7 @@ def _parse_working_groups(values: list[str] \| None) -> list[WorkingGroup]:

		def _parse_subgroups(values: list[str] \| None) -> list[str] \| None:
		"""Parse and normalize subgroup names, expanding aliases to canonical names."""
		from tdoc_crawler.crawler import normalize_subgroup_alias
		from tdoc_crawler.crawlers import normalize_subgroup_alias

		if not values:
		return None

src/tdoc_crawler/crawler.py

+19 −0

Original line number	Diff line number	Diff line
		@@ -328,11 +328,20 @@ class TDocCrawler:
		meeting=meeting,
		meeting_id=None,
		file_size=file_size,
		title=None,
		contact=None,
		tdoc_type=None,
		for_purpose=None,
		agenda_item=None,
		status=None,
		is_revision_of=None,
		document_type=None,
		checksum=None,
		source_path=f"{full_path}/{name}",
		date_created=None,
		date_retrieved=datetime.now(UTC),
		validated=False,
		validation_failed=False,
		)
		collected.append(metadata)
		seen_ids.add(tdoc_id)
		@@ -500,8 +509,18 @@ class MeetingCrawler:
		location = cells[2].get_text(" ", strip=True) if len(cells) > 2 else "TBC"
		files_url = self._extract_first_link(cells[-3])

		# Get tbid from working group, subtb from database lookup if subgroup is available
		tbid = working_group.tbid
		subtb: int \| None = None
		if subgroup:
		subgroup_data = self.database.get_subgroup_by_code(subgroup)
		if subgroup_data:
		subtb = subgroup_data["subtb"]

		return MeetingMetadata(
		meeting_id=meeting_id,
		tbid=tbid,
		subtb=subtb,
		working_group=working_group,
		subgroup=subgroup,
		short_name=short_name,

src/tdoc_crawler/crawlers/init.py

0 → 100644

+25 −0

Original line number	Diff line number	Diff line
		"""Crawlers for retrieving TDoc and meeting metadata from 3GPP resources."""

		from __future__ import annotations

		# Re-export all public symbols
		from .meetings import (
		MEETING_CODE_REGISTRY,
		MeetingCrawler,
		MeetingCrawlResult,
		normalize_subgroup_alias,
		normalize_working_group_alias,
		)
		from .tdocs import EXCLUDED_DIRS, TDOC_PATTERN, TDocCrawler, TDocCrawlResult

		__all__ = [
		"EXCLUDED_DIRS",
		"MEETING_CODE_REGISTRY",
		"MeetingCrawler",
		"MeetingCrawlResult",
		"TDOC_PATTERN",
		"TDocCrawler",
		"TDocCrawlResult",
		"normalize_subgroup_alias",
		"normalize_working_group_alias",
		]