Commit 5765ff43 authored by Jan Reimes's avatar Jan Reimes
Browse files

♻️ refactor(database): simplify TDoc model and database interactions

- Removed working_group, subgroup, and meeting fields from TDocMetadata
- Updated database schema to reflect changes in TDoc structure
- Adjusted related database operations and queries to accommodate new model
- Enhanced test cases to ensure proper functionality with updated models
parent ff13ef2f
Loading
Loading
Loading
Loading
+3 −15
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ from rich.console import Console

from tdoc_crawler.crawlers import TDocCrawlResult, fetch_tdoc_metadata
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.models import PortalCredentials, QueryConfig, TDocMetadata, WorkingGroup
from tdoc_crawler.models import PortalCredentials, QueryConfig, TDocMetadata

from .helpers import resolve_meeting_id

@@ -60,22 +60,10 @@ def fetch_missing_tdocs(
                if not meeting_id:
                    _logger.warning(f"Could not resolve meeting '{meeting_name}' to meeting_id for {tdoc_id}")

            # Infer working group from TDoc ID
            tdoc_prefix = tdoc_id[0].upper()
            working_group_map = {"R": WorkingGroup.RAN, "S": WorkingGroup.SA, "C": WorkingGroup.CT, "T": WorkingGroup.CT}
            working_group = working_group_map.get(tdoc_prefix, WorkingGroup.RAN)

            # Build TDoc URL (using meeting info if available)
            # For now, use a placeholder URL since we're fetching from portal
            url = f"https://www.3gpp.org/ftp/tsg_{working_group.value.lower()}/.../{tdoc_id}.zip"

            # Create TDocMetadata object (all fields without defaults must be provided)
            # Create TDocMetadata object
            metadata = TDocMetadata(
                tdoc_id=tdoc_id.upper(),
                url=url,
                working_group=working_group,
                subgroup=None,
                meeting=meeting_name,
                url=f"https://www.3gpp.org/ftp/tsg_ran/.../{tdoc_id}.zip",
                meeting_id=meeting_id,
                file_size=None,
                title=portal_data.get("title"),
+14 −9
Original line number Diff line number Diff line
@@ -15,16 +15,21 @@ def tdoc_to_dict(result: TDocMetadata) -> dict[str, object]:
    return {
        "tdoc_id": result.tdoc_id,
        "url": result.url,
        "working_group": result.working_group.value,
        "subgroup": result.subgroup,
        "meeting": result.meeting,
        "meeting_id": result.meeting_id,
        "file_size": result.file_size,
        "title": result.title,
        "contact": result.contact,
        "tdoc_type": result.tdoc_type,
        "for_purpose": result.for_purpose,
        "agenda_item": result.agenda_item,
        "status": result.status,
        "is_revision_of": result.is_revision_of,
        "document_type": result.document_type,
        "checksum": result.checksum,
        "source_path": result.source_path,
        "date_created": result.date_created.isoformat() if result.date_created else None,
        "date_retrieved": result.date_retrieved.isoformat(),
        "validated": result.validated,
    }


@@ -48,18 +53,18 @@ def print_tdoc_table(results: list[TDocMetadata]) -> None:
    """Print TDoc results as formatted table."""
    table = Table(title=f"TDoc results ({len(results)} rows)")
    table.add_column("TDoc", style="cyan")
    table.add_column("WG", style="magenta")
    table.add_column("Subgroup", style="yellow")
    table.add_column("Meeting", style="green")
    table.add_column("Meeting ID", style="magenta")
    table.add_column("Title", style="yellow")
    table.add_column("Type", style="green")
    table.add_column("Size (KB)", justify="right", style="blue")

    for result in results[:100]:
        size_kb = f"{result.file_size // 1024}" if result.file_size else "?"
        table.add_row(
            result.tdoc_id,
            result.working_group.value,
            result.subgroup or "-",
            result.meeting or "-",
            str(result.meeting_id or "-"),
            result.title or "-",
            result.tdoc_type or "-",
            size_kb,
        )

+2 −6
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ from __future__ import annotations

import sqlite3

SCHEMA_VERSION = 1
SCHEMA_VERSION = 2


def configure_connection(conn: sqlite3.Connection) -> None:
@@ -90,10 +90,7 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
        CREATE TABLE IF NOT EXISTS tdocs (
            tdoc_id TEXT PRIMARY KEY COLLATE NOCASE,
            url TEXT NOT NULL,
            working_group TEXT NOT NULL,
            subgroup TEXT NOT NULL,
            meeting TEXT,
            meeting_id INTEGER,
            meeting_id INTEGER NOT NULL,
            file_size INTEGER,
            title TEXT,
            contact TEXT,
@@ -115,7 +112,6 @@ def initialize_schema(conn: sqlite3.Connection) -> None:
        )
        """
    )
    conn.execute("CREATE INDEX IF NOT EXISTS idx_tdocs_wg ON tdocs(working_group)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_tdocs_meeting_id ON tdocs(meeting_id)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_tdocs_date_retrieved ON tdocs(date_retrieved)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_tdocs_validated ON tdocs(validated)")
+5 −3
Original line number Diff line number Diff line
@@ -54,9 +54,11 @@ def get_statistics(conn: sqlite3.Connection) -> DatabaseStatistics:
        # Breakdown by working group
        cursor.execute(
            """
            SELECT working_group, COUNT(*) as count
            FROM tdocs
            GROUP BY working_group
            SELECT wg.name, COUNT(*) as count
            FROM tdocs t
            LEFT JOIN meetings m ON t.meeting_id = m.meeting_id
            LEFT JOIN working_groups wg ON m.tbid = wg.tbid
            GROUP BY wg.name
            ORDER BY count DESC
            """
        )
+23 −29
Original line number Diff line number Diff line
@@ -49,18 +49,15 @@ def upsert_tdoc(conn: sqlite3.Connection, metadata: TDocMetadata) -> tuple[bool,
            cursor.execute(
                """
                INSERT INTO tdocs (
                    tdoc_id, url, working_group, subgroup, meeting, meeting_id,
                    tdoc_id, url, meeting_id,
                    file_size, title, contact, tdoc_type, for_purpose, agenda_item,
                    status, is_revision_of, document_type, checksum, source_path,
                    date_created, date_retrieved, date_updated, validated, validation_failed
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """,
                (
                    record.tdoc_id,
                    record.url,
                    record.working_group.value,
                    record.subgroup,
                    record.meeting,
                    record.meeting_id,
                    record.file_size,
                    record.title,
@@ -87,9 +84,6 @@ def upsert_tdoc(conn: sqlite3.Connection, metadata: TDocMetadata) -> tuple[bool,
        new_date_retrieved = record.date_retrieved.isoformat()
        changed = any((
            existing["url"] != record.url,
            existing["working_group"] != record.working_group.value,
            existing["subgroup"] != record.subgroup,
            existing["meeting"] != record.meeting,
            existing["meeting_id"] != record.meeting_id,
            existing["file_size"] != record.file_size,
            existing["title"] != record.title,
@@ -113,9 +107,6 @@ def upsert_tdoc(conn: sqlite3.Connection, metadata: TDocMetadata) -> tuple[bool,
                """
                UPDATE tdocs
                   SET url = ?,
                       working_group = ?,
                       subgroup = ?,
                       meeting = ?,
                       meeting_id = ?,
                       file_size = ?,
                       title = ?,
@@ -137,9 +128,6 @@ def upsert_tdoc(conn: sqlite3.Connection, metadata: TDocMetadata) -> tuple[bool,
                """,
                (
                    record.url,
                    record.working_group.value,
                    record.subgroup,
                    record.meeting,
                    record.meeting_id,
                    record.file_size,
                    record.title,
@@ -205,13 +193,15 @@ def get_existing_tdoc_ids(conn: sqlite3.Connection, working_groups: Iterable[Wor
        Set of TDoc IDs (uppercase)
    """
    params: list[Any] = []
    query = "SELECT tdoc_id FROM tdocs"
    query = "SELECT t.tdoc_id FROM tdocs t"

    if working_groups:
        working_groups_list = [wg.value for wg in working_groups]
        placeholders = ["?"] * len(working_groups_list)
        query += f" WHERE working_group IN ({','.join(placeholders)})"
        params.extend(working_groups_list)
        # Join with meetings to filter by working group
        query += " LEFT JOIN meetings m ON t.meeting_id = m.meeting_id"
        tbids = [wg.tbid for wg in working_groups]
        placeholders = ["?"] * len(tbids)
        query += f" WHERE m.tbid IN ({','.join(placeholders)})"
        params.extend(tbids)

    cursor = conn.cursor()
    try:
@@ -232,6 +222,9 @@ def query_tdocs(conn: sqlite3.Connection, config: QueryConfig) -> list[TDocMetad

    Returns:
        List of matching TDoc metadata

    Raises:
        DatabaseError: If query fails
    """
    clauses: list[str] = []
    params: list[Any] = []
@@ -241,27 +234,29 @@ def query_tdocs(conn: sqlite3.Connection, config: QueryConfig) -> list[TDocMetad
        clauses.append(f"tdoc_id IN ({placeholders})")
        params.extend(config.tdoc_ids)

    # Filter by working groups via join to meetings table
    if config.working_groups:
        placeholders = ",".join(["?"] * len(config.working_groups))
        clauses.append(f"working_group IN ({placeholders})")
        params.extend([wg.value for wg in config.working_groups])
        clauses.append(f"m.tbid IN ({placeholders})")
        params.extend([wg.tbid for wg in config.working_groups])

    if config.start_date is not None:
        clauses.append("date_retrieved >= ?")
        clauses.append("t.date_retrieved >= ?")
        params.append(config.start_date.isoformat())

    if config.end_date is not None:
        clauses.append("date_retrieved <= ?")
        clauses.append("t.date_retrieved <= ?")
        params.append(config.end_date.isoformat())

    query_parts = ["SELECT * FROM tdocs"]
    # Build query with LEFT JOIN to meetings table
    query_parts = ["SELECT t.* FROM tdocs t LEFT JOIN meetings m ON t.meeting_id = m.meeting_id"]
    if clauses:
        query_parts.append("WHERE " + " AND ".join(clauses))

    if config.order == SortOrder.ASC:
        query_parts.append("ORDER BY date_retrieved ASC, tdoc_id ASC")
        query_parts.append("ORDER BY t.date_retrieved ASC, t.tdoc_id ASC")
    else:
        query_parts.append("ORDER BY date_retrieved DESC, tdoc_id DESC")
        query_parts.append("ORDER BY t.date_retrieved DESC, t.tdoc_id DESC")

    if config.limit is not None:
        query_parts.append("LIMIT ?")
@@ -273,6 +268,8 @@ def query_tdocs(conn: sqlite3.Connection, config: QueryConfig) -> list[TDocMetad
    try:
        cursor.execute(query, params)
        rows = cursor.fetchall()
    except sqlite3.OperationalError as exc:
        raise DatabaseError.parse_failure("tdocs-query", str(exc)) from exc
    finally:
        cursor.close()

@@ -409,9 +406,6 @@ def row_to_tdoc_metadata(row: sqlite3.Row) -> TDocMetadata:
        return TDocMetadata(
            tdoc_id=row["tdoc_id"],
            url=row["url"],
            working_group=WorkingGroup(row["working_group"]),
            subgroup=row["subgroup"],
            meeting=row["meeting"],
            meeting_id=row["meeting_id"],
            file_size=row["file_size"],
            title=row["title"],
Loading