Commit 5da13fe3 authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(database): migrate to Oxyde ORM for database operations

* Refactor MeetingDatabase to use Oxyde models for meeting metadata.
* Introduce new Oxyde models for CrawlLogEntry, MeetingMetadata, and TDocMetadata.
* Update SpecDatabase to utilize Oxyde for specification records.
* Enhance TDocDatabase to support async operations and Oxyde models.
parent 6e5cfc4b
Loading
Loading
Loading
Loading
+10 −8
Original line number Diff line number Diff line
@@ -5,9 +5,11 @@ from __future__ import annotations
import logging
import os
import shutil
import sqlite3
from pathlib import Path
from typing import Any

import requests
from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.database.specs import SpecDatabase
@@ -40,7 +42,7 @@ def resolve_spec_release_from_db(spec_number: str, requested_release: str, db_fi
                available_releases = [entry.release for entry in versions if entry.release]
                if available_releases:
                    resolved = resolve_release_to_full_version(requested_release, available_releases)
    except Exception as exc:
    except (OSError, ValueError, sqlite3.Error) as exc:
        _logger.debug("Could not resolve release '%s' for %s: %s", requested_release, spec_number, exc)
    return resolved

@@ -158,7 +160,7 @@ def delete_workspace(workspace: str | None, preserve_artifacts: bool = True) ->
                _logger.info(f"Deleted LightRAG artifacts for '{normalized_workspace}' from {workspace_artifacts_dir}")
            else:
                _logger.debug(f"No LightRAG artifacts found for '{normalized_workspace}'")
        except Exception as e:
        except (OSError, ValueError) as e:
            _logger.warning(f"Failed to delete LightRAG artifacts for '{normalized_workspace}': {e}")

    _logger.info(f"Deleted workspace '{normalized_workspace}' (preserve_artifacts={preserve_artifacts})")
@@ -409,7 +411,7 @@ def _resolve_tdoc_metadata(tdoc_id: str, db_file: Path | None = None) -> TDocMet
                metadata = db._get_tdoc(tdoc_id)
                if metadata and metadata.url:
                    _logger.debug(f"Found TDoc {tdoc_id} in database with URL")
        except Exception as e:
        except (OSError, ValueError, sqlite3.Error) as e:
            _logger.debug(f"Database lookup failed for {tdoc_id}: {e}")

    # Step 2: Try WhatTheSpec API if not found in database
@@ -419,7 +421,7 @@ def _resolve_tdoc_metadata(tdoc_id: str, db_file: Path | None = None) -> TDocMet
            metadata = resolve_via_whatthespec(tdoc_id)
            if metadata:
                _logger.info(f"Resolved TDoc {tdoc_id} via WhatTheSpec API")
        except Exception as e:
        except (OSError, ValueError, requests.RequestException) as e:
            _logger.debug(f"WhatTheSpec lookup failed for {tdoc_id}: {e}")

    # Step 3: Try 3GPP Portal as final fallback (requires credentials)
@@ -431,7 +433,7 @@ def _resolve_tdoc_metadata(tdoc_id: str, db_file: Path | None = None) -> TDocMet
                metadata = portal_source.fetch_by_id(tdoc_id, db_file=db_file)
                if metadata:
                    _logger.info(f"Resolved TDoc {tdoc_id} via 3GPP Portal")
        except Exception as e:
        except (OSError, ValueError, requests.RequestException) as e:
            _logger.debug(f"Portal lookup failed for {tdoc_id}: {e}")

    if metadata is None:
@@ -473,7 +475,7 @@ def _checkout_tdoc_if_needed(tdoc_id: str, metadata: TDocMetadata, checkout_base
    except FileNotFoundError as e:
        _logger.warning(f"TDoc {tdoc_id} not found or withdrawn: {e}")
        return None
    except Exception as e:
    except (OSError, ValueError) as e:
        _logger.warning(f"Error checking out TDoc {tdoc_id}: {e}")
        return None

@@ -554,7 +556,7 @@ def checkout_spec_to_workspace(
                            # Extract version code from version string (e.g., "26.260" from "26.260-h00")
                            version_code = entry.version.split("-", 1)[1]
                            version_codes.append(version_code)
        except Exception as exc:
        except (OSError, ValueError, sqlite3.Error) as exc:
            _logger.debug("Could not resolve version codes for %s: %s", spec_number, exc)

        # Search for spec with matching version code
@@ -592,7 +594,7 @@ def checkout_spec_to_workspace(
        _logger.warning(f"Failed to checkout spec {spec_number} (release {requested_release})")
        return None

    except Exception as e:
    except (OSError, ValueError, sqlite3.Error) as e:
        _logger.warning(f"Error checking out spec {spec_number} (release {requested_release}): {e}")
        return None

+1 −1
Original line number Diff line number Diff line
@@ -23,7 +23,6 @@ dependencies = [
    "packaging>=25.0",
    "pandas>=3.0.0",
    "pydantic>=2.12.2",
    "pydantic-sqlite>=0.4.0",
    "python-calamine>=0.5.3",
    "python-dotenv>=1.1.1",
    "pyyaml>=6.0.3",
@@ -36,6 +35,7 @@ dependencies = [
    "zipinspect>=0.1.2",
    "lxml>=6.0.2",
    "pool-executors",
    "oxyde>=0.4.0",
]

[project.urls]
+78 −68
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

from __future__ import annotations

import asyncio
import zipfile
from typing import Any, cast

@@ -84,9 +85,10 @@ def open_tdoc(
    normalized_id = tdoc_id.strip().upper()
    config = TDocQueryConfig(tdoc_ids=[normalized_id])

    async def _open_tdoc() -> None:
        with create_cached_session() as session:
        with TDocDatabase(manager.db_file) as database:
            results = database.query_tdocs(config)
            async with TDocDatabase(manager.db_file) as database:
                results = await database.query_tdocs(config)
                result = fetch_missing_tdocs(
                    database,
                    config,
@@ -113,6 +115,8 @@ def open_tdoc(
            console.print(f"[green]Opening {tdoc_file}")
            launch_file(tdoc_file)

    asyncio.run(_open_tdoc())


@app.command("checkout", rich_help_panel=HELP_PANEL_MAIN)
def checkout(
@@ -134,9 +138,10 @@ def checkout(
    normalized_ids = [tid.strip().upper() for tid in tdoc_ids]
    config = TDocQueryConfig(tdoc_ids=normalized_ids)

    async def _checkout() -> None:
        with create_cached_session() as session:
        with TDocDatabase(manager.db_file) as database:
            results = database.query_tdocs(config)
            async with TDocDatabase(manager.db_file) as database:
                results = await database.query_tdocs(config)
                result = fetch_missing_tdocs(
                    database,
                    config,
@@ -173,6 +178,8 @@ def checkout(
            if error_count:
                console.print(f"[red]Failed: {error_count} TDoc(s)[/red]")

    asyncio.run(_checkout())


@app.command("stats", rich_help_panel=HELP_PANEL_MAIN)
def stats(
@@ -187,8 +194,11 @@ def stats(
        console.print(f"[red]Database not found: {db_file}[/red]")
        raise typer.Exit(code=1)

    with MeetingDatabase(db_file) as database:
        stats_dict = cast(dict[str, Any], database.get_statistics())
    async def _stats() -> dict[str, Any]:
        async with MeetingDatabase(db_file) as database:
            return cast(dict[str, Any], await database.get_statistics())

    stats_dict = asyncio.run(_stats())

    table = Table(title="TDoc database statistics")
    table.add_column("Metric", style="cyan")
+120 −100
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

from __future__ import annotations

import asyncio
import json
from datetime import datetime

@@ -50,12 +51,12 @@ from tdoc_crawler.cli.printing import print_spec_crawl_table, spec_crawl_to_dict
from tdoc_crawler.config import CacheManager
from tdoc_crawler.credentials import set_credentials
from tdoc_crawler.database import MeetingDatabase, TDocDatabase
from tdoc_crawler.database.specs import SpecDatabase
from tdoc_crawler.database.specs import SpecCrawlResult, SpecDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler
from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata, MeetingQueryConfig
from tdoc_crawler.meetings.operations.crawl import MeetingCrawler, MeetingCrawlResult
from tdoc_crawler.models.base import HttpCacheConfig, OutputFormat, SortOrder
from tdoc_crawler.models.crawl_limits import CrawlLimits
from tdoc_crawler.models.subworking_groups import SUBTB_INDEX
@@ -63,6 +64,7 @@ from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, c
from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
from tdoc_crawler.tdocs.operations import TDocCrawler
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
from tdoc_crawler.tdocs.operations.crawl import TDocCrawlResult
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

@@ -137,7 +139,8 @@ def crawl_tdocs(
    scope_parts = []

    # Query actual meetings from database to show realistic scope
    with MeetingDatabase(db_file) as meeting_db:
    async def fetch_meetings() -> list[MeetingMetadata]:
        async with MeetingDatabase(db_file) as meeting_db:
            query_config = MeetingQueryConfig(
                working_groups=working_groups,
                subgroups=subgroups,
@@ -147,21 +150,9 @@ def crawl_tdocs(
                start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
                end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
            )
        meetings = meeting_db.query_meetings(query_config)
            return await meeting_db.query_meetings(query_config)

    if meetings:
        # Extract unique subgroups from queried meetings
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    # Fallback to input parameters if no meetings found in DB
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
        meetings = meeting_db.query_meetings(query_config)
    meetings = asyncio.run(fetch_meetings())

    if meetings:
        # Extract unique subgroups from queried meetings
@@ -186,9 +177,14 @@ def crawl_tdocs(
        clear_specs=clear_specs,
    )

    with TDocDatabase(db_file) as database:
    async def run_tdoc_crawl() -> tuple[TDocCrawlResult, float]:
        async with TDocDatabase(db_file) as database:
            crawler = TDocCrawler(database)
        crawl_id = database.log_crawl_start("tdoc", [wg.value for wg in config.working_groups], config.incremental)
            crawl_id = await database.log_crawl_start(
                "tdoc",
                [wg.value for wg in config.working_groups],
                config.incremental,
            )

            crawl_start_time = datetime.now()

@@ -209,7 +205,7 @@ def crawl_tdocs(
            elapsed_seconds = (crawl_end_time - crawl_start_time).total_seconds()
            throughput = result.processed / elapsed_seconds if elapsed_seconds > 0 else 0

        database.log_crawl_end(
            await database.log_crawl_end(
                crawl_id,
                items_added=result.inserted,
                items_updated=result.updated,
@@ -232,7 +228,7 @@ def crawl_tdocs(
                    agenda_pattern=agenda,
                    agenda_pattern_exclude=agenda_ex,
                )
            results = database.query_tdocs(query_config)
                results = await database.query_tdocs(query_config)

                with create_cached_session(http_cache_enabled=http_cache_enabled) as session:
                    checkout_result = checkout_tdocs(results, manager.checkout_dir, force=False, session=session)
@@ -244,6 +240,10 @@ def crawl_tdocs(
                    for error in checkout_result.errors[:5]:
                        console.print(f"  - {error}")

            return result, throughput

    result, throughput = asyncio.run(run_tdoc_crawl())

    console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
@@ -312,8 +312,13 @@ def crawl_meetings(
        clear_db=clear_db,
    )

    with MeetingDatabase(db_file) as database:
        crawl_id = database.log_crawl_start("meeting", [wg.value for wg in config.working_groups], config.incremental)
    async def run_meeting_crawl() -> MeetingCrawlResult:
        async with MeetingDatabase(db_file) as database:
            crawl_id = await database.log_crawl_start(
                "meeting",
                [wg.value for wg in config.working_groups],
                config.incremental,
            )

            crawler = MeetingCrawler(database)

@@ -326,13 +331,17 @@ def crawl_meetings(

                result = crawler.crawl(config, progress_callback=update_progress)

        database.log_crawl_end(
            await database.log_crawl_end(
                crawl_id,
                items_added=result.inserted,
                items_updated=result.updated,
                errors_count=len(result.errors),
            )

            return result

    result = asyncio.run(run_meeting_crawl())

    console.print(f"[green]Processed {result.processed} meetings[/green]")
    console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]")
    if result.errors:
@@ -350,8 +359,12 @@ def crawl_meetings(
            start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
            end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
        )
        with MeetingDatabase(db_file) as database:
            meetings = database.query_meetings(query_config)

        async def fetch_checkout_meetings() -> list[MeetingMetadata]:
            async with MeetingDatabase(db_file) as database:
                return await database.query_meetings(query_config)

        meetings = asyncio.run(fetch_checkout_meetings())

        with create_cached_session() as session:
            checkout_meeting_tdocs(meetings, manager.checkout_dir, manager.http_cache_file, session=session)
@@ -390,15 +403,20 @@ def crawl_specs(
        clear_specs=clear_specs,
    )

    with SpecDatabase(manager.db_file) as database:
        results = database.crawl_specs(specs, release, sources)
    async def crawl_specs_db() -> list[SpecCrawlResult]:
        async with SpecDatabase(manager.db_file) as database:
            return await database.crawl_specs(specs, release, sources)

    results = asyncio.run(crawl_specs_db())

    if not results:
        console.print("[yellow]No specs crawled[/yellow]")
        return

    if checkout:
        with SpecDatabase(manager.db_file) as database:

        async def checkout_specs_db() -> None:
            async with SpecDatabase(manager.db_file) as database:
                checkout_specs(
                    [result.spec_number for result in results],
                    manager.checkout_dir,
@@ -406,6 +424,8 @@ def crawl_specs(
                    release=release,
                )

        asyncio.run(checkout_specs_db())

    if output is OutputFormat.JSON:
        console.print(json.dumps([spec_crawl_to_dict(result) for result in results], indent=2))
    elif output is OutputFormat.YAML:
+47 −24
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@

from __future__ import annotations

import asyncio
import json
from datetime import UTC, datetime
from typing import Annotated
@@ -143,8 +144,10 @@ def query_tdocs(
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )
    with TDocDatabase(db_file) as database:
        results = database.query_tdocs(config)

    async def run_query() -> list:
        async with TDocDatabase(db_file) as database:
            results = await database.query_tdocs(config)
            if not no_fetch:
                with create_cached_session() as session:
                    result = fetch_missing_tdocs(
@@ -155,10 +158,15 @@ def query_tdocs(
                        cache_manager_name=manager.name,
                    )
                    if result.fetch_result and result.fetch_result.errors:
                    console.print(f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]")
                        console.print(
                            f"[yellow]{len(result.fetch_result.errors)} issues detected during targeted crawl[/yellow]",
                        )
                        for error in result.fetch_result.errors[:3]:
                            console.print(f"  - {error}")
                    results = result.refreshed
        return results

    results = asyncio.run(run_query())

    if not results:
        console.print("[yellow]No TDocs found[/yellow]")
@@ -169,8 +177,11 @@ def query_tdocs(
            checkout_tdocs(results, manager.checkout_dir, force=False, session=session)

    # Build meeting map for enriched output
    with MeetingDatabase(db_file) as meeting_db:
        meeting_map = meeting_db._meeting_map()
    async def load_meeting_map() -> dict:
        async with MeetingDatabase(db_file) as meeting_db:
            return await meeting_db._meeting_map()

    meeting_map = asyncio.run(load_meeting_map())

    # If graph-rag flag is set, perform hybrid search
    # Disabled due to logical inconsistency - query_hybrid expects text query, not TDoc ID
@@ -227,8 +238,12 @@ def query_meetings(
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )
    with MeetingDatabase(db_file) as database:
        meetings = database.query_meetings(config)

    async def run_query() -> list:
        async with MeetingDatabase(db_file) as database:
            return await database.query_meetings(config)

    meetings = asyncio.run(run_query())

    if not meetings:
        console.print("[yellow]No meetings found[/yellow]")
@@ -292,8 +307,12 @@ def query_specs(
        clear_tdocs=clear_tdocs,
        clear_specs=clear_specs,
    )
    with SpecDatabase(db_file) as database:
        results = database.query_specs(filters)

    async def run_query() -> list:
        async with SpecDatabase(db_file) as database:
            return await database.query_specs(filters)

    results = asyncio.run(run_query())

    if not results:
        console.print("[yellow]No specs found[/yellow]")
@@ -301,9 +320,13 @@ def query_specs(

    if checkout:
        spec_list = [result.spec_number for result in results]
        with SpecDatabase(db_file) as database:

        async def load_specs_for_checkout() -> None:
            async with SpecDatabase(db_file) as database:
                checkout_specs(spec_list, manager.checkout_dir, database, release="latest")

        asyncio.run(load_specs_for_checkout())

    if output is OutputFormat.JSON:
        console.print(json.dumps([spec_query_to_dict(result) for result in results], indent=2))
    elif output is OutputFormat.YAML:
Loading