Commit d701ef0c authored by Jan Reimes's avatar Jan Reimes
Browse files

refactor(database): replace TDocDatabase with SpecDatabase and clean up methods

* Updated database interactions to use SpecDatabase instead of TDocDatabase.
* Removed unused CheckoutDirOption and refactored related methods.
* Enhanced SpecDownloads class for better handling of spec downloads.
* Improved test coverage for SpecDownloads with mock implementations.
parent a110a426
Loading
Loading
Loading
Loading
+3 −4
Original line number Diff line number Diff line
@@ -26,7 +26,6 @@ from tdoc_crawler.checkout import (
from tdoc_crawler.cli.args import (
    DEFAULT_VERBOSITY,
    CacheDirOption,
    CheckoutDirOption,
    CheckoutOption,
    CheckoutTDocIdsArgument,
    ClearDbOption,
@@ -80,7 +79,7 @@ from tdoc_crawler.cli.printing import (
from tdoc_crawler.config import CacheManager
from tdoc_crawler.crawlers import MeetingCrawler, TDocCrawler
from tdoc_crawler.credentials import resolve_credentials, set_credentials
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.database import SpecDatabase, TDocDatabase
from tdoc_crawler.fetching import fetch_missing_tdocs
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.logging import set_verbosity
@@ -159,7 +158,7 @@ def crawl_tdocs(
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
    console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]")

    with TDocDatabase(db_file) as database:
    with SpecDatabase(db_file) as database:
        checkout_dir = manager.checkout_dir
        # Clear TDocs if requested
        if clear_tdocs:
@@ -858,7 +857,7 @@ def checkout_spec(
    spec_file: SpecFileOption = None,
    release: ReleaseOption = "latest",
    doc_only: DocOnlyOption = False,
    checkout_dir: CheckoutDirOption = None,
    # checkout_dir: CheckoutDirOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = DEFAULT_VERBOSITY,
) -> None:
+323 −263

File changed.

Preview size limit exceeded, changes collapsed.

+0 −1
Original line number Diff line number Diff line
@@ -32,7 +32,6 @@ class TDocMetadata(BaseModel):

    # Mandatory metadata fields from portal
    tdoc_id: str = Field(..., description="Unique TDoc identifier (case-normalized)")
    # TODO: consider using the meeting short name as meeting_id (str), as it is available via portal access?
    meeting_id: int = Field(..., description="Foreign key reference to the meetings table")
    title: str = Field(..., description="Document title as published on the portal")
    url: str | None = Field(None, description="Full URL to the TDoc resource, None if not available")
+29 −41
Original line number Diff line number Diff line
@@ -9,17 +9,13 @@ from zipinspect import HTTPZipReader

from tdoc_crawler.config import resolve_cache_manager
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.http_client import create_cached_session
from tdoc_crawler.http_client import download_to_path
from tdoc_crawler.specs.catalog import SpecCatalog
from tdoc_crawler.specs.normalization import normalize_spec_number
from tdoc_crawler.specs.sources.base import SpecSource

_logger = logging.getLogger(__name__)

# TODO: Simply this class; "open" is actually the same as "checkout" with a single file and a trigger to the system's open action.
# TODO: This way we avoid code duplication between "checkout" and "open", as they both need to resolve URLs, download, and extract.
# TODO: Thus, both methods should share as much code/logic as possible


class SpecDownloads:
    """Download and extraction utilities for specs."""
@@ -32,19 +28,16 @@ class SpecDownloads:
        self,
        specs: list[str],
        doc_only: bool,
        checkout_dir: Path,  # Todo: checkout_dir must be derived from cache_dir and relative subdir from URL
        release: str = "latest",
        sources: list[SpecSource] | None = None,
    ) -> list[Path]:
        """Download and extract spec documents to the checkout directory."""
        checkout_dir.mkdir(parents=True, exist_ok=True)
        results: list[Path] = []

        for spec in specs:
            try:
                normalized = normalize_spec_number(spec)
                series = f"{normalized.split('.')[0]}_series"
                # Todo: do not hard-code "Specs/archive" path segment, derive from URL
                target_dir = checkout_dir / "Specs" / "archive" / series / normalized
                target_dir.mkdir(parents=True, exist_ok=True)

@@ -75,14 +68,6 @@ class SpecDownloads:

                if not success:
                    self._download_full_zip(url, target_dir / filename)
                    # Extract zip? Or just keep it?
                    # "Checkout" usually implies extraction or having files ready.
                    # 3GPP archive structure has zips.
                    # But if doc-only, we extract the doc.
                    # If full text, usually we keep the zip or extract it?
                    # Task says "Download, extract, and open".
                    # Existing implementations usually unzip TDocs.
                    # I will extract the zip if full download.
                    self._extract_zip(target_dir / filename, target_dir)

                results.append(target_dir)
@@ -97,7 +82,7 @@ class SpecDownloads:
        self,
        spec: str,
        doc_only: bool,
        checkout_dir: Path,  # Todo: checkout_dir must be derived from cache_dir and relative subdir from URL
        checkout_dir: Path,
        release: str = "latest",
        sources: list[SpecSource] | None = None,
    ) -> Path:
@@ -148,7 +133,7 @@ class SpecDownloads:
                    msg = f"No versions found for spec {normalized} with release {release}"
                    raise ValueError(msg)
            except (ValueError, IndexError) as e:
                msg = f"Invalid release format: {release}. Expected format like '17' or '17.1'"
                msg = f"Invalid release format: {release}. Expected format like '17', '17.1', or '17.1.0'"
                raise ValueError(msg) from e

        target = versions[0]
@@ -162,44 +147,47 @@ class SpecDownloads:
        """Attempt to download only the document file from remote zip."""
        try:
            async with HTTPZipReader(url) as reader:
                await reader.load_entries()
                # Find doc entry
                # HTTPZipReader entries likely have .filename or .name
                # I'll check first entry type
                entries: list[str] = [e.filename for e in reader.entries]
                doc_file = _select_doc_entry(entries, normalized)
                # entries is populated by __aenter__ via load_entries()
                # Find doc entry using .path (not .filename)
                entries = reader.entries
                if entries is None:
                    _logger.info("Doc-only: No entries found in %s", url)
                    return False

                doc_file = _select_doc_entry([e.path for e in entries], normalized)

                if not doc_file:
                    _logger.info("Doc-only: No document found in %s", url)
                    return False

                # Extract
                await reader.extract([e for e in reader.entries if e.filename == doc_file], out_dir=target_dir)
                # Extract matching entries to target_dir
                for entry in entries:
                    if entry.path == doc_file:
                        target_path = target_dir / doc_file.split("/")[-1]
                        with target_path.open("wb") as f:
                            async for _ in reader.extract(entry, f):
                                pass
                        return True
                return False
        except Exception as exc:
            _logger.warning("Doc-only download failed for %s: %s", url, exc)
            return False

    def _download_full_zip(self, url: str, target_path: Path) -> None:
        """Download full zip file."""
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        }
        session = create_cached_session(cache_dir=self.cache)
        response = session.get(url, stream=True, timeout=60, headers=headers)
        response.raise_for_status()
        with open(target_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

    # TODO: should be either staticmethod or moved to utility module, as it is not specific to this class
    def _extract_zip(self, zip_path: Path, extract_dir: Path) -> None:
        download_to_path(url, target_path, cache_manager_name=self._cache_manager.name)

    @staticmethod
    def _extract_zip(zip_file: Path, extract_dir: Path, keep_zip: bool = True) -> None:
        """Extract zip file."""
        try:
            with zipfile.ZipFile(zip_path) as z:
            with zipfile.ZipFile(zip_file) as z:
                z.extractall(extract_dir)

            if not keep_zip:
                zip_file.unlink()
        except Exception as exc:
            _logger.error("Failed to extract %s: %s", zip_path, exc)
            _logger.error("Failed to extract %s: %s", zip_file, exc)


def _select_doc_entry(entries: list[str], normalized: str) -> str | None:
+36 −20
Original line number Diff line number Diff line
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from tdoc_crawler.config import CacheManager
from tdoc_crawler.specs.downloads import SpecDownloads


@@ -11,33 +13,53 @@ def mock_db() -> MagicMock:


@pytest.fixture  # noqa: ANN201
def downloader(mock_db: MagicMock) -> SpecDownloads:
    return SpecDownloads(mock_db)
def mock_cache_manager(tmp_path: Path) -> CacheManager:
    cache_dir = tmp_path / "cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    return CacheManager(root_path=cache_dir, name="test")


@pytest.fixture  # noqa: ANN201
def downloader(mock_db: MagicMock, mock_cache_manager: CacheManager) -> SpecDownloads:
    mock_cache_manager.register()
    return SpecDownloads(mock_db, cache_manager_name="test")


class TestSpecDownloads:
    # This test verifies that checkout_specs resolves URL and uses doc-only logic.
    @patch("tdoc_crawler.specs.downloads.HTTPZipReader")
    def test_checkout_specs_doc_only_remote(self, MockHTTPZipReader: object, downloader: SpecDownloads, mock_db: MagicMock, tmp_path: object) -> None:  # noqa: ANN001
    def test_checkout_specs_doc_only_remote(self, MockHTTPZipReader: MagicMock, downloader: SpecDownloads, mock_db: MagicMock, tmp_path: Path) -> None:  # noqa: ANN001
        """Test that checkout_specs resolves URL and uses zipinspect for doc-only."""
        specs = ["26.132"]
        checkout_dir = tmp_path / "checkout"

        # Setup Mock Reader
        mock_reader = MagicMock()
        mock_reader.load_entries = AsyncMock()
        mock_reader.extract = AsyncMock()

        entry = MagicMock()
        entry.filename = "26132.doc"
        entry.path = "26132.doc"  # Use .path (not .filename)
        mock_reader.entries = [entry]

        # Mocking async context manager manually
        mock_cm = MagicMock()
        mock_cm.__aenter__ = AsyncMock(return_value=mock_reader)
        mock_cm.__aexit__ = AsyncMock(return_value=None)
        # extract() is an async generator - track it with a wrapper mock
        extracted_entries: list[MagicMock] = []

        async def mock_extract(info, output):
            extracted_entries.append(info)
            # Simulate extracting by writing placeholder bytes
            output.write(b"test content")
            yield 12  # bytes written

        MockHTTPZipReader.return_value = mock_cm
        mock_reader.extract = mock_extract

        # Set up async context manager on the return value
        async def mock_aenter(self):
            return mock_reader

        async def mock_aexit(self, *args):
            return None

        MockHTTPZipReader.return_value.__aenter__ = mock_aenter
        MockHTTPZipReader.return_value.__aexit__ = mock_aexit

        # We rely on _resolve_spec_url
        with patch.object(downloader, "_resolve_spec_url", return_value=("http://example.com/26132-j00.zip", "26132-j00.zip"), create=True) as mock_resolve:
@@ -50,12 +72,6 @@ class TestSpecDownloads:
            # Verify HTTPZipReader called
            MockHTTPZipReader.assert_called_with("http://example.com/26132-j00.zip")

            # Verify load_entries called
            mock_reader.load_entries.assert_called_once()

            # Verify extract called with correct entries
            assert mock_reader.extract.call_count == 1
            # Check args
            args, kwargs = mock_reader.extract.call_args
            assert args[0][0].filename == "26132.doc"
            assert kwargs["out_dir"] == checkout_dir / "Specs" / "archive" / "26_series" / "26.132"
            # Verify extract called with correct entry
            assert len(extracted_entries) == 1
            assert extracted_entries[0].path == "26132.doc"