Commit 7deb92da authored by Jan Reimes's avatar Jan Reimes
Browse files

Resolve TDoc file sizes from checkout directories at query time

Add resolve_tdoc_file_sizes() that computes file sizes from on-disk
checkout directories when file_size is not already stored in the DB.
Sums all extracted file sizes in the TDoc checkout directory.

Called from both query entry points (cli/query.py and cli/query/tdocs.py)
before table rendering, so the Size (KB) column shows real values when
files have been checked out, and '?' otherwise.
parent d120c7b4
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -55,7 +55,7 @@ from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.specs.models import SpecQueryFilters
from tdoc_crawler.specs.operations.checkout import checkout_specs
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs, resolve_tdoc_file_sizes
from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups
@@ -183,6 +183,9 @@ def query_tdocs(
        with create_cached_session() as session:
            checkout_tdocs(results, path_config.checkout_dir, force=False, session=session)

    # Resolve file sizes from on-disk checkout directories for display
    resolve_tdoc_file_sizes(results, path_config.checkout_dir)

    # Build meeting map for enriched output
    async def load_meeting_map() -> dict:
        async with MeetingDatabase(db_file) as meeting_db:
+4 −1
Original line number Diff line number Diff line
@@ -38,7 +38,7 @@ from tdoc_crawler.logging import DEFAULT_LEVEL as DEFAULT_VERBOSITY
from tdoc_crawler.logging import set_verbosity
from tdoc_crawler.models.base import OutputFormat, SortOrder
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.tdocs.operations.checkout import checkout_tdocs
from tdoc_crawler.tdocs.operations.checkout import checkout_tdocs, resolve_tdoc_file_sizes
from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
from tdoc_crawler.utils.parse import parse_working_groups

@@ -140,6 +140,9 @@ def query_tdocs(
        with create_cached_session() as session:
            checkout_tdocs(results, path_config.checkout_dir, force=False, session=session)

    # Resolve file sizes from on-disk checkout directories for display
    resolve_tdoc_file_sizes(results, path_config.checkout_dir)

    async def load_meeting_map() -> dict:
        async with MeetingDatabase(db_file) as meeting_db:
            return await meeting_db._meeting_map()
+21 −2
Original line number Diff line number Diff line
@@ -18,10 +18,9 @@ from urllib.parse import urljoin, urlparse
import niquests as requests
from bs4 import BeautifulSoup

from tdoc_crawler.database.oxyde_models import TDocMetadata
from tdoc_crawler.database.oxyde_models import MeetingMetadata, TDocMetadata
from tdoc_crawler.http_client import download_to_file
from tdoc_crawler.logging import get_logger
from tdoc_crawler.database.oxyde_models import MeetingMetadata
from tdoc_crawler.tdocs.models import TDocStatus
from tdoc_crawler.tdocs.sources.doclist import DocumentListError, fetch_meeting_document_list
from tdoc_crawler.utils.security import safe_extract_zip
@@ -38,6 +37,26 @@ class CheckoutResult:
    errors: list[str]


def resolve_tdoc_file_sizes(results: list[TDocMetadata], checkout_dir: Path) -> None:
    """Resolve file sizes for TDocs from on-disk checkout directories.

    For each TDoc with ``file_size`` unset and a valid URL, computes the
    checkout path and sums the sizes of all files in that directory.
    Updates ``record.file_size`` in-place.
    """
    for record in results:
        if record.file_size is not None or not record.url:
            continue
        tdoc_dir = get_checkout_path(record, checkout_dir)
        if not tdoc_dir.exists():
            continue
        total = sum(
            f.stat().st_size for f in tdoc_dir.iterdir() if f.is_file()
        )
        if total:
            record.file_size = total


def get_checkout_path(metadata: TDocMetadata, checkout_dir: Path) -> Path:
    """Calculate the checkout path for a TDoc based on its URL.