✨ feat(logging): add rich logging integration with configurable verbosity (21eaa269) · Commits · Jan Reimes / 3gpp-crawler

src/tdoc_crawler/checkout.py

+175 −3

Original line number	Diff line number	Diff line
		@@ -7,20 +7,30 @@ directory structure as the server.

		from __future__ import annotations

		import logging
		import posixpath
		import shutil
		import zipfile
		from contextlib import suppress
		from dataclasses import dataclass
		from pathlib import Path
		from typing import TYPE_CHECKING, cast
		from urllib.parse import urlparse

		import requests

		from tdoc_crawler.crawlers.meeting_doclist import DocumentListError, fetch_meeting_document_list
		from tdoc_crawler.http_client import download_to_path
		from tdoc_crawler.models import TDocMetadata
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models import MeetingMetadata, TDocMetadata
		from tdoc_crawler.specs.downloads import SpecDownloads
		from tdoc_crawler.specs.sources.base import FunctionSpecSource, SpecSource
		from tdoc_crawler.specs.sources.threegpp import fetch_threegpp_metadata
		from tdoc_crawler.specs.sources.whatthespec import fetch_whatthespec_metadata

		logger = logging.getLogger(__name__)
		if TYPE_CHECKING:
		from tdoc_crawler.database import TDocDatabase

		logger = get_logger(__name__)


		def get_checkout_path(metadata: TDocMetadata, checkout_dir: Path) -> Path:
		@@ -173,8 +183,170 @@ def get_checked_out_tdocs(checkout_dir: Path) -> list[str]:
		return tdoc_ids


		def clear_checkout_tdocs(checkout_dir: Path) -> int:
		"""Clear TDoc checkout entries from the checkout directory.

		Args:
		checkout_dir: Base checkout directory

		Returns:
		Number of entries removed
		"""
		if not checkout_dir.exists():
		return 0
		removed = 0
		for entry in checkout_dir.iterdir():
		if entry.name == "Specs":
		continue
		if entry.is_dir():
		shutil.rmtree(entry)
		else:
		entry.unlink()
		removed += 1
		return removed


		def clear_checkout_specs(checkout_dir: Path) -> int:
		"""Clear spec checkout entries from the checkout directory.

		Args:
		checkout_dir: Base checkout directory

		Returns:
		Number of entries removed (always 1 if Specs directory existed)
		"""
		specs_dir = checkout_dir / "Specs"
		if not specs_dir.exists():
		return 0
		shutil.rmtree(specs_dir)
		return 1


		def checkout_specs(
		spec_numbers: list[str],
		checkout_dir: Path,
		database: TDocDatabase,
		release: str = "latest",
		doc_only: bool = False,
		) -> list[Path]:
		"""Checkout spec documents to the checkout directory.

		Args:
		spec_numbers: List of spec numbers to checkout
		checkout_dir: Base checkout directory
		database: TDocDatabase instance for metadata lookup
		release: Release version to checkout
		doc_only: If True, download only document files instead of full zip

		Returns:
		List of paths to checked out specs
		"""
		sources = build_default_spec_sources()
		downloader = SpecDownloads(database)
		return downloader.checkout_specs(spec_numbers, doc_only, checkout_dir, release, sources=sources)


		def build_default_spec_sources() -> list[SpecSource]:
		"""Build the default list of spec sources.

		Returns:
		List of SpecSource instances for fetching spec metadata
		"""
		return [
		cast("SpecSource", FunctionSpecSource("3gpp", fetch_threegpp_metadata)),
		cast("SpecSource", FunctionSpecSource("whatthespec", fetch_whatthespec_metadata)),
		]


		@dataclass
		class CheckoutResult:
		"""Result of a checkout operation."""

		success_count: int
		error_count: int
		errors: list[str]


		def checkout_tdocs(
		results: list[TDocMetadata],
		checkout_dir: Path,
		force: bool = False,
		) -> CheckoutResult:
		"""Checkout multiple TDoc files to the checkout directory.

		Args:
		results: List of TDocMetadata to checkout
		checkout_dir: Base checkout directory
		force: If True, re-download even if already exists

		Returns:
		CheckoutResult with success/error counts
		"""
		if not results:
		return CheckoutResult(success_count=0, error_count=0, errors=[])

		checkout_dir.mkdir(parents=True, exist_ok=True)
		success_count = 0
		error_count = 0
		errors: list[str] = []

		for metadata in results:
		try:
		if not metadata.url:
		raise ValueError("missing URL")
		checkout_tdoc(metadata, checkout_dir, force=force)
		success_count += 1
		except (FileNotFoundError, OSError, ValueError, zipfile.BadZipFile) as exc:
		errors.append(f"{metadata.tdoc_id}: {exc}")
		error_count += 1

		return CheckoutResult(success_count=success_count, error_count=error_count, errors=errors)


		def checkout_meeting_tdocs(
		meetings: list[MeetingMetadata],
		cache_dir: Path,
		) -> CheckoutResult:
		"""Checkout TDoc files from a list of meetings.

		Args:
		meetings: List of MeetingMetadata to checkout TDocs from
		cache_dir: Cache directory for document list caching

		Returns:
		CheckoutResult with success/error counts
		"""
		if not meetings:
		return CheckoutResult(success_count=0, error_count=0, errors=[])

		checkout_dir = cache_dir / "checkout"
		unique: dict[str, TDocMetadata] = {}
		errors: list[str] = []

		for meeting in meetings:
		if not meeting.files_url:
		errors.append(f"{meeting.short_name}: no files URL")
		continue
		try:
		tdocs = fetch_meeting_document_list(meeting.meeting_id, cache_dir)
		except DocumentListError as exc:
		errors.append(f"{meeting.short_name}: {exc}")
		continue
		for metadata in tdocs:
		if metadata.tdoc_id not in unique:
		unique[metadata.tdoc_id] = metadata

		return checkout_tdocs(list(unique.values()), checkout_dir, force=False)


		__all__ = [
		"CheckoutResult",
		"checkout_meeting_tdocs",
		"checkout_specs",
		"checkout_tdoc",
		"checkout_tdocs",
		"clear_checkout_specs",
		"clear_checkout_tdocs",
		"get_checked_out_tdocs",
		"get_checkout_path",
		"prepare_tdoc_file",

src/tdoc_crawler/cli/app.py

+77 −145

File changed.

Preview size limit exceeded, changes collapsed.

src/tdoc_crawler/cli/args.py

+6 −1

Original line number	Diff line number	Diff line
		@@ -2,11 +2,16 @@

		from __future__ import annotations

		import logging
		from pathlib import Path
		from typing import Annotated

		import typer

		from tdoc_crawler.logging import DEFAULT_LEVEL as LOGGING_DEFAULT_LEVEL

		DEFAULT_VERBOSITY = logging.getLevelName(LOGGING_DEFAULT_LEVEL)

		CacheDirOption = Annotated[Path, typer.Option("--cache-dir", "-c", help="Cache directory", envvar="TDC_CACHE_DIR")]
		WorkingGroupOption = Annotated[list[str] \| None, typer.Option("--working-group", "-w", help="Filter by working group", envvar="TDC_WORKING_GROUP")]
		SubgroupOption = Annotated[list[str] \| None, typer.Option("--sub-group", "-s", help="Filter by sub-working group", envvar="TDC_SUB_GROUP")]
		@@ -26,7 +31,7 @@ OverallTimeoutOption = Annotated[
		]
		MaxRetriesOption = Annotated[int, typer.Option("--max-retries", help="HTTP retry attempts", envvar="TDC_MAX_RETRIES")]
		TimeoutOption = Annotated[int, typer.Option("--timeout", help="HTTP timeout seconds", envvar="TDC_TIMEOUT")]
		VerboseOption = Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging", envvar="TDC_VERBOSE")]
		VerbosityOption = Annotated[str, typer.Option("--verbosity", "-v", help="Logging verbosity level", envvar="TDC_VERBOSITY")]

		TDocIdsArgument = Annotated[list[str] \| None, typer.Argument(help="TDoc identifiers to query")]
		OutputFormatOption = Annotated[str, typer.Option("--output", "-o", help="Output format", envvar="TDC_OUTPUT")]

src/tdoc_crawler/cli/helpers.py

+4 −7

Original line number	Diff line number	Diff line
		@@ -2,7 +2,6 @@

		from __future__ import annotations

		import logging
		import os
		import subprocess
		import sys
		@@ -15,11 +14,12 @@ import typer
		from tdoc_crawler.cli.console import get_console
		from tdoc_crawler.crawlers import normalize_subgroup_alias, normalize_working_group_alias
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.logging import get_logger
		from tdoc_crawler.models import CrawlLimits, HttpCacheConfig, MeetingQueryConfig, SortOrder, WorkingGroup
		from tdoc_crawler.specs.normalization import normalize_portal_meeting_name
		from tdoc_crawler.specs.normalization import expand_spec_ranges_batch, normalize_portal_meeting_name

		console = get_console()
		_logger = logging.getLogger(__name__)
		_logger = get_logger(__name__)

		DEFAULT_DATABASE_FILENAME = "tdoc_crawler.db"
		DOWNLOAD_TIMEOUT = 60
		@@ -130,9 +130,6 @@ def collect_spec_numbers(specs: list[str] \| None, spec_file: Path \| None) -> lis
		if not collected:
		return []

		# Import here to avoid circular imports
		from tdoc_crawler.specs.normalization import expand_spec_ranges_batch

		try:
		expanded = expand_spec_ranges_batch(collected)
		except ValueError as e:
		@@ -239,7 +236,7 @@ def launch_file(path: Path) -> None:
		raise typer.Exit(code=1)
		try:
		if sys.platform.startswith("win"):
		os.startfile(path) # type: ignore[attr-defined] # noqa: S606
		os.startfile(path) # noqa: S606
		elif sys.platform == "darwin":
		open_cmd = Path("/usr/bin/open")
		if open_cmd.exists():

src/tdoc_crawler/logging/init.py

0 → 100644

+79 −0

Original line number	Diff line number	Diff line
		"""Logging module with RichHandler integration.

		Provides a factory function for creating loggers with RichHandler
		configured for consistent console output across the application.
		"""

		import functools
		import logging
		from typing import Final

		from rich.console import Console
		from rich.logging import RichHandler

		# Default logging level for all loggers
		DEFAULT_LEVEL: Final[int] = logging.WARNING

		# Configure the root logger with RichHandler on first import
		_console: Final[Console] = Console()
		_rich_handler: RichHandler = RichHandler(
		console=_console,
		show_time=True,
		show_path=False,
		rich_tracebacks=True,
		markup=False,
		tracebacks_width=None,
		)

		# Add RichHandler to root logger (no other handlers)
		logging.root.handlers.clear()
		logging.root.addHandler(_rich_handler)
		logging.root.setLevel(DEFAULT_LEVEL)


		@functools.cache
		def get_logger(name: str) -> logging.Logger:
		"""Get a logger instance with RichHandler configured.

		This factory function returns cached logger instances for consistent
		logging across the application. The first call sets up the
		RichHandler on the root logger, which all child loggers inherit.

		Args:
		name: The logger name, typically __name__ of the calling module.

		Returns:
		A configured logger instance.

		Examples:
		>>> logger = get_logger(__name__)
		>>> logger.info("Starting operation")
		>>> logger.debug("Detailed diagnostic information")
		"""
		return logging.getLogger(name)


		def set_verbosity(level: int \| str) -> None:
		"""Set the logging level for all loggers.

		This updates the root logger level, which affects all child loggers
		created via get_logger().

		Args:
		level: The logging level (e.g., logging.DEBUG, logging.INFO, etc.)
		Accepts integer values (0-50), logging constants, or string names of levels.

		Examples:
		>>> set_verbosity(logging.DEBUG)
		>>> set_verbosity(10) # DEBUG level
		"""
		if isinstance(level, str):
		level = level.upper()
		if level_int := logging._nameToLevel.get(level):
		level = level_int
		elif level.isnumeric():
		level = int(level)
		else:
		raise ValueError(f"Invalid logging level name: {level}")

		logging.root.setLevel(level)