feat(cli, database): enhance date filtering and add glob pattern support (90d56ffb) · Commits · Jan Reimes / 3gpp-crawler

.env.example

+8 −4

Original line number	Diff line number	Diff line
		@@ -50,11 +50,11 @@ TDC_LIMIT_TDOCS=100
		# Limit total meetings to crawl (default: None = no limit)
		TDC_LIMIT_MEETINGS=10

		# Query date range - start date (ISO 8601 timestamp, e.g., 2024-01-01T00:00:00Z)
		TDC_START_DATE=2024-01-01T00:00:00Z
		# Query date range - start date (YYYY, YYYY-MM, or YYYY-MM-DD format)
		TDC_START_DATE=2024-01-01

		# Query date range - end date (ISO 8601 timestamp, e.g., 2024-12-31T23:59:59Z)
		TDC_END_DATE=2024-12-31T23:59:59Z
		# Query date range - end date (YYYY, YYYY-MM, or YYYY-MM-DD format)
		TDC_END_DATE=2024-12-31

		# Output Configuration

		@@ -96,6 +96,10 @@ TDC_AI_LLM_API_BASE=
		# See https://huggingface.co/models?library=sentence-transformers for alternatives
		TDC_AI_EMBEDDING_MODEL=perplexity-ai/pplx-embed-context-v1-0.6b

		# Activate workspace after creation (default: true)
		# Set to "true", "1", or "yes" to enable; anything else disables it
		TDC_AI_WORKSPACE_ACTIVATE=true

		# Chunking
		TDC_AI_MAX_CHUNK_SIZE=1000
		TDC_AI_CHUNK_OVERLAP=100

src/tdoc_crawler/cli/ai.py

+61 −20

Original line number	Diff line number	Diff line
		@@ -3,6 +3,7 @@
		from __future__ import annotations

		import json
		from datetime import datetime
		from pathlib import Path
		from typing import Annotated

		@@ -28,9 +29,20 @@ from tdoc_crawler.ai import (
		)
		from tdoc_crawler.ai.models import SourceKind
		from tdoc_crawler.ai.operations.pipeline import process_all
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		EndDateOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		StartDateOption,
		TitlePatternExcludeOption,
		TitlePatternOption,
		)
		from tdoc_crawler.config import CacheManager

		HELP_PANEL = "AI Commands"
		from tdoc_crawler.database import TDocDatabase
		from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.utils.date_parser import parse_partial_date

		ai_app = typer.Typer(help="AI document processing commands")
		console = Console()
		@@ -179,6 +191,9 @@ _workspace_app = typer.Typer(help="Manage GraphRAG workspaces")
		def workspace_create(
		name: Annotated[str, typer.Argument(..., help="Workspace name")],
		auto_build: Annotated[bool, typer.Option("--auto-build", help="Automatically process documents added to this workspace")] = False,
		activate: Annotated[
		bool, typer.Option("--activate/--no-activate", help="Activate workspace after creation (default: activate)", envvar="TDC_AI_WORKSPACE_ACTIVATE")
		] = True,
		json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
		) -> None:
		"""Create a new workspace."""
		@@ -187,10 +202,15 @@ def workspace_create(
		storage = AiStorage(AiConfig.from_env(cache_manager_name="default").ai_store_path) # type: ignore[arg-type]
		workspace = create_workspace(storage, name, auto_build=auto_build)

		if activate:
		set_active_workspace(name)

		if json_output:
		typer.echo(workspace.model_dump_json())
		else:
		console.print(f"[green]Created workspace: {workspace.workspace_name}[/green]")
		if activate:
		console.print("[cyan]Activated as current workspace[/cyan]")
		if auto_build:
		console.print("[cyan]Auto-build: Enabled[/cyan]")

		@@ -351,16 +371,29 @@ def workspace_clear(
		@_workspace_app.command("add-members")
		def workspace_add_members(
		workspace: Annotated[str \| None, typer.Option("--workspace", "-w", help="Workspace name")] = None,
		items: Annotated[list[str], typer.Argument(..., help="Source item IDs to add")] = None, # type: ignore[assignment]
		items: Annotated[list[str] \| None, typer.Argument(..., help="Source item IDs to add (optional if filters provided)")] = None,
		kind: Annotated[
		str,
		typer.Option("--kind", help="Source kind (tdoc, spec, other)"),
		] = "tdoc",
		checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", help="Checkout/download documents if not present")] = True,
		release: Annotated[str \| None, typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs.")] = None,
		# NEW: Add filter options
		start_date: StartDateOption = None,
		end_date: EndDateOption = None,
		source: SourcePatternOption = None,
		source_ex: SourcePatternExcludeOption = None,
		title: TitlePatternOption = None,
		title_ex: TitlePatternExcludeOption = None,
		agenda: AgendaPatternOption = None,
		agenda_ex: AgendaPatternExcludeOption = None,
		limit: Annotated[int \| None, typer.Option("--limit", help="Maximum items to add")] = None,
		json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
		) -> None:
		"""Add source items to a workspace."""
		"""Add source items to a workspace.

		If no items are provided, queries the database using the provided filters.
		"""
		workspace = resolve_workspace(workspace)
		manager = CacheManager().register()

		@@ -368,12 +401,30 @@ def workspace_add_members(

		source_kind = SourceKind(kind.lower()) if kind.lower() in [e.value for e in SourceKind] else SourceKind.OTHER

		# Build members with actual paths
		members = []
		checkout_base = manager.root / "checkout"
		# Query database if no items provided but filters are present
		if items is None:
		if source_kind == SourceKind.TDOC:
		config = TDocQueryConfig(
		start_date=datetime.combine(parse_partial_date(start_date), datetime.min.time()) if start_date else None,
		end_date=datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time()) if end_date else None,
		source_pattern=source,
		source_pattern_exclude=source_ex,
		title_pattern=title,
		title_pattern_exclude=title_ex,
		agenda_pattern=agenda,
		agenda_pattern_exclude=agenda_ex,
		limit=limit,
		)
		with TDocDatabase(manager.db_file) as db:
		results = db.query_tdocs(config)
		items = [tdoc.tdoc_id for tdoc in results]
		else:
		console.print("[red]Error: No items provided and filtering is only supported for TDocs[/red]")
		raise typer.Exit(1)

		for item in items:
		source_path = item
		if not items:
		console.print("[yellow]No items match the provided filters[/yellow]")
		return

		# Build members with actual paths
		members = []
		@@ -400,6 +451,7 @@ def workspace_add_members(
		# Ensure .ai subfolder exists
		ensure_ai_subfolder(checkout_path)
		members.append(make_workspace_member(workspace, item, source_path, source_kind))

		# Report skipped items
		if skipped_items:
		console.print("\n[yellow]Warning: Skipped invalid items:[/yellow]")
		@@ -408,17 +460,6 @@ def workspace_add_members(
		if source_kind == SourceKind.TDOC:
		console.print("\n[yellow]Hint: For TDocs, ensure the meeting has been crawled with 'tdoc-crawler crawl-tdocs <meeting-id>'[/yellow]")
		console.print()
		if checkout:
		checkout_path = None
		if source_kind == SourceKind.TDOC:
		checkout_path = checkout_tdoc_to_workspace(item, checkout_base, storage, workspace)
		elif source_kind == SourceKind.SPEC:
		checkout_path = checkout_spec_to_workspace(item, checkout_base, workspace, release or "latest")
		if checkout_path:
		source_path = str(checkout_path)
		# Ensure .ai subfolder exists
		ensure_ai_subfolder(checkout_path)
		members.append(make_workspace_member(workspace, item, source_path, source_kind))

		count = storage.add_workspace_members(workspace, members)

src/tdoc_crawler/cli/args.py

+25 −0

Original line number	Diff line number	Diff line
		@@ -68,6 +68,31 @@ IncludeWithoutFilesOption = Annotated[
		typer.Option("--include-without-files", help="Include meetings without files URLs"),
		]
		FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch full metadata instead of URL only")]
		# Glob pattern filters for TDocs
		SourcePatternOption = Annotated[
		list[str] \| None,
		typer.Option("--source", help="Glob pattern for source field (e.g., 'huawei'). Multiple values are OR'd.", envvar="TDC_SOURCE_PATTERN"),
		]
		SourcePatternExcludeOption = Annotated[
		list[str] \| None,
		typer.Option("--source-ex", help="Glob pattern to exclude source field. Multiple values are OR'd.", envvar="TDC_SOURCE_PATTERN_EXCLUDE"),
		]
		TitlePatternOption = Annotated[
		list[str] \| None,
		typer.Option("--title", help="Glob pattern for title field (e.g., 'AI'). Multiple values are OR'd.", envvar="TDC_TITLE_PATTERN"),
		]
		TitlePatternExcludeOption = Annotated[
		list[str] \| None,
		typer.Option("--title-ex", help="Glob pattern to exclude title field. Multiple values are OR'd.", envvar="TDC_TITLE_PATTERN_EXCLUDE"),
		]
		AgendaPatternOption = Annotated[
		list[str] \| None,
		typer.Option("--agenda", help="Glob pattern for agenda_item_text field. Multiple values are OR'd.", envvar="TDC_AGENDA_PATTERN"),
		]
		AgendaPatternExcludeOption = Annotated[
		list[str] \| None,
		typer.Option("--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar="TDC_AGENDA_PATTERN_EXCLUDE"),
		]


		# Options - Specs

src/tdoc_crawler/cli/crawl.py

+48 −4

Original line number	Diff line number	Diff line
		@@ -11,11 +11,14 @@ from dotenv import load_dotenv

		from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_options
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		CacheDirOption,
		CheckoutOption,
		ClearDbOption,
		ClearSpecsOption,
		ClearTDocsOption,
		EndDateOption,
		EolPasswordOption,
		EolUsernameOption,
		HttpCacheOption,
		@@ -28,13 +31,17 @@ from tdoc_crawler.cli.args import (
		MaxRetriesOption,
		NoProgressOption,
		OutputFormatOption,
		OverallTimeoutOption,
		PromptCredentialsOption,
		ReleaseOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		SpecArgument,
		SpecFileOption,
		StartDateOption,
		SubgroupOption,
		TimeoutOption,
		TitlePatternExcludeOption,
		TitlePatternOption,
		VerbosityOption,
		WorkersOption,
		WorkingGroupOption,
		@@ -56,6 +63,7 @@ from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, c
		from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
		from tdoc_crawler.tdocs.operations import TDocCrawler
		from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

		load_dotenv()
		@@ -77,8 +85,16 @@ def crawl_tdocs(
		workers: WorkersOption = 4,
		timeout: TimeoutOption = 30,
		max_retries: MaxRetriesOption = 3,
		overall_timeout: OverallTimeoutOption = None,
		overall_timeout: int \| None = None,
		no_progress: NoProgressOption = False,
		start_date: StartDateOption = None,
		end_date: EndDateOption = None,
		source: SourcePatternOption = None,
		source_ex: SourcePatternExcludeOption = None,
		title: TitlePatternOption = None,
		title_ex: TitlePatternExcludeOption = None,
		agenda: AgendaPatternOption = None,
		agenda_ex: AgendaPatternExcludeOption = None,
		cache_dir: CacheDirOption = None,
		http_cache_enabled: HttpCacheOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		@@ -103,8 +119,8 @@ def crawl_tdocs(
		working_groups=working_groups,
		subgroups=subgroups,
		meeting_ids=None,
		start_date=None,
		end_date=None,
		start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
		end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
		incremental=incremental,
		force_revalidate=False,
		workers=workers,
		@@ -128,9 +144,25 @@ def crawl_tdocs(
		limit=None,
		order=SortOrder.ASC,
		include_without_files=False,
		start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
		end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
		)
		meetings = meeting_db.query_meetings(query_config)

		if meetings:
		# Extract unique subgroups from queried meetings
		unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
		if unique_subgroups:
		scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
		else:
		scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
		# Fallback to input parameters if no meetings found in DB
		elif subgroups:
		scope_parts.append(f"subgroups: {', '.join(subgroups)}")
		else:
		scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
		meetings = meeting_db.query_meetings(query_config)

		if meetings:
		# Extract unique subgroups from queried meetings
		unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
		@@ -191,6 +223,14 @@ def crawl_tdocs(
		working_groups=working_groups,
		limit=checkout_limit,
		order=SortOrder.DESC,
		start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
		end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
		source_pattern=source,
		source_pattern_exclude=source_ex,
		title_pattern=title,
		title_pattern_exclude=title_ex,
		agenda_pattern=agenda,
		agenda_pattern_exclude=agenda_ex,
		)
		results = database.query_tdocs(query_config)

		@@ -229,6 +269,8 @@ def crawl_meetings(
		eol_username: EolUsernameOption = None,
		eol_password: EolPasswordOption = None,
		prompt_credentials: PromptCredentialsOption = None,
		start_date: StartDateOption = None,
		end_date: EndDateOption = None,
		cache_dir: CacheDirOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		@@ -305,6 +347,8 @@ def crawl_meetings(
		limit=limit_meetings if limit_meetings and limit_meetings > 0 else None,
		order=SortOrder.DESC,
		include_without_files=False,
		start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
		end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
		)
		with MeetingDatabase(db_file) as database:
		meetings = database.query_meetings(query_config)

src/tdoc_crawler/cli/query.py

+22 −2

Original line number	Diff line number	Diff line
		@@ -11,6 +11,8 @@ from dotenv import load_dotenv

		from tdoc_crawler.cli._shared import console, handle_clear_options
		from tdoc_crawler.cli.args import (
		AgendaPatternExcludeOption,
		AgendaPatternOption,
		CacheDirOption,
		CheckoutOption,
		ClearSpecsOption,
		@@ -21,6 +23,8 @@ from tdoc_crawler.cli.args import (
		NoFetchOption,
		OrderOption,
		OutputFormatOption,
		SourcePatternExcludeOption,
		SourcePatternOption,
		SpecArgument,
		SpecFileOption,
		StartDateOption,
		@@ -28,6 +32,8 @@ from tdoc_crawler.cli.args import (
		SubgroupOption,
		TDocIdsArgument,
		TitleOption,
		TitlePatternExcludeOption,
		TitlePatternOption,
		VerbosityOption,
		WorkingGroupOption,
		)
		@@ -52,6 +58,7 @@ from tdoc_crawler.specs.operations.checkout import checkout_specs
		from tdoc_crawler.tdocs.models import TDocQueryConfig
		from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
		from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
		from tdoc_crawler.utils.date_parser import parse_partial_date
		from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

		load_dotenv()
		@@ -72,6 +79,13 @@ def query_tdocs(
		clear_tdocs: ClearTDocsOption = False,
		clear_specs: ClearSpecsOption = False,
		cache_dir: CacheDirOption = None,
		# Glob pattern filters
		source: SourcePatternOption = None,
		source_ex: SourcePatternExcludeOption = None,
		title: TitlePatternOption = None,
		title_ex: TitlePatternExcludeOption = None,
		agenda: AgendaPatternOption = None,
		agenda_ex: AgendaPatternExcludeOption = None,
		verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
		) -> None:
		"""Query TDoc metadata from database."""
		@@ -80,12 +94,12 @@ def query_tdocs(

		working_groups = parse_working_groups(working_group)
		try:
		start = datetime.fromisoformat(start_date) if start_date else None
		start = datetime.combine(parse_partial_date(start_date), datetime.min.time()) if start_date else None
		except ValueError as exc:
		console.print("[red]Invalid start date format; use ISO-8601")
		raise typer.Exit(code=2) from exc
		try:
		end = datetime.fromisoformat(end_date) if end_date else None
		end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time()) if end_date else None
		except ValueError as exc:
		console.print("[red]Invalid end date format; use ISO-8601")
		raise typer.Exit(code=2) from exc
		@@ -111,6 +125,12 @@ def query_tdocs(
		end_date=end,
		limit=limit,
		order=sort_order,
		source_pattern=source,
		source_pattern_exclude=source_ex,
		title_pattern=title,
		title_pattern_exclude=title_ex,
		agenda_pattern=agenda,
		agenda_pattern_exclude=agenda_ex,
		)

		db_file = manager.db_file