Commit 90d56ffb authored by Jan Reimes's avatar Jan Reimes
Browse files

feat(cli, database): enhance date filtering and add glob pattern support

* Introduce start and end date filters for meetings and TDocs.
* Add glob pattern filters for source, title, and agenda fields.
* Update environment variable examples for date formats.
* Implement date parsing utility for flexible date input.
* Adjust CLI commands to support new filtering options.
parent be72f6d8
Loading
Loading
Loading
Loading
+8 −4
Original line number Diff line number Diff line
@@ -50,11 +50,11 @@ TDC_LIMIT_TDOCS=100
# Limit total meetings to crawl (default: None = no limit)
TDC_LIMIT_MEETINGS=10

# Query date range - start date (ISO 8601 timestamp, e.g., 2024-01-01T00:00:00Z)
TDC_START_DATE=2024-01-01T00:00:00Z
# Query date range - start date (YYYY, YYYY-MM, or YYYY-MM-DD format)
TDC_START_DATE=2024-01-01

# Query date range - end date (ISO 8601 timestamp, e.g., 2024-12-31T23:59:59Z)
TDC_END_DATE=2024-12-31T23:59:59Z
# Query date range - end date (YYYY, YYYY-MM, or YYYY-MM-DD format)
TDC_END_DATE=2024-12-31

# Output Configuration

@@ -96,6 +96,10 @@ TDC_AI_LLM_API_BASE=
# See https://huggingface.co/models?library=sentence-transformers for alternatives
TDC_AI_EMBEDDING_MODEL=perplexity-ai/pplx-embed-context-v1-0.6b

# Activate workspace after creation (default: true)
# Set to "true", "1", or "yes" to enable; anything else disables it
TDC_AI_WORKSPACE_ACTIVATE=true

# Chunking
TDC_AI_MAX_CHUNK_SIZE=1000
TDC_AI_CHUNK_OVERLAP=100
+61 −20
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
from __future__ import annotations

import json
from datetime import datetime
from pathlib import Path
from typing import Annotated

@@ -28,9 +29,20 @@ from tdoc_crawler.ai import (
)
from tdoc_crawler.ai.models import SourceKind
from tdoc_crawler.ai.operations.pipeline import process_all
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    EndDateOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    StartDateOption,
    TitlePatternExcludeOption,
    TitlePatternOption,
)
from tdoc_crawler.config import CacheManager

HELP_PANEL = "AI Commands"
from tdoc_crawler.database import TDocDatabase
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.utils.date_parser import parse_partial_date

ai_app = typer.Typer(help="AI document processing commands")
console = Console()
@@ -179,6 +191,9 @@ _workspace_app = typer.Typer(help="Manage GraphRAG workspaces")
def workspace_create(
    name: Annotated[str, typer.Argument(..., help="Workspace name")],
    auto_build: Annotated[bool, typer.Option("--auto-build", help="Automatically process documents added to this workspace")] = False,
    activate: Annotated[
        bool, typer.Option("--activate/--no-activate", help="Activate workspace after creation (default: activate)", envvar="TDC_AI_WORKSPACE_ACTIVATE")
    ] = True,
    json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
) -> None:
    """Create a new workspace."""
@@ -187,10 +202,15 @@ def workspace_create(
    storage = AiStorage(AiConfig.from_env(cache_manager_name="default").ai_store_path)  # type: ignore[arg-type]
    workspace = create_workspace(storage, name, auto_build=auto_build)

    if activate:
        set_active_workspace(name)

    if json_output:
        typer.echo(workspace.model_dump_json())
    else:
        console.print(f"[green]Created workspace: {workspace.workspace_name}[/green]")
        if activate:
            console.print("[cyan]Activated as current workspace[/cyan]")
        if auto_build:
            console.print("[cyan]Auto-build: Enabled[/cyan]")

@@ -351,16 +371,29 @@ def workspace_clear(
@_workspace_app.command("add-members")
def workspace_add_members(
    workspace: Annotated[str | None, typer.Option("--workspace", "-w", help="Workspace name")] = None,
    items: Annotated[list[str], typer.Argument(..., help="Source item IDs to add")] = None,  # type: ignore[assignment]
    items: Annotated[list[str] | None, typer.Argument(..., help="Source item IDs to add (optional if filters provided)")] = None,
    kind: Annotated[
        str,
        typer.Option("--kind", help="Source kind (tdoc, spec, other)"),
    ] = "tdoc",
    checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", help="Checkout/download documents if not present")] = True,
    release: Annotated[str | None, typer.Option("--release", help="Spec release version (e.g., 16.3.0, 17.0.0). Only applies to specs.")] = None,
    # NEW: Add filter options
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    source: SourcePatternOption = None,
    source_ex: SourcePatternExcludeOption = None,
    title: TitlePatternOption = None,
    title_ex: TitlePatternExcludeOption = None,
    agenda: AgendaPatternOption = None,
    agenda_ex: AgendaPatternExcludeOption = None,
    limit: Annotated[int | None, typer.Option("--limit", help="Maximum items to add")] = None,
    json_output: Annotated[bool, typer.Option("--json", help="Output as JSON")] = False,
) -> None:
    """Add source items to a workspace."""
    """Add source items to a workspace.

    If no items are provided, queries the database using the provided filters.
    """
    workspace = resolve_workspace(workspace)
    manager = CacheManager().register()

@@ -368,12 +401,30 @@ def workspace_add_members(

    source_kind = SourceKind(kind.lower()) if kind.lower() in [e.value for e in SourceKind] else SourceKind.OTHER

    # Build members with actual paths
    members = []
    checkout_base = manager.root / "checkout"
    # Query database if no items provided but filters are present
    if items is None:
        if source_kind == SourceKind.TDOC:
            config = TDocQueryConfig(
                start_date=datetime.combine(parse_partial_date(start_date), datetime.min.time()) if start_date else None,
                end_date=datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time()) if end_date else None,
                source_pattern=source,
                source_pattern_exclude=source_ex,
                title_pattern=title,
                title_pattern_exclude=title_ex,
                agenda_pattern=agenda,
                agenda_pattern_exclude=agenda_ex,
                limit=limit,
            )
            with TDocDatabase(manager.db_file) as db:
                results = db.query_tdocs(config)
                items = [tdoc.tdoc_id for tdoc in results]
        else:
            console.print("[red]Error: No items provided and filtering is only supported for TDocs[/red]")
            raise typer.Exit(1)

    for item in items:
        source_path = item
    if not items:
        console.print("[yellow]No items match the provided filters[/yellow]")
        return

    # Build members with actual paths
    members = []
@@ -400,6 +451,7 @@ def workspace_add_members(
                # Ensure .ai subfolder exists
                ensure_ai_subfolder(checkout_path)
        members.append(make_workspace_member(workspace, item, source_path, source_kind))

    # Report skipped items
    if skipped_items:
        console.print("\n[yellow]Warning: Skipped invalid items:[/yellow]")
@@ -408,17 +460,6 @@ def workspace_add_members(
        if source_kind == SourceKind.TDOC:
            console.print("\n[yellow]Hint: For TDocs, ensure the meeting has been crawled with 'tdoc-crawler crawl-tdocs <meeting-id>'[/yellow]")
        console.print()
        if checkout:
            checkout_path = None
            if source_kind == SourceKind.TDOC:
                checkout_path = checkout_tdoc_to_workspace(item, checkout_base, storage, workspace)
            elif source_kind == SourceKind.SPEC:
                checkout_path = checkout_spec_to_workspace(item, checkout_base, workspace, release or "latest")
            if checkout_path:
                source_path = str(checkout_path)
                # Ensure .ai subfolder exists
                ensure_ai_subfolder(checkout_path)
        members.append(make_workspace_member(workspace, item, source_path, source_kind))

    count = storage.add_workspace_members(workspace, members)

+25 −0
Original line number Diff line number Diff line
@@ -68,6 +68,31 @@ IncludeWithoutFilesOption = Annotated[
    typer.Option("--include-without-files", help="Include meetings without files URLs"),
]
FullMetadataOption = Annotated[bool, typer.Option("--full-metadata", help="Fetch full metadata instead of URL only")]
# Glob pattern filters for TDocs
SourcePatternOption = Annotated[
    list[str] | None,
    typer.Option("--source", help="Glob pattern for source field (e.g., '*huawei*'). Multiple values are OR'd.", envvar="TDC_SOURCE_PATTERN"),
]
SourcePatternExcludeOption = Annotated[
    list[str] | None,
    typer.Option("--source-ex", help="Glob pattern to exclude source field. Multiple values are OR'd.", envvar="TDC_SOURCE_PATTERN_EXCLUDE"),
]
TitlePatternOption = Annotated[
    list[str] | None,
    typer.Option("--title", help="Glob pattern for title field (e.g., '*AI*'). Multiple values are OR'd.", envvar="TDC_TITLE_PATTERN"),
]
TitlePatternExcludeOption = Annotated[
    list[str] | None,
    typer.Option("--title-ex", help="Glob pattern to exclude title field. Multiple values are OR'd.", envvar="TDC_TITLE_PATTERN_EXCLUDE"),
]
AgendaPatternOption = Annotated[
    list[str] | None,
    typer.Option("--agenda", help="Glob pattern for agenda_item_text field. Multiple values are OR'd.", envvar="TDC_AGENDA_PATTERN"),
]
AgendaPatternExcludeOption = Annotated[
    list[str] | None,
    typer.Option("--agenda-ex", help="Glob pattern to exclude agenda_item_text field. Multiple values are OR'd.", envvar="TDC_AGENDA_PATTERN_EXCLUDE"),
]


# Options - Specs
+48 −4
Original line number Diff line number Diff line
@@ -11,11 +11,14 @@ from dotenv import load_dotenv

from tdoc_crawler.cli._shared import console, create_progress_bar, handle_clear_options
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    CacheDirOption,
    CheckoutOption,
    ClearDbOption,
    ClearSpecsOption,
    ClearTDocsOption,
    EndDateOption,
    EolPasswordOption,
    EolUsernameOption,
    HttpCacheOption,
@@ -28,13 +31,17 @@ from tdoc_crawler.cli.args import (
    MaxRetriesOption,
    NoProgressOption,
    OutputFormatOption,
    OverallTimeoutOption,
    PromptCredentialsOption,
    ReleaseOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    SpecArgument,
    SpecFileOption,
    StartDateOption,
    SubgroupOption,
    TimeoutOption,
    TitlePatternExcludeOption,
    TitlePatternOption,
    VerbosityOption,
    WorkersOption,
    WorkingGroupOption,
@@ -56,6 +63,7 @@ from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, c
from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig
from tdoc_crawler.tdocs.operations import TDocCrawler
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

load_dotenv()
@@ -77,8 +85,16 @@ def crawl_tdocs(
    workers: WorkersOption = 4,
    timeout: TimeoutOption = 30,
    max_retries: MaxRetriesOption = 3,
    overall_timeout: OverallTimeoutOption = None,
    overall_timeout: int | None = None,
    no_progress: NoProgressOption = False,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    source: SourcePatternOption = None,
    source_ex: SourcePatternExcludeOption = None,
    title: TitlePatternOption = None,
    title_ex: TitlePatternExcludeOption = None,
    agenda: AgendaPatternOption = None,
    agenda_ex: AgendaPatternExcludeOption = None,
    cache_dir: CacheDirOption = None,
    http_cache_enabled: HttpCacheOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
@@ -103,8 +119,8 @@ def crawl_tdocs(
        working_groups=working_groups,
        subgroups=subgroups,
        meeting_ids=None,
        start_date=None,
        end_date=None,
        start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
        end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
        incremental=incremental,
        force_revalidate=False,
        workers=workers,
@@ -128,9 +144,25 @@ def crawl_tdocs(
            limit=None,
            order=SortOrder.ASC,
            include_without_files=False,
            start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
            end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
        )
        meetings = meeting_db.query_meetings(query_config)

    if meetings:
        # Extract unique subgroups from queried meetings
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
        if unique_subgroups:
            scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}")
        else:
            scope_parts.append(f"meetings: {len(meetings)} meeting(s)")
    # Fallback to input parameters if no meetings found in DB
    elif subgroups:
        scope_parts.append(f"subgroups: {', '.join(subgroups)}")
    else:
        scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}")
        meetings = meeting_db.query_meetings(query_config)

    if meetings:
        # Extract unique subgroups from queried meetings
        unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX}
@@ -191,6 +223,14 @@ def crawl_tdocs(
                working_groups=working_groups,
                limit=checkout_limit,
                order=SortOrder.DESC,
                start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
                end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
                source_pattern=source,
                source_pattern_exclude=source_ex,
                title_pattern=title,
                title_pattern_exclude=title_ex,
                agenda_pattern=agenda,
                agenda_pattern_exclude=agenda_ex,
            )
            results = database.query_tdocs(query_config)

@@ -229,6 +269,8 @@ def crawl_meetings(
    eol_username: EolUsernameOption = None,
    eol_password: EolPasswordOption = None,
    prompt_credentials: PromptCredentialsOption = None,
    start_date: StartDateOption = None,
    end_date: EndDateOption = None,
    cache_dir: CacheDirOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
@@ -305,6 +347,8 @@ def crawl_meetings(
            limit=limit_meetings if limit_meetings and limit_meetings > 0 else None,
            order=SortOrder.DESC,
            include_without_files=False,
            start_date=parse_partial_date(start_date, is_end=False) if start_date and start_date.strip() else None,
            end_date=parse_partial_date(end_date, is_end=True) if end_date and end_date.strip() else None,
        )
        with MeetingDatabase(db_file) as database:
            meetings = database.query_meetings(query_config)
+22 −2
Original line number Diff line number Diff line
@@ -11,6 +11,8 @@ from dotenv import load_dotenv

from tdoc_crawler.cli._shared import console, handle_clear_options
from tdoc_crawler.cli.args import (
    AgendaPatternExcludeOption,
    AgendaPatternOption,
    CacheDirOption,
    CheckoutOption,
    ClearSpecsOption,
@@ -21,6 +23,8 @@ from tdoc_crawler.cli.args import (
    NoFetchOption,
    OrderOption,
    OutputFormatOption,
    SourcePatternExcludeOption,
    SourcePatternOption,
    SpecArgument,
    SpecFileOption,
    StartDateOption,
@@ -28,6 +32,8 @@ from tdoc_crawler.cli.args import (
    SubgroupOption,
    TDocIdsArgument,
    TitleOption,
    TitlePatternExcludeOption,
    TitlePatternOption,
    VerbosityOption,
    WorkingGroupOption,
)
@@ -52,6 +58,7 @@ from tdoc_crawler.specs.operations.checkout import checkout_specs
from tdoc_crawler.tdocs.models import TDocQueryConfig
from tdoc_crawler.tdocs.operations.checkout import checkout_meeting_tdocs, checkout_tdocs
from tdoc_crawler.tdocs.operations.fetch import fetch_missing_tdocs
from tdoc_crawler.utils.date_parser import parse_partial_date
from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, parse_working_groups

load_dotenv()
@@ -72,6 +79,13 @@ def query_tdocs(
    clear_tdocs: ClearTDocsOption = False,
    clear_specs: ClearSpecsOption = False,
    cache_dir: CacheDirOption = None,
    # Glob pattern filters
    source: SourcePatternOption = None,
    source_ex: SourcePatternExcludeOption = None,
    title: TitlePatternOption = None,
    title_ex: TitlePatternExcludeOption = None,
    agenda: AgendaPatternOption = None,
    agenda_ex: AgendaPatternExcludeOption = None,
    verbosity: VerbosityOption = str(DEFAULT_VERBOSITY),
) -> None:
    """Query TDoc metadata from database."""
@@ -80,12 +94,12 @@ def query_tdocs(

    working_groups = parse_working_groups(working_group)
    try:
        start = datetime.fromisoformat(start_date) if start_date else None
        start = datetime.combine(parse_partial_date(start_date), datetime.min.time()) if start_date else None
    except ValueError as exc:
        console.print("[red]Invalid start date format; use ISO-8601")
        raise typer.Exit(code=2) from exc
    try:
        end = datetime.fromisoformat(end_date) if end_date else None
        end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time()) if end_date else None
    except ValueError as exc:
        console.print("[red]Invalid end date format; use ISO-8601")
        raise typer.Exit(code=2) from exc
@@ -111,6 +125,12 @@ def query_tdocs(
        end_date=end,
        limit=limit,
        order=sort_order,
        source_pattern=source,
        source_pattern_exclude=source_ex,
        title_pattern=title,
        title_pattern_exclude=title_ex,
        agenda_pattern=agenda,
        agenda_pattern_exclude=agenda_ex,
    )

    db_file = manager.db_file
Loading