Loading src/tdoc_crawler/cli/crawl.py +41 −29 Original line number Diff line number Diff line Loading @@ -57,6 +57,7 @@ from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata, Me from tdoc_crawler.meetings.operations.crawl import MeetingCrawler, MeetingCrawlResult from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, checkout_specs from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig from tdoc_crawler.tdocs.operations import TDocCrawler Loading @@ -75,6 +76,43 @@ def _parse_date(date_str: str | None, is_end: bool = False) -> date | None: return parse_partial_date(date_str, is_end=is_end) def _build_scope_description( meetings: list[MeetingMetadata], subgroups: list[str] | None, working_groups: list[WorkingGroup], ) -> list[str]: scope_parts = [] if meetings: unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") return scope_parts def _print_crawl_result(result: TDocCrawlResult, throughput: float) -> None: console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") def _print_meeting_crawl_result(result: MeetingCrawlResult) -> None: console.print(f"[green]Processed {result.processed} meetings[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") def crawl_tdocs( working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, Loading Loading @@ -139,9 +177,6 @@ def crawl_tdocs( db_file = crawler_config.path.db_file scope_parts = [] # Query actual meetings from database to show realistic scope async def fetch_meetings() -> list[MeetingMetadata]: async with MeetingDatabase(db_file) as meeting_db: query_config = MeetingQueryConfig( Loading @@ -156,19 +191,7 @@ def crawl_tdocs( return await meeting_db.query_meetings(query_config) meetings = asyncio.run(fetch_meetings()) if meetings: # Extract unique subgroups from queried meetings unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") # Fallback to input parameters if no meetings found in DB elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") scope_parts = _build_scope_description(meetings, subgroups, working_groups) console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") Loading Loading @@ -245,13 +268,7 @@ def crawl_tdocs( return result, throughput result, throughput = asyncio.run(run_tdoc_crawl()) console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") _print_crawl_result(result, throughput) def crawl_meetings( Loading Loading @@ -348,12 +365,7 @@ def crawl_meetings( result = asyncio.run(run_meeting_crawl()) console.print(f"[green]Processed {result.processed} meetings[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") _print_meeting_crawl_result(result) if checkout: query_config = MeetingQueryConfig( Loading src/tdoc_crawler/cli/query.py +15 −19 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ from __future__ import annotations import asyncio from datetime import UTC, datetime from typing import Annotated import typer Loading Loading @@ -64,6 +63,20 @@ from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, pars HELP_PANEL = "Query Commands" def _parse_date_range(start_date: str | None, end_date: str | None) -> tuple[datetime | None, datetime | None]: try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: console.print("[red]Invalid start date format; use ISO-8601") raise typer.Exit(code=2) from exc try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError as exc: console.print("[red]Invalid end date format; use ISO-8601") raise typer.Exit(code=2) from exc return start, end def query_tdocs( tdoc_ids: TDocIdsArgument = None, working_group: WorkingGroupOption = None, Loading @@ -84,7 +97,6 @@ def query_tdocs( title_ex: TitlePatternExcludeOption = None, agenda: AgendaPatternOption = None, agenda_ex: AgendaPatternExcludeOption = None, graph_rag: Annotated[bool, typer.Option("--graph-rag", help="Use hybrid graph-RAG search")] = False, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Query TDoc metadata from database.""" Loading @@ -92,16 +104,7 @@ def query_tdocs( path_config = PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig() working_groups = parse_working_groups(working_group) try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: console.print("[red]Invalid start date format; use ISO-8601") raise typer.Exit(code=2) from exc try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError as exc: console.print("[red]Invalid end date format; use ISO-8601") raise typer.Exit(code=2) from exc start, end = _parse_date_range(start_date, end_date) try: sort_order = SortOrder(order.lower()) Loading Loading @@ -177,13 +180,6 @@ def query_tdocs( meeting_map = asyncio.run(load_meeting_map()) # If graph-rag flag is set, perform hybrid search # Disabled due to logical inconsistency - query_hybrid expects text query, not TDoc ID # The hybrid search function is designed for semantic search + graph expansion, # but was being incorrectly called with a TDoc ID instead of a text query. # This functionality has been disabled as it doesn't make logical sense in CLI context. pass if config.output_format is OutputFormat.TABLE: print_tdoc_table(results, meeting_map) else: Loading Loading
src/tdoc_crawler/cli/crawl.py +41 −29 Original line number Diff line number Diff line Loading @@ -57,6 +57,7 @@ from tdoc_crawler.meetings.models import MeetingCrawlConfig, MeetingMetadata, Me from tdoc_crawler.meetings.operations.crawl import MeetingCrawler, MeetingCrawlResult from tdoc_crawler.models.base import OutputFormat, SortOrder from tdoc_crawler.models.subworking_groups import SUBTB_INDEX from tdoc_crawler.models.working_groups import WorkingGroup from tdoc_crawler.specs.operations.checkout import build_default_spec_sources, checkout_specs from tdoc_crawler.tdocs.models import TDocCrawlConfig, TDocQueryConfig from tdoc_crawler.tdocs.operations import TDocCrawler Loading @@ -75,6 +76,43 @@ def _parse_date(date_str: str | None, is_end: bool = False) -> date | None: return parse_partial_date(date_str, is_end=is_end) def _build_scope_description( meetings: list[MeetingMetadata], subgroups: list[str] | None, working_groups: list[WorkingGroup], ) -> list[str]: scope_parts = [] if meetings: unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") return scope_parts def _print_crawl_result(result: TDocCrawlResult, throughput: float) -> None: console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") def _print_meeting_crawl_result(result: MeetingCrawlResult) -> None: console.print(f"[green]Processed {result.processed} meetings[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") def crawl_tdocs( working_group: WorkingGroupOption = None, subgroup: SubgroupOption = None, Loading Loading @@ -139,9 +177,6 @@ def crawl_tdocs( db_file = crawler_config.path.db_file scope_parts = [] # Query actual meetings from database to show realistic scope async def fetch_meetings() -> list[MeetingMetadata]: async with MeetingDatabase(db_file) as meeting_db: query_config = MeetingQueryConfig( Loading @@ -156,19 +191,7 @@ def crawl_tdocs( return await meeting_db.query_meetings(query_config) meetings = asyncio.run(fetch_meetings()) if meetings: # Extract unique subgroups from queried meetings unique_subgroups = {SUBTB_INDEX[m.subtb].code for m in meetings if m.subtb and m.subtb in SUBTB_INDEX} if unique_subgroups: scope_parts.append(f"subgroups: {', '.join(sorted(unique_subgroups))}") else: scope_parts.append(f"meetings: {len(meetings)} meeting(s)") # Fallback to input parameters if no meetings found in DB elif subgroups: scope_parts.append(f"subgroups: {', '.join(subgroups)}") else: scope_parts.append(f"working groups: {', '.join(wg.value for wg in working_groups)}") scope_parts = _build_scope_description(meetings, subgroups, working_groups) console.print(f"[cyan]Crawling TDocs ({', '.join(scope_parts)})[/cyan]") Loading Loading @@ -245,13 +268,7 @@ def crawl_tdocs( return result, throughput result, throughput = asyncio.run(run_tdoc_crawl()) console.print(f"[green]Processed {result.processed} TDocs ({throughput:.1f} TDocs/sec)[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") _print_crawl_result(result, throughput) def crawl_meetings( Loading Loading @@ -348,12 +365,7 @@ def crawl_meetings( result = asyncio.run(run_meeting_crawl()) console.print(f"[green]Processed {result.processed} meetings[/green]") console.print(f"[green]Inserted {result.inserted}, updated {result.updated}[/green]") if result.errors: console.print(f"[yellow]{len(result.errors)} issues detected[/yellow]") for error in result.errors[:5]: console.print(f" - {error}") _print_meeting_crawl_result(result) if checkout: query_config = MeetingQueryConfig( Loading
src/tdoc_crawler/cli/query.py +15 −19 Original line number Diff line number Diff line Loading @@ -4,7 +4,6 @@ from __future__ import annotations import asyncio from datetime import UTC, datetime from typing import Annotated import typer Loading Loading @@ -64,6 +63,20 @@ from tdoc_crawler.utils.parse import collect_spec_numbers, parse_subgroups, pars HELP_PANEL = "Query Commands" def _parse_date_range(start_date: str | None, end_date: str | None) -> tuple[datetime | None, datetime | None]: try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: console.print("[red]Invalid start date format; use ISO-8601") raise typer.Exit(code=2) from exc try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError as exc: console.print("[red]Invalid end date format; use ISO-8601") raise typer.Exit(code=2) from exc return start, end def query_tdocs( tdoc_ids: TDocIdsArgument = None, working_group: WorkingGroupOption = None, Loading @@ -84,7 +97,6 @@ def query_tdocs( title_ex: TitlePatternExcludeOption = None, agenda: AgendaPatternOption = None, agenda_ex: AgendaPatternExcludeOption = None, graph_rag: Annotated[bool, typer.Option("--graph-rag", help="Use hybrid graph-RAG search")] = False, verbosity: VerbosityOption = str(DEFAULT_VERBOSITY), ) -> None: """Query TDoc metadata from database.""" Loading @@ -92,16 +104,7 @@ def query_tdocs( path_config = PathConfig(cache_dir=cache_dir) if cache_dir else PathConfig() working_groups = parse_working_groups(working_group) try: start = datetime.combine(parse_partial_date(start_date), datetime.min.time(), tzinfo=UTC) if start_date else None except ValueError as exc: console.print("[red]Invalid start date format; use ISO-8601") raise typer.Exit(code=2) from exc try: end = datetime.combine(parse_partial_date(end_date, is_end=True), datetime.max.time(), tzinfo=UTC) if end_date else None except ValueError as exc: console.print("[red]Invalid end date format; use ISO-8601") raise typer.Exit(code=2) from exc start, end = _parse_date_range(start_date, end_date) try: sort_order = SortOrder(order.lower()) Loading Loading @@ -177,13 +180,6 @@ def query_tdocs( meeting_map = asyncio.run(load_meeting_map()) # If graph-rag flag is set, perform hybrid search # Disabled due to logical inconsistency - query_hybrid expects text query, not TDoc ID # The hybrid search function is designed for semantic search + graph expansion, # but was being incorrectly called with a TDoc ID instead of a text query. # This functionality has been disabled as it doesn't make logical sense in CLI context. pass if config.output_format is OutputFormat.TABLE: print_tdoc_table(results, meeting_map) else: Loading