Merge e06a1ea into main: resolve conflicts in roadmap, 3gpp-ai CLI, and summarize tests (6f78b4c3) · Commits · Jan Reimes / 3gpp-crawler

.config/mise/config.toml

+22 −20

Original line number	Diff line number	Diff line
		@@ -16,6 +16,7 @@ ripgrep = "latest"

		node = "latest"
		bun = "latest"
		java = "temurin"

		[env]
		GITLAB_HOST = "forge.3gpp.org"
		@@ -80,44 +81,45 @@ arg "<ai_agent>" help="AI assistant to use: claude, gemini, copilot, opencode, .
		shell = "pwsh -NoProfile -Command"
		run = [
		'cls',
		'bun x add-mcp -y -a {{usage.ai_agent}} "grepai mcp-serve"',
		#'bun x add-mcp -y -a {{usage.ai_agent}} -n docs-mcp-server "grepai mcp-serve"',
		'bun x add-mcp -y -a {{usage.ai_agent}} -n cytoscnpy-mcp "cytoscnpy mcp-server"'
		'npx -y add-mcp -y -a {{usage.ai_agent}} "grepai mcp-serve"',
		#'npx -y add-mcp -y -a {{usage.ai_agent}} -n docs-mcp-server "grepai mcp-serve"',
		'npx -y add-mcp -y -a {{usage.ai_agent}} -n cytoscnpy-mcp "cytoscnpy mcp-server"'
		]

		[tasks.add-skills]
		run = [
		"cls",
		# python skills
		"bun x skills add https://github.com/jr2804/prompts -a universal -y -s python-ultimate",
		"bun x skills add https://github.com/jiatastic/open-python-skills -a universal -y -s ty-skills -s pydantic",
		"npx -y skills add https://github.com/jr2804/prompts -a universal -y -s python-ultimate",
		"npx -y skills add https://github.com/jiatastic/open-python-skills -a universal -y -s ty-skills -s pydantic",

		# anti-AI hallucination skills
		"bun x skills add https://github.com/alinaqi/claude-bootstrap -s code-deduplication -a universal -y",
		"bun x skills add https://github.com/hardikpandya/stop-slop -a universal -y",
		"bun x skills add https://github.com/glaforge/deslopify -a universal -y",
		"npx -y skills add https://github.com/alinaqi/claude-bootstrap -s code-deduplication -a universal -y",
		"npx -y skills add https://github.com/hardikpandya/stop-slop -a universal -y",
		"npx -y skills add https://github.com/glaforge/deslopify -a universal -y",

		# development tool skills
		"bun x skills add https://github.com/s2005/uv-skill -a universal -y",
		"bun x skills add https://github.com/netresearch/agent-rules-skill -a universal -y",
		"npx -y skills add https://github.com/s2005/uv-skill -a universal -y",
		"npx -y skills add https://github.com/netresearch/agent-rules-skill -a universal -y",

		# skills for other tools
		"bun x skills add https://github.com/nicobailon/visual-explainer -a universal -y",
		"bun x skills add https://github.com/FlorianBruniaux/claude-code-ultimate-guide/tree/main/examples -s rtk-optimizer -a universal -y",
		"bun x skills add JuliusBrussee/caveman -a universal -y",
		"bun x skills add run-llama/llamaparse-agent-skills --skill liteparse -a universal -y",
		"npx -y skills add https://github.com/nicobailon/visual-explainer -a universal -y",
		"npx -y skills add https://github.com/FlorianBruniaux/claude-code-ultimate-guide/tree/main/examples -s rtk-optimizer -a universal -y",
		"npx -y skills add JuliusBrussee/caveman -a universal -y",
		"npx -y skills add run-llama/llamaparse-agent-skills --skill liteparse -a universal -y",

		# skill for project-specific tools/MCP servers
		"bun x skills add https://github.com/AlmogBaku/debug-skill -a universal -y", # for dab tool -> AlmogBaku/debug-skill
		"bun x skills add yoanbernabeu/grepai-skills -a universal -y",
		"bun x skills add AlmogBaku/debug-skill -a universal -y",
		"bun x skills add https://github.com/arabold/docs-mcp-server -a universal -y",
		"npx -y skills add https://github.com/AlmogBaku/debug-skill -a universal -y", # for dab tool -> AlmogBaku/debug-skill
		"npx -y skills add yoanbernabeu/grepai-skills -a universal -y",
		"npx -y skills add AlmogBaku/debug-skill -a universal -y",
		"npx -y skills add https://github.com/arabold/docs-mcp-server -a universal -y",

		# 3GPP skills (TODO: fix - requires well-known endpoint)
		"bun x skills add https://forge.3gpp.org/rep/reimes/awesome-3gpp-skills/-/tree/main/skills -a universal -y",
		"npx -y skills add https://forge.3gpp.org/rep/reimes/awesome-3gpp-skills/-/tree/main/skills -a universal -y",
		"npx -y skills add https://github.com/lugasia/3gpp-skill -a universal -y",

		# skill for teddi-mcp/-cli
		"bun x skills add https://forge.3gpp.org/rep/reimes/teddi-mcp/-/tree/main/skills -a universal -y",
		"npx -y skills add https://forge.3gpp.org/rep/reimes/teddi-mcp/-/tree/main/skills -a universal -y",


		]

.env.example

+20 −1

Original line number	Diff line number	Diff line
		@@ -109,7 +109,7 @@ TDC_AI_LLM_MODEL=openrouter/openrouter/free
		# TDC_AI_LLM_API_KEY=

		# Embedding model in format <provider>/<model_name>
		# Recommended: ollama/qwen3-embedding:0.6b (self-hosted, no subscription required)
		# Recommended: ollama/vuongnguyen2212/CodeRankEmbed:latest (CodeRank embedding, self-hosted)
		TDC_AI_EMBEDDING_MODEL=ollama/vuongnguyen2212/CodeRankEmbed:latest

		# Maximum tokens per chunk (default: 1000)
		@@ -139,6 +139,25 @@ TDC_AI_PARALLELISM=4
		# Set to "true", "1", or "yes" to enable
		# TDC_AI_VLM=false

		# ============================================================================
		# OPENDATALOADER PDF SETTINGS
		# ============================================================================
		# OpenDataLoader is used for PDF extraction (replaces previous docling-based pipeline)
		# Requires Java 11+ installed on system PATH
		# See: https://github.com/opendataloader-project/opendataloader-pdf

		# Enable hybrid AI mode for complex PDF pages (default: off)
		# Options: off, docling-fast, docling-full
		# Requires: pip install "opendataloader-pdf[hybrid]" and opendataloader-pdf-hybrid server running
		# TDC_AI_HYBRID_MODE=off

		# URL for hybrid AI server when enabled (default: http://localhost:5002)
		# TDC_AI_HYBRID_URL=http://localhost:5002

		# ============================================================================
		# GRAPH QUERY CONFIGURATION
		# ============================================================================

		# Graph query level: simple\|medium\|advanced (default: simple)
		# simple: Return count and list without synthesis
		# medium: Parse query keywords, filter nodes, generate simple text summary

.planning/ROADMAP.md

+74 −1

Original line number	Diff line number	Diff line
		# Roadmap
		# Roadmap — 3GPP Crawler Codebase Improvement

		## Phases

		- [x] Phase 01: Normalization & Progress Bars - Consolidate duplicate normalization logic and fix progress bar document counts (completed 2026-04-12)
		- [x] Phase 02: Checkout, Graph, Deprecation & Config - Fix checkout paths, datetime errors, deprecated imports, and config drift (completed 2026-04-19)

		---

		## Phase Details

		### Phase 01: Normalization & Progress Bars

		Goal: Eliminate duplicate normalization code and fix progress bar UX issues so users see document counts during long operations

		Depends on: Nothing

		Requirements: NORM-01, NORM-02, PROGRESS-01, PROGRESS-02

		Success Criteria (what must be TRUE):
		1. All normalization functions exist in single location (`src/tdoc_crawler/utils/normalization.py`)
		2. `meetings/utils.py` re-exports from normalization.py, no duplicate functions
		3. 6+ files importing normalization use single source (verified via grep)
		4. Unit tests exist for all normalization functions covering edge cases
		5. Progress bar shows "N/N" format (e.g., "5/69") during `workspace process`
		6. Progress bar shows "N/N" format during `add_members` command

		Plans: 2 plans

		Plan list:
		- [x] 01-normalization-PLAN.md — Consolidate normalization logic and add unit tests
		- [x] 02-progress-PLAN.md — Fix progress bar display to show document counts

		---

		### Phase 02: Checkout, Graph, Deprecation & Config

		Goal: Fix checkout path issues, datetime scope errors in graph building, remove deprecated imports, and align config defaults

		Depends on: Phase 1

		Requirements: CHECKOUT-01, GRAPH-01, DEPRECATED-01, CONFIG-01

		Success Criteria (what must be TRUE):
		1. Empty folder detection triggers re-download correctly
		2. "No document files found" warning no longer appears for valid TDocs with empty folders
		3. Graph building completes without datetime scope errors
		4. Errors in graph building are caught and reported with meaningful messages
		5. No import errors from deprecated modules (AiStorage, EmbeddingsManager, tdoc_ai.operations.pipeline, lancedb)
		6. `.env.example` embedding model matches code defaults in `threegpp_ai/config.py`

		Plans: 3 plans

		Plan list:
		- [x] 02-01-PLAN.md — Fix checkout empty folder detection and re-download triggers
		- [x] 02-02-PLAN.md — Add graph error handling and verify deprecated imports removed
		- [x] 02-03-PLAN.md — Align embedding model defaults between .env.example and code

		---

		## Progress

		\| Phase \| Plans Complete \| Status \| Completed \|
		\|-------\|----------------\|--------\|-----------\|
		\| 01. Normalization & Progress Bars \| 2/2 \| Complete \| 2026-04-12 \|
		\| 02. Checkout, Graph, Deprecation & Config \| 3/3 \| Complete \| 2026-04-19 \|

		---

		Last updated: 2026-04-19

		---

		## Archive

		- [x] Milestone v1.0 Advanced PDF Extraction Pipeline (2026-04-17 to 2026-04-18) - 6 phases complete, 12 plans complete, archived at `.planning/milestones/v1.0-ROADMAP.md`

.planning/phases/02-checkout-graph-deprecation-config/02-01-PLAN.md

0 → 100644

+196 −0

Original line number	Diff line number	Diff line
		---
		phase: 02-checkout-graph-deprecation-config
		plan: 01
		type: execute
		wave: 1
		depends_on: []
		files_modified:
		- packages/3gpp-ai/threegpp_ai/operations/workspaces.py
		- packages/3gpp-ai/threegpp_ai/operations/classify.py
		- tests/test_checkout.py
		autonomous: true
		requirements:
		- CHECKOUT-01

		must_haves:
		truths:
		- "Empty checkout folders trigger re-download automatically"
		- "No 'No document files found' warnings for valid TDocs"
		- "Checkout path matches 3GPP FTP hierarchy structure"
		artifacts:
		- path: "packages/3gpp-ai/threegpp_ai/operations/workspaces.py"
		provides: "Checkout logic with empty folder detection"
		contains: "_checkout_tdoc_if_needed"
		- path: "packages/3gpp-ai/threegpp_ai/operations/classify.py"
		provides: "Classification with proper empty folder handling"
		contains: "classify_document_files"
		key_links:
		- from: "packages/3gpp-ai/threegpp_ai/operations/workspaces.py"
		to: "packages/3gpp-ai/threegpp_ai/operations/classify.py"
		via: "checkout then classify flow"
		pattern: "classify_document_files.*folder_path"
		---

		<objective>
		Fix checkout path issues and empty folder detection to prevent "No document files found" errors during workspace processing.

		Purpose: When documents are deleted but workspace metadata remains, the system must re-checkout documents automatically instead of failing with warnings.

		Output: Robust checkout logic with proper empty folder detection and re-download triggers.
		</objective>

		<execution_context>
		@$HOME/.config/opencode/get-shit-done/workflows/execute-plan.md
		@$HOME/.config/opencode/get-shit-done/templates/summary.md
		</execution_context>

		<context>
		@.planning/PROJECT.md
		@.planning/ROADMAP.md
		@.planning/STATE.md
		@.planning/REQUIREMENTS.md
		@.planning/codebase/CONCERNS.md

		# Key interfaces from workspaces.py
		```python
		async def _resolve_tdoc_metadata(tdoc_id: str, db_file: Path \| None = None) -> TDocMetadata \| None:
		"""Resolve TDoc metadata via fallback chain: Database → WhatTheSpec → 3GPP Portal."""

		def _checkout_tdoc_if_needed(tdoc_id: str, metadata: TDocMetadata, checkout_base: Path) -> Path \| None:
		"""Checkout a TDoc if not already checked out."""

		def resolve_tdoc_checkout_path(tdoc_id: str, checkout_base: Path) -> Path \| None:
		"""Find existing TDoc checkout path by scanning for 'Docs' directories."""
		```

		# Current issue (CONCERNS.md line 96-119)
		When running "ai workspace process" after deleting crawled documents, checkout path construction may be incorrect.
		Error pattern: "WARNING No document files found in C:\Users\...\checkout\...\S4-250638 for document S4-250638"

		# CacheManager pattern (from AGENTS.md)
		All paths MUST use CacheManager - NEVER hardcode ~/.3gpp-crawler
		</context>

		<tasks>

		<task type="auto" tdd="true">
		<name>Task 1: Add empty folder detection tests</name>
		<files>tests/test_checkout.py</files>
		<behavior>
		- Test: Empty folder (only .ai subfolder) triggers re-download
		- Test: Folder with actual files does NOT trigger re-download
		- Test: Non-existent folder triggers checkout
		- Test: resolve_tdoc_checkout_path finds correct path in nested Docs structure
		</behavior>
		<action>
		Create test file with pytest fixtures for checkout scenarios:
		1. Mock checkout_base with rglob("Docs") structure
		2. Test _checkout_tdoc_if_needed with empty folder (has .ai/ but no docs)
		3. Test _checkout_tdoc_if_needed with populated folder
		4. Verify resolve_tdoc_checkout_path handles 3GPP FTP hierarchy:
		- TSG_SA/WG4_CODEC/TSGS4_131-bis-e/Docs/S4-250638
		- TSG_RAN/WG1_RH/TSGR1_115/Docs/R1-2300001
		5. Test classification flow when folder is empty vs populated

		Use tmp_path for isolated test directories. Mock TDocMetadata with realistic FTP URLs.
		</action>
		<verify>
		<automated>uv run pytest tests/test_checkout.py -v</automated>
		</verify>
		<done>
		All checkout tests pass, covering empty folder detection, re-download triggers, and path resolution edge cases.
		</done>
		</task>

		<task type="auto" tdd="true">
		<name>Task 2: Fix empty folder detection in workspaces.py</name>
		<files>packages/3gpp-ai/threegpp_ai/operations/workspaces.py</files>
		<behavior>
		- Test: Empty folder (only .ai subfolder) returns False for has_files check
		- Test: Folder with .pdf/.docx/.xlsx files returns True for has_files check
		- Test: _checkout_tdoc_if_needed re-downloads when folder empty
		- Test: Warning message includes full path for debugging
		</behavior>
		<action>
		Update _checkout_tdoc_if_needed() at line 447-483:

		1. Improve empty folder detection (line 460-466):
		- Current: any(f.is_file() for f in existing_path.iterdir())
		- Problem: Only checks top-level, misses .ai/ subfolder scenario
		- Fix: Check recursively, exclude .ai/ from "has files" determination
		- Pattern: any(f.is_file() and not f.parent.name == ".ai" for f in existing_path.rglob("*"))

		2. Add better logging (line 466):
		- Current: "folder exists but is empty"
		- Add: List what was found (e.g., "found: .ai/ subfolder only")

		3. Ensure re-download triggers (line 467-479):
		- Verify checkout_tdoc() is called when folder empty
		- Add error handling for FileNotFoundError with path context

		4. Fix resolve_tdoc_checkout_path (line 390-394):
		- Verify rglob("Docs") matches 3GPP FTP structure
		- Add debug logging for path resolution failures

		Per DRY principle: Use CacheManager for all path operations if not already.
		</action>
		<verify>
		<automated>uv run pytest tests/test_checkout.py::test_empty_folder_triggers_redownload -v</automated>
		</verify>
		<done>
		Empty folder detection works correctly, re-download triggers automatically, no false "No document files found" warnings.
		</done>
		</task>

		<task type="auto">
		<name>Task 3: Improve classification error messages</name>
		<files>packages/3gpp-ai/threegpp_ai/operations/classify.py</files>
		<action>
		Update classify_document_files() at line 216-233:

		1. Enhance warning message (line 232):
		- Current: "No document files found in {folder_path} for document {document_id}"
		- Add: List expected file types (.pdf, .docx, .xlsx, .pptx)
		- Add: Suggest re-download command if folder empty
		- Example: "No document files found in {path}. Expected: .pdf/.docx/.xlsx. Run 'tdoc-crawler checkout {id}' to re-download."

		2. Add folder structure debug (line 230-232):
		- Before returning [], log what files exist (if any)
		- Pattern: logger.debug("Folder contents: %s", list(folder_path.iterdir()))

		3. Handle edge case: folder_path is symlink or mount point
		- Add: folder_path.resolve() before checking existence

		This improves debugging when checkout issues occur without changing core logic.
		</action>
		<verify>
		<automated>uv run pytest tests/test_checkout.py::test_classification_empty_folder_message -v</automated>
		</verify>
		<done>
		Error messages include actionable debugging information, expected file types, and re-download suggestions.
		</done>
		</task>

		</tasks>

		<verification>
		- [ ] All test_checkout.py tests pass
		- [ ] Empty folder triggers re-download in manual test
		- [ ] No "No document files found" warnings for valid TDocs with files
		- [ ] Checkout path matches 3GPP FTP hierarchy (verified with rg --files)
		</verification>

		<success_criteria>
		1. CHECKOUT-01 requirement satisfied: Empty folder detection triggers re-download correctly
		2. "No document files found" warning no longer appears for valid TDocs with actual document files
		3. Path construction matches 3GPP FTP hierarchy (TSG_SA/WG4_/TSGS4_/Docs/TDocID)
		4. Tests cover edge cases: empty folder, populated folder, non-existent folder, symlink paths
		</success_criteria>

		<output>
		After completion, create `.planning/phases/02-checkout-graph-deprecation-config/02-01-SUMMARY.md` with:
		- Checkout fixes implemented
		- Test coverage added
		- Before/after error message examples
		- Manual verification results
		</output>

.planning/phases/02-checkout-graph-deprecation-config/02-02-PLAN.md

0 → 100644

+230 −0

File added.

Preview size limit exceeded, changes collapsed.