Commit 7d0e04c4 authored by Jan Reimes's avatar Jan Reimes
Browse files
# Conflicts:
#	.planning/codebase/ARCHITECTURE.md
#	.planning/codebase/CONCERNS.md
#	.planning/codebase/CONVENTIONS.md
#	.planning/codebase/INTEGRATIONS.md
#	.planning/codebase/STACK.md
#	.planning/codebase/STRUCTURE.md
#	.planning/codebase/TESTING.md
#	demo.bat
#	src/tdoc_crawler/cli/_shared.py
#	src/tdoc_crawler/cli/_workspace_commands.py
#	src/tdoc_crawler/cli/args.py
#	src/tdoc_crawler/cli/crawl.py
#	src/tdoc_crawler/cli/query.py
#	src/tdoc_crawler/cli/tdoc_app.py
#	src/tdoc_crawler/config/cache_manager.py
#	src/tdoc_crawler/config/settings.py
#	src/tdoc_crawler/config/sources.py
#	src/tdoc_crawler/database/meetings.py
#	src/tdoc_crawler/database/specs.py
#	src/tdoc_crawler/extraction/checkout.py
#	src/tdoc_crawler/extraction/conversion.py
#	src/tdoc_crawler/extraction/convert.py
#	src/tdoc_crawler/extraction/fetch_tdoc.py
#	src/tdoc_crawler/extraction/workspace_utils.py
#	src/tdoc_crawler/http_client/session.py
#	src/tdoc_crawler/specs/operations/checkout.py
#	src/tdoc_crawler/specs/sources/threegpp.py
#	src/tdoc_crawler/tdocs/operations/checkout.py
#	src/tdoc_crawler/utils/normalization.py
#	src/tdoc_crawler/utils/parse.py
#	src/tdoc_crawler/workspaces/__init__.py
#	src/tdoc_crawler/workspaces/crud.py
#	src/tdoc_crawler/workspaces/utils.py
parents 625de3c2 18ad66a4
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -82,7 +82,7 @@ shell = "pwsh -NoProfile -Command"
run = [
    'cls',
    'npx -y add-mcp -y -a {{usage.ai_agent}} "grepai mcp-serve"',
    #'npx -y add-mcp -y -a {{usage.ai_agent}} -n docs-mcp-server "grepai mcp-serve"',
    'npx -y add-mcp -y -a {{usage.ai_agent}} -n teddi "uvx --from https://forge.3gpp.org/rep/reimes/teddi-mcp.git teddi-mcp"',
    'npx -y add-mcp -y -a {{usage.ai_agent}} -n cytoscnpy-mcp "cytoscnpy mcp-server"'
]

+9 −0
Original line number Diff line number Diff line
@@ -19,6 +19,9 @@
# Checkout directory name (default: checkout)
# TDC_CHECKOUT_DIRNAME=checkout

# Workspaces directory name (default: workspaces)
# TDC_WORKSPACES_DIRNAME=workspaces

# ============================================================================
# ETSI ONLINE (EOL) CREDENTIALS
# ============================================================================
@@ -81,6 +84,12 @@ TDC_MAX_RETRIES=3
# Maximum number of documents to crawl (default: 1000)
# TDC_LIMIT_TDOCS=1000

# Maximum meetings to crawl overall (negative = newest N)
# TDC_LIMIT_MEETINGS=

# Per sub-WG meeting limit (negative = newest N)
# TDC_LIMIT_MEETINGS_PER_SUBWG=

# Number of parallel subinterpreter workers (default: 4)
TDC_WORKERS=4

+3 −1
Original line number Diff line number Diff line
@@ -256,3 +256,5 @@ src/teddi-mcp/uv.lock
#.planning/
/PLAN.md
.opencode/
/.beads/export-state.json
opencode.json
+23 −20
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ A command-line tool for crawling the 3GPP FTP server, caching 3GPP document meta
- **Case-Insensitive Queries**: Search for TDocs regardless of case
- **Multiple Output Formats**: Export results as table, JSON, or YAML
- **Incremental Updates**: Only fetch new data on subsequent crawls
- **Wiki-First Architecture**: Extraction artifacts organized in ~/.3gpp-crawler/wiki/ for external tool consumption
- **Workspace Architecture**: Extraction artifacts organized in `~/.3gpp-crawler/workspaces/<ws>/sources/<doc>/`
- **Rich CLI**: Beautiful terminal output with progress indicators

## Installation
@@ -38,6 +38,7 @@ uv sync
### Using pip (not recommended)

```bash
# Note: package name may differ from repository name
pip install 3gpp-crawler
```

@@ -57,8 +58,8 @@ cp .env.example .env
# Acts as a "3GPP-compatible fallback". While whatthespec.net is the primary
# data source, it is community-maintained. Credentials allow falling back to
# official 3GPP portal endpoints if the primary source is unavailable.
EOL_USERNAME=your_username
EOL_PASSWORD=your_password
TDC_EOL_USERNAME=your_username
TDC_EOL_PASSWORD=your_password

# HTTP Cache Configuration (optional - uses defaults if not set)
HTTP_CACHE_TTL=7200                      # Cache TTL in seconds (default: 7200 = 2 hours)
@@ -71,11 +72,11 @@ Alternatively, you can:
uvx tdoc-crawler crawl-meetings --eol-username your_username --eol-password your_password

# Configure HTTP caching via CLI:
uvx tdoc-crawler crawl-tdocs --cache-ttl 3600 --cache-refresh
uvx tdoc-crawler crawl --cache-ttl 3600 --cache-refresh

# Or set environment variables directly:
export EOL_USERNAME=your_username
export EOL_PASSWORD=your_password
export TDC_EOL_USERNAME=your_username
export TDC_EOL_PASSWORD=your_password
export HTTP_CACHE_TTL=3600
```

@@ -87,20 +88,22 @@ NOTE: If no credentials are provided, the tool will prompt you interactively

| Command | Alias | Purpose |
|---------|-------|---------|
| **Crawling** | | |
| **tdoc-crawler** | | |
| `crawl` | `ct` | Crawl TDoc metadata from FTP |
| `crawl-meetings` | `cm` | Populate meeting database (Run this first!) |
| `crawl-tdocs` | `ct` | Crawl TDoc metadata from FTP |
| `crawl-specs` | `cs` | Crawl technical specification metadata |
| **Querying** | | |
| `query` | `qt` | Search for TDocs (auto-fetches if missing) |
| `query-meetings` | `qm` | Search and display meeting metadata |
| `query-tdocs` | `qt` | Search for TDocs (auto-fetches if missing) |
| `query-specs` | `qs` | Search technical specifications |
| **Utilities** | | |
| `open` | | Download and open a TDoc |
| `checkout` | | Batch download TDocs to local folder |
| `open-spec` | `os` | Download and open latest spec document |
| `checkout-spec` | `cos` | Batch download technical specifications |
| `stats` | | View database statistics |
| **spec-crawler** | | |
| `crawl` | | Crawl technical specification metadata |
| `query` | | Search technical specifications |
| `open` | | Download and open latest spec document |
| `checkout` | | Batch download technical specifications |
| **3gpp-crawler** | | |
| `config {init,show,validate,docs}` | | Manage configuration |
| `workspace {create,list,...}` | | Manage workspaces and processing |

### 1. Crawl Metadata

@@ -111,10 +114,10 @@ Gather metadata from 3GPP and WhatTheSpec:
tdoc-crawler crawl-meetings

# Crawl TDoc metadata (RAN, SA, CT)
tdoc-crawler crawl-tdocs
tdoc-crawler crawl

# Populate spec catalog
spec-crawler crawl-specs
spec-crawler crawl
```

### 2. Query Metadata
@@ -126,7 +129,7 @@ Search and filter stored information:
tdoc-crawler query R1-2400001

# Query specifications
spec-crawler query-specs 23.501
spec-crawler query 23.501

# List recent meetings
tdoc-crawler query-meetings --limit 10
@@ -141,13 +144,13 @@ Open documents, batch download (checkout), and check database status:
tdoc-crawler open R1-2400001

# Download and open latest version of a spec
spec-crawler open-spec 23.501
spec-crawler open 23.501

# Batch download (checkout) TDocs to local folder
tdoc-crawler checkout R1-2400001 S2-2400567

# Batch checkout specifications
spec-crawler checkout-spec 26130-26140
spec-crawler checkout 26130-26140

# View database statistics
tdoc-crawler stats
+3 −2
Original line number Diff line number Diff line
@@ -29,14 +29,14 @@ Config files are discovered in this order (later files override earlier):

### Path Settings

*File system paths for cache, database, checkout, and AI storage*
*File system paths for cache, database, checkout, and workspaces*

| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `cache_dir` | Path | ~/.3gpp-crawler | Root cache directory for storing downloaded files and metadata |
| `db_filename` | str | "3gpp_crawler.db" | SQLite database filename for storing crawl metadata |
| `checkout_dirname` | str | "checkout" | Subdirectory name for checked-out documents |
| `ai_cache_dirname` | str | "lightrag" | Subdirectory name for AI-related cache (embeddings, graphs) |
| `workspaces_dirname` | str | "workspaces" | Subdirectory name for workspace data (sources, wiki) |

### HTTP Settings

@@ -248,6 +248,7 @@ For backward compatibility, environment variables are still supported:
| Variable | Description |
|----------|-------------|
| `TDC_CACHE_DIR` | Cache directory path |
| `TDC_WORKSPACES_DIRNAME` | Workspaces subdirectory name |
| `TDC_EOL_USERNAME` | ETSI Online username |
| `TDC_EOL_PASSWORD` | ETSI Online password |
| `TDC_TIMEOUT` | HTTP timeout in seconds |
Loading