From 6739182c2049c26c3b0b85c8c2ad74a519a4296c Mon Sep 17 00:00:00 2001
From: Fahad <fahad@2doapp.com>
Date: Fri, 13 Jun 2025 15:59:48 +0400
Subject: [PATCH] Differentiate custom from openrouter models Split readme into
 multiple docs

---
 README.md                        | 481 +------------------------------
 conf/custom_models.json          |  68 ++---
 docs/advanced-usage.md           | 328 +++++++++++++++++++++
 docs/custom_models.md            |  57 +++-
 docs/setup-troubleshooting.md    | 175 +++++++++++
 providers/custom.py              |  48 ++-
 providers/openai_compatible.py   |   8 +-
 providers/openrouter_registry.py |   1 +
 8 files changed, 617 insertions(+), 549 deletions(-)
 create mode 100644 docs/advanced-usage.md
 create mode 100644 docs/setup-troubleshooting.md

diff --git a/README.md b/README.md
index 39d1c6d..f811196 100644
--- a/README.md
+++ b/README.md
@@ -44,18 +44,13 @@ and review into consideration to aid with its pre-commit review.
   - [`debug`](#5-debug---expert-debugging-assistant) - Debugging help
   - [`analyze`](#6-analyze---smart-file-analysis) - File analysis
 
-- **Advanced Topics**
-  - [Model Configuration](#model-configuration) - Auto mode & multi-provider selection
-  - [Thinking Modes](#thinking-modes---managing-token-costs--quality) - Control depth vs cost
-  - [Working with Large Prompts](#working-with-large-prompts) - Bypass MCP's 25K token limit
-  - [Web Search Integration](#web-search-integration) - Smart search recommendations
-  - [Collaborative Workflows](#collaborative-workflows) - Multi-tool patterns
-  - [Tool Parameters](#tool-parameters) - Detailed parameter reference
+- **Advanced Usage**
+  - [Advanced Features](#advanced-features) - AI-to-AI conversations, large prompts, web search
+  - [Complete Advanced Guide](docs/advanced-usage.md) - Model configuration, thinking modes, workflows, tool parameters
 
-- **Resources**
-  - [Windows Setup](#windows-setup-guide) - WSL setup instructions for Windows
-  - [Troubleshooting](#troubleshooting) - Common issues and solutions
-  - [Testing](#testing) - Running tests
+- **Setup & Support**
+  - [Setup & Troubleshooting Guide](docs/setup-troubleshooting.md) - Testing, troubleshooting, common issues
+  - [License](#license) - Apache 2.0
 
 ## Why This Server?
 
@@ -426,172 +421,9 @@ Use zen and perform a thorough precommit ensuring there aren't any new regressio
 "Get zen to show its version"
 ```
 
-## Tool Parameters
+For detailed tool parameters and configuration options, see the [Advanced Usage Guide](docs/advanced-usage.md).
 
-All tools that work with files support **both individual files and entire directories**. The server automatically expands directories, filters for relevant code files, and manages token limits.
 
-### File-Processing Tools
-
-**`analyze`** - Analyze files or directories
-- `files`: List of file paths or directories (required)
-- `question`: What to analyze (required)  
-- `model`: auto|pro|flash|o3|o3-mini (default: server default)
-- `analysis_type`: architecture|performance|security|quality|general
-- `output_format`: summary|detailed|actionable
-- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
-- `use_websearch`: Enable web search for documentation and best practices - allows model to request Claude perform searches (default: true)
-
-```
-"Analyze the src/ directory for architectural patterns" (auto mode picks best model)
-"Use flash to quickly analyze main.py and tests/ to understand test coverage" 
-"Use o3 for logical analysis of the algorithm in backend/core.py"
-"Use pro for deep analysis of the entire backend/ directory structure"
-```
-
-**`codereview`** - Review code files or directories
-- `files`: List of file paths or directories (required)
-- `model`: auto|pro|flash|o3|o3-mini (default: server default)
-- `review_type`: full|security|performance|quick
-- `focus_on`: Specific aspects to focus on
-- `standards`: Coding standards to enforce
-- `severity_filter`: critical|high|medium|all
-- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
-
-```
-"Review the entire api/ directory for security issues" (auto mode picks best model)
-"Use pro to review auth/ for deep security analysis"
-"Use o3 to review logic in algorithms/ for correctness"
-"Use flash to quickly review src/ with focus on performance, only show critical issues"
-```
-
-**`debug`** - Debug with file context
-- `error_description`: Description of the issue (required)
-- `model`: auto|pro|flash|o3|o3-mini (default: server default)
-- `error_context`: Stack trace or logs
-- `files`: Files or directories related to the issue
-- `runtime_info`: Environment details
-- `previous_attempts`: What you've tried
-- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
-- `use_websearch`: Enable web search for error messages and solutions - allows model to request Claude perform searches (default: true)
-
-```
-"Debug this logic error with context from backend/" (auto mode picks best model)
-"Use o3 to debug this algorithm correctness issue"
-"Use pro to debug this complex architecture problem"
-```
-
-**`thinkdeep`** - Extended analysis with file context
-- `current_analysis`: Your current thinking (required)
-- `model`: auto|pro|flash|o3|o3-mini (default: server default)
-- `problem_context`: Additional context
-- `focus_areas`: Specific aspects to focus on
-- `files`: Files or directories for context
-- `thinking_mode`: minimal|low|medium|high|max (default: max, Gemini only)
-- `use_websearch`: Enable web search for documentation and insights - allows model to request Claude perform searches (default: true)
-
-```
-"Think deeper about my design with reference to src/models/" (auto mode picks best model)
-"Use pro to think deeper about this architecture with extended thinking"
-"Use o3 to think deeper about the logical flow in this algorithm"
-```
-
-## Collaborative Workflows
-
-### Design → Review → Implement
-```
-Think hard about designing and developing a fun calculator app in swift. Review your design plans with o3, taking in
-their suggestions but keep the feature-set realistic and doable without adding bloat. Begin implementing and in between
-implementation, get a codereview done by Gemini Pro and chat with Flash if you need to for creative directions.   
-```
-
-### Code → Review → Fix
-```
-Implement a new screen where the locations taken from the database display on a map, with pins falling from
-the top and landing with animation. Once done, codereview with gemini pro and o3 both and ask them to critique your
-work. Fix medium to critical bugs / concerns / issues and show me the final product
-```
-
-### Debug → Analyze → Solution → Precommit Check → Publish
-```
-Take a look at these log files saved under subfolder/diagnostics.log there's a bug where the user says the app
-crashes at launch. Think hard and go over each line, tallying it with corresponding code within the project. After
-you've performed initial investigation, ask gemini pro to analyze the log files and the related code where you 
-suspect lies the bug and then formulate and implement a bare minimal fix. Must not regress. Perform a precommit
-with zen in the end using gemini pro to confirm we're okay to publish the fix 
-```
-
-### Tool Selection Guidance
-To help choose the right tool for your needs:
-
-**Decision Flow:**
-1. **Have a specific error/exception?** → Use `debug`
-2. **Want to find bugs/issues in code?** → Use `codereview`
-3. **Want to understand how code works?** → Use `analyze`
-4. **Have analysis that needs extension/validation?** → Use `thinkdeep`
-5. **Want to brainstorm or discuss?** → Use `chat`
-
-**Key Distinctions:**
-- `analyze` vs `codereview`: analyze explains, codereview prescribes fixes
-- `chat` vs `thinkdeep`: chat is open-ended, thinkdeep extends specific analysis
-- `debug` vs `codereview`: debug diagnoses runtime errors, review finds static issues
-
-## Thinking Modes - Managing Token Costs & Quality
-
-**Claude automatically manages thinking modes based on task complexity**, but you can also manually control Gemini's reasoning depth to balance between response quality and token consumption. Each thinking mode uses a different amount of tokens, directly affecting API costs and response time.
-
-### Thinking Modes & Token Budgets
-
-These only apply to models that support customizing token usage for extended thinking, such as Gemini 2.5 Pro.
-
-| Mode | Token Budget | Use Case | Cost Impact |
-|------|-------------|----------|-------------|
-| `minimal` | 128 tokens | Simple, straightforward tasks | Lowest cost |
-| `low` | 2,048 tokens | Basic reasoning tasks | 16x more than minimal |
-| `medium` | 8,192 tokens | **Default** - Most development tasks | 64x more than minimal |
-| `high` | 16,384 tokens | Complex problems requiring thorough analysis (default for `thinkdeep`) | 128x more than minimal |
-| `max` | 32,768 tokens | Exhaustive reasoning | 256x more than minimal |
-
-### How to Use Thinking Modes
-
-**Claude automatically selects appropriate thinking modes**, but you can override this by explicitly requesting a specific mode in your prompts. Remember: higher thinking modes = more tokens = higher cost but better quality:
-
-#### Optimizing Token Usage & Costs
-
-**In most cases, let Claude automatically manage thinking modes** for optimal balance of cost and quality. Override manually when you have specific requirements:
-
-**Use lower modes (`minimal`, `low`) to save tokens when:**
-- Doing simple formatting or style checks
-- Getting quick explanations of basic concepts
-- Working with straightforward code
-- You need faster responses
-- Working within tight token budgets
-
-**Use higher modes (`high`, `max`) when quality justifies the cost:**
-- Debugging complex issues (worth the extra tokens to find root causes)
-- Reviewing security-critical code (cost of tokens < cost of vulnerabilities)
-- Analyzing system architecture (comprehensive analysis saves development time)
-- Finding subtle bugs or edge cases
-- Working on performance optimizations
-
-**Token Cost Examples:**
-- `minimal` (128 tokens) vs `max` (32,768 tokens) = 256x difference in thinking tokens
-- For a simple formatting check, using `minimal` instead of the default `medium` saves ~8,000 thinking tokens
-- For critical security reviews, the extra tokens in `high` or `max` mode are a worthwhile investment
-
-**Examples by scenario:**
-```
-# Quick style check with o3
-"Use flash to review formatting in utils.py"
-
-# Security audit with o3
-"Get o3 to do a security review of auth/ with thinking mode high"
-
-# Complex debugging, letting claude pick the best model
-"Use zen to debug this race condition with max thinking mode"
-
-# Architecture analysis with Gemini 2.5 Pro
-"Analyze the entire src/ directory architecture with high thinking using pro"
-```
 
 ## Advanced Features
 
@@ -655,100 +487,12 @@ This server enables **true AI collaboration** between Claude and multiple AI mod
    → Flash quickly validates formatting with awareness of all previous fixes
 ```
 
-### Working with Large Prompts
+For more advanced features like working with large prompts and dynamic context requests, see the [Advanced Usage Guide](docs/advanced-usage.md).
 
-The MCP protocol has a combined request+response limit of approximately 25K tokens. This server intelligently works around this limitation by automatically handling large prompts as files:
-
-**How it works:**
-1. When you send a prompt larger than the configured limit (default: 50K characters ~10-12K tokens), the server detects this
-2. It responds with a special status asking Claude to save the prompt to a file named `prompt.txt`
-3. Claude saves the prompt and resends the request with the file path instead
-4. The server reads the file content directly into Gemini's 1M token context
-5. The full MCP token capacity is preserved for the response
-
-**Example scenario:**
-```
-# You have a massive code review request with detailed context
-User: "Use gemini to review this code: [50,000+ character detailed analysis]"
-
-# Server detects the large prompt and responds:
-Zen MCP: "The prompt is too large for MCP's token limits (>50,000 characters). 
-Please save the prompt text to a temporary file named 'prompt.txt' and resend 
-the request with an empty prompt string and the absolute file path included 
-in the files parameter, along with any other files you wish to share as context."
-
-# Claude automatically handles this:
-- Saves your prompt to /tmp/prompt.txt
-- Resends: "Use gemini to review this code" with files=["/tmp/prompt.txt", "/path/to/code.py"]
-
-# Server processes the large prompt through Gemini's 1M context
-# Returns comprehensive analysis within MCP's response limits
-```
-
-This feature ensures you can send arbitrarily large prompts to Gemini without hitting MCP's protocol limitations, while maximizing the available space for detailed responses.
-
-### Dynamic Context Requests
-Tools can request additional context from Claude during execution. When Gemini needs more information to provide a thorough analysis, it will ask Claude for specific files or clarification, enabling true collaborative problem-solving.
-
-**Example:** If Gemini is debugging an error but needs to see a configuration file that wasn't initially provided, it can request: 
-```json
-{
-  "status": "requires_clarification",
-  "question": "I need to see the database configuration to understand this connection error",
-  "files_needed": ["config/database.yml", "src/db_connection.py"]
-}
-```
-
-Claude will then provide the requested files and Gemini can continue with a more complete analysis.
-
-### Web Search Integration
-
-**Smart web search recommendations for enhanced analysis**
-
-Web search is now enabled by default for all tools. Instead of performing searches directly, Gemini intelligently analyzes when additional information from the web would enhance its response and provides specific search recommendations for Claude to execute.
-
-**How it works:**
-1. Gemini analyzes the request and identifies areas where current documentation, API references, or community solutions would be valuable
-2. It provides its analysis based on its training data
-3. If web searches would strengthen the analysis, Gemini includes a "Recommended Web Searches for Claude" section
-4. Claude can then perform these searches and incorporate the findings
-
-**Example:**
-```
-User: "Use gemini to debug this FastAPI async error"
-
-Gemini's Response:
-[... debugging analysis ...]
-
-**Recommended Web Searches for Claude:**
-- "FastAPI async def vs def performance 2024" - to verify current best practices for async endpoints
-- "FastAPI BackgroundTasks memory leak" - to check for known issues with the version you're using
-- "FastAPI lifespan context manager pattern" - to explore proper resource management patterns
-
-Claude can then search for these specific topics and provide you with the most current information.
-```
-
-**Benefits:**
-- Always access to latest documentation and best practices
-- Gemini focuses on reasoning about what information would help
-- Claude maintains control over actual web searches
-- More collaborative approach between the two AI assistants
-- Reduces hallucination by encouraging verification of assumptions
-
-**Web search control:**
-Web search is enabled by default, allowing models to request Claude perform searches for current documentation and solutions. If you prefer the model to work only with its training data, you can disable web search:
-```
-"Use gemini to review this code with use_websearch false"
-```
 
 ## Configuration
 
-The server includes several configurable properties that control its behavior:
-
-### Model Configuration
-
-**🎯 Auto Mode (Recommended):**
-Set `DEFAULT_MODEL=auto` in your .env file and Claude will intelligently select the best model for each task:
+**Auto Mode (Recommended):** Set `DEFAULT_MODEL=auto` in your .env file and Claude will intelligently select the best model for each task.
 
 ```env
 # .env file
@@ -759,207 +503,14 @@ GEMINI_API_KEY=your-gemini-key    # Enables Gemini Pro & Flash
 OPENAI_API_KEY=your-openai-key    # Enables O3, O3-mini
 ```
 
-**How Auto Mode Works:**
-- Claude analyzes each request and selects the optimal model
-- Model selection is based on task complexity, requirements, and model strengths
-- You can always override: "Use flash for quick check" or "Use o3 to debug"
+**Available Models:**
+- **`pro`** (Gemini 2.5 Pro): Extended thinking, deep analysis
+- **`flash`** (Gemini 2.0 Flash): Ultra-fast responses
+- **`o3`**: Strong logical reasoning  
+- **`o3-mini`**: Balanced speed/quality
+- **Custom models**: via OpenRouter or local APIs (Ollama, vLLM, etc.)
 
-**Supported Models & When Claude Uses Them:**
-
-| Model | Provider | Context | Strengths | Auto Mode Usage |
-|-------|----------|---------|-----------|------------------|
-| **`pro`** (Gemini 2.5 Pro) | Google | 1M tokens | Extended thinking (up to 32K tokens), deep analysis | Complex architecture, security reviews, deep debugging |
-| **`flash`** (Gemini 2.0 Flash) | Google | 1M tokens | Ultra-fast responses | Quick checks, formatting, simple analysis |
-| **`o3`** | OpenAI | 200K tokens | Strong logical reasoning | Debugging logic errors, systematic analysis |
-| **`o3-mini`** | OpenAI | 200K tokens | Balanced speed/quality | Moderate complexity tasks |
-| **`llama`** (Llama 3.2) | Custom/Local | 128K tokens | Local inference, privacy | On-device analysis, cost-free processing |
-| **Any model** | OpenRouter | Varies | Access to GPT-4, Claude, Llama, etc. | User-specified or based on task requirements |
-
-**Mix & Match Providers:** Use multiple providers simultaneously! Set both `OPENROUTER_API_KEY` and `CUSTOM_API_URL` to access 
-cloud models (expensive/powerful) AND local models (free/private) in the same conversation.
-
-**Manual Model Selection:**
-You can specify a default model instead of auto mode:
-
-```env
-# Use a specific model by default
-DEFAULT_MODEL=gemini-2.5-pro-preview-06-05  # Always use Gemini Pro
-DEFAULT_MODEL=flash                         # Always use Flash
-DEFAULT_MODEL=o3                           # Always use O3
-```
-
-**Per-Request Model Override:**
-Regardless of your default setting, you can specify models per request:
-- "Use **pro** for deep security analysis of auth.py"
-- "Use **flash** to quickly format this code"
-- "Use **o3** to debug this logic error"
-- "Review with **o3-mini** for balanced analysis"
-
-**Model Capabilities:**
-- **Gemini Models**: Support thinking modes (minimal to max), web search, 1M context
-- **O3 Models**: Excellent reasoning, systematic analysis, 200K context
-
-### Temperature Defaults
-Different tools use optimized temperature settings:
-- **`TEMPERATURE_ANALYTICAL`**: `0.2` - Used for code review and debugging (focused, deterministic)
-- **`TEMPERATURE_BALANCED`**: `0.5` - Used for general chat (balanced creativity/accuracy)
-- **`TEMPERATURE_CREATIVE`**: `0.7` - Used for deep thinking and architecture (more creative)
-
-### Logging Configuration
-Control logging verbosity via the `LOG_LEVEL` environment variable:
-- **`DEBUG`**: Shows detailed operational messages, tool execution flow, conversation threading
-- **`INFO`**: Shows general operational messages (default)
-- **`WARNING`**: Shows only warnings and errors
-- **`ERROR`**: Shows only errors
-
-**Set in your .env file:**
-```bash
-LOG_LEVEL=DEBUG  # For troubleshooting
-LOG_LEVEL=INFO   # For normal operation (default)
-```
-
-**For Docker:**
-```bash
-# In .env file
-LOG_LEVEL=DEBUG
-
-# Or set directly when starting
-LOG_LEVEL=DEBUG docker compose up
-```
-
-
-## File Path Requirements
-
-**All file paths must be absolute paths.**
-
-When using any Gemini tool, always provide absolute paths:
-```
-✅ "Use gemini to analyze /Users/you/project/src/main.py"
-❌ "Use gemini to analyze ./src/main.py"  (will be rejected)
-```
-
-### Security & File Access
-
-By default, the server allows access to files within your home directory. This is necessary for the server to work with any file you might want to analyze from Claude.
-
-**For Docker environments**, the `WORKSPACE_ROOT` environment variable is used to map your local directory to the internal `/workspace` directory, enabling the MCP to translate absolute file references correctly:
-
-```json
-"env": {
-  "GEMINI_API_KEY": "your-key",
-  "WORKSPACE_ROOT": "/Users/you/project"  // Maps to /workspace inside Docker
-}
-```
-
-This allows Claude to use absolute paths that will be correctly translated between your local filesystem and the Docker container.
-
-
-## How System Prompts Work
-
-The server uses carefully crafted system prompts to give each tool specialized expertise:
-
-### Prompt Architecture
-- **Centralized Prompts**: All system prompts are defined in `prompts/tool_prompts.py`
-- **Tool Integration**: Each tool inherits from `BaseTool` and implements `get_system_prompt()`
-- **Prompt Flow**: `User Request → Tool Selection → System Prompt + Context → Gemini Response`
-
-### Specialized Expertise
-Each tool has a unique system prompt that defines its role and approach:
-- **`thinkdeep`**: Acts as a senior development partner, challenging assumptions and finding edge cases
-- **`codereview`**: Expert code reviewer with security/performance focus, uses severity levels
-- **`debug`**: Systematic debugger providing root cause analysis and prevention strategies
-- **`analyze`**: Code analyst focusing on architecture, patterns, and actionable insights
-
-### Customization
-To modify tool behavior, you can:
-1. Edit prompts in `prompts/tool_prompts.py` for global changes
-2. Override `get_system_prompt()` in a tool class for tool-specific changes
-3. Use the `temperature` parameter to adjust response style (0.2 for focused, 0.7 for creative)
-
-## Testing
-
-### Unit Tests (No API Key Required)
-The project includes comprehensive unit tests that use mocks and don't require a Gemini API key:
-
-```bash
-# Run all unit tests
-python -m pytest tests/ -v
-
-# Run with coverage
-python -m pytest tests/ --cov=. --cov-report=html
-```
-
-### Simulation Tests (API Key Required)
-To test the MCP server with comprehensive end-to-end simulation:
-
-```bash
-# Set your API keys (at least one required)
-export GEMINI_API_KEY=your-gemini-api-key-here
-export OPENAI_API_KEY=your-openai-api-key-here
-
-# Run all simulation tests (default: uses existing Docker containers)
-python communication_simulator_test.py
-
-# Run specific tests only
-python communication_simulator_test.py --tests basic_conversation content_validation
-
-# Run with Docker rebuild (if needed)
-python communication_simulator_test.py --rebuild-docker
-
-# List available tests
-python communication_simulator_test.py --list-tests
-```
-
-The simulation tests validate:
-- Basic conversation flow with continuation
-- File handling and deduplication
-- Cross-tool conversation threading
-- Redis memory persistence
-- Docker container integration
-
-### GitHub Actions CI/CD
-The project includes GitHub Actions workflows that:
-
-- **✅ Run unit tests automatically** - No API key needed, uses mocks
-- **✅ Test on Python 3.10, 3.11, 3.12** - Ensures compatibility
-- **✅ Run linting and formatting checks** - Maintains code quality
-
-The CI pipeline works without any secrets and will pass all tests using mocked responses. Simulation tests require API key secrets (`GEMINI_API_KEY` and/or `OPENAI_API_KEY`) to run the communication simulator.
-
-## Troubleshooting
-
-### Docker Issues
-
-**"Connection failed" in Claude Desktop**
-- Ensure Docker services are running: `docker compose ps`
-- Check if the container name is correct: `docker ps` to see actual container names
-- Verify your .env file has at least one valid API key (GEMINI_API_KEY or OPENAI_API_KEY)
-
-**"API key environment variable is required"**
-- Edit your .env file and add at least one API key (Gemini or OpenAI)
-- Restart services: `docker compose restart`
-
-**Container fails to start**
-- Check logs: `docker compose logs zen-mcp`
-- Ensure Docker has enough resources (memory/disk space)
-- Try rebuilding: `docker compose build --no-cache`
-
-**"spawn ENOENT" or execution issues**
-- Verify the container is running: `docker compose ps`
-- Check that Docker Desktop is running
-- On Windows: Ensure WSL2 is properly configured for Docker
-
-**Testing your Docker setup:**
-```bash
-# Check if services are running
-docker compose ps
-
-# Test manual connection
-docker exec -i zen-mcp-server echo "Connection test"
-
-# View logs
-docker compose logs -f
-```
+For detailed configuration options, see the [Advanced Usage Guide](docs/advanced-usage.md).
 
 ## License
 
diff --git a/conf/custom_models.json b/conf/custom_models.json
index dc1fb08..611e4f0 100644
--- a/conf/custom_models.json
+++ b/conf/custom_models.json
@@ -13,6 +13,7 @@
       "Aliases are case-insensitive and should be unique across all models",
       "context_window is the model's total context window size in tokens (input + output)",
       "Set supports_* flags based on the model's actual capabilities",
+      "Set is_custom=true for models that should ONLY work with custom endpoints (Ollama, vLLM, etc.)",
       "Models not listed here will use generic defaults (32K context window, basic features)",
       "For OpenRouter models: Use official OpenRouter model names (e.g., 'anthropic/claude-3-opus')",
       "For local/custom models: Use model names as they appear in your API (e.g., 'llama3.2', 'gpt-3.5-turbo')"
@@ -24,37 +25,21 @@
       "supports_extended_thinking": "Whether the model supports extended reasoning tokens (currently none do via OpenRouter or custom APIs)",
       "supports_json_mode": "Whether the model can guarantee valid JSON output",
       "supports_function_calling": "Whether the model supports function/tool calling",
+      "is_custom": "Set to true for models that should ONLY be used with custom API endpoints (Ollama, vLLM, etc.). False or omitted for OpenRouter/cloud models.",
       "description": "Human-readable description of the model"
     },
     "example_custom_model": {
-      "model_name": "vendor/model-name-version",
+      "model_name": "my-local-model",
       "aliases": ["shortname", "nickname", "abbrev"],
       "context_window": 128000,
       "supports_extended_thinking": false,
       "supports_json_mode": true,
       "supports_function_calling": true,
-      "description": "Brief description of the model"
+      "is_custom": true,
+      "description": "Example custom/local model for Ollama, vLLM, etc."
     }
   },
   "models": [
-    {
-      "model_name": "openai/gpt-4o",
-      "aliases": ["gpt4o", "4o", "gpt-4o"],
-      "context_window": 128000,
-      "supports_extended_thinking": false,
-      "supports_json_mode": true,
-      "supports_function_calling": true,
-      "description": "OpenAI's most capable model, GPT-4 Optimized"
-    },
-    {
-      "model_name": "openai/gpt-4o-mini",
-      "aliases": ["gpt4o-mini", "4o-mini", "gpt-4o-mini"],
-      "context_window": 128000,
-      "supports_extended_thinking": false,
-      "supports_json_mode": true,
-      "supports_function_calling": true,
-      "description": "Smaller, faster version of GPT-4o"
-    },
     {
       "model_name": "anthropic/claude-3-opus",
       "aliases": ["opus", "claude-opus", "claude3-opus", "claude-3-opus"],
@@ -83,22 +68,22 @@
       "description": "Claude 3 Haiku - Fast and efficient"
     },
     {
-      "model_name": "google/gemini-pro-1.5",
+      "model_name": "google/gemini-2.5-pro-preview",
       "aliases": ["pro","gemini-pro", "gemini", "pro-openrouter"],
       "context_window": 1048576,
       "supports_extended_thinking": false,
       "supports_json_mode": true,
       "supports_function_calling": false,
-      "description": "Google's Gemini Pro 1.5 via OpenRouter"
+      "description": "Google's Gemini 2.5 Pro via OpenRouter"
     },
     {
-      "model_name": "google/gemini-flash-1.5-8b",
-      "aliases": ["flash","gemini-flash", "flash-openrouter", "flash-8b"],
+      "model_name": "google/gemini-2.5-flash-preview-05-20",
+      "aliases": ["flash","gemini-flash", "flash-openrouter", "flash-2.5"],
       "context_window": 1048576,
       "supports_extended_thinking": false,
       "supports_json_mode": true,
       "supports_function_calling": false,
-      "description": "Google's Gemini Flash 1.5 8B via OpenRouter"
+      "description": "Google's Gemini 2.5 Flash via OpenRouter"
     },
     {
       "model_name": "mistral/mistral-large",
@@ -119,22 +104,13 @@
       "description": "Meta's Llama 3 70B model"
     },
     {
-      "model_name": "cohere/command-r-plus",
-      "aliases": ["command-r-plus", "command-r", "cohere"],
-      "context_window": 128000,
-      "supports_extended_thinking": false,
-      "supports_json_mode": false,
-      "supports_function_calling": true,
-      "description": "Cohere's Command R Plus model"
-    },
-    {
-      "model_name": "deepseek/deepseek-coder",
-      "aliases": ["deepseek-coder", "deepseek", "coder"],
-      "context_window": 16384,
-      "supports_extended_thinking": false,
-      "supports_json_mode": false,
+      "model_name": "deepseek/deepseek-r1-0528",
+      "aliases": ["deepseek-r1", "deepseek", "r1", "deepseek-thinking"],
+      "context_window": 65536,
+      "supports_extended_thinking": true,
+      "supports_json_mode": true,
       "supports_function_calling": false,
-      "description": "DeepSeek's coding-focused model"
+      "description": "DeepSeek R1 with thinking mode - advanced reasoning capabilities"
     },
     {
       "model_name": "perplexity/llama-3-sonar-large-32k-online",
@@ -154,18 +130,9 @@
       "supports_function_calling": true,
       "description": "OpenAI's o3 model - well-rounded and powerful across domains"
     },
-    {
-      "model_name": "openai/o3-mini",
-      "aliases": ["o3-mini", "o3mini"],
-      "context_window": 200000,
-      "supports_extended_thinking": false,
-      "supports_json_mode": true,
-      "supports_function_calling": true,
-      "description": "OpenAI's o3-mini reasoning model - cost-efficient with STEM performance"
-    },
     {
       "model_name": "openai/o3-mini-high",
-      "aliases": ["o3-mini-high", "o3mini-high"],
+      "aliases": ["o3-mini", "o3mini", "o3-mini-high", "o3mini-high"],
       "context_window": 200000,
       "supports_extended_thinking": false,
       "supports_json_mode": true,
@@ -179,6 +146,7 @@
       "supports_extended_thinking": false,
       "supports_json_mode": false,
       "supports_function_calling": false,
+      "is_custom": true,
       "description": "Local Llama 3.2 model via custom endpoint (Ollama/vLLM) - 128K context window"
     }
   ]
diff --git a/docs/advanced-usage.md b/docs/advanced-usage.md
new file mode 100644
index 0000000..a45e1fd
--- /dev/null
+++ b/docs/advanced-usage.md
@@ -0,0 +1,328 @@
+# Advanced Usage Guide
+
+This guide covers advanced features, configuration options, and workflows for power users of the Zen MCP server.
+
+## Table of Contents
+
+- [Model Configuration](#model-configuration)
+- [Thinking Modes](#thinking-modes)
+- [Tool Parameters](#tool-parameters)
+- [Collaborative Workflows](#collaborative-workflows)
+- [Working with Large Prompts](#working-with-large-prompts)
+- [Web Search Integration](#web-search-integration)
+- [System Prompts](#system-prompts)
+
+## Model Configuration
+
+**Auto Mode (Recommended):**
+Set `DEFAULT_MODEL=auto` in your .env file and Claude will intelligently select the best model for each task:
+
+```env
+# .env file
+DEFAULT_MODEL=auto  # Claude picks the best model automatically
+
+# API Keys (at least one required)
+GEMINI_API_KEY=your-gemini-key    # Enables Gemini Pro & Flash
+OPENAI_API_KEY=your-openai-key    # Enables O3, O3-mini
+```
+
+**How Auto Mode Works:**
+- Claude analyzes each request and selects the optimal model
+- Model selection is based on task complexity, requirements, and model strengths
+- You can always override: "Use flash for quick check" or "Use o3 to debug"
+
+**Supported Models & When Claude Uses Them:**
+
+| Model | Provider | Context | Strengths | Auto Mode Usage |
+|-------|----------|---------|-----------|------------------|
+| **`pro`** (Gemini 2.5 Pro) | Google | 1M tokens | Extended thinking (up to 32K tokens), deep analysis | Complex architecture, security reviews, deep debugging |
+| **`flash`** (Gemini 2.0 Flash) | Google | 1M tokens | Ultra-fast responses | Quick checks, formatting, simple analysis |
+| **`o3`** | OpenAI | 200K tokens | Strong logical reasoning | Debugging logic errors, systematic analysis |
+| **`o3-mini`** | OpenAI | 200K tokens | Balanced speed/quality | Moderate complexity tasks |
+| **`llama`** (Llama 3.2) | Custom/Local | 128K tokens | Local inference, privacy | On-device analysis, cost-free processing |
+| **Any model** | OpenRouter | Varies | Access to GPT-4, Claude, Llama, etc. | User-specified or based on task requirements |
+
+**Mix & Match Providers:** Use multiple providers simultaneously! Set both `OPENROUTER_API_KEY` and `CUSTOM_API_URL` to access 
+cloud models (expensive/powerful) AND local models (free/private) in the same conversation.
+
+**Manual Model Selection:**
+You can specify a default model instead of auto mode:
+
+```env
+# Use a specific model by default
+DEFAULT_MODEL=gemini-2.5-pro-preview-06-05  # Always use Gemini Pro
+DEFAULT_MODEL=flash                         # Always use Flash
+DEFAULT_MODEL=o3                           # Always use O3
+```
+
+**Per-Request Model Override:**
+Regardless of your default setting, you can specify models per request:
+- "Use **pro** for deep security analysis of auth.py"
+- "Use **flash** to quickly format this code"
+- "Use **o3** to debug this logic error"
+- "Review with **o3-mini** for balanced analysis"
+
+**Model Capabilities:**
+- **Gemini Models**: Support thinking modes (minimal to max), web search, 1M context
+- **O3 Models**: Excellent reasoning, systematic analysis, 200K context
+
+## Thinking Modes
+
+**Claude automatically manages thinking modes based on task complexity**, but you can also manually control Gemini's reasoning depth to balance between response quality and token consumption. Each thinking mode uses a different amount of tokens, directly affecting API costs and response time.
+
+### Thinking Modes & Token Budgets
+
+These only apply to models that support customizing token usage for extended thinking, such as Gemini 2.5 Pro.
+
+| Mode | Token Budget | Use Case | Cost Impact |
+|------|-------------|----------|-------------|
+| `minimal` | 128 tokens | Simple, straightforward tasks | Lowest cost |
+| `low` | 2,048 tokens | Basic reasoning tasks | 16x more than minimal |
+| `medium` | 8,192 tokens | **Default** - Most development tasks | 64x more than minimal |
+| `high` | 16,384 tokens | Complex problems requiring thorough analysis (default for `thinkdeep`) | 128x more than minimal |
+| `max` | 32,768 tokens | Exhaustive reasoning | 256x more than minimal |
+
+### How to Use Thinking Modes
+
+**Claude automatically selects appropriate thinking modes**, but you can override this by explicitly requesting a specific mode in your prompts. Remember: higher thinking modes = more tokens = higher cost but better quality:
+
+#### Optimizing Token Usage & Costs
+
+**In most cases, let Claude automatically manage thinking modes** for optimal balance of cost and quality. Override manually when you have specific requirements:
+
+**Use lower modes (`minimal`, `low`) to save tokens when:**
+- Doing simple formatting or style checks
+- Getting quick explanations of basic concepts
+- Working with straightforward code
+- You need faster responses
+- Working within tight token budgets
+
+**Use higher modes (`high`, `max`) when quality justifies the cost:**
+- Debugging complex issues (worth the extra tokens to find root causes)
+- Reviewing security-critical code (cost of tokens < cost of vulnerabilities)
+- Analyzing system architecture (comprehensive analysis saves development time)
+- Finding subtle bugs or edge cases
+- Working on performance optimizations
+
+**Token Cost Examples:**
+- `minimal` (128 tokens) vs `max` (32,768 tokens) = 256x difference in thinking tokens
+- For a simple formatting check, using `minimal` instead of the default `medium` saves ~8,000 thinking tokens
+- For critical security reviews, the extra tokens in `high` or `max` mode are a worthwhile investment
+
+**Examples by scenario:**
+```
+# Quick style check with o3
+"Use flash to review formatting in utils.py"
+
+# Security audit with o3
+"Get o3 to do a security review of auth/ with thinking mode high"
+
+# Complex debugging, letting claude pick the best model
+"Use zen to debug this race condition with max thinking mode"
+
+# Architecture analysis with Gemini 2.5 Pro
+"Analyze the entire src/ directory architecture with high thinking using pro"
+```
+
+## Tool Parameters
+
+All tools that work with files support **both individual files and entire directories**. The server automatically expands directories, filters for relevant code files, and manages token limits.
+
+### File-Processing Tools
+
+**`analyze`** - Analyze files or directories
+- `files`: List of file paths or directories (required)
+- `question`: What to analyze (required)  
+- `model`: auto|pro|flash|o3|o3-mini (default: server default)
+- `analysis_type`: architecture|performance|security|quality|general
+- `output_format`: summary|detailed|actionable
+- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
+- `use_websearch`: Enable web search for documentation and best practices - allows model to request Claude perform searches (default: true)
+
+```
+"Analyze the src/ directory for architectural patterns" (auto mode picks best model)
+"Use flash to quickly analyze main.py and tests/ to understand test coverage" 
+"Use o3 for logical analysis of the algorithm in backend/core.py"
+"Use pro for deep analysis of the entire backend/ directory structure"
+```
+
+**`codereview`** - Review code files or directories
+- `files`: List of file paths or directories (required)
+- `model`: auto|pro|flash|o3|o3-mini (default: server default)
+- `review_type`: full|security|performance|quick
+- `focus_on`: Specific aspects to focus on
+- `standards`: Coding standards to enforce
+- `severity_filter`: critical|high|medium|all
+- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
+
+```
+"Review the entire api/ directory for security issues" (auto mode picks best model)
+"Use pro to review auth/ for deep security analysis"
+"Use o3 to review logic in algorithms/ for correctness"
+"Use flash to quickly review src/ with focus on performance, only show critical issues"
+```
+
+**`debug`** - Debug with file context
+- `error_description`: Description of the issue (required)
+- `model`: auto|pro|flash|o3|o3-mini (default: server default)
+- `error_context`: Stack trace or logs
+- `files`: Files or directories related to the issue
+- `runtime_info`: Environment details
+- `previous_attempts`: What you've tried
+- `thinking_mode`: minimal|low|medium|high|max (default: medium, Gemini only)
+- `use_websearch`: Enable web search for error messages and solutions - allows model to request Claude perform searches (default: true)
+
+```
+"Debug this logic error with context from backend/" (auto mode picks best model)
+"Use o3 to debug this algorithm correctness issue"
+"Use pro to debug this complex architecture problem"
+```
+
+**`thinkdeep`** - Extended analysis with file context
+- `current_analysis`: Your current thinking (required)
+- `model`: auto|pro|flash|o3|o3-mini (default: server default)
+- `problem_context`: Additional context
+- `focus_areas`: Specific aspects to focus on
+- `files`: Files or directories for context
+- `thinking_mode`: minimal|low|medium|high|max (default: max, Gemini only)
+- `use_websearch`: Enable web search for documentation and insights - allows model to request Claude perform searches (default: true)
+
+```
+"Think deeper about my design with reference to src/models/" (auto mode picks best model)
+"Use pro to think deeper about this architecture with extended thinking"
+"Use o3 to think deeper about the logical flow in this algorithm"
+```
+
+## Collaborative Workflows
+
+### Design → Review → Implement
+```
+Think hard about designing and developing a fun calculator app in swift. Review your design plans with o3, taking in
+their suggestions but keep the feature-set realistic and doable without adding bloat. Begin implementing and in between
+implementation, get a codereview done by Gemini Pro and chat with Flash if you need to for creative directions.   
+```
+
+### Code → Review → Fix
+```
+Implement a new screen where the locations taken from the database display on a map, with pins falling from
+the top and landing with animation. Once done, codereview with gemini pro and o3 both and ask them to critique your
+work. Fix medium to critical bugs / concerns / issues and show me the final product
+```
+
+### Debug → Analyze → Solution → Precommit Check → Publish
+```
+Take a look at these log files saved under subfolder/diagnostics.log there's a bug where the user says the app
+crashes at launch. Think hard and go over each line, tallying it with corresponding code within the project. After
+you've performed initial investigation, ask gemini pro to analyze the log files and the related code where you 
+suspect lies the bug and then formulate and implement a bare minimal fix. Must not regress. Perform a precommit
+with zen in the end using gemini pro to confirm we're okay to publish the fix 
+```
+
+### Tool Selection Guidance
+To help choose the right tool for your needs:
+
+**Decision Flow:**
+1. **Have a specific error/exception?** → Use `debug`
+2. **Want to find bugs/issues in code?** → Use `codereview`
+3. **Want to understand how code works?** → Use `analyze`
+4. **Have analysis that needs extension/validation?** → Use `thinkdeep`
+5. **Want to brainstorm or discuss?** → Use `chat`
+
+**Key Distinctions:**
+- `analyze` vs `codereview`: analyze explains, codereview prescribes fixes
+- `chat` vs `thinkdeep`: chat is open-ended, thinkdeep extends specific analysis
+- `debug` vs `codereview`: debug diagnoses runtime errors, review finds static issues
+
+## Working with Large Prompts
+
+The MCP protocol has a combined request+response limit of approximately 25K tokens. This server intelligently works around this limitation by automatically handling large prompts as files:
+
+**How it works:**
+1. When you send a prompt larger than the configured limit (default: 50K characters ~10-12K tokens), the server detects this
+2. It responds with a special status asking Claude to save the prompt to a file named `prompt.txt`
+3. Claude saves the prompt and resends the request with the file path instead
+4. The server reads the file content directly into Gemini's 1M token context
+5. The full MCP token capacity is preserved for the response
+
+**Example scenario:**
+```
+# You have a massive code review request with detailed context
+User: "Use gemini to review this code: [50,000+ character detailed analysis]"
+
+# Server detects the large prompt and responds:
+Zen MCP: "The prompt is too large for MCP's token limits (>50,000 characters). 
+Please save the prompt text to a temporary file named 'prompt.txt' and resend 
+the request with an empty prompt string and the absolute file path included 
+in the files parameter, along with any other files you wish to share as context."
+
+# Claude automatically handles this:
+- Saves your prompt to /tmp/prompt.txt
+- Resends: "Use gemini to review this code" with files=["/tmp/prompt.txt", "/path/to/code.py"]
+
+# Server processes the large prompt through Gemini's 1M context
+# Returns comprehensive analysis within MCP's response limits
+```
+
+This feature ensures you can send arbitrarily large prompts to Gemini without hitting MCP's protocol limitations, while maximizing the available space for detailed responses.
+
+## Web Search Integration
+
+**Smart web search recommendations for enhanced analysis**
+
+Web search is now enabled by default for all tools. Instead of performing searches directly, Gemini intelligently analyzes when additional information from the web would enhance its response and provides specific search recommendations for Claude to execute.
+
+**How it works:**
+1. Gemini analyzes the request and identifies areas where current documentation, API references, or community solutions would be valuable
+2. It provides its analysis based on its training data
+3. If web searches would strengthen the analysis, Gemini includes a "Recommended Web Searches for Claude" section
+4. Claude can then perform these searches and incorporate the findings
+
+**Example:**
+```
+User: "Use gemini to debug this FastAPI async error"
+
+Gemini's Response:
+[... debugging analysis ...]
+
+**Recommended Web Searches for Claude:**
+- "FastAPI async def vs def performance 2024" - to verify current best practices for async endpoints
+- "FastAPI BackgroundTasks memory leak" - to check for known issues with the version you're using
+- "FastAPI lifespan context manager pattern" - to explore proper resource management patterns
+
+Claude can then search for these specific topics and provide you with the most current information.
+```
+
+**Benefits:**
+- Always access to latest documentation and best practices
+- Gemini focuses on reasoning about what information would help
+- Claude maintains control over actual web searches
+- More collaborative approach between the two AI assistants
+- Reduces hallucination by encouraging verification of assumptions
+
+**Web search control:**
+Web search is enabled by default, allowing models to request Claude perform searches for current documentation and solutions. If you prefer the model to work only with its training data, you can disable web search:
+```
+"Use gemini to review this code with use_websearch false"
+```
+
+## System Prompts
+
+The server uses carefully crafted system prompts to give each tool specialized expertise:
+
+### Prompt Architecture
+- **Centralized Prompts**: All system prompts are defined in `prompts/tool_prompts.py`
+- **Tool Integration**: Each tool inherits from `BaseTool` and implements `get_system_prompt()`
+- **Prompt Flow**: `User Request → Tool Selection → System Prompt + Context → Gemini Response`
+
+### Specialized Expertise
+Each tool has a unique system prompt that defines its role and approach:
+- **`thinkdeep`**: Acts as a senior development partner, challenging assumptions and finding edge cases
+- **`codereview`**: Expert code reviewer with security/performance focus, uses severity levels
+- **`debug`**: Systematic debugger providing root cause analysis and prevention strategies
+- **`analyze`**: Code analyst focusing on architecture, patterns, and actionable insights
+
+### Customization
+To modify tool behavior, you can:
+1. Edit prompts in `prompts/tool_prompts.py` for global changes
+2. Override `get_system_prompt()` in a tool class for tool-specific changes
+3. Use the `temperature` parameter to adjust response style (0.2 for focused, 0.7 for creative)
\ No newline at end of file
diff --git a/docs/custom_models.md b/docs/custom_models.md
index 9f0f55a..aa4c9c1 100644
--- a/docs/custom_models.md
+++ b/docs/custom_models.md
@@ -35,7 +35,9 @@ This guide covers setting up multiple AI model providers including OpenRouter, c
 
 ## Model Aliases
 
-The server uses `conf/custom_models.json` to map convenient aliases to both OpenRouter and custom model names. Some popular aliases:
+The server uses `conf/custom_models.json` to map convenient aliases to both OpenRouter and custom model names. This unified registry supports both cloud models (via OpenRouter) and local models (via custom endpoints).
+
+### OpenRouter Models (Cloud)
 
 | Alias | Maps to OpenRouter Model |
 |-------|-------------------------|
@@ -44,12 +46,18 @@ The server uses `conf/custom_models.json` to map convenient aliases to both Open
 | `haiku` | `anthropic/claude-3-haiku` |
 | `gpt4o`, `4o` | `openai/gpt-4o` |
 | `gpt4o-mini`, `4o-mini` | `openai/gpt-4o-mini` |
-| `gemini`, `pro-openrouter` | `google/gemini-pro-1.5` |
-| `flash-openrouter` | `google/gemini-flash-1.5-8b` |
+| `pro`, `gemini` | `google/gemini-pro-1.5` |
+| `flash` | `google/gemini-flash-1.5-8b` |
 | `mistral` | `mistral/mistral-large` |
 | `deepseek`, `coder` | `deepseek/deepseek-coder` |
 | `perplexity` | `perplexity/llama-3-sonar-large-32k-online` |
 
+### Custom/Local Models
+
+| Alias | Maps to Local Model | Note |
+|-------|-------------------|------|
+| `local-llama`, `local` | `llama3.2` | Requires `CUSTOM_API_URL` configured |
+
 View the full list in [`conf/custom_models.json`](conf/custom_models.json). 
 
 **Note:** While you can use any OpenRouter model by its full name, models not in the config file will use generic capabilities (32K context window, no extended thinking, etc.) which may not match the model's actual capabilities. For best results, add new models to the config file with their proper specifications.
@@ -143,11 +151,12 @@ CUSTOM_MODEL_NAME=your-loaded-model
 
 ## Using Models
 
-**Using model aliases (from conf/openrouter_models.json):**
+**Using model aliases (from conf/custom_models.json):**
 ```
 # OpenRouter models:
 "Use opus for deep analysis"         # → anthropic/claude-3-opus
 "Use sonnet to review this code"     # → anthropic/claude-3-sonnet
+"Use pro via zen to analyze this"    # → google/gemini-pro-1.5
 "Use gpt4o via zen to analyze this"  # → openai/gpt-4o
 "Use mistral via zen to optimize"    # → mistral/mistral-large
 
@@ -171,6 +180,21 @@ CUSTOM_MODEL_NAME=your-loaded-model
 **For OpenRouter:** Check current model pricing at [openrouter.ai/models](https://openrouter.ai/models).  
 **For Local models:** Context window and capabilities are defined in `conf/custom_models.json`.
 
+## Model Provider Selection
+
+The system automatically routes models to the appropriate provider:
+
+1. **Models with `is_custom: true`** → Always routed to Custom API (requires `CUSTOM_API_URL`)
+2. **Models with `is_custom: false` or omitted** → Routed to OpenRouter (requires `OPENROUTER_API_KEY`)
+3. **Unknown models** → Fallback logic based on model name patterns
+
+**Provider Priority Order:**
+1. Native APIs (Google, OpenAI) - if API keys are available
+2. Custom endpoints - for models marked with `is_custom: true`  
+3. OpenRouter - catch-all for cloud models
+
+This ensures clean separation between local and cloud models while maintaining flexibility for unknown models.
+
 ## Model Configuration
 
 The server uses `conf/custom_models.json` to define model aliases and capabilities. You can:
@@ -181,7 +205,9 @@ The server uses `conf/custom_models.json` to define model aliases and capabiliti
 
 ### Adding Custom Models
 
-Edit `conf/custom_models.json` to add new models:
+Edit `conf/custom_models.json` to add new models. The configuration supports both OpenRouter (cloud) and custom endpoint (local) models.
+
+#### Adding an OpenRouter Model
 
 ```json
 {
@@ -195,11 +221,32 @@ Edit `conf/custom_models.json` to add new models:
 }
 ```
 
+#### Adding a Custom/Local Model
+
+```json
+{
+  "model_name": "my-local-model",
+  "aliases": ["local-model", "custom"],
+  "context_window": 128000,
+  "supports_extended_thinking": false,
+  "supports_json_mode": false,
+  "supports_function_calling": false,
+  "is_custom": true,
+  "description": "My custom Ollama/vLLM model"
+}
+```
+
 **Field explanations:**
+- `model_name`: The model identifier (OpenRouter format like `vendor/model` or local name like `llama3.2`)
+- `aliases`: Array of short names users can type instead of the full model name
 - `context_window`: Total tokens the model can process (input + output combined)
 - `supports_extended_thinking`: Whether the model has extended reasoning capabilities
 - `supports_json_mode`: Whether the model can guarantee valid JSON output
 - `supports_function_calling`: Whether the model supports function/tool calling
+- `is_custom`: **Set to `true` for models that should ONLY work with custom endpoints** (Ollama, vLLM, etc.)
+- `description`: Human-readable description of the model
+
+**Important:** Always set `is_custom: true` for local models. This ensures they're only used when `CUSTOM_API_URL` is configured and prevents conflicts with OpenRouter.
 
 ## Available Models
 
diff --git a/docs/setup-troubleshooting.md b/docs/setup-troubleshooting.md
new file mode 100644
index 0000000..1fa56cf
--- /dev/null
+++ b/docs/setup-troubleshooting.md
@@ -0,0 +1,175 @@
+# Setup and Troubleshooting Guide
+
+This guide covers platform-specific setup instructions, file path requirements, testing procedures, and troubleshooting common issues.
+
+## Table of Contents
+
+- [File Path Requirements](#file-path-requirements)
+- [Testing](#testing)
+- [Troubleshooting](#troubleshooting)
+
+## Windows Users
+
+**Windows users must use WSL2** - Install WSL2 with Ubuntu, then follow the same setup as Linux/macOS. All commands should be run in your WSL2 terminal.
+
+```powershell
+# Install WSL2 (run as Administrator in PowerShell)
+wsl --install -d Ubuntu
+```
+
+Once WSL2 is installed, the setup process is identical to Linux/macOS.
+
+## File Path Requirements
+
+**All file paths must be absolute paths.**
+
+When using any tool, always provide absolute paths:
+```
+✅ "Use zen to analyze /Users/you/project/src/main.py"
+❌ "Use zen to analyze ./src/main.py"  (will be rejected)
+```
+
+### Security & File Access
+
+By default, the server allows access to files within your home directory. This is necessary for the server to work with any file you might want to analyze from Claude.
+
+**For Docker environments**, the `WORKSPACE_ROOT` environment variable is used to map your local directory to the internal `/workspace` directory, enabling the MCP to translate absolute file references correctly:
+
+```json
+"env": {
+  "GEMINI_API_KEY": "your-key",
+  "WORKSPACE_ROOT": "/Users/you/project"  // Maps to /workspace inside Docker
+}
+```
+
+This allows Claude to use absolute paths that will be correctly translated between your local filesystem and the Docker container.
+
+## Testing
+
+### Unit Tests (No API Key Required)
+The project includes comprehensive unit tests that use mocks and don't require a Gemini API key:
+
+```bash
+# Run all unit tests
+python -m pytest tests/ -v
+
+# Run with coverage
+python -m pytest tests/ --cov=. --cov-report=html
+```
+
+### Simulation Tests (API Key Required)
+To test the MCP server with comprehensive end-to-end simulation:
+
+```bash
+# Set your API keys (at least one required)
+export GEMINI_API_KEY=your-gemini-api-key-here
+export OPENAI_API_KEY=your-openai-api-key-here
+
+# Run all simulation tests (default: uses existing Docker containers)
+python communication_simulator_test.py
+
+# Run specific tests only
+python communication_simulator_test.py --tests basic_conversation content_validation
+
+# Run with Docker rebuild (if needed)
+python communication_simulator_test.py --rebuild-docker
+
+# List available tests
+python communication_simulator_test.py --list-tests
+```
+
+The simulation tests validate:
+- Basic conversation flow with continuation
+- File handling and deduplication
+- Cross-tool conversation threading
+- Redis memory persistence
+- Docker container integration
+
+### GitHub Actions CI/CD
+The project includes GitHub Actions workflows that:
+
+- **✅ Run unit tests automatically** - No API key needed, uses mocks
+- **✅ Test on Python 3.10, 3.11, 3.12** - Ensures compatibility
+- **✅ Run linting and formatting checks** - Maintains code quality
+
+The CI pipeline works without any secrets and will pass all tests using mocked responses. Simulation tests require API key secrets (`GEMINI_API_KEY` and/or `OPENAI_API_KEY`) to run the communication simulator.
+
+## Troubleshooting
+
+### Docker Issues
+
+**"Connection failed" in Claude Desktop**
+- Ensure Docker services are running: `docker compose ps`
+- Check if the container name is correct: `docker ps` to see actual container names
+- Verify your .env file has at least one valid API key (GEMINI_API_KEY or OPENAI_API_KEY)
+
+**"API key environment variable is required"**
+- Edit your .env file and add at least one API key (Gemini or OpenAI)
+- Restart services: `docker compose restart`
+
+**Container fails to start**
+- Check logs: `docker compose logs zen-mcp`
+- Ensure Docker has enough resources (memory/disk space)
+- Try rebuilding: `docker compose build --no-cache`
+
+**"spawn ENOENT" or execution issues**
+- Verify the container is running: `docker compose ps`
+- Check that Docker Desktop is running
+- Ensure WSL2 integration is enabled in Docker Desktop (Windows users)
+
+**Testing your Docker setup:**
+```bash
+# Check if services are running
+docker compose ps
+
+# Test manual connection
+docker exec -i zen-mcp-server echo "Connection test"
+
+# View logs
+docker compose logs -f
+```
+
+### Common Setup Issues
+
+**File permission issues**
+- Use `sudo chmod +x setup-docker.sh` if the script isn't executable
+- Ensure your user is in the docker group: `sudo usermod -aG docker $USER`
+
+**WSL2 issues (Windows users)**
+- Ensure you're running Windows 10 version 2004+ or Windows 11
+- Enable Docker Desktop WSL2 integration in settings
+- Always run commands in WSL2 terminal, not Windows Command Prompt
+
+### API Key Issues
+
+**Invalid API key errors**
+- Double-check your API keys are correct
+- Ensure there are no extra spaces or characters in your .env file
+- For Gemini: Verify your key works at [Google AI Studio](https://makersuite.google.com/app/apikey)
+- For OpenAI: Verify your key works at [OpenAI Platform](https://platform.openai.com/api-keys)
+
+**Rate limiting**
+- Gemini free tier has limited access to latest models
+- Consider upgrading to a paid API plan for better performance
+- OpenAI O3 requires sufficient credits in your account
+
+### Performance Issues
+
+**Slow responses**
+- Check your internet connection
+- Try using a different model (e.g., Flash instead of Pro for faster responses)
+- Use lower thinking modes to save tokens and reduce response time
+
+**High token usage**
+- Review the [thinking modes section](advanced-usage.md#thinking-modes) to optimize costs
+- Use `minimal` or `low` thinking modes for simple tasks
+- Consider the auto mode to let Claude choose appropriate models
+
+### Getting Help
+
+If you encounter issues not covered here:
+
+1. **Check the logs**: `docker compose logs -f`
+2. **Verify your setup**: Run through the quickstart guide again
+3. **Test with simple commands**: Start with basic functionality before complex workflows
+4. **Report bugs**: Create an issue at the project repository with detailed error messages and your setup information
\ No newline at end of file
diff --git a/providers/custom.py b/providers/custom.py
index b13c545..7d2feab 100644
--- a/providers/custom.py
+++ b/providers/custom.py
@@ -176,52 +176,50 @@ class CustomProvider(OpenAICompatibleProvider):
         """
         logging.debug(f"Custom provider validating model: '{model_name}'")
 
+        # If OpenRouter is available and this looks like a cloud model, defer to OpenRouter
+        openrouter_available = os.getenv("OPENROUTER_API_KEY") is not None
+
         # Try to resolve through registry first
         config = self._registry.resolve(model_name)
         if config:
             model_id = config.model_name
-            # Only accept models that are clearly local/custom based on the resolved name
-            # Local models should not have vendor/ prefix (except for special cases)
-            is_local_model = (
-                "/" not in model_id  # Simple names like "llama3.2"
-                or "local" in model_id.lower()  # Explicit local indicator
-                or
-                # Check if any of the aliases contain local indicators
-                any("local" in alias.lower() or "ollama" in alias.lower() for alias in config.aliases)
-                if hasattr(config, "aliases")
-                else False
-            )
-
-            if is_local_model:
-                logging.debug(f"Model '{model_name}' -> '{model_id}' validated via registry (local model)")
+            # Use explicit is_custom flag for clean validation
+            if config.is_custom:
+                logging.debug(f"Model '{model_name}' -> '{model_id}' validated via registry (custom model)")
                 return True
             else:
-                # This is a cloud/OpenRouter model - reject it for custom provider
-                logging.debug(f"Model '{model_name}' -> '{model_id}' rejected (cloud model for OpenRouter)")
+                # This is a cloud/OpenRouter model - if OpenRouter is available, defer to it
+                if openrouter_available:
+                    logging.debug(f"Model '{model_name}' -> '{model_id}' deferred to OpenRouter (cloud model)")
+                else:
+                    logging.debug(f"Model '{model_name}' -> '{model_id}' rejected (cloud model, no OpenRouter)")
                 return False
 
-        # Strip :latest suffix and try validation again (it's just a version tag)
+        # Handle version tags for unknown models (e.g., "my-model:latest")
         clean_model_name = model_name
-        if model_name.endswith(":latest"):
-            clean_model_name = model_name[:-7]  # Remove ":latest"
-            logging.debug(f"Stripped :latest from '{model_name}' -> '{clean_model_name}'")
+        if ":" in model_name:
+            clean_model_name = model_name.split(":")[0]
+            logging.debug(f"Stripped version tag from '{model_name}' -> '{clean_model_name}'")
             # Try to resolve the clean name
             config = self._registry.resolve(clean_model_name)
             if config:
                 return self.validate_model_name(clean_model_name)  # Recursively validate clean name
 
+        # For unknown models (not in registry), only accept if they look like local models
+        # This maintains backward compatibility for custom models not yet in the registry
+
         # Accept models with explicit local indicators in the name
         if any(indicator in clean_model_name.lower() for indicator in ["local", "ollama", "vllm", "lmstudio"]):
             logging.debug(f"Model '{clean_model_name}' validated via local indicators")
             return True
 
-        # Accept simple model names without vendor prefix ONLY if they're not in registry
-        # This allows for unknown local models like custom fine-tunes
-        if "/" not in clean_model_name and ":" not in clean_model_name and not config:
-            logging.debug(f"Model '{clean_model_name}' validated via simple name pattern (unknown local model)")
+        # Accept simple model names without vendor prefix (likely local/custom models)
+        if "/" not in clean_model_name:
+            logging.debug(f"Model '{clean_model_name}' validated as potential local model (no vendor prefix)")
             return True
 
-        logging.debug(f"Model '{model_name}' NOT validated by custom provider")
+        # Reject everything else (likely cloud models not in registry)
+        logging.debug(f"Model '{model_name}' rejected by custom provider (appears to be cloud model)")
         return False
 
     def generate_content(
diff --git a/providers/openai_compatible.py b/providers/openai_compatible.py
index 547d146..cdff57c 100644
--- a/providers/openai_compatible.py
+++ b/providers/openai_compatible.py
@@ -75,11 +75,11 @@ class OpenAICompatibleProvider(ModelProvider):
                 logging.info(f"Configured allowed models for {self.FRIENDLY_NAME}: {sorted(models)}")
                 return models
 
-        # Log warning if no allow-list configured for proxy providers
+        # Log info if no allow-list configured for proxy providers
         if self.get_provider_type() not in [ProviderType.GOOGLE, ProviderType.OPENAI]:
-            logging.warning(
-                f"No model allow-list configured for {self.FRIENDLY_NAME}. "
-                f"Set {env_var} to restrict model access and control costs."
+            logging.info(
+                f"Model allow-list not configured for {self.FRIENDLY_NAME} - all models permitted. "
+                f"To restrict access, set {env_var} with comma-separated model names."
             )
 
         return None
diff --git a/providers/openrouter_registry.py b/providers/openrouter_registry.py
index 032e411..71536bd 100644
--- a/providers/openrouter_registry.py
+++ b/providers/openrouter_registry.py
@@ -24,6 +24,7 @@ class OpenRouterModelConfig:
     supports_streaming: bool = True
     supports_function_calling: bool = False
     supports_json_mode: bool = False
+    is_custom: bool = False  # True for models that should only be used with custom endpoints
     description: str = ""
 
     def to_capabilities(self) -> ModelCapabilities: