Created
September 28, 2025 09:15
-
-
Save codelion/1f8613849135cdfc794bb77dfd518c3f to your computer and use it in GitHub Desktop.
OptiLLM Proxy Plugin Config
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| providers: | |
| # Local server 1 - Fastest local, gets first priority | |
| - name: gemma_server_1 | |
| base_url: http://192.168.10.145:11111/v1 | |
| api_key: dummy | |
| weight: 100 # Highest weight - will get requests first | |
| max_concurrent: 1 # This server can only handle 1 request at a time | |
| # Local server 2 - Slower local, gets second priority | |
| - name: gemma_server_2 | |
| base_url: http://192.168.10.84:11111/v1 | |
| api_key: dummy | |
| weight: 50 # Second highest weight | |
| max_concurrent: 1 # This server can only handle 1 request at a time | |
| # Google AI - Only use when local servers are busy | |
| - name: google_ai_server | |
| base_url: https://generativelanguage.googleapis.com/v1beta/openai/ | |
| api_key: ${GEMMA_API_KEY} # Using environment variable interpolation | |
| weight: 1 # Lowest weight - use only as overflow when locals are busy | |
| max_concurrent: 16 # Google AI can handle 16 parallel requests | |
| model_map: | |
| # Map to actual working Google AI model with models/ prefix | |
| "gemma-3-4b-it": "gemini-flash-lite-latest" | |
| "default": "gemini-flash-lite-latest" | |
| routing: | |
| strategy: weighted # Use weighted routing for optimal throughput | |
| health_check: | |
| enabled: true | |
| interval: 30 # Check health every 30 seconds | |
| timeout: 5 # Health check timeout | |
| disable_fallback: true # Disable OpenAI fallback to prevent auth errors | |
| timeouts: | |
| request: 600 # 10 minutes for long-running queries with large token generation | |
| connect: 10 # Connection timeout | |
| queue: | |
| max_concurrent: 18 # Global limit = sum of all provider limits (16+1+1=18) | |
| timeout: 600 # 10 minutes queue timeout to match request timeout | |
| monitoring: | |
| log_level: INFO | |
| track_latency: true | |
| track_errors: true | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment