codelion/proxy_config.yaml

## proxy_config.yaml
providers:
  # Local server 1 - Fastest local, gets first priority
  - name: gemma_server_1
    base_url: http://192.168.10.145:11111/v1
    api_key: dummy
    weight: 100  # Highest weight - will get requests first
    max_concurrent: 1  # This server can only handle 1 request at a time

  # Local server 2 - Slower local, gets second priority
  - name: gemma_server_2
    base_url: http://192.168.10.84:11111/v1
    api_key: dummy
    weight: 50  # Second highest weight
    max_concurrent: 1  # This server can only handle 1 request at a time

  # Google AI - Only use when local servers are busy
  - name: google_ai_server
    base_url: https://generativelanguage.googleapis.com/v1beta/openai/
    api_key: ${GEMMA_API_KEY}  # Using environment variable interpolation
    weight: 1  # Lowest weight - use only as overflow when locals are busy
    max_concurrent: 16  # Google AI can handle 16 parallel requests
    model_map:
      # Map to actual working Google AI model with models/ prefix
      "gemma-3-4b-it": "gemini-flash-lite-latest"
      "default": "gemini-flash-lite-latest"

routing:
  strategy: weighted  # Use weighted routing for optimal throughput
  health_check:
    enabled: true
    interval: 30  # Check health every 30 seconds
    timeout: 5    # Health check timeout
  disable_fallback: true  # Disable OpenAI fallback to prevent auth errors

timeouts:
  request: 600  # 10 minutes for long-running queries with large token generation
  connect: 10   # Connection timeout

queue:
  max_concurrent: 18  # Global limit = sum of all provider limits (16+1+1=18)
  timeout: 600  # 10 minutes queue timeout to match request timeout

monitoring:
  log_level: INFO
  track_latency: true
  track_errors: true
	providers:
	# Local server 1 - Fastest local, gets first priority
	- name: gemma_server_1
	base_url: http://192.168.10.145:11111/v1
	api_key: dummy
	weight: 100 # Highest weight - will get requests first
	max_concurrent: 1 # This server can only handle 1 request at a time

	# Local server 2 - Slower local, gets second priority
	- name: gemma_server_2
	base_url: http://192.168.10.84:11111/v1
	api_key: dummy
	weight: 50 # Second highest weight
	max_concurrent: 1 # This server can only handle 1 request at a time

	# Google AI - Only use when local servers are busy
	- name: google_ai_server
	base_url: https://generativelanguage.googleapis.com/v1beta/openai/
	api_key: ${GEMMA_API_KEY} # Using environment variable interpolation
	weight: 1 # Lowest weight - use only as overflow when locals are busy
	max_concurrent: 16 # Google AI can handle 16 parallel requests
	model_map:
	# Map to actual working Google AI model with models/ prefix
	"gemma-3-4b-it": "gemini-flash-lite-latest"
	"default": "gemini-flash-lite-latest"

	routing:
	strategy: weighted # Use weighted routing for optimal throughput
	health_check:
	enabled: true
	interval: 30 # Check health every 30 seconds
	timeout: 5 # Health check timeout
	disable_fallback: true # Disable OpenAI fallback to prevent auth errors

	timeouts:
	request: 600 # 10 minutes for long-running queries with large token generation
	connect: 10 # Connection timeout

	queue:
	max_concurrent: 18 # Global limit = sum of all provider limits (16+1+1=18)
	timeout: 600 # 10 minutes queue timeout to match request timeout

	monitoring:
	log_level: INFO
	track_latency: true
	track_errors: true
No results found