- Ubuntu 22.04+ (or similar Linux)
- GTX 1060 6GB + i3-10100 + 16GB RAM (or similar)
- NVIDIA driver installed
llama-swap is a proxy server for llama.cpp that provides:
- Hot-swap models without restarting
- OpenAI-compatible API with model routing
- Automatic model loading/unloading based on requests
- Multi-model support from a single endpoint
sudo apt update && sudo apt install -y \
build-essential \
git \
cmake \
libcurl4-openssl-dev \
curl \
libgomp1 \
wget \
golang-go# Check if CUDA is installed
nvidia-smi
# If not, install CUDA 12.x
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt install -y cuda-toolkit-12-4
# Add to PATH
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
# Verify
nvcc --versioncd ~
git clone https://github.com/ikawrakow/ik_llama.cpp.git
cd ik_llama.cpp
# Build with CUDA support
cmake -B build -DGGML_NATIVE=ON -DGGML_CUDA=ON
cmake --build build --config Release -j$(nproc)
# Verify build
ls build/bin/llama-server# Create models directory
mkdir -p ~/models
cd ~/models
# Qwen3.5-9B IQ4_XS (main model with speculative decoding)
wget -O Qwen3.5-9B-IQ4_XS.gguf \
"https://huggingface.co/bartowski/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-IQ4_XS.gguf"
# Qwen3.5-0.8B Q8_0 (draft model)
wget -O Qwen3.5-0.8B-Q8_0.gguf \
"https://huggingface.co/bartowski/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf"
# Optional: Additional models to add later
# wget -O Qwen3.5-3B-Q8_0.gguf \
# "https://huggingface.co/bartowski/Qwen3.5-3B-GGUF/resolve/main/Qwen3.5-3B-Q8_0.gguf"cd ~
git clone https://github.com/mostlygeek/llama-swap.git
cd llama-swap
# Build
go build -o llama-swap ./cmd/llama-swap
# Verify
./llama-swap --versionmkdir -p ~/.config/llama-swap
cat > ~/.config/llama-swap/config.yaml << 'EOF'
# llama-swap configuration
# Path to ik_llama.cpp server binary
exec: ~/ik_llama.cpp/build/bin/llama-server
# Host and port for llama-swap proxy
host: 0.0.0.0
port: 8080
# Models directory
models_dir: ~/models
# Default model (used when no model specified)
default_model: qwen3.5-9b-spec
# Model timeout (unload after inactivity)
timeout: 300
# Models configuration
models:
# Qwen3.5-9B with speculative decoding (primary model)
qwen3.5-9b-spec:
model: Qwen3.5-9B-IQ4_XS.gguf
model_draft: Qwen3.5-0.8B-Q8_0.gguf
ngl: 99
ngl_draft: 99
draft_max_tokens: 8
ctx_size: 4096
threads: 4
description: "Qwen3.5-9B with speculative decoding - fastest option"
# Qwen3.5-9B without speculative decoding (if you want to compare)
qwen3.5-9b:
model: Qwen3.5-9B-IQ4_XS.gguf
ngl: 99
ctx_size: 4096
threads: 4
description: "Qwen3.5-9B without speculative decoding"
# Qwen3.5-0.8B (fastest, smallest)
qwen3.5-0.8b:
model: Qwen3.5-0.8B-Q8_0.gguf
ngl: 99
ctx_size: 4096
threads: 4
description: "Qwen3.5-0.8B - ultra fast, lower quality"
# Add more models here as you download them
# qwen3.5-3b:
# model: Qwen3.5-3B-Q8_0.gguf
# ngl: 99
# ctx_size: 4096
# threads: 4
# description: "Qwen3.5-3B - balanced option"