Skip to content

Instantly share code, notes, and snippets.

@celsowm
Last active January 5, 2026 02:51
Show Gist options
  • Select an option

  • Save celsowm/8507d0990ad04467fa8b4f5ddf36099e to your computer and use it in GitHub Desktop.

Select an option

Save celsowm/8507d0990ad04467fa8b4f5ddf36099e to your computer and use it in GitHub Desktop.
ai_server.md

#step 1: download model

hf download Qwen/Qwen3-14B-FP8 --local-dir /srv/models/qwen3-14b-fp8

big model multi gpus:

docker run -d --name vllm-qwen-235b-thinking \
  --gpus '"device=0,1,2,3"' \
  --ipc=host \
  -p 8000:8000 \
  -v /srv/models:/models \
  vllm/vllm-openai:latest \
  /models/qwen3-235b-fp8 \
  --port 8000 \
  --tensor-parallel-size 4 \
  --max-model-len 65536 \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --enable-chunked-prefill \
  --reasoning-parser deepseek_r1

medium model big ctx

docker run -d --name vllm-qwen-14b \
  -p 8001:8001 \
  -v /srv/models:/models \
  --gpus '"device=4"' \
  -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
  vllm/vllm-openai:latest \
  /models/qwen3-14b-fp8 \
  --host 0.0.0.0 \
  --port 8001 \
  --max-model-len 98304 \
  --hf-overrides '{"max_position_embeddings": 98304, "rope_scaling": {"type": "yarn", "factor": 3.0, "original_max_position_embeddings": 32768}}' \
  --enable-chunked-prefill \
  --enable-auto-tool-choice \
  --tool-call-parser hermes

multimodal

docker run -d --name olmOCR-2-7B \
  -p 8002:8002 \
  -v /srv/models:/models \
  --gpus '"device=5"' \
  vllm/vllm-openai:latest \
  /models/olmOCR-2-7B-1025-FP8 \
  --host 0.0.0.0 \
  --port 8002 \
  --max-model-len 16384 \
  --gpu-memory-utilization 0.45

embedding

docker run -d --name vllm-qwen-embed \
  -p 8003:8003 \
  -v /srv/models:/models \
  --gpus '"device=5"' \
  vllm/vllm-openai:latest \
  --model /models/qwen3-embedding-8B \
  --port 8003 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.45

xeon-cpu (compile first):

docker run -d --name qwen3-4b \
  --privileged --ipc=host --network=host \
  --memory=32g --memory-swap=32g --shm-size=2g \
  --cpuset-cpus "0-85,172-257" \
  -v /srv/models:/models \
  -e SGLANG_USE_CPU_ENGINE=1 \
  -e OMP_NUM_THREADS=64 \
  -e MKL_NUM_THREADS=64 \
  sglang-cpu:xeon \
  /opt/.venv/bin/python3 -m sglang.launch_server \
    --model-path /models/qwen3-4b-w8a8 \
    --device cpu \
    --quantization w8a8_int8 \
    --context-length 8192 \
    --mem-fraction-static 0.02 \
    --max-total-tokens 8192 \
    --host 0.0.0.0 \
    --port 8004
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment