chawasit/gist:4106c2833636750cb870c3074a6ff474

## gistfile1.txt
# REF: https://www.reddit.com/r/LocalLLM/comments/1qk1tfh/any_success_with_glm_flash_47_on_vllm_014/
# REF: https://github.com/vllm-project/vllm/issues/32373

docker run --name vllm-glm47-flash \
    --gpus all \
    --ipc=host \
    -p 11434:8000 \
    -v "/home/chawasit/.cache/huggingface:/root/.cache/huggingface" \
    -e LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/local/cuda/lib64 \
    -e VLLM_USE_DEEP_GEMM=0 \
    -e VLLM_SLEEP_WHEN_IDLE=1 \
    -e VLLM_USE_FLASHINFER_MOE_FP16=1 \
    -e VLLM_USE_FLASHINFER_SAMPLER=0 \
    -e OMP_NUM_THREADS=4 \
    -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False \
    --entrypoint bash \
    vllm/vllm-openai:nightly -c "pip install https://github.com/huggingface/transformers/archive/main.tar.gz && vllm serve QuantTrio/GLM-4.7-Flash-AWQ --served-model-name zai-org/glm-4.7-flash --tensor-parallel-size 2 --max-model-len 120000 --max-num-seqs 2 --gpu-memory-utilization 0.9 --enable-expert-parallel --tool-call-parser glm47 --reasoning-parser glm45 --dtype bfloat16 --enable-auto-tool-choice --host 0.0.0.0 --port 8000"
	# REF: https://www.reddit.com/r/LocalLLM/comments/1qk1tfh/any_success_with_glm_flash_47_on_vllm_014/
	# REF: https://github.com/vllm-project/vllm/issues/32373

	docker run --name vllm-glm47-flash \
	--gpus all \
	--ipc=host \
	-p 11434:8000 \
	-v "/home/chawasit/.cache/huggingface:/root/.cache/huggingface" \
	-e LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/local/cuda/lib64 \
	-e VLLM_USE_DEEP_GEMM=0 \
	-e VLLM_SLEEP_WHEN_IDLE=1 \
	-e VLLM_USE_FLASHINFER_MOE_FP16=1 \
	-e VLLM_USE_FLASHINFER_SAMPLER=0 \
	-e OMP_NUM_THREADS=4 \
	-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False \
	--entrypoint bash \
	vllm/vllm-openai:nightly -c "pip install https://github.com/huggingface/transformers/archive/main.tar.gz && vllm serve QuantTrio/GLM-4.7-Flash-AWQ --served-model-name zai-org/glm-4.7-flash --tensor-parallel-size 2 --max-model-len 120000 --max-num-seqs 2 --gpu-memory-utilization 0.9 --enable-expert-parallel --tool-call-parser glm47 --reasoning-parser glm45 --dtype bfloat16 --enable-auto-tool-choice --host 0.0.0.0 --port 8000"
No results found