Skip to content

Instantly share code, notes, and snippets.

@SteelPh0enix
Last active August 27, 2025 19:04
Show Gist options
  • Select an option

  • Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.

Select an option

Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
llama.cpp shell utils
#!/bin/zsh
# Collection of variables, aliases and functions to work w/ llama.cpp
# Source to activate.
# HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM!
# These settings are for RX 7900 XT & latest Arch Linux
export USE_ROCM=1
export HIP_PLATFORM="amd"
export GPU_ARCHS="gfx1100"
export HSA_OVERRIDE_GFX_VERSION="11.0.0"
export USE_SYMENGINE=1
# llama.cpp-related variables (tweak if necessary)
export LLAMA_CPP_PATH="${HOME}/.llama.cpp"
export LLAMA_CPP_INSTALL_PATH="${HOME}/.llama.cpp.install"
export LLAMA_CPP_VENV_PATH="${HOME}/.llama.cpp.venv"
export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=(
"-DCMAKE_INSTALL_PREFIX=${LLAMA_CPP_INSTALL_PATH}"
"-DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang"
"-DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++"
"-DLLAMA_BUILD_TESTS=OFF"
"-DLLAMA_BUILD_EXAMPLES=ON"
"-DLLAMA_BUILD_SERVER=ON"
"-DLLAMA_STANDALONE=ON"
"-DLLAMA_CURL=OFF"
"-DLLAMA_SERVER_SSL=ON"
"-DGGML_CCACHE=ON"
"-DGGML_NATIVE=ON"
"-DGGML_OPENMP=ON"
"-DGGML_LTO=ON"
"-DGGML_RPC=ON"
# CPU acceleration
"-DGGML_CPU=ON"
"-DGGML_AVX=ON"
"-DGGML_AVX2=ON"
# GPU acceleration
"-DGPU_TARGETS=${GPU_ARCHS}"
"-DGGML_HIP=ON"
"-DGGML_HIP_ROCWMMA_FATTN=ON"
"-DGGML_CUDA_FA_ALL_QUANTS=ON"
"-DGGML_CUDA_GRAPHS=ON"
)
export LLAMA_CPP_CMAKE_ARGS_FOR_VULKAN=(
"-DCMAKE_INSTALL_PREFIX=${LLAMA_CPP_INSTALL_PATH}"
"-DLLAMA_BUILD_TESTS=OFF"
"-DLLAMA_BUILD_EXAMPLES=ON"
"-DLLAMA_BUILD_SERVER=ON"
"-DLLAMA_STANDALONE=ON"
"-DLLAMA_CURL=OFF"
"-DLLAMA_SERVER_SSL=ON"
"-DGGML_CCACHE=ON"
"-DGGML_NATIVE=ON"
"-DGGML_OPENMP=ON"
"-DGGML_LTO=ON"
"-DGGML_RPC=ON"
# CPU acceleration
"-DGGML_CPU=ON"
"-DGGML_AVX=ON"
"-DGGML_AVX2=ON"
# GPU acceleration
"-DGGML_VULKAN=ON"
)
# llama.cpp server default settings
export LLAMA_ARG_HOST="0.0.0.0"
export LLAMA_ARG_PORT="51536"
export LLAMA_ARG_BATCH=2048
export LLAMA_ARG_UBATCH=2048
export LLAMA_ARG_SWA_FULL=false
export LLAMA_ARG_KV_SPLIT=false
export LLAMA_SET_ROWS=1 # for ARG_KV_SPLIT=false to work
export LLAMA_ARG_FLASH_ATTN=true
export LLAMA_ARG_MLOCK=true
export LLAMA_ARG_NO_MMAP=false
export LLAMA_ARG_N_GPU_LAYERS=999
export LLAMA_OFFLINE=false
export LLAMA_ARG_ENDPOINT_SLOTS=true
export LLAMA_ARG_ENDPOINT_PROPS=true
export LLAMA_API_KEY="dummy-api-key"
export OPENAI_API_KEY="${LLAMA_API_KEY}"
export LLAMA_CPP_SERVER_URL="${LLAMA_ARG_HOST}:${LLAMA_ARG_PORT}"
export OPENAI_BASE_URL="${LLAMA_CPP_SERVER_URL}/v1"
export OPENAI_MODEL=""
# System-related variables
export PATH="${PATH}:${LLAMA_CPP_INSTALL_PATH}/bin"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${LLAMA_CPP_INSTALL_PATH}/lib"
export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py"
export UV_TORCH_BACKEND="auto"
# generic llm-related functions
function serve-llm() {
local model_gguf_path=$1
local model_name=${1:t:r}
if [[ -z "$model_gguf_path" ]]; then
echo "Error: Model path not provided" >&2
echo "Usage: $0 <path-to-model.gguf> <context-length> [additional arguments for llama-server]" >&2
return 1
fi
if [[ ! -f "$model_gguf_path" ]]; then
echo "Error: Model file not found at: $model_gguf_path" >&2
return 2
fi
local context_length=$2
if [[ -z "$context_length" ]]; then
echo "Error: Context length not provided" >&2
return 3
fi
if (( context_length < 0 )); then
echo "Error: Context length must be >= 0" >&2
return 4
fi
if (( context_length > 0)); then
echo "Serving $model_name with $context_length tokens context, additional flags: ${@[3,-1]}"
else
echo "Serving $model_name with maximum available context, additional flags: ${@[3,-1]}"
fi
llama-server \
--ctx-size ${context_length} \
--model ${model_gguf_path} \
--alias ${model_name} \
"${@[3,-1]}"
}
function serve-llm-jinja() {
serve-llm $1 $2 \
--jinja \
"${@[3,-1]}"
}
function serve-llm-jinja-ext() {
local template_path=$3
if [[ -z "$template_path" ]]; then
echo "Error: Chat template path not provided" >&2
echo "Usage: $0 <path-to-model.gguf> <context-length> <chat-template.jinja> [additional arguments for llama-server]" >&2
return 1
fi
if [[ ! -f "$template_path" ]]; then
echo "Error: Chat template file not found at: $template_path" >&2
return 10
fi
serve-llm-jinja $1 $2 \
--chat-template-file $template_path \
"${@[4,-1]}"
}
function serve-llm-qwen() {
serve-llm-jinja $1 $2 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0 \
--presence-penalty 1.5 \
"${@[3,-1]}"
}
function serve-llm-qwen-ext() {
serve-llm-jinja-ext $1 $2 $3 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0 \
--presence-penalty 1.5 \
"${@[4,-1]}"
}
function serve-llm-qwen-coder() {
export OPENAI_MODEL="Qwen3-Coder-30B-A3B-Instruct-UD-Q3_K_XL"
serve-llm-jinja-ext "$HOME/LLMs/Qwen3-Coder-30B-A3B-Instruct-UD-Q3_K_XL.gguf" \
81920 \
"$HOME/LLMs/repos/Qwen3-Coder-30B-A3B-Instruct/chat-template.jinja" \
--n-cpu-moe 3 \
--cache-type-k q8_0 \
--cache-type-v q8_0 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--repeat-penalty 1.05
}
function serve-llm-gpt-oss() {
serve-llm-jinja \
"$HOME/LLMs/gpt-oss-20b.auto.gguf" \
0 \
--temp 1.0 \
--top-p 1.0 \
--top-k 100 \
--min-p 0.01
}
function serve-llm-mistral() {
serve-llm-jinja \
"$HOME/LLMs/Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL.gguf" \
32768 \
--temp 0.15 \
--cache-type-k q8_0 \
--cache-type-v q8_0
}
function llama-model-to-gguf() {
local base_model_dir=$1
local output_quantization=${2:-auto}
local output_gguf_dir=${3:-.}
# base_model_dir should point to a repository, so dir's name should be model's name
local model_name=$(basename $base_model_dir)
if [ ! -d "$base_model_dir" ]; then
echo "Error: Model directory '$base_model_dir' does not exist."
return 1
fi
llama-venv-activate
# Run the conversion command
python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir
# Check if the conversion was successful
if [ $? -eq 0 ]; then
echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf"
else
echo "Error: Failed to quantize model '$base_model_dir'."
fi
}
function llama-venv-activate() {
echo "Activating python virtualenv for LLMs..."
source $LLAMA_CPP_VENV_PATH/bin/activate
}
function llama-venv-update() {
llama-venv-activate
echo "Updating python virtualenv for LLMs..."
uv pip install --upgrade pip setuptools wheel cmake "huggingface_hub[cli,hf_xet]" mistral_common
uv pip install --upgrade $LLAMA_CPP_PATH
echo "Updated!"
}
function llama-venv-initialize() {
echo "Initializing python virtualenv for LLMs..."
uv venv --no-project --no-config $LLAMA_CPP_VENV_PATH
echo "Virtual env initialized!"
}
function llama-venv-remove() {
echo "Removing python virtualenv for LLMs..."
rm -rf $LLAMA_CPP_VENV_PATH
echo "Deleted!"
}
# llama.cpp management functions
function llama-cpp-clone() {
echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}"
git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH
pushd $LLAMA_CPP_PATH
git submodule update --init --recursive
git lfs pull
popd
}
function llama-cpp-update() {
echo "Pulling latest llama.cpp commit..."
pushd $LLAMA_CPP_PATH
git pull
git lfs pull
popd
}
function llama-cpp-clean() {
echo "Clearing llama.cpp repository from any build artifacts and junk..."
pushd $LLAMA_CPP_PATH
git clean -xddf
popd
}
function llama-cpp-build() {
local arch=${1:-rocm}
local -a cmake_arguments
case $arch in
vulkan)
cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_VULKAN[@]}")
;;
rocm|*)
cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}")
;;
esac
pushd $LLAMA_CPP_PATH
echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)"
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments
echo "Building llama.cpp..."
cmake --build build --config Release -j 24
cmake --install build --config Release
popd
}
function llama-cpp-clean-update() {
llama-cpp-clean
llama-cpp-update
}
function llama-cpp-clean-build() {
local arch=${1:-rocm}
llama-cpp-clean
llama-cpp-build $arch
}
function llama-cpp-clean-update-build() {
local arch=${1:-rocm}
llama-cpp-clean-update
llama-cpp-build $arch
}
alias llamamup=llama-cpp-clean-update-build
alias llamasrv=llama-serve
# openwebui+ollama management functions
export OPENWEBUI_DOCKER_DIR="$HOME/LLMs/"
function llm-start() {
pushd $OPENWEBUI_DOCKER_DIR
docker compose up -d
popd
}
function llm-stop() {
pushd $OPENWEBUI_DOCKER_DIR
docker compose stop
popd
}
function llm-pull() {
pushd $OPENWEBUI_DOCKER_DIR
docker compose pull
popd
}
alias llm-restart="llm-stop && llm-start"
alias ollama="docker exec ollama ollama"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment