Last active
August 27, 2025 19:04
-
-
Save SteelPh0enix/760107a1749df8203fd7b0943fcb5976 to your computer and use it in GitHub Desktop.
llama.cpp shell utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/zsh | |
| # Collection of variables, aliases and functions to work w/ llama.cpp | |
| # Source to activate. | |
| # HARDCODED VALUES - MAKE SURE TO TUNE THEM FOR YOUR SYSTEM! | |
| # These settings are for RX 7900 XT & latest Arch Linux | |
| export USE_ROCM=1 | |
| export HIP_PLATFORM="amd" | |
| export GPU_ARCHS="gfx1100" | |
| export HSA_OVERRIDE_GFX_VERSION="11.0.0" | |
| export USE_SYMENGINE=1 | |
| # llama.cpp-related variables (tweak if necessary) | |
| export LLAMA_CPP_PATH="${HOME}/.llama.cpp" | |
| export LLAMA_CPP_INSTALL_PATH="${HOME}/.llama.cpp.install" | |
| export LLAMA_CPP_VENV_PATH="${HOME}/.llama.cpp.venv" | |
| export LLAMA_CPP_CMAKE_ARGS_FOR_ROCM=( | |
| "-DCMAKE_INSTALL_PREFIX=${LLAMA_CPP_INSTALL_PATH}" | |
| "-DCMAKE_C_COMPILER=${ROCM_PATH}/llvm/bin/clang" | |
| "-DCMAKE_CXX_COMPILER=${ROCM_PATH}/llvm/bin/clang++" | |
| "-DLLAMA_BUILD_TESTS=OFF" | |
| "-DLLAMA_BUILD_EXAMPLES=ON" | |
| "-DLLAMA_BUILD_SERVER=ON" | |
| "-DLLAMA_STANDALONE=ON" | |
| "-DLLAMA_CURL=OFF" | |
| "-DLLAMA_SERVER_SSL=ON" | |
| "-DGGML_CCACHE=ON" | |
| "-DGGML_NATIVE=ON" | |
| "-DGGML_OPENMP=ON" | |
| "-DGGML_LTO=ON" | |
| "-DGGML_RPC=ON" | |
| # CPU acceleration | |
| "-DGGML_CPU=ON" | |
| "-DGGML_AVX=ON" | |
| "-DGGML_AVX2=ON" | |
| # GPU acceleration | |
| "-DGPU_TARGETS=${GPU_ARCHS}" | |
| "-DGGML_HIP=ON" | |
| "-DGGML_HIP_ROCWMMA_FATTN=ON" | |
| "-DGGML_CUDA_FA_ALL_QUANTS=ON" | |
| "-DGGML_CUDA_GRAPHS=ON" | |
| ) | |
| export LLAMA_CPP_CMAKE_ARGS_FOR_VULKAN=( | |
| "-DCMAKE_INSTALL_PREFIX=${LLAMA_CPP_INSTALL_PATH}" | |
| "-DLLAMA_BUILD_TESTS=OFF" | |
| "-DLLAMA_BUILD_EXAMPLES=ON" | |
| "-DLLAMA_BUILD_SERVER=ON" | |
| "-DLLAMA_STANDALONE=ON" | |
| "-DLLAMA_CURL=OFF" | |
| "-DLLAMA_SERVER_SSL=ON" | |
| "-DGGML_CCACHE=ON" | |
| "-DGGML_NATIVE=ON" | |
| "-DGGML_OPENMP=ON" | |
| "-DGGML_LTO=ON" | |
| "-DGGML_RPC=ON" | |
| # CPU acceleration | |
| "-DGGML_CPU=ON" | |
| "-DGGML_AVX=ON" | |
| "-DGGML_AVX2=ON" | |
| # GPU acceleration | |
| "-DGGML_VULKAN=ON" | |
| ) | |
| # llama.cpp server default settings | |
| export LLAMA_ARG_HOST="0.0.0.0" | |
| export LLAMA_ARG_PORT="51536" | |
| export LLAMA_ARG_BATCH=2048 | |
| export LLAMA_ARG_UBATCH=2048 | |
| export LLAMA_ARG_SWA_FULL=false | |
| export LLAMA_ARG_KV_SPLIT=false | |
| export LLAMA_SET_ROWS=1 # for ARG_KV_SPLIT=false to work | |
| export LLAMA_ARG_FLASH_ATTN=true | |
| export LLAMA_ARG_MLOCK=true | |
| export LLAMA_ARG_NO_MMAP=false | |
| export LLAMA_ARG_N_GPU_LAYERS=999 | |
| export LLAMA_OFFLINE=false | |
| export LLAMA_ARG_ENDPOINT_SLOTS=true | |
| export LLAMA_ARG_ENDPOINT_PROPS=true | |
| export LLAMA_API_KEY="dummy-api-key" | |
| export OPENAI_API_KEY="${LLAMA_API_KEY}" | |
| export LLAMA_CPP_SERVER_URL="${LLAMA_ARG_HOST}:${LLAMA_ARG_PORT}" | |
| export OPENAI_BASE_URL="${LLAMA_CPP_SERVER_URL}/v1" | |
| export OPENAI_MODEL="" | |
| # System-related variables | |
| export PATH="${PATH}:${LLAMA_CPP_INSTALL_PATH}/bin" | |
| export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${LLAMA_CPP_INSTALL_PATH}/lib" | |
| export PYTHONPATH="${PYTHONPATH}:${LLAMA_CPP_PATH}/gguf-py" | |
| export UV_TORCH_BACKEND="auto" | |
| # generic llm-related functions | |
| function serve-llm() { | |
| local model_gguf_path=$1 | |
| local model_name=${1:t:r} | |
| if [[ -z "$model_gguf_path" ]]; then | |
| echo "Error: Model path not provided" >&2 | |
| echo "Usage: $0 <path-to-model.gguf> <context-length> [additional arguments for llama-server]" >&2 | |
| return 1 | |
| fi | |
| if [[ ! -f "$model_gguf_path" ]]; then | |
| echo "Error: Model file not found at: $model_gguf_path" >&2 | |
| return 2 | |
| fi | |
| local context_length=$2 | |
| if [[ -z "$context_length" ]]; then | |
| echo "Error: Context length not provided" >&2 | |
| return 3 | |
| fi | |
| if (( context_length < 0 )); then | |
| echo "Error: Context length must be >= 0" >&2 | |
| return 4 | |
| fi | |
| if (( context_length > 0)); then | |
| echo "Serving $model_name with $context_length tokens context, additional flags: ${@[3,-1]}" | |
| else | |
| echo "Serving $model_name with maximum available context, additional flags: ${@[3,-1]}" | |
| fi | |
| llama-server \ | |
| --ctx-size ${context_length} \ | |
| --model ${model_gguf_path} \ | |
| --alias ${model_name} \ | |
| "${@[3,-1]}" | |
| } | |
| function serve-llm-jinja() { | |
| serve-llm $1 $2 \ | |
| --jinja \ | |
| "${@[3,-1]}" | |
| } | |
| function serve-llm-jinja-ext() { | |
| local template_path=$3 | |
| if [[ -z "$template_path" ]]; then | |
| echo "Error: Chat template path not provided" >&2 | |
| echo "Usage: $0 <path-to-model.gguf> <context-length> <chat-template.jinja> [additional arguments for llama-server]" >&2 | |
| return 1 | |
| fi | |
| if [[ ! -f "$template_path" ]]; then | |
| echo "Error: Chat template file not found at: $template_path" >&2 | |
| return 10 | |
| fi | |
| serve-llm-jinja $1 $2 \ | |
| --chat-template-file $template_path \ | |
| "${@[4,-1]}" | |
| } | |
| function serve-llm-qwen() { | |
| serve-llm-jinja $1 $2 \ | |
| --temp 0.6 \ | |
| --top-p 0.95 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --presence-penalty 1.5 \ | |
| "${@[3,-1]}" | |
| } | |
| function serve-llm-qwen-ext() { | |
| serve-llm-jinja-ext $1 $2 $3 \ | |
| --temp 0.6 \ | |
| --top-p 0.95 \ | |
| --top-k 20 \ | |
| --min-p 0 \ | |
| --presence-penalty 1.5 \ | |
| "${@[4,-1]}" | |
| } | |
| function serve-llm-qwen-coder() { | |
| export OPENAI_MODEL="Qwen3-Coder-30B-A3B-Instruct-UD-Q3_K_XL" | |
| serve-llm-jinja-ext "$HOME/LLMs/Qwen3-Coder-30B-A3B-Instruct-UD-Q3_K_XL.gguf" \ | |
| 81920 \ | |
| "$HOME/LLMs/repos/Qwen3-Coder-30B-A3B-Instruct/chat-template.jinja" \ | |
| --n-cpu-moe 3 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 \ | |
| --temp 0.7 \ | |
| --top-p 0.8 \ | |
| --top-k 20 \ | |
| --repeat-penalty 1.05 | |
| } | |
| function serve-llm-gpt-oss() { | |
| serve-llm-jinja \ | |
| "$HOME/LLMs/gpt-oss-20b.auto.gguf" \ | |
| 0 \ | |
| --temp 1.0 \ | |
| --top-p 1.0 \ | |
| --top-k 100 \ | |
| --min-p 0.01 | |
| } | |
| function serve-llm-mistral() { | |
| serve-llm-jinja \ | |
| "$HOME/LLMs/Mistral-Small-3.2-24B-Instruct-2506-UD-Q4_K_XL.gguf" \ | |
| 32768 \ | |
| --temp 0.15 \ | |
| --cache-type-k q8_0 \ | |
| --cache-type-v q8_0 | |
| } | |
| function llama-model-to-gguf() { | |
| local base_model_dir=$1 | |
| local output_quantization=${2:-auto} | |
| local output_gguf_dir=${3:-.} | |
| # base_model_dir should point to a repository, so dir's name should be model's name | |
| local model_name=$(basename $base_model_dir) | |
| if [ ! -d "$base_model_dir" ]; then | |
| echo "Error: Model directory '$base_model_dir' does not exist." | |
| return 1 | |
| fi | |
| llama-venv-activate | |
| # Run the conversion command | |
| python $LLAMA_CPP_PATH/convert_hf_to_gguf.py --outtype $output_quantization --outfile $output_gguf_dir/$model_name.$output_quantization.gguf $base_model_dir | |
| # Check if the conversion was successful | |
| if [ $? -eq 0 ]; then | |
| echo "Model '$model_name' successfully quantized to $output_quantization format and saved as $output_gguf_dir/$model_name.$output_quantization.gguf" | |
| else | |
| echo "Error: Failed to quantize model '$base_model_dir'." | |
| fi | |
| } | |
| function llama-venv-activate() { | |
| echo "Activating python virtualenv for LLMs..." | |
| source $LLAMA_CPP_VENV_PATH/bin/activate | |
| } | |
| function llama-venv-update() { | |
| llama-venv-activate | |
| echo "Updating python virtualenv for LLMs..." | |
| uv pip install --upgrade pip setuptools wheel cmake "huggingface_hub[cli,hf_xet]" mistral_common | |
| uv pip install --upgrade $LLAMA_CPP_PATH | |
| echo "Updated!" | |
| } | |
| function llama-venv-initialize() { | |
| echo "Initializing python virtualenv for LLMs..." | |
| uv venv --no-project --no-config $LLAMA_CPP_VENV_PATH | |
| echo "Virtual env initialized!" | |
| } | |
| function llama-venv-remove() { | |
| echo "Removing python virtualenv for LLMs..." | |
| rm -rf $LLAMA_CPP_VENV_PATH | |
| echo "Deleted!" | |
| } | |
| # llama.cpp management functions | |
| function llama-cpp-clone() { | |
| echo "Pulling llama.cpp repository to ${LLAMA_CPP_PATH}" | |
| git clone git@github.com:ggerganov/llama.cpp.git $LLAMA_CPP_PATH | |
| pushd $LLAMA_CPP_PATH | |
| git submodule update --init --recursive | |
| git lfs pull | |
| popd | |
| } | |
| function llama-cpp-update() { | |
| echo "Pulling latest llama.cpp commit..." | |
| pushd $LLAMA_CPP_PATH | |
| git pull | |
| git lfs pull | |
| popd | |
| } | |
| function llama-cpp-clean() { | |
| echo "Clearing llama.cpp repository from any build artifacts and junk..." | |
| pushd $LLAMA_CPP_PATH | |
| git clean -xddf | |
| popd | |
| } | |
| function llama-cpp-build() { | |
| local arch=${1:-rocm} | |
| local -a cmake_arguments | |
| case $arch in | |
| vulkan) | |
| cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_VULKAN[@]}") | |
| ;; | |
| rocm|*) | |
| cmake_arguments=("${LLAMA_CPP_CMAKE_ARGS_FOR_ROCM[@]}") | |
| ;; | |
| esac | |
| pushd $LLAMA_CPP_PATH | |
| echo "Generating build files (backend: $backend, CMake arguments: $cmake_arguments)" | |
| cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release $cmake_arguments | |
| echo "Building llama.cpp..." | |
| cmake --build build --config Release -j 24 | |
| cmake --install build --config Release | |
| popd | |
| } | |
| function llama-cpp-clean-update() { | |
| llama-cpp-clean | |
| llama-cpp-update | |
| } | |
| function llama-cpp-clean-build() { | |
| local arch=${1:-rocm} | |
| llama-cpp-clean | |
| llama-cpp-build $arch | |
| } | |
| function llama-cpp-clean-update-build() { | |
| local arch=${1:-rocm} | |
| llama-cpp-clean-update | |
| llama-cpp-build $arch | |
| } | |
| alias llamamup=llama-cpp-clean-update-build | |
| alias llamasrv=llama-serve | |
| # openwebui+ollama management functions | |
| export OPENWEBUI_DOCKER_DIR="$HOME/LLMs/" | |
| function llm-start() { | |
| pushd $OPENWEBUI_DOCKER_DIR | |
| docker compose up -d | |
| popd | |
| } | |
| function llm-stop() { | |
| pushd $OPENWEBUI_DOCKER_DIR | |
| docker compose stop | |
| popd | |
| } | |
| function llm-pull() { | |
| pushd $OPENWEBUI_DOCKER_DIR | |
| docker compose pull | |
| popd | |
| } | |
| alias llm-restart="llm-stop && llm-start" | |
| alias ollama="docker exec ollama ollama" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment