Skip to content

Instantly share code, notes, and snippets.

@carlosroman
Last active March 6, 2026 13:40
Show Gist options
  • Select an option

  • Save carlosroman/4943cd2ab4a9e81fe7f40905e4290521 to your computer and use it in GitHub Desktop.

Select an option

Save carlosroman/4943cd2ab4a9e81fe7f40905e4290521 to your computer and use it in GitHub Desktop.
Random llama.cpp notes, config and scripts I use on my Halo Strix setup
version = 1
# WIP config where I tweak my model settings when I use router mode
# Global
[*]
# Offload all layers to the GPU VRAM
n-gpu-layers = 999
# Use all the threads for HTTP requests
threads = -1
# Not sure if this should be on or off for my setup (default on)
fit = on
# Disable memory-map model as loading everything into VRAM
no-mmap = 1
# Use direct-io
# direct-io = 1
# unsloth suggest --ctx-size 16384 (16k) which seems low
# Sugested to use --ctx-size 32768 (32k) for fast coder
# Suggested to use --ctx-size 65536 (64k) for Multi-file work or big refactor
# Suggested for crazy --ctx-size 131072 (128k) for One-shot analysis (Project dump)
# Some models support --ctx-size 204800 (200k) which is HUGE
ctx-size = 131072
# GLM-4.7-Flash
[glm-4.7-flash]
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_0
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q6_K_XL
hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_K_XL
seed = 3407
temp = 1.0
top-p = 0.95
min-p = 0.01
repeat-penalty = 1.0
ctx-size = 131072
# [glm-4.7-flash-16k]
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_0
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q6_K_XL
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_K_XL
# seed = 3407
# temp = 1.0
# top-p = 0.95
# min-p = 0.01
# repeat-penalty = 1.0
# ctx-size = 16348
# [glm-4.7-flash-32k]
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_0
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q6_K_XL
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_K_XL
# seed = 3407
# temp = 1.0
# top-p = 0.95
# min-p = 0.01
# repeat-penalty = 1.0
# ctx-size = 32768
# [glm-4.7-flash-64k]
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_0
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q6_K_XL
# hf-repo = unsloth/GLM-4.7-Flash-GGUF:Q8_K_XL
# seed = 3407
# temp = 1.0
# top-p = 0.95
# min-p = 0.01
# repeat-penalty = 1.0
# ctx-size = 65536
# gpt-oss-120b-GGUF
[gpt-oss-120b]
# hf-repo = ggml-org/gpt-oss-120b-GGUF
# hf-repo = unsloth/gpt-oss-120b-GGUF:Q8_0
hf-repo = unsloth/gpt-oss-120b-GGUF:Q8_K_XL
temp = 1.0
min-p = 0.0
top-p = 1.0
top-k = 0.0
ctx-size = 131072
# gpt-oss-20b-GGUF
[gpt-oss-20b]
# hf-repo = ggml-org/gpt-oss-20b-GGUF
# hf-repo = unsloth/gpt-oss-20b-GGUF:Q6_K_XL
# hf-repo = unsloth/gpt-oss-20b-GGUF:Q8_0
hf-repo = unsloth/gpt-oss-20b-GGUF:Q8_K_XL
temp = 1.0
min-p = 0.0
top-p = 1.0
top-k = 0.0
# Qwen 3.5-122b
[qwen3.5-122b-coder]
hf-repo = unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL
# ctx-size = 16384
temp = 0.6
top-p = 0,95
top-k = 20
min-p = 0.00
repeat-penalty = 1.0
presence-penalty = 0.00
[qwen3.5-122b-tasks]
hf-repo = unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL
# ctx-size = 16384
temp = 1.0
top-p = 0,95
top-k = 20
min-p = 0.00
# Qwen3.5-35B-A3B
[qwen3.5-35b-a3b-coder]
hf-repo = unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q8_K_XL
temp = 0.6
top-p = 0,95
top-k = 20
min-p = 0.00
repeat-penalty = 1.0
presence-penalty = 0.00
[qwen3.5-35b-a3b-tasks]
hf-repo = unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q8_K_XL
temp = 1.0
top-p = 0,95
top-k = 20
min-p = 0.00
# Qwen3-Coder-30B-A3B-Instruct #
[qwen3-coder-30b]
# hf-repo = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q6_K_XL
# hf-repo = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q8_0
hf-repo = unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF:Q8_K_XL
# hf-repo = ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
ngl = 99
temp = 0.7
min-p = 0.0
top-p = 0.80
top-k = 20
repeat-penalty = 1.05
ctx-size = 131072
# Qwen3-Coder-Next
[qwen3-coder-next]
hf-repo = unsloth/Qwen3-Coder-Next-GGUF:Q8_K_XL
seed = 3407
temp = 1.0
top-p = 0.95
min-p = 0.01
top-k = 40
[qwen3-coder-next-q6-k]
hf-repo = unsloth/Qwen3-Coder-Next-GGUF:Q6_K
seed = 3407
temp = 1.0
top-p = 0.95
min-p = 0.01
top-k = 40
# [qwen3-coder-next-q8-0]
# hf-repo = unsloth/Qwen3-Coder-Next-GGUF:Q8_0
# seed = 3407
# temp = 1.0
# top-p = 0.95
# min-p = 0.01
# top-k = 40
# Qwen3-Next-80B-A3B-Instruct
[qwen3-next-instruct]
hf-repo = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF:Q8_K_XL
ngl = 99
temp = 0.7
min-p = 0.0
top-p = 0.80
top-k = 20
presence-penalty = 1.0
[qwen3-next-instruct-q6-k]
hf-repo = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF:Q6_K
ngl = 99
temp = 0.7
min-p = 0.0
top-p = 0.80
top-k = 20
presence-penalty = 1.0
[qwen3-next-instruct-q8-0]
hf-repo = unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF:Q8_0
ngl = 99
temp = 0.7
min-p = 0.0
top-p = 0.80
top-k = 20
presence-penalty = 1.0
# Gemma 3
[gemma-3]
# hf-repo = unsloth/gemma-3-27b-it-GGUF:Q6_K_XL
hf-repo = unsloth/gemma-3-27b-it-GGUF:Q8_K_XL
seed = 3407
prio = 2
temp = 1.0
repeat-penalty = 1.0
min-p = 0.01
top-k = 64
top-p = 0.95
[gemma-3-12b]
# hf-repo = unsloth/gemma-3-27b-it-GGUF:Q6_K_XL
hf-repo = unsloth/gemma-3-12b-it-GGUF:UD-Q8_K_XL
seed = 3407
prio = 2
temp = 1.0
repeat-penalty = 1.0
min-p = 0.01
top-k = 64
top-p = 0.95
# [gemma-3-q6]
# hf-repo = unsloth/gemma-3-27b-it-GGUF:Q6_K_XL
# seed = 3407
# prio = 2
# temp = 1.0
# repeat-penalty = 1.0
# min-p = 0.01
# top-k = 64
# top-p = 0.95
#
[devstral-small-2-24b]
# hf-repo = ggml-org/Devstral-Small-2-24B-Instruct-2512-GGUF
hf-repo = unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF:Q8_K_XL
seed = 3407
prio = 3
temp = 0.15
min-p = 0.01
#
[devstral-2-123b]
hf-repo = unsloth/Devstral-2-123B-Instruct-2512-GGUF:Q4_K_XL
# hf-repo = unsloth/Devstral-2-123B-Instruct-2512-GGUF:Q5_K_XL
# hf-repo = unsloth/Devstral-2-123B-Instruct-2512-GGUF:Q6_K
# hf-repo = unsloth/Devstral-2-123B-Instruct-2512-GGUF:Q6_K_XL
seed = 3407
prio = 3
temp = 0.15
min-p = 0.01
#!/usr/bin/env sh
echo "Setting up..."
cmake -S . -B build \
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DGGML_HIP=ON \
-DHIP_PLATFORM=amd \
-DGGML_HIPBLAS=ON \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGPU_TARGETS=gfx1151 \
-DCMAKE_BUILD_TYPE=Release \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_EXAMPLES=OFF \
--fresh
echo "Building..."
cmake --build build \
--clean-first \
--config Release -- -j$(nproc)
#!/usr/bin/env sh
./build/bin/llama-server \
--models-preset models.ini \
--models-max 2 \
--log-file llama.log \
--metrics \
--host 0.0.0.0 \
--port 9090

Setup llama.cpp

Clone git clone git@github.com:ggml-org/llama.cpp.git and then cd into llama.cpp and run the following:

uv venv --python 3.12
uv pip install --index-url https://repo.amd.com/rocm/whl/gfx1151/ "rocm[libraries,devel]"
source ./.venv/bin/activate
rocm-sdk init
deactivate

Then export following variables or add them to a .envrc file in the root of the llama.cpp directory and use dirnev:

export ROCM_PATH=$(uv run rocm-sdk path --root)
export HIP_DEVICE_LIB_PATH=$(uv run rocm-sdk path --root)/lib/llvm/amdgcn/bitcode
export HIPCXX="$(uv run rocm-sdk path --root)/llvm/bin/clang"
export HIP_PATH="$(uv run rocm-sdk path --root)"
export HIP_PLATFORM=amd
export CMAKE_PREFIX_PATH="$(uv run rocm-sdk path --root):$CMAKE_PREFIX_PATH"
export GGML_CUDA_ENABLE_UNIFIED_MEMORY=ON
export LD_LIBRARY_PATH="$(uv run rocm-sdk path --root)/lib"
export HF_TOKEN=<token is optional>

Then create/download run-build.sh and run it to build llama.cpp with rocm support.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment