Skip to content

Instantly share code, notes, and snippets.

@theely
Created December 5, 2025 10:48
Show Gist options
  • Select an option

  • Save theely/2ce2797531c1d4dd48c5424279e4c146 to your computer and use it in GitHub Desktop.

Select an option

Save theely/2ce2797531c1d4dd48c5424279e4c146 to your computer and use it in GitHub Desktop.
#!/bin/bash
#SBATCH --nodes=2
#SBATCH --time=0-00:15:00
#SBATCH --account=a-csstaff
#SBATCH --partition=mi300
cat > env.toml <<- EOF
image = "/capstor/scratch/cscs/palmee/rocm-7.0.2-dev.sqsh"
mounts = [
"/capstor",
"/iopsstor",
"/tmp",
"/users/palmee/ofi-ncc-rccl-debug/libnccl-net.so.master-rocm7:/usr/lib/librccl-net-ofi.so"
]
[env]
NCCL_DEBUG = "WARN"
NCCL_NET_PLUGIN = "ofi"
NCCL_NET = "AWS Libfabric"
NCCL_CROSS_NIC = "1"
NCCL_NET_GDR_LEVEL = "PHB"
NCCL_SOCKET_IFNAME = "hsn"
NCCL_PROTO = "^LL128"
FI_CXI_COMPAT = "0"
FI_MR_CACHE_MONITOR = "userfaultfd"
FI_CXI_DISABLE_HOST_REGISTER = "1"
OFI_NCCL_DISABLE_DMABUF = "1"
HSA_NO_SCRATCH_RECLAIM = "1"
FI_CXI_RDZV_EAGER_SIZE = "0"
FI_CXI_RDZV_GET_MIN = "0"
FI_CXI_RDZV_THRESHOLD = "0"
EOF
cat > config.yaml <<- EOF
name: ML Vetting
evals:
- name: RCCL-Low-Level
type: vetnode.evaluations.rccl_lib_eval.RcclLibEval
scheduler: slurm
payload: 8 GB
method: allreduce
min_bandwidth: 15 GB/s
warmup:
payload: 256 MB
runs: 5
requirements:
- ['hip-python~=7.0.2','--index-url','https://test.pypi.org/simple']
- numpy
EOF
sbcast config.yaml /tmp/config.yaml
sbcast env.toml /tmp/env.toml
srun -N ${SLURM_JOB_NUM_NODES} --tasks-per-node=4 -u --environment=/tmp/env.toml --container-writable bash -c '
if [[ $SLURM_LOCALID = 0 ]]; then
echo "[vetnode] Install env..."
git clone https://github.com/theely/vetnode.git
cd vetnode
pip install --no-cache-dir -r ./requirements.txt
cd src
echo "[vetnode] set-up"
python3.10 -m vetnode setup /tmp/config.yaml
else
sleep 30
cd vetnode/src
fi
echo "[vetnode] diagnose"
python3.10 -m vetnode diagnose /tmp/config.yaml
'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment