Skip to content

Instantly share code, notes, and snippets.

@bio-punk
Created December 19, 2025 08:01
Show Gist options
  • Select an option

  • Save bio-punk/3a8110621e1d7b8dc12c58741aa70f2b to your computer and use it in GitHub Desktop.

Select an option

Save bio-punk/3a8110621e1d7b8dc12c58741aa70f2b to your computer and use it in GitHub Desktop.
multinode run Megatron #slurm #megatron
#!/bin/bash
#SBATCH -J MEGATRON_LLAMA
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH --qos=gpugpu
#SBATCH --gres=gpu:8
#SBATCH -o logs/slurm-%j.log
#SBATCH -e logs/slurm-%j.log
#SBATCH
echo "设置自定义参数"
CONDA_ENV_NAME=py312-t29
DATA_PATH=
TOKENIZER_MODEL=
TP=1
PP=4
echo "-------------------------------------------------"
echo "CONDA_ENV_NAME: $CONDA_ENV_NAME"
echo "DATA_PATH: $DATA_PATH"
echo "TOKENIZER_MODEL: $TOKENIZER_MODEL"
echo "TP: $TP"
echo "PP: $PP"
echo "-------------------------------------------------"
echo "enviroment modules配置"
echo "-------------------------------------------------"
module purge
module load cuda/12.8 nccl/2.27.6-1_cuda12.8
# module load /data/run01/scvj012/mbbq/cudnn/9.8.0.87_cuda12
module list
echo "-------------------------------------------------"
echo "conda环境激活"
source /data/apps/miniforge/25.3.0-3/etc/profile.d/conda.sh
conda activate $CONDA_ENV_NAME
echo "-------------------------------------------------"
echo "conda environment: $CONDA_PREFIX"
echo "python: $(which python)"
echo "python version: $(python --version)"
echo "-------------------------------------------------"
echo "分布式参数配置"
GPUS_PER_NODE=`nvidia-smi -L|wc -l`
export MASTER_ADDR=$(scontrol show hostnames | head -n 1)
export MASTER_PORT=$((RANDOM % 101 + 20000))
export NCCL_DEBUG=INFO
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=mlx5_2:1
export NCCL_SOCKET_IFNAME=bond0
echo "检查主节点端口${MASTER_PORT}是否被占用"
lsof -i:${MASTER_PORT}
if [ $? -eq 0 ]; then
echo "Port $MASTER_PORT is already in use. Please choose a different port."
exit 1
fi
echo "-------------------------------------------------"
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"
echo "GPUS_PER_NODE: $GPUS_PER_NODE"
echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME"
echo "NCCL_IB_HCA: $NCCL_IB_HCA"
echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE"
echo "NCCL_DEBUG: $NCCL_DEBUG"
echo "-------------------------------------------------"
echo "设置预训练参数"
GPT_ARGS="\
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--use-mcore-models \
--micro-batch-size 8 \
--global-batch-size 64 \
--sequence-parallel \
--use-flash-attn \
--use-rotary-position-embeddings \
--rope-scaling-factor 32.0 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--num-layers 28 \
--hidden-size 3072 \
--ffn-hidden-size 8192 \
--num-attention-heads 24 \
--group-query-attention \
--num-query-groups 8 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--make-vocab-size-divisible-by 1 \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--rotary-base 500000 \
--normalization RMSNorm \
--norm-epsilon 1e-5 \
--swiglu \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--lr 1.25e-6 \
--train-iters 10 \
--lr-decay-style cosine \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--initial-loss-scale 4096 \
--use-distributed-optimizer \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--bf16 "
DATA_ARGS="\
--data-path $DATA_PATH \
--split 10,0,0 "
OUTPUT_ARGS="\
--log-interval 1 \
--save-interval 10 \
--eval-interval 10 \
--eval-iters 0 "
echo "-------------------------------------------------"
echo "GPT_ARGS: $GPT_ARGS"
echo "DATA_ARGS: $DATA_ARGS"
echo "OUTPUT_ARGS: $OUTPUT_ARGS"
echo "-------------------------------------------------"
echo "检查环境变量"
echo "-------------------------------------------------"
env
export
echo "-------------------------------------------------"
echo "在每个节点上运行命令"
let node_rank=0
for node in $(scontrol show hostnames)
do
echo "Running on $node"
echo "生成分布式参数"
DISTRIBUTED_ARGS="\
--nproc_per_node $GPUS_PER_NODE \
--nnodes $SLURM_NNODES \
--node_rank $node_rank \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT "
echo "DISTRIBUTED_ARGS: $DISTRIBUTED_ARGS"
CMD="\
torchrun \
$DISTRIBUTED_ARGS \
pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl "
echo "CMD:"
echo $CMD
echo "日志保存到logs/slurm-${SLURM_JOB_ID}-rank_${node_rank}-node_${node}.log"
LOG_FILE="logs/slurm-${SLURM_JOB_ID}-rank_${node_rank}-node_${node}.log"
srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" env
if [[ $node != $(scontrol show hostnames|tail -n 1) ]]; then
srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" \
$CMD 1>$LOG_FILE 2>&1 &
else
srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" \
$CMD 1>$LOG_FILE 2>&1
fi
let node_rank=node_rank+1
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment