Created
December 19, 2025 08:01
-
-
Save bio-punk/3a8110621e1d7b8dc12c58741aa70f2b to your computer and use it in GitHub Desktop.
multinode run Megatron #slurm #megatron
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| #SBATCH -J MEGATRON_LLAMA | |
| #SBATCH -N 2 | |
| #SBATCH -p gpu | |
| #SBATCH --qos=gpugpu | |
| #SBATCH --gres=gpu:8 | |
| #SBATCH -o logs/slurm-%j.log | |
| #SBATCH -e logs/slurm-%j.log | |
| #SBATCH | |
| echo "设置自定义参数" | |
| CONDA_ENV_NAME=py312-t29 | |
| DATA_PATH= | |
| TOKENIZER_MODEL= | |
| TP=1 | |
| PP=4 | |
| echo "-------------------------------------------------" | |
| echo "CONDA_ENV_NAME: $CONDA_ENV_NAME" | |
| echo "DATA_PATH: $DATA_PATH" | |
| echo "TOKENIZER_MODEL: $TOKENIZER_MODEL" | |
| echo "TP: $TP" | |
| echo "PP: $PP" | |
| echo "-------------------------------------------------" | |
| echo "enviroment modules配置" | |
| echo "-------------------------------------------------" | |
| module purge | |
| module load cuda/12.8 nccl/2.27.6-1_cuda12.8 | |
| # module load /data/run01/scvj012/mbbq/cudnn/9.8.0.87_cuda12 | |
| module list | |
| echo "-------------------------------------------------" | |
| echo "conda环境激活" | |
| source /data/apps/miniforge/25.3.0-3/etc/profile.d/conda.sh | |
| conda activate $CONDA_ENV_NAME | |
| echo "-------------------------------------------------" | |
| echo "conda environment: $CONDA_PREFIX" | |
| echo "python: $(which python)" | |
| echo "python version: $(python --version)" | |
| echo "-------------------------------------------------" | |
| echo "分布式参数配置" | |
| GPUS_PER_NODE=`nvidia-smi -L|wc -l` | |
| export MASTER_ADDR=$(scontrol show hostnames | head -n 1) | |
| export MASTER_PORT=$((RANDOM % 101 + 20000)) | |
| export NCCL_DEBUG=INFO | |
| export NCCL_IB_DISABLE=0 | |
| export NCCL_IB_HCA=mlx5_2:1 | |
| export NCCL_SOCKET_IFNAME=bond0 | |
| echo "检查主节点端口${MASTER_PORT}是否被占用" | |
| lsof -i:${MASTER_PORT} | |
| if [ $? -eq 0 ]; then | |
| echo "Port $MASTER_PORT is already in use. Please choose a different port." | |
| exit 1 | |
| fi | |
| echo "-------------------------------------------------" | |
| echo "MASTER_ADDR: $MASTER_ADDR" | |
| echo "MASTER_PORT: $MASTER_PORT" | |
| echo "GPUS_PER_NODE: $GPUS_PER_NODE" | |
| echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME" | |
| echo "NCCL_IB_HCA: $NCCL_IB_HCA" | |
| echo "NCCL_IB_DISABLE: $NCCL_IB_DISABLE" | |
| echo "NCCL_DEBUG: $NCCL_DEBUG" | |
| echo "-------------------------------------------------" | |
| echo "设置预训练参数" | |
| GPT_ARGS="\ | |
| --tensor-model-parallel-size ${TP} \ | |
| --pipeline-model-parallel-size ${PP} \ | |
| --use-mcore-models \ | |
| --micro-batch-size 8 \ | |
| --global-batch-size 64 \ | |
| --sequence-parallel \ | |
| --use-flash-attn \ | |
| --use-rotary-position-embeddings \ | |
| --rope-scaling-factor 32.0 \ | |
| --tokenizer-type HuggingFaceTokenizer \ | |
| --tokenizer-model ${TOKENIZER_MODEL} \ | |
| --num-layers 28 \ | |
| --hidden-size 3072 \ | |
| --ffn-hidden-size 8192 \ | |
| --num-attention-heads 24 \ | |
| --group-query-attention \ | |
| --num-query-groups 8 \ | |
| --seq-length 8192 \ | |
| --max-position-embeddings 8192 \ | |
| --make-vocab-size-divisible-by 1 \ | |
| --disable-bias-linear \ | |
| --attention-dropout 0.0 \ | |
| --init-method-std 0.01 \ | |
| --hidden-dropout 0.0 \ | |
| --position-embedding-type rope \ | |
| --rotary-base 500000 \ | |
| --normalization RMSNorm \ | |
| --norm-epsilon 1e-5 \ | |
| --swiglu \ | |
| --no-masked-softmax-fusion \ | |
| --attention-softmax-in-fp32 \ | |
| --lr 1.25e-6 \ | |
| --train-iters 10 \ | |
| --lr-decay-style cosine \ | |
| --min-lr 1.25e-7 \ | |
| --weight-decay 1e-1 \ | |
| --lr-warmup-fraction 0.01 \ | |
| --clip-grad 1.0 \ | |
| --adam-beta1 0.9 \ | |
| --adam-beta2 0.95 \ | |
| --initial-loss-scale 4096 \ | |
| --use-distributed-optimizer \ | |
| --no-gradient-accumulation-fusion \ | |
| --no-load-optim \ | |
| --no-load-rng \ | |
| --bf16 " | |
| DATA_ARGS="\ | |
| --data-path $DATA_PATH \ | |
| --split 10,0,0 " | |
| OUTPUT_ARGS="\ | |
| --log-interval 1 \ | |
| --save-interval 10 \ | |
| --eval-interval 10 \ | |
| --eval-iters 0 " | |
| echo "-------------------------------------------------" | |
| echo "GPT_ARGS: $GPT_ARGS" | |
| echo "DATA_ARGS: $DATA_ARGS" | |
| echo "OUTPUT_ARGS: $OUTPUT_ARGS" | |
| echo "-------------------------------------------------" | |
| echo "检查环境变量" | |
| echo "-------------------------------------------------" | |
| env | |
| export | |
| echo "-------------------------------------------------" | |
| echo "在每个节点上运行命令" | |
| let node_rank=0 | |
| for node in $(scontrol show hostnames) | |
| do | |
| echo "Running on $node" | |
| echo "生成分布式参数" | |
| DISTRIBUTED_ARGS="\ | |
| --nproc_per_node $GPUS_PER_NODE \ | |
| --nnodes $SLURM_NNODES \ | |
| --node_rank $node_rank \ | |
| --master_addr $MASTER_ADDR \ | |
| --master_port $MASTER_PORT " | |
| echo "DISTRIBUTED_ARGS: $DISTRIBUTED_ARGS" | |
| CMD="\ | |
| torchrun \ | |
| $DISTRIBUTED_ARGS \ | |
| pretrain_gpt.py \ | |
| $GPT_ARGS \ | |
| $DATA_ARGS \ | |
| $OUTPUT_ARGS \ | |
| --distributed-backend nccl " | |
| echo "CMD:" | |
| echo $CMD | |
| echo "日志保存到logs/slurm-${SLURM_JOB_ID}-rank_${node_rank}-node_${node}.log" | |
| LOG_FILE="logs/slurm-${SLURM_JOB_ID}-rank_${node_rank}-node_${node}.log" | |
| srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" env | |
| if [[ $node != $(scontrol show hostnames|tail -n 1) ]]; then | |
| srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" \ | |
| $CMD 1>$LOG_FILE 2>&1 & | |
| else | |
| srun -N 1 --gres=gpu:"${GPUS_PER_NODE}" -w "${node}" \ | |
| $CMD 1>$LOG_FILE 2>&1 | |
| fi | |
| let node_rank=node_rank+1 | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment