Skip to content

Instantly share code, notes, and snippets.

@ASKabalan
Last active February 24, 2026 13:21
Show Gist options
  • Select an option

  • Save ASKabalan/721209322df82dc1ea2dd2d25af3b7ea to your computer and use it in GitHub Desktop.

Select an option

Save ASKabalan/721209322df82dc1ea2dd2d25af3b7ea to your computer and use it in GitHub Desktop.
Slurm Scripting and Profiling
#!/bin/bash
# bash validate_fg.sh 1.0 c1d1s1
noise=$1
td_sp=20.0
SKY=$2
# Collect all kmeans job IDs here
job_ids=()
SBATCH_ARGS="--account=XXX@YYY --nodes=1 --gres=gpu:1 --tasks-per-node=1 -C XXX"
# ---------- GAL020 ----------
jid=$(sbatch $SBATCH_ARGS --job-name=FX_20_$noise_$SKY \
$SLURM_SCRIPT LOGS kmeans-model -n 64 -ns 1 -nr $noise \
-pc 10000 500 500 -tag $SKY -m GAL020 -i LiteBIRD \
-sp 1.54 $td_sp -3.0 -mi 1000 -o RESULTS -cond)
job_ids+=("$jid")
# ---------- GAL040 ----------
jid=$(sbatch $SBATCH_ARGS --job-name=FX_40_$noise_$SKY \
$SLURM_SCRIPT LOGS kmeans-model -n 64 -ns 1 -nr $noise \
-pc 10000 500 500 -tag $SKY -m GAL040 -i LiteBIRD \
-sp 1.54 $td_sp -3.0 -mi 1000 -o RESULTS -cond)
job_ids+=("$jid")
# ---------- GAL060 ----------
jid=$(sbatch $SBATCH_ARGS --job-name=FX_60_$noise_$SKY \
$SLURM_SCRIPT LOGS kmeans-model -n 64 -ns 1 -nr $noise \
-pc 10000 500 500 -tag $SKY -m GAL060 -i LiteBIRD \
-sp 1.54 $td_sp -3.0 -mi 1000 -o RESULTS -cond)
job_ids+=("$jid")
# ---------- Final analysis job depending on ALL previous jobs ----------
# Build colon-separated list of job IDs: id1:id2:id3:...
deps=$(IFS=:; echo "${job_ids[*]}")
noise_text=Noiseless
noise_percent=0
if (( $(echo "$noise > 0.0" | bc -l) )); then
noise_text=Noisy
noise_percent=100
fi
sbatch --dependency=afterany:$deps \
$SBATCH_ARGS \
--job-name=FX_validate_$noise_$SKY \
$SLURM_SCRIPT LOGS r_analysis validate -r kmeans_noise${noise_percent}_${SKY} -t "FURAX $noise_text $SKY" \
-ird RESULTS --noise-ratio $noise --no-tex --scales 1e-2 1e-3
sbatch --dependency=afterany:$deps \
$SBATCH_ARGS \
--job-name=FX_cache_$noise_$SKY \
$SLURM_SCRIPT LOGS r_analysis cache -r kmeans -ird RESULTS -mi 2000 --no-tex
#!/bin/bash
##############################################################################################################################
# USAGE: sbatch myscript.sh <RUN_NAME> python <script.py> [args...]
# EXAMPLE: sbatch myscript.sh my_experiment_v1 python train.py --lr 0.01
##############################################################################################################################
#SBATCH --job-name=Likelihoods
#SBATCH --cpus-per-task=3
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --hint=nomultithread
#SBATCH --time=02:00:00
#SBATCH --output=%x_%N.out
#SBATCH --error=%x_%N.err
#SBATCH --parsable
# Nettoyage des modules charges en interactif et herites par defaut
num_nodes=$SLURM_JOB_NUM_NODES
num_gpu_per_node=$SLURM_NTASKS_PER_NODE
nb_gpus=$(( num_nodes * num_gpu_per_node ))
module purge
echo "Job partition: $SLURM_JOB_PARTITION"
if [[ "$SLURM_JOB_PARTITION" == "gpu_p5" ]]; then
module load arch/a100
source SOME_ENV
gpu_name=a100
elif [[ "$SLURM_JOB_PARTITION" == "gpu_p6" ]]; then
module purge
module load arch/h100
source source SOME_ENV
gpu_name=h100
else
source SOME_ENV
gpu_name=v100
fi
echo "The number of nodes allocated for this job is: $num_nodes"
echo "The number of GPUs allocated for this job is: $nb_gpus"
export SEND_EMAIL=1
export JAX_COMPILATION_CACHE_DIR=$WORK/jax_cache
export JAX_PERSISTENT_CACHE_ENABLE_XLA_CACHES=xla_gpu_per_fusion_autotune_cache_dir
export JAX_PERSISTENT_CACHE_MIN_ENTRY_SIZE_BYTES=-1
export JAX_PERSISTENT_CACHE_MIN_COMPILE_TIME_SECS=0
module load texlive
# Helper function to get the actual script name (e.g., 'main' from 'python main.py')
function get_script_name() {
if [[ "$1" == "python" || "$1" == "python3" ]]; then
basename "$2" .py
else
basename "$1" .py
fi
}
# Helper to turn arguments into a filename-safe string
# Truncates long strings and appends a hash for uniqueness/safety
function get_args_slug() {
local full_slug=$(echo "$*" | tr -s ' ./-' '_')
# Check if slug is longer than 100 characters
if [ ${#full_slug} -le 100 ]; then
echo "$full_slug"
else
# Take the first 50 chars for readability + 8 chars of MD5 hash for uniqueness
local hash=$(echo "$*" | md5sum | cut -c1-8)
echo "${full_slug:0:50}_${hash}"
fi
}
function plaunch() {
if [ $# -lt 2 ]; then
echo "Usage: plaunch <RUN_NAME> <command> [arguments...]"
return 1
fi
# 1. Extract the Run Name (first arg)
local run_name=$1
shift
# 2. Identify script name and create a slug from arguments
local script_name=$(get_script_name "$@")
local args_slug=$(get_args_slug "$@")
# 3. Define directories (RUN_NAME is the folder)
local output_dir="$run_name/prof_traces"
local report_dir="$run_name/out_prof"
mkdir -p "$output_dir"
mkdir -p "$report_dir"
local out_file="$output_dir/${args_slug}.out"
local err_file="$output_dir/${args_slug}.err"
local report_file="$report_dir/report_${args_slug}_rank%q{SLURM_PROCID}"
# Log full command for traceability since filename might be truncated
echo "Full Command: $@" > "$out_file"
echo "----------------------------------------" >> "$out_file"
# 4. Run NSYS
srun nsys profile -t cuda,nvtx,osrt,mpi -o "$report_file" "$@" >> "$out_file" 2>> "$err_file" || true
}
function slaunch() {
if [ $# -lt 2 ]; then
echo "Usage: slaunch <RUN_NAME> <command> [arguments...]"
return 1
fi
# 1. Extract the Run Name (first arg)
local run_name=$1
shift
# 2. Identify script name and create a slug from arguments
local script_name=$(get_script_name "$@")
local args_slug=$(get_args_slug "$@")
# 3. Define directories (RUN_NAME is the folder)
local output_dir="$run_name/traces"
mkdir -p "$output_dir"
local out_file="$output_dir/${args_slug}.out"
local err_file="$output_dir/${args_slug}.err"
# Log full command for traceability since filename might be truncated
echo "Full Command: $@" > "$out_file"
echo "----------------------------------------" >> "$out_file"
# 4. Run the job
srun "$@" >> "$out_file" 2>> "$err_file"
local rc=$?
# ---- conditional email ----
if [ "$SEND_EMAIL" -eq 1 ] && [ -n "$EMAIL" ]; then
{
echo "SLURM job $SLURM_JOB_ID finished."
echo "Job name: $SLURM_JOB_NAME"
echo "Run label: $run_name"
echo "Args: $@"
echo "Exit code: $rc"
echo "Output directory: $output_dir"
echo
echo "Attaching:"
echo " $out_file"
echo " $err_file"
echo
uuencode "$output_dir/$script_name.out" "${script_name}.out"
uuencode "$output_dir/$script_name.err" "${script_name}.err"
} | mail -s "[JEAN-ZAY] Job finished: [$SLURM_JOB_NAME] $run_name (rc=$rc)" "$EMAIL"
fi
}
# Echo des commandes lancees
set -x
slaunch "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment