Last active
November 18, 2025 03:17
-
-
Save tomsing1/2a12738a5317ce8a397bbd1fe9f02443 to your computer and use it in GitHub Desktop.
Experimenting with the clustermq R package on a local SLURM cluster set up via docker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ##-------- Setting up slurm-docker-cluster | |
| # clone slurm-docker-cluster repository | |
| git clone https://github.com/giovtorres/slurm-docker-cluster.git | |
| cd slurm-docker-cluster | |
| # build docker images and start containers | |
| make up | |
| make status | |
| make test | |
| make down | |
| ##-------- Adding R and clustermq | |
| # create a second Dockerfile that installs R and clustermq | |
| cat > Dockerfile.r << 'EOF' | |
| ARG SLURM_VERSION | |
| FROM slurm-docker-cluster:${SLURM_VERSION} | |
| USER root | |
| RUN dnf -y install epel-release \ | |
| && dnf -y install R-base zeromq-devel \ | |
| && dnf clean all | |
| RUN cat > /usr/lib64/R/etc/Rprofile.site <<'REOF' | |
| options(repos = c(CRAN = sprintf("https://packagemanager.posit.co/cran/latest/bin/linux/rhel9-%s/%s", | |
| R.version["arch"], substr(getRversion(), 1, 3)))) | |
| REOF | |
| RUN R -q -e 'install.packages(c("clustermq", "callr"))' | |
| EOF | |
| # create a docker-compose.override.r.yml file | |
| cat > docker-compose.override.r.yml << 'EOF' | |
| x-node-build: &node-build | |
| context: . | |
| dockerfile: Dockerfile.r | |
| args: | |
| SLURM_VERSION: ${SLURM_VERSION:-25.05.3} | |
| BASE_IMAGE: slurm-docker-cluster:${SLURM_VERSION:-25.05.3} | |
| services: | |
| slurmctld: | |
| image: slurmctld-r:${SLURM_VERSION:-25.05.3} | |
| build: *node-build | |
| c1: | |
| image: c1-r:${SLURM_VERSION:-25.05.3} | |
| build: *node-build | |
| c2: | |
| image: c2-r:${SLURM_VERSION:-25.05.3} | |
| build: *node-build | |
| EOF | |
| # build the new docker images defined by the override | |
| docker compose \ | |
| -f docker-compose.yml \ | |
| -f docker-compose.override.r.yml \ | |
| build slurmctld c1 c2 | |
| docker images | |
| # start the cluster, using the new images | |
| docker compose \ | |
| -f docker-compose.yml \ | |
| -f docker-compose.override.r.yml \ | |
| up -d | |
| # check that the installation was successful on all 3 nodes | |
| for NODE in slurmctld c1 c2 | |
| do | |
| echo ">>> Node" $NODE | |
| docker exec -it $NODE R --vanilla -s -e \ | |
| "paste('clustermq', installed.packages()['clustermq', 'Version'])" | |
| done | |
| ##--------- Interactively submitting jobs with clustermq | |
| make shell | |
| # create the SLURM template file for clustermq | |
| # see: https://mschubert.github.io/clustermq/articles/userguide.html#slurm | |
| cat > /data/slurm.tmpl << 'EOF' | |
| #!/bin/sh | |
| #SBATCH --job-name={{ job_name }} | |
| #SBATCH --partition=normal | |
| #SBATCH --output={{ log_file | /dev/null }} | |
| #SBATCH --error={{ log_file | /dev/null }} | |
| #SBATCH --mem-per-cpu={{ memory | 4096 }} | |
| #SBATCH --array=1-{{ n_jobs }} | |
| #SBATCH --cpus-per-task={{ cores | 1 }} | |
| ulimit -v $(( 1024 * {{ memory | 4096 }} )) | |
| CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' | |
| EOF | |
| # start an interactive R session | |
| R --vanilla | |
| # attach the clustermq package | |
| library(clustermq) | |
| # first, using the multiprocessor backend (e.g. not using SLURM) | |
| options( | |
| clustermq.scheduler = "multiprocess" | |
| ) | |
| res <- Q( | |
| fun = function(i) paste(Sys.info()[["nodename"]], i), | |
| i = 1:10, | |
| n_jobs = 2, | |
| timeout = 60 | |
| ) | |
| print(res) | |
| # second, using the SLURM backend | |
| options( | |
| clustermq.scheduler = "slurm", | |
| clustermq.template = "/data/slurm.tmpl" | |
| ) | |
| # we add the Sys.sleep command to slow down the execution enough | |
| # to run make jobs in a separate shell to see the jobs and appear in the | |
| # execution queue | |
| test_fun <- function(i) {Sys.sleep(1); paste(Sys.info()[["nodename"]], i)} | |
| res <- Q( | |
| fun = test_fun, | |
| i = 1:10, | |
| n_jobs = 2, # maps to SLURM array size | |
| memory = 1000, # up to 1000 MB per CPU | |
| timeout = 60 | |
| ) | |
| print(res) | |
| q() | |
| exit | |
| ##--------- cleanup | |
| make down | |
| make clean | |
| docker images | |
| docker rmi slurm-docker-cluster:25.05.3 c1-r:25.05.3 c2-r:25.05.3 slurmctld-r:25.05.3 | |
| docker buildx prune --all --force |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment