Skip to content

Instantly share code, notes, and snippets.

@ConnorBaker
Created April 11, 2025 16:46
Show Gist options
  • Select an option

  • Save ConnorBaker/7e842929d864593c08640433a7407885 to your computer and use it in GitHub Desktop.

Select an option

Save ConnorBaker/7e842929d864593c08640433a7407885 to your computer and use it in GitHub Desktop.
Various Azure RDMA NFS scripts
terraform {
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = "~> 4.4"
}
}
}
provider "azurerm" {
subscription_id = "REMOVED"
use_cli = true
features {}
}
resource "azurerm_resource_group" "nixos-rg" {
location = "East US"
name = "nixos-rg"
}
# NOTE: It is not enough for machines to be in the same proximity placement group for RDMA to work;
# the must also be in the same scale set or availability group.
# https://learn.microsoft.com/en-us/answers/questions/513090/azure-connectivity-problems-with-infiniband-rdma-h
resource "azurerm_proximity_placement_group" "nixos-ppg" {
location = azurerm_resource_group.nixos-rg.location
name = "nixos-ppg"
resource_group_name = azurerm_resource_group.nixos-rg.name
}
resource "azurerm_availability_set" "nixos-availability-set" {
location = azurerm_resource_group.nixos-rg.location
name = "nixos-availability-set"
resource_group_name = azurerm_resource_group.nixos-rg.name
proximity_placement_group_id = azurerm_proximity_placement_group.nixos-ppg.id
}
resource "azurerm_virtual_network" "nixos-network" {
address_space = ["10.0.0.0/16"]
location = azurerm_resource_group.nixos-rg.location
name = "nixos-network"
resource_group_name = azurerm_resource_group.nixos-rg.name
}
resource "azurerm_subnet" "nixos-subnet" {
address_prefixes = ["10.0.2.0/24"]
name = "nixos-subnet"
resource_group_name = azurerm_resource_group.nixos-rg.name
virtual_network_name = azurerm_virtual_network.nixos-network.name
}
resource "azurerm_network_security_group" "nixos-network-sg" {
location = azurerm_resource_group.nixos-rg.location
name = "nixos-network-sg"
resource_group_name = azurerm_resource_group.nixos-rg.name
security_rule {
name = "allow-inbound"
priority = 1001
direction = "Inbound"
access = "Allow"
protocol = "*"
source_port_range = "*"
destination_port_range = "*"
source_address_prefix = "*"
destination_address_prefix = "*"
}
}
resource "azurerm_public_ip_prefix" "nixos-public-ip-prefix" {
name = "nixos-pip"
location = azurerm_resource_group.nixos-rg.location
resource_group_name = azurerm_resource_group.nixos-rg.name
ip_version = "IPv4"
}
# Create the machines
# One machine, nixos-vm-little, is set up to perform little builds (max-jobs=num cores, cores per job=2, oversubscribed).
# One machine, nixos-vm-big, is set up to perform big builds (max-jobs=4, cores per job=num cores / 3, oversubscribed).
# One machine, nixos-vm-store, is set up as the remote store, where Nix store exists in-memory and is accessed over NFS from the other machines.
# Both nixos-vm-little and nixos-vm-big are set up as remote builders for nixos-vm-store, and both mount nixos-vm-store's Nix store over NFS over RDMA.
# SSH into nixos-vm-store and run nixpkgs-review with max-jobs set to zero, forcing it to use the remote builders for everything.
resource "azurerm_linux_virtual_machine_scale_set" "nixos-vmss" {
admin_username = "azureuser"
location = azurerm_resource_group.nixos-rg.location
name = "nixos-vmss"
resource_group_name = azurerm_resource_group.nixos-rg.name
sku = "Standard_HB120rs_v3"
instances = 2
eviction_policy = "Delete"
priority = "Spot"
proximity_placement_group_id = azurerm_proximity_placement_group.nixos-ppg.id
admin_ssh_key {
username = "azureuser"
public_key = file("~/.ssh/azure_ed25519.pub")
}
network_interface {
name = "nixos-vmss-nic"
primary = true
# TODO: Is it accelerated networking which is causing the RDMA to fail with NFS?
# NOPE! Still doesn't work. Maybe I need to use an older version of Ubuntu?
enable_accelerated_networking = false
network_security_group_id = azurerm_network_security_group.nixos-network-sg.id
ip_configuration {
name = "nixos-vmss-nic-ip"
primary = true
subnet_id = azurerm_subnet.nixos-subnet.id
public_ip_address {
name = "nixos-vmss-nic-public-ip"
public_ip_prefix_id = azurerm_public_ip_prefix.nixos-public-ip-prefix.id
}
}
}
os_disk {
caching = "ReadOnly"
storage_account_type = "Premium_LRS"
# Use an ephemeral disk for the OS disk
diff_disk_settings {
option = "Local"
placement = "CacheDisk"
}
}
source_image_reference {
offer = "ubuntu-24_04-lts"
publisher = "Canonical"
sku = "server"
version = "latest"
}
}
#!/usr/bin/env bash
# General script used to set up an Azure HPC instance with Infiniband to use RDMA.
# I do not remember how well this works.
# Set up the UDEV rule for the Mellanox device
# NOTE: Assumes only a single IB interface exists.
echo 'SUBSYSTEM=="net", ACTION=="add", ATTR{dev_id}=="0x0", ATTR{type}=="32", NAME="ib0"' \
| sudo tee /etc/udev/rules.d/99-rename-ib.rules \
&& sudo udevadm control --reload-rules
# Install the Mellanox OFED drivers
# On names for the mellanox interfaces:
# https://techcommunity.microsoft.com/t5/azure-compute-blog/accelerated-networking-on-hb-hc-hbv2-hbv3-and-ndv2/ba-p/2067965
# For more recent releases of the installer, check:
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
# NOTE: Have not found nfsrdma driver to be available via the DOCA installer
# NOTE: Cannot use --with-nvmf on Ubuntu 24.04 -- install fails.
wget "https://content.mellanox.com/ofed/MLNX_OFED-24.07-0.6.1.0/MLNX_OFED_LINUX-24.07-0.6.1.0-ubuntu24.04-x86_64.tgz" \
&& tar -xvf MLNX_OFED_LINUX-24.07-0.6.1.0-ubuntu24.04-x86_64.tgz \
&& pushd MLNX_OFED_LINUX-24.07-0.6.1.0-ubuntu24.04-x86_64 \
&& sudo ./mlnxofedinstall --with-nfsrdma --force \
&& popd
# Load the drivers and restart Mellanox Software Tools drivers
sudo /etc/init.d/openibd restart \
&& sudo mst restart
# Assign an IP address to the IB interface and bring it up
# NOTE: 10.1.0.4 to match azure-store address on 10.0.2.4
sudo ip addr add 10.1.0.5/16 dev ib0 \
&& sudo ip link set ib0 up
# Install nfs for server
sudo apt install -y nfs-kernel-server
# Drivers we'll need to reload.
# TODO: Remove client/server drivers depending.
cat <<EOF | sudo tee /etc/modules-load.d/rdma.conf
svcrdma
xprtrdma
EOF
# Reload the modules
sudo systemctl restart systemd-modules-load.service
# Format and mount nvme0n1 and nvme1n1 as EXT4
sudo mkfs.ext4 /dev/nvme0n1
sudo mkfs.ext4 /dev/nvme1n1
sudo mkdir -p /drive0 /drive1 /drive0_nfs
sudo mount -t ext4 -o sync,rw /dev/nvme0n1 /drive0
sudo mount -t ext4 -o sync,rw /dev/nvme1n1 /drive1
# Create the NFS export directory
sudo mkdir -p /nix
sudo mount -t tmpfs -o size=300G tmpfs /nix
# Expose /nix via NFS
# https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/configuring_and_using_network_file_services/deploying-an-nfs-server_configuring-and-using-network-file-services#services-required-on-an-nfs-server_deploying-an-nfs-server
# https://infohub.delltechnologies.com/en-us/l/dell-technologies-powerscale-onefs-best-practices-for-davinci-resolve/linux-settings/
echo "/drive0 10.1.0.5(fsid=1,rw,sync,no_wdelay,insecure,no_root_squash,no_subtree_check)" | sudo tee -a /etc/exports
# https://wiki.debian.org/NFSServerSetup
cat <<EOF | sudo tee /etc/default/nfs-kernel-server
RPCNFSDOPTS="-N 2 -N 3 -U --rdma"
RPCMOUNTDOPTS="--manage-gids -N 2 -N 3"
EOF
# https://wiki.debian.org/NFSServerSetup
cat <<EOF | sudo tee /etc/default/nfs-common
NEED_STATD=no
NEED_IDMAPD=yes
NEED_GSSD=no
EOF
# Enable RDMA for NFS
# https://dzone.com/articles/optimizing-infiniband-bandwidth-utilization
# Settings for NFSv4 only:
# https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/7/html/storage_administration_guide/nfs-serverconfig#nfs4-only
cat <<EOF | sudo tee /etc/nfs.conf.d/rdma.conf
[mountd]
threads=64
[nfsd]
threads=64
udp=n
tcp=y
vers3=n
vers4=y
vers4.0=n
vers4.1=n
vers4.2=y
rdma=y
rdma-port=20049
EOF
# Restart the NFS server
sudo systemctl restart nfs-server
sudo exportfs -ra
# On consumers:
# TODO: UDEV rules for consistent naming of the IB interface
sudo ip addr add 10.1.0.5/16 dev ib0
# Bring the interface up
sudo ip link set ib0 up
# Install nfs for client
sudo apt install -y nfs-common
# TODO:
# https://learn.microsoft.com/en-us/azure/azure-netapp-files/performance-linux-concurrency-session-slots#nfsv41
# See "Can increasing session slots increase overall performance?" in the following document:
# https://www.netapp.com/media/10720-tr-4067.pdf
echo "options nfs max_session_slots=180" | sudo tee /etc/modprobe.d/nfsclient.conf
# Mount the NFS share
sudo mkdir -p /nix
# TODO: Optimize parameters
# TODO: nconnect not supported -- is the VAST NFS client required for nconnect option with RDMA?
# TODO: Can't set namelen=1023, not a recognized option?
# Default settings when rsize and wsize are not set (the maximum value, 1MB, is used):
sudo mount -t nfs4 -o rw,sync,noatime,vers=4.2,rsize=1048576,wsize=1048576,namlen=255,acregmin=0,acregmax=0,acdirmin=0,acdirmax=0,hard,noac,proto=rdma,port=20049,nconnect=16,timeo=600,retrans=2,sec=sys,lookupcache=none,local_lock=none 10.1.0.4:/nix /mounted
sudo mount -t nfs4 -o rw,sync,noatime,vers=4.2,rsize=1048576,wsize=1048576,namlen=255,acregmin=0,acregmax=0,acdirmin=0,acdirmax=0,hard,noac,proto=rdma,port=20049,nconnect=16,timeo=600,retrans=2,sec=sys,lookupcache=none,local_lock=none 10.1.0.4:/drive0 /drive0_nfs

Notes

Setup

az account set --subscription "Azure subscription 1"
az configure --defaults group=simpleLinuxTestVMResourceGroup location=eastus
az group create --resource-group simpleLinuxTestVMResourceGroup --location eastus

Provision

az deployment group create \
    --name simpleLinuxTestVMDeployment \
    --template-file main.bicep \
    --parameters \
        adminUsername=azure \
        sshPublicKey=@~/.ssh/azure_ed25519.pub \
    --output json \
    --verbose

Cleanup

TODO: Was unable to delete by name -- try again to see if it works when the deployment actually uses that name.

TODO: Why didn't the deployment show up when I used az deployment group list?

az deployment group delete \
    --name simpleLinuxTestVMDeployment \
    --output json \
    --verbose

TODO: big.LITTLE setup -- given the cost per instance and overhead with the remote builder protocol's default implementation, it makes more sense to have two nodes, each HB120rs_v3.

LITTLE will have:

  • a block on building things requiring big-parallel system feature
  • cores = 4
  • max-jobs = 40 (33% over-provisioning) to try to account for serial portions of jobs

big will have:

  • big-parallel system feature
  • cores = 80
  • max-jobs = 2 (33% over-provisioning) to try to account for serial portions of jobs

Not all jobs which should be marked as requiring big-parallel are currently marked as such. This is a TODO.

#!/usr/bin/env bash
# A wrapper around main.tf
set -euo pipefail
declare -r AZURE_INSTANCE_USER="azure"
declare -r AZURE_INSTANCE_SIZE="Standard_HB120rs_v3"
declare -r AZURE_SSH_PUBLIC_KEY_PATH="$HOME/.ssh/azure_ed25519.pub"
azureSetup() {
az account set --subscription "Azure subscription 1"
az configure --defaults group=simpleLinuxTestVMResourceGroup location=eastus
az group create --resource-group simpleLinuxTestVMResourceGroup --location eastus
}
provisionInstances() {
az deployment group create \
--name simpleLinuxTestVMDeployment \
--template-file main.bicep \
--parameters \
adminUsername="$AZURE_INSTANCE_USER" \
sshPublicKey=@"$AZURE_SSH_PUBLIC_KEY_PATH" \
vmSize="$AZURE_INSTANCE_SIZE" \
--output json \
--verbose |
jq -crS .
}
main() {
azureSetup
provisionInstances
}
main
#!/usr/bin/env bash
# General script used to set up HB120rs_v3 instances, don't remember if it conflicts with mellanox.sh
set -euo pipefail
declare -r USER="azureuser"
declare -r USER_HOME="/home/$USER"
declare -r BTRFS_BLOCK_DEVICE="/dev/nvme0n1"
declare -r BTRFS_MOUNT="/mnt/fs"
_log() {
if (($# != 2)); then
echo "_log: missing function name and message" >&2
exit 1
fi
echo "[$(date)][${1:?}] ${2:?}"
}
installPrerequisites() {
log() { _log "installPrerequisites" "$@"; }
local -ar packages=(
# Btrfs
"btrfs-progs"
"gdisk"
# bpftune
# See https://github.com/oracle/bpftune?tab=readme-ov-file#getting-started
"make"
"libbpf1"
"libbpf-dev"
"libcap-dev"
"linux-tools-common" # Provides bpftool
"libnl-route-3-dev" # TODO: This wasn't one of the dependencies listed in the README
"libnl-3-dev"
"clang"
"llvm"
"python3-docutils"
# ZRAM
"linux-modules-extra-azure"
"zstd"
# Generally required
"git"
"gpg"
"inetutils-ping"
)
log "Updating apt"
sudo apt-get update
log "Installing packages: ${packages[*]}"
sudo apt-get install -y "${packages[@]}"
}
setupZramSwap() {
log() { _log "setupZramSwap" "$@"; }
local -r swapSize="1TB"
log "Enabling zram module"
sudo modprobe zram num_devices=1
log "Creating zram0 device"
sudo zramctl --find --size "$swapSize" --algorithm zstd
log "Enabling zram0 device"
sleep 2
sudo mkswap /dev/zram0
sleep 2
sudo swapon --priority -2 /dev/zram0
}
setupBtrfsMntFsVolume() {
log() { _log "setupBtrfsMntFsVolume" "$@"; }
local -ar disks=(
"/dev/nvme0n1"
"/dev/nvme1n1"
)
log "Creating Btrfs volume"
for disk in "${disks[@]}"; do
log "Processing $disk"
log "Wiping disk"
sudo sgdisk --zap-all "$disk"
log "Creating GPT"
sudo parted --script "$disk" mklabel gpt mkpart primary 0% 100%
done
log "Waiting for device nodes to appear"
sudo udevadm settle
log "Formatting disks"
sudo mkfs.btrfs --force --label fs --data raid0 "${disks[@]}"
log "Mounting disks"
sudo mkdir -p "$BTRFS_MOUNT"
sudo mount -t btrfs -o defaults,noatime "$BTRFS_BLOCK_DEVICE" "$BTRFS_MOUNT"
}
createBtrfsMntFsSubvolume() {
log() { _log "createBtrfsMntFsSubvolume" "$@"; }
if (($# != 2)); then
log "!!! missing subvolume name and path !!!" >&2
exit 1
fi
local -r name="$1"
local -r mountPoint="$2"
log "Creating subvolume $name"
sudo btrfs subvolume create "$BTRFS_MOUNT/$name"
log "Mounting subvolume $name"
sudo mkdir -p "$mountPoint" "$BTRFS_MOUNT/$name"
sudo mount -t btrfs -o defaults,noatime,subvol="$name" "$BTRFS_BLOCK_DEVICE" "$mountPoint"
}
setupBtrfsMntFsSubvolumes() {
log() { _log "setupBtrfsMntFsSubvolumes" "$@"; }
local -ar subvolumeNames=(
"nix"
"tmp"
"working"
)
log "Creating Btrfs subvolumes"
for name in "${subvolumeNames[@]}"; do
createBtrfsMntFsSubvolume "$name" "/$name"
done
log "Fixing permissions on /tmp"
sudo chmod -R 1777 "/tmp"
log "Fixing permissions on /working"
sudo chown -R "$USER:$USER" "/working"
log "Setting up an OverlayFS mount for $USER_HOME on /working"
local -r lowerDir="/home/.$USER"
local -r upperDir="/working/$USER"
local -r workDir="/working/.$USER"
sudo mv "$USER_HOME" "$lowerDir"
sudo mkdir -p "$upperDir" "$workDir" "$USER_HOME"
sudo mount -t overlay overlay -o lowerdir="$lowerDir",upperdir="$upperDir",workdir="$workDir" "$USER_HOME"
log "Setting permissions on $USER_HOME"
sudo chown -R "$USER:$USER" "$USER_HOME"
}
setupNix() {
log() { _log "setupNix" "$@"; }
local -r NIX_CONFIG="/etc/nix/nix.conf"
local -r NIX_INSTALLER="https://install.determinate.systems/nix"
local -r NIX_VERSION="2.25.3"
local -r NIX_PLATFORM="x86_64-linux"
local -r NIX_PACKAGE_URL="https://releases.nixos.org/nix/nix-$NIX_VERSION/nix-$NIX_VERSION-$NIX_PLATFORM.tar.xz"
local -ra extraConfig=(
"accept-flake-config = true"
"allow-import-from-derivation = false"
"auto-allocate-uids = true"
"builders-use-substitutes = true"
"connect-timeout = 10"
"experimental-features = auto-allocate-uids cgroups flakes mounted-ssh-store nix-command"
"fsync-metadata = false"
"http-connections = 128"
"log-lines = 100"
"max-substitution-jobs = 64"
"narinfo-cache-negative-ttl = 0"
"sandbox-fallback = false"
"substituters = https://cache.nixos.org"
"trusted-public-keys = cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
"trusted-users = root runner @wheel"
"use-cgroups = true"
"use-xdg-base-directories = true"
"warn-dirty = false"
)
log "Installing Nix $NIX_VERSION for $NIX_PLATFORM"
curl --proto '=https' --tlsv1.2 -sSf -L "$NIX_INSTALLER" |
sh -s -- install --no-confirm --nix-package-url "$NIX_PACKAGE_URL"
log "Overriding defaults in Nix configuration"
sudo sed \
-e '/auto-optimise-store =/d' \
-e '/experimental-features =/d' \
-e '/upgrade-nix-store-path-url =/d' \
-i "$NIX_CONFIG"
log "Adding extra Nix configuration"
for line in "${extraConfig[@]}"; do
echo "$line" | sudo tee -a "$NIX_CONFIG"
done
log "Reloading Nix configuration"
sudo systemctl restart nix-daemon
}
setupKernelVmParameters() {
log() { _log "setupKernelVmParameters" "$@"; }
# Taken from: https://github.com/ConnorBaker/nixos-configs/blob/e6d3e54ed9d257bd148a5bfb57dc476570b5d9f0/modules/zram.nix
local -ra vmParameters=(
# https://wiki.archlinux.org/title/Zram#Optimizing_swap_on_zram
"vm.watermark_boost_factor=0"
"vm.watermark_scale_factor=125"
"vm.page-cluster=0"
# https://github.com/pop-os/default-settings/blob/master_noble/etc/sysctl.d/10-pop-default-settings.conf
"vm.swappiness=190" # Strong preference for ZRAM
"vm.max_map_count=2147483642"
# Higher values since these machines are used mostly as remote builders
"vm.dirty_ratio=80"
"vm.dirty_background_ratio=50"
)
log "Setting up kernel VM parameters"
for param in "${vmParameters[@]}"; do
sudo sysctl -w "$param"
done
}
setupKernelNetParameters() {
log() { _log "setupKernelNetParameters" "$@"; }
# Taken from: https://github.com/ConnorBaker/nixos-configs/blob/e6d3e54ed9d257bd148a5bfb57dc476570b5d9f0/modules/networking.nix
local -ri KB=1024
local -ri MB=$((KB * KB))
# Memory settings
local -ri memMin=$((8 * KB))
local -ri rmemDefault=$((128 * KB))
local -ri wmemDefault=$((16 * KB))
local -ri memMax=$((16 * MB))
local -ra netParameters=(
# Enable BPF JIT for better performance
"net.core.bpf_jit_enable=1"
"net.core.bpf_jit_harden=0"
# Change the default queueing discipline to cake and the congestion control algorithm to BBR
"net.core.default_qdisc=cake"
"net.ipv4.tcp_congestion_control=bbr"
# Largely taken from https://wiki.archlinux.org/title/sysctl and
# https://github.com/redhat-performance/tuned/blob/master/profiles/network-throughput/tuned.conf#L10
"net.core.somaxconn=$((8 * KB))"
"net.core.netdev_max_backlog=$((16 * KB))"
"net.core.optmem_max=$((64 * KB))"
# RMEM
"net.core.rmem_default=$rmemDefault"
"net.core.rmem_max=$memMax"
"net.ipv4.tcp_rmem=$memMin $rmemDefault $memMax"
"net.ipv4.udp_rmem_min=$memMin"
# WMEM
"net.core.wmem_default=$wmemDefault"
"net.core.wmem_max=$memMax"
"net.ipv4.tcp_wmem=$memMin $wmemDefault $memMax"
"net.ipv4.udp_wmem_min=$memMin"
# General TCP
"net.ipv4.tcp_fastopen=3"
"net.ipv4.tcp_fin_timeout=10"
"net.ipv4.tcp_keepalive_intvl=10"
"net.ipv4.tcp_keepalive_probes=6"
"net.ipv4.tcp_keepalive_time=60"
"net.ipv4.tcp_max_syn_backlog=$((8 * KB))"
"net.ipv4.tcp_max_tw_buckets=2000000"
"net.ipv4.tcp_mtu_probing=1"
"net.ipv4.tcp_slow_start_after_idle=0"
"net.ipv4.tcp_tw_reuse=1"
)
log "Setting up kernel network parameters"
for param in "${netParameters[@]}"; do
sudo sysctl -w "$param"
done
}
setupBpftune() {
log() { _log "setupBpftune" "$@"; }
local -r BASE_URL="https://github.com/oracle/bpftune/archive"
local -r REV="0e6bca2e5880fcbaac6478c4042f5f9314e61463"
local -r TARBALL_NAME="bpftune-$REV.tar.gz"
local -r BPFTUNE_DIR="$USER_HOME/bpftune"
log "Creating directory for bpftune"
mkdir -p "$BPFTUNE_DIR"
log "Entering directory for bpftune"
pushd "$BPFTUNE_DIR"
log "Downloading bpftune tarball"
curl --location "$BASE_URL/$REV.tar.gz" --output "$TARBALL_NAME"
log "Extracting bpftune tarball"
tar xzf "$TARBALL_NAME" --strip-components=1
log "Removing downloaded archive"
rm -f "$TARBALL_NAME"
log "Building bpftune"
make -j
log "Installing bpftune"
sudo make install
log "Staring bpftune"
sudo systemctl enable bpftune
sudo systemctl start bpftune
log "Exiting directory for bpftune"
popd
}
main() {
# Software
installPrerequisites
# Memory
setupZramSwap
setupKernelVmParameters # Values chosen for ZRAM
# Disks
setupBtrfsMntFsVolume
setupBtrfsMntFsSubvolumes
# Nix
setupNix
# Network
setupKernelNetParameters
# setupBpftune
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment