Skip to content

Instantly share code, notes, and snippets.

@sozercan
Last active March 13, 2026 03:43
Show Gist options
  • Select an option

  • Save sozercan/34db2de801fb9fa34baad5cc0e0c50ca to your computer and use it in GitHub Desktop.

Select an option

Save sozercan/34db2de801fb9fa34baad5cc0e0c50ca to your computer and use it in GitHub Desktop.
Enable Azure Managed Lustre File System (AMLFS) on an existing AKS cluster with dynamic provisioning
#!/usr/bin/env bash
#
# setup-amlfs-aks.sh
#
# Enable Azure Managed Lustre File System (AMLFS) on an existing AKS cluster
# with dynamic provisioning via the Azure Lustre CSI driver.
#
# IMPORTANT: AMLFS requires a dedicated subnet separate from the AKS node subnet.
# This script automatically creates one if it doesn't exist. Using the AKS node
# subnet causes extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet).
#
# Prerequisites:
# - Azure CLI (az) installed and logged in
# - kubectl configured to target your AKS cluster
# - Sufficient Azure subscription quota for AMLFS
#
# Usage:
# export RESOURCE_GROUP="my-resource-group"
# export CLUSTER_NAME="my-aks-cluster"
# ./setup-amlfs-aks.sh
#
# Optional environment variables:
# AMLFS_SKU - AMLFS SKU (default: AMLFS-Durable-Premium-125)
# AMLFS_STORAGE_SIZE - Storage size (default: 4Ti)
# AMLFS_ZONE - Availability zone (default: 1)
# AMLFS_SUBNET_NAME - Dedicated subnet name for AMLFS (default: amlfs-subnet)
# AMLFS_SUBNET_PREFIX - CIDR for AMLFS subnet (default: auto-calculated)
# PVC_NAME - PVC name (default: shared-amlfs-storage)
# STORAGECLASS_NAME - StorageClass name (default: amlfs-lustre)
set -euo pipefail
# Required variables
: "${RESOURCE_GROUP:?❌ RESOURCE_GROUP must be set (e.g., export RESOURCE_GROUP=my-rg)}"
: "${CLUSTER_NAME:?❌ CLUSTER_NAME must be set (e.g., export CLUSTER_NAME=my-aks)}"
# Optional variables with defaults
: "${AMLFS_SKU:=AMLFS-Durable-Premium-125}"
: "${AMLFS_STORAGE_SIZE:=4Ti}"
: "${AMLFS_ZONE:=1}"
: "${AMLFS_SUBNET_NAME:=amlfs-subnet}"
: "${AMLFS_SUBNET_PREFIX:=}"
: "${PVC_NAME:=shared-amlfs-storage}"
: "${STORAGECLASS_NAME:=amlfs-lustre}"
echo "============================================"
echo " AMLFS Setup for Existing AKS Cluster"
echo "============================================"
echo ""
echo "Cluster: ${CLUSTER_NAME}"
echo "Resource Group: ${RESOURCE_GROUP}"
echo "AMLFS SKU: ${AMLFS_SKU}"
echo "Storage Size: ${AMLFS_STORAGE_SIZE}"
echo "Zone: ${AMLFS_ZONE}"
echo "AMLFS Subnet: ${AMLFS_SUBNET_NAME}"
echo "PVC Name: ${PVC_NAME}"
echo ""
# ------------------------------------------------------------------
# Step 1: Register Microsoft.StorageCache resource provider
# ------------------------------------------------------------------
echo "πŸ“¦ Step 1: Registering Microsoft.StorageCache resource provider..."
PROVIDER_STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv 2>/dev/null || echo "NotRegistered")
if [[ "${PROVIDER_STATE}" == "Registered" ]]; then
echo "βœ… Microsoft.StorageCache is already registered."
else
az provider register --namespace Microsoft.StorageCache
echo "⏳ Waiting for Microsoft.StorageCache to be registered..."
while true; do
STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv)
if [[ "${STATE}" == "Registered" ]]; then
echo "βœ… Microsoft.StorageCache registered successfully."
break
fi
echo " Current state: ${STATE}. Waiting..."
sleep 10
done
fi
# ------------------------------------------------------------------
# Step 2: Install the Azure Lustre CSI driver
# ------------------------------------------------------------------
echo ""
echo "πŸ”§ Step 2: Installing Azure Lustre CSI driver..."
if kubectl get -n kube-system deployment csi-azurelustre-controller >/dev/null 2>&1; then
echo "βœ… Azure Lustre CSI driver is already installed, skipping."
else
curl -skSL https://raw.githubusercontent.com/kubernetes-sigs/azurelustre-csi-driver/main/deploy/install-driver.sh | bash
echo "⏳ Waiting for CSI controller pods to be ready..."
while true; do
READY=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
DESIRED=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.replicas}' 2>/dev/null || echo "0")
if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then
break
fi
echo " Controller pods: ${READY}/${DESIRED} ready..."
sleep 5
done
echo "⏳ Waiting for CSI node pods to be ready..."
while true; do
READY=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0")
DESIRED=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0")
if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then
break
fi
echo " Node pods: ${READY}/${DESIRED} ready..."
sleep 5
done
echo "βœ… Azure Lustre CSI driver installed successfully."
fi
# ------------------------------------------------------------------
# Step 3: Assign RBAC roles to kubelet identity
# ------------------------------------------------------------------
echo ""
echo "πŸ”‘ Step 3: Assigning RBAC roles to kubelet identity..."
SUBSCRIPTION_ID=$(az account show --query id -o tsv)
OBJECT_ID=$(az aks show \
--name "${CLUSTER_NAME}" \
--resource-group "${RESOURCE_GROUP}" \
--query identityProfile.kubeletidentity.objectId \
-o tsv)
NODE_RESOURCE_GROUP=$(az aks show \
--name "${CLUSTER_NAME}" \
--resource-group "${RESOURCE_GROUP}" \
--query nodeResourceGroup \
-o tsv)
echo " Kubelet Identity: ${OBJECT_ID}"
echo " Node RG: ${NODE_RESOURCE_GROUP}"
# Assign Contributor on node resource group
EXISTING_CONTRIBUTOR=$(az role assignment list \
--assignee "${OBJECT_ID}" \
--role "Contributor" \
--scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" \
--query "[].roleDefinitionName" -o tsv)
if [[ -n "${EXISTING_CONTRIBUTOR}" ]]; then
echo "βœ… Contributor role already assigned on node resource group."
else
echo " Assigning Contributor role on node resource group..."
az role assignment create \
--assignee-object-id "${OBJECT_ID}" \
--assignee-principal-type ServicePrincipal \
--role "Contributor" \
--scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" >/dev/null
echo "βœ… Contributor role assigned."
fi
# Assign Reader on subscription
EXISTING_READER=$(az role assignment list \
--assignee "${OBJECT_ID}" \
--role "Reader" \
--scope "/subscriptions/${SUBSCRIPTION_ID}" \
--query "[].roleDefinitionName" -o tsv)
if [[ -n "${EXISTING_READER}" ]]; then
echo "βœ… Reader role already assigned on subscription."
else
echo " Assigning Reader role on subscription..."
az role assignment create \
--assignee-object-id "${OBJECT_ID}" \
--assignee-principal-type ServicePrincipal \
--role "Reader" \
--scope "/subscriptions/${SUBSCRIPTION_ID}" >/dev/null
echo "βœ… Reader role assigned."
fi
# ------------------------------------------------------------------
# Step 4: Create dedicated AMLFS subnet
# ------------------------------------------------------------------
echo ""
echo "🌐 Step 4: Creating dedicated subnet for AMLFS..."
echo " ⚠️ AMLFS must NOT share the AKS node subnet β€” this causes"
echo " extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet)."
# Discover the AKS VNet
VNET_INFO=$(az network vnet list \
--resource-group "${NODE_RESOURCE_GROUP}" \
--query "[0].{name:name, prefixes:addressSpace.addressPrefixes[0]}" \
-o json)
VNET_NAME=$(echo "${VNET_INFO}" | jq -r '.name')
echo " VNet: ${VNET_NAME}"
# Check if AMLFS subnet already exists
if az network vnet subnet show \
--resource-group "${NODE_RESOURCE_GROUP}" \
--vnet-name "${VNET_NAME}" \
--name "${AMLFS_SUBNET_NAME}" &>/dev/null; then
echo "βœ… Subnet '${AMLFS_SUBNET_NAME}' already exists, reusing it."
else
# Auto-calculate subnet prefix if not provided
if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then
# Get existing subnets and find an unused /24 range
# Start from the second octet block after the AKS subnet
EXISTING_PREFIXES=$(az network vnet subnet list \
--resource-group "${NODE_RESOURCE_GROUP}" \
--vnet-name "${VNET_NAME}" \
--query "[].addressPrefix" -o tsv)
# Get the VNet base address (e.g., 10.224.0.0/12 -> 10)
VNET_PREFIX=$(echo "${VNET_INFO}" | jq -r '.prefixes')
VNET_BASE=$(echo "${VNET_PREFIX}" | cut -d'.' -f1)
# Try to find an unused /24 in the VNet range
for SECOND_OCTET in $(seq 225 237); do
CANDIDATE="${VNET_BASE}.${SECOND_OCTET}.0.0/24"
if ! echo "${EXISTING_PREFIXES}" | grep -q "${VNET_BASE}.${SECOND_OCTET}"; then
AMLFS_SUBNET_PREFIX="${CANDIDATE}"
break
fi
done
if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then
echo "❌ Could not auto-calculate a free /24 subnet. Please set AMLFS_SUBNET_PREFIX manually."
exit 1
fi
fi
echo " Creating subnet '${AMLFS_SUBNET_NAME}' with prefix '${AMLFS_SUBNET_PREFIX}'..."
az network vnet subnet create \
--resource-group "${NODE_RESOURCE_GROUP}" \
--vnet-name "${VNET_NAME}" \
--name "${AMLFS_SUBNET_NAME}" \
--address-prefixes "${AMLFS_SUBNET_PREFIX}" >/dev/null
echo "βœ… Subnet '${AMLFS_SUBNET_NAME}' created (${AMLFS_SUBNET_PREFIX})."
fi
# ------------------------------------------------------------------
# Step 5: Deploy StorageClass and PVC with dedicated subnet
# ------------------------------------------------------------------
echo ""
echo "πŸ“€ Step 5: Deploying StorageClass and PVC with dedicated AMLFS subnet..."
cat <<EOF | kubectl apply -f -
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ${STORAGECLASS_NAME}
provisioner: azurelustre.csi.azure.com
parameters:
sku-name: "${AMLFS_SKU}"
zones: "${AMLFS_ZONE}"
maintenance-day-of-week: "Sunday"
maintenance-time-of-day-utc: "02:00"
subnet-name: "${AMLFS_SUBNET_NAME}"
reclaimPolicy: Delete
volumeBindingMode: Immediate
mountOptions:
- "noatime"
- "flock"
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ${PVC_NAME}
spec:
accessModes:
- ReadWriteMany
storageClassName: ${STORAGECLASS_NAME}
resources:
requests:
storage: ${AMLFS_STORAGE_SIZE}
EOF
echo "βœ… StorageClass '${STORAGECLASS_NAME}' and PVC '${PVC_NAME}' created."
# ------------------------------------------------------------------
# Step 6: Wait for PVC to be bound
# ------------------------------------------------------------------
echo ""
echo "⏳ Step 6: Waiting for PVC '${PVC_NAME}' to be bound..."
echo " (AMLFS provisioning typically takes 15-20 minutes with a dedicated subnet)"
echo ""
SECONDS_WAITED=0
MAX_WAIT=2400 # 40 minutes
while true; do
PVC_STATUS=$(kubectl get pvc "${PVC_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
if [[ "${PVC_STATUS}" == "Bound" ]]; then
echo ""
echo "βœ… PVC '${PVC_NAME}' is Bound!"
kubectl get pvc "${PVC_NAME}"
break
fi
# Check for non-timeout errors (ignore DeadlineExceeded as the driver retries)
LATEST_ERROR=$(kubectl describe pvc "${PVC_NAME}" 2>/dev/null | grep "ProvisioningFailed" | grep -v "DeadlineExceeded" | tail -1)
if [[ -n "${LATEST_ERROR}" ]]; then
echo " ⚠️ Provisioning error: ${LATEST_ERROR}"
fi
if [[ ${SECONDS_WAITED} -ge ${MAX_WAIT} ]]; then
echo ""
echo "❌ Timed out after $((MAX_WAIT / 60)) minutes. PVC is still ${PVC_STATUS}."
echo " The CSI driver will continue retrying in the background."
echo " Monitor with: kubectl describe pvc ${PVC_NAME}"
echo " Check Azure: az resource list --resource-group ${NODE_RESOURCE_GROUP} --resource-type Microsoft.StorageCache/amlFileSystems -o table"
exit 1
fi
echo " [$(date +%H:%M:%S)] PVC status: ${PVC_STATUS} (${SECONDS_WAITED}s elapsed)"
sleep 30
SECONDS_WAITED=$((SECONDS_WAITED + 30))
done
# ------------------------------------------------------------------
# Step 7: Verify with a test pod
# ------------------------------------------------------------------
echo ""
echo "πŸ§ͺ Step 7: Deploying test pod to verify Lustre storage..."
kubectl delete pod lustre-test --ignore-not-found=true --wait=false >/dev/null 2>&1 || true
sleep 2
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: lustre-test
spec:
containers:
- name: test
image: busybox
command: ["sh", "-c", "echo 'Hello from AMLFS!' > /mnt/lustre/test.txt && cat /mnt/lustre/test.txt && df -h /mnt/lustre && echo 'SUCCESS' && sleep 30"]
volumeMounts:
- mountPath: /mnt/lustre
name: lustre-data
restartPolicy: Never
volumes:
- name: lustre-data
persistentVolumeClaim:
claimName: ${PVC_NAME}
EOF
echo "⏳ Waiting for test pod to complete..."
kubectl wait --for=condition=Ready pod/lustre-test --timeout=120s 2>/dev/null || true
sleep 5
echo ""
echo "πŸ“‹ Test pod logs:"
kubectl logs lustre-test
echo ""
echo "🧹 Cleaning up test pod..."
kubectl delete pod lustre-test --wait=false >/dev/null 2>&1 || true
echo ""
echo "============================================"
echo " βœ… AMLFS Setup Complete!"
echo "============================================"
echo ""
echo "Summary:"
echo " - CSI Driver: azurelustre.csi.azure.com (installed)"
echo " - StorageClass: ${STORAGECLASS_NAME}"
echo " - PVC: ${PVC_NAME} (Bound)"
echo " - Storage: ${AMLFS_STORAGE_SIZE} (${AMLFS_SKU})"
echo " - AMLFS Subnet: ${AMLFS_SUBNET_NAME}"
echo ""
echo "Usage in your deployments:"
echo " volumes:"
echo " - name: lustre-data"
echo " persistentVolumeClaim:"
echo " claimName: ${PVC_NAME}"
echo ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment