Last active
March 13, 2026 03:43
-
-
Save sozercan/34db2de801fb9fa34baad5cc0e0c50ca to your computer and use it in GitHub Desktop.
Enable Azure Managed Lustre File System (AMLFS) on an existing AKS cluster with dynamic provisioning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # | |
| # setup-amlfs-aks.sh | |
| # | |
| # Enable Azure Managed Lustre File System (AMLFS) on an existing AKS cluster | |
| # with dynamic provisioning via the Azure Lustre CSI driver. | |
| # | |
| # IMPORTANT: AMLFS requires a dedicated subnet separate from the AKS node subnet. | |
| # This script automatically creates one if it doesn't exist. Using the AKS node | |
| # subnet causes extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet). | |
| # | |
| # Prerequisites: | |
| # - Azure CLI (az) installed and logged in | |
| # - kubectl configured to target your AKS cluster | |
| # - Sufficient Azure subscription quota for AMLFS | |
| # | |
| # Usage: | |
| # export RESOURCE_GROUP="my-resource-group" | |
| # export CLUSTER_NAME="my-aks-cluster" | |
| # ./setup-amlfs-aks.sh | |
| # | |
| # Optional environment variables: | |
| # AMLFS_SKU - AMLFS SKU (default: AMLFS-Durable-Premium-125) | |
| # AMLFS_STORAGE_SIZE - Storage size (default: 4Ti) | |
| # AMLFS_ZONE - Availability zone (default: 1) | |
| # AMLFS_SUBNET_NAME - Dedicated subnet name for AMLFS (default: amlfs-subnet) | |
| # AMLFS_SUBNET_PREFIX - CIDR for AMLFS subnet (default: auto-calculated) | |
| # PVC_NAME - PVC name (default: shared-amlfs-storage) | |
| # STORAGECLASS_NAME - StorageClass name (default: amlfs-lustre) | |
| set -euo pipefail | |
| # Required variables | |
| : "${RESOURCE_GROUP:?β RESOURCE_GROUP must be set (e.g., export RESOURCE_GROUP=my-rg)}" | |
| : "${CLUSTER_NAME:?β CLUSTER_NAME must be set (e.g., export CLUSTER_NAME=my-aks)}" | |
| # Optional variables with defaults | |
| : "${AMLFS_SKU:=AMLFS-Durable-Premium-125}" | |
| : "${AMLFS_STORAGE_SIZE:=4Ti}" | |
| : "${AMLFS_ZONE:=1}" | |
| : "${AMLFS_SUBNET_NAME:=amlfs-subnet}" | |
| : "${AMLFS_SUBNET_PREFIX:=}" | |
| : "${PVC_NAME:=shared-amlfs-storage}" | |
| : "${STORAGECLASS_NAME:=amlfs-lustre}" | |
| echo "============================================" | |
| echo " AMLFS Setup for Existing AKS Cluster" | |
| echo "============================================" | |
| echo "" | |
| echo "Cluster: ${CLUSTER_NAME}" | |
| echo "Resource Group: ${RESOURCE_GROUP}" | |
| echo "AMLFS SKU: ${AMLFS_SKU}" | |
| echo "Storage Size: ${AMLFS_STORAGE_SIZE}" | |
| echo "Zone: ${AMLFS_ZONE}" | |
| echo "AMLFS Subnet: ${AMLFS_SUBNET_NAME}" | |
| echo "PVC Name: ${PVC_NAME}" | |
| echo "" | |
| # ------------------------------------------------------------------ | |
| # Step 1: Register Microsoft.StorageCache resource provider | |
| # ------------------------------------------------------------------ | |
| echo "π¦ Step 1: Registering Microsoft.StorageCache resource provider..." | |
| PROVIDER_STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv 2>/dev/null || echo "NotRegistered") | |
| if [[ "${PROVIDER_STATE}" == "Registered" ]]; then | |
| echo "β Microsoft.StorageCache is already registered." | |
| else | |
| az provider register --namespace Microsoft.StorageCache | |
| echo "β³ Waiting for Microsoft.StorageCache to be registered..." | |
| while true; do | |
| STATE=$(az provider show -n Microsoft.StorageCache --query registrationState -o tsv) | |
| if [[ "${STATE}" == "Registered" ]]; then | |
| echo "β Microsoft.StorageCache registered successfully." | |
| break | |
| fi | |
| echo " Current state: ${STATE}. Waiting..." | |
| sleep 10 | |
| done | |
| fi | |
| # ------------------------------------------------------------------ | |
| # Step 2: Install the Azure Lustre CSI driver | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "π§ Step 2: Installing Azure Lustre CSI driver..." | |
| if kubectl get -n kube-system deployment csi-azurelustre-controller >/dev/null 2>&1; then | |
| echo "β Azure Lustre CSI driver is already installed, skipping." | |
| else | |
| curl -skSL https://raw.githubusercontent.com/kubernetes-sigs/azurelustre-csi-driver/main/deploy/install-driver.sh | bash | |
| echo "β³ Waiting for CSI controller pods to be ready..." | |
| while true; do | |
| READY=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") | |
| DESIRED=$(kubectl get -n kube-system deployment csi-azurelustre-controller -o jsonpath='{.status.replicas}' 2>/dev/null || echo "0") | |
| if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then | |
| break | |
| fi | |
| echo " Controller pods: ${READY}/${DESIRED} ready..." | |
| sleep 5 | |
| done | |
| echo "β³ Waiting for CSI node pods to be ready..." | |
| while true; do | |
| READY=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.numberReady}' 2>/dev/null || echo "0") | |
| DESIRED=$(kubectl get -n kube-system daemonset csi-azurelustre-node -o jsonpath='{.status.desiredNumberScheduled}' 2>/dev/null || echo "0") | |
| if [[ "${READY}" -gt 0 && "${READY}" == "${DESIRED}" ]]; then | |
| break | |
| fi | |
| echo " Node pods: ${READY}/${DESIRED} ready..." | |
| sleep 5 | |
| done | |
| echo "β Azure Lustre CSI driver installed successfully." | |
| fi | |
| # ------------------------------------------------------------------ | |
| # Step 3: Assign RBAC roles to kubelet identity | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "π Step 3: Assigning RBAC roles to kubelet identity..." | |
| SUBSCRIPTION_ID=$(az account show --query id -o tsv) | |
| OBJECT_ID=$(az aks show \ | |
| --name "${CLUSTER_NAME}" \ | |
| --resource-group "${RESOURCE_GROUP}" \ | |
| --query identityProfile.kubeletidentity.objectId \ | |
| -o tsv) | |
| NODE_RESOURCE_GROUP=$(az aks show \ | |
| --name "${CLUSTER_NAME}" \ | |
| --resource-group "${RESOURCE_GROUP}" \ | |
| --query nodeResourceGroup \ | |
| -o tsv) | |
| echo " Kubelet Identity: ${OBJECT_ID}" | |
| echo " Node RG: ${NODE_RESOURCE_GROUP}" | |
| # Assign Contributor on node resource group | |
| EXISTING_CONTRIBUTOR=$(az role assignment list \ | |
| --assignee "${OBJECT_ID}" \ | |
| --role "Contributor" \ | |
| --scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" \ | |
| --query "[].roleDefinitionName" -o tsv) | |
| if [[ -n "${EXISTING_CONTRIBUTOR}" ]]; then | |
| echo "β Contributor role already assigned on node resource group." | |
| else | |
| echo " Assigning Contributor role on node resource group..." | |
| az role assignment create \ | |
| --assignee-object-id "${OBJECT_ID}" \ | |
| --assignee-principal-type ServicePrincipal \ | |
| --role "Contributor" \ | |
| --scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${NODE_RESOURCE_GROUP}" >/dev/null | |
| echo "β Contributor role assigned." | |
| fi | |
| # Assign Reader on subscription | |
| EXISTING_READER=$(az role assignment list \ | |
| --assignee "${OBJECT_ID}" \ | |
| --role "Reader" \ | |
| --scope "/subscriptions/${SUBSCRIPTION_ID}" \ | |
| --query "[].roleDefinitionName" -o tsv) | |
| if [[ -n "${EXISTING_READER}" ]]; then | |
| echo "β Reader role already assigned on subscription." | |
| else | |
| echo " Assigning Reader role on subscription..." | |
| az role assignment create \ | |
| --assignee-object-id "${OBJECT_ID}" \ | |
| --assignee-principal-type ServicePrincipal \ | |
| --role "Reader" \ | |
| --scope "/subscriptions/${SUBSCRIPTION_ID}" >/dev/null | |
| echo "β Reader role assigned." | |
| fi | |
| # ------------------------------------------------------------------ | |
| # Step 4: Create dedicated AMLFS subnet | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "π Step 4: Creating dedicated subnet for AMLFS..." | |
| echo " β οΈ AMLFS must NOT share the AKS node subnet β this causes" | |
| echo " extremely slow provisioning (90+ min vs ~15 min with a dedicated subnet)." | |
| # Discover the AKS VNet | |
| VNET_INFO=$(az network vnet list \ | |
| --resource-group "${NODE_RESOURCE_GROUP}" \ | |
| --query "[0].{name:name, prefixes:addressSpace.addressPrefixes[0]}" \ | |
| -o json) | |
| VNET_NAME=$(echo "${VNET_INFO}" | jq -r '.name') | |
| echo " VNet: ${VNET_NAME}" | |
| # Check if AMLFS subnet already exists | |
| if az network vnet subnet show \ | |
| --resource-group "${NODE_RESOURCE_GROUP}" \ | |
| --vnet-name "${VNET_NAME}" \ | |
| --name "${AMLFS_SUBNET_NAME}" &>/dev/null; then | |
| echo "β Subnet '${AMLFS_SUBNET_NAME}' already exists, reusing it." | |
| else | |
| # Auto-calculate subnet prefix if not provided | |
| if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then | |
| # Get existing subnets and find an unused /24 range | |
| # Start from the second octet block after the AKS subnet | |
| EXISTING_PREFIXES=$(az network vnet subnet list \ | |
| --resource-group "${NODE_RESOURCE_GROUP}" \ | |
| --vnet-name "${VNET_NAME}" \ | |
| --query "[].addressPrefix" -o tsv) | |
| # Get the VNet base address (e.g., 10.224.0.0/12 -> 10) | |
| VNET_PREFIX=$(echo "${VNET_INFO}" | jq -r '.prefixes') | |
| VNET_BASE=$(echo "${VNET_PREFIX}" | cut -d'.' -f1) | |
| # Try to find an unused /24 in the VNet range | |
| for SECOND_OCTET in $(seq 225 237); do | |
| CANDIDATE="${VNET_BASE}.${SECOND_OCTET}.0.0/24" | |
| if ! echo "${EXISTING_PREFIXES}" | grep -q "${VNET_BASE}.${SECOND_OCTET}"; then | |
| AMLFS_SUBNET_PREFIX="${CANDIDATE}" | |
| break | |
| fi | |
| done | |
| if [[ -z "${AMLFS_SUBNET_PREFIX}" ]]; then | |
| echo "β Could not auto-calculate a free /24 subnet. Please set AMLFS_SUBNET_PREFIX manually." | |
| exit 1 | |
| fi | |
| fi | |
| echo " Creating subnet '${AMLFS_SUBNET_NAME}' with prefix '${AMLFS_SUBNET_PREFIX}'..." | |
| az network vnet subnet create \ | |
| --resource-group "${NODE_RESOURCE_GROUP}" \ | |
| --vnet-name "${VNET_NAME}" \ | |
| --name "${AMLFS_SUBNET_NAME}" \ | |
| --address-prefixes "${AMLFS_SUBNET_PREFIX}" >/dev/null | |
| echo "β Subnet '${AMLFS_SUBNET_NAME}' created (${AMLFS_SUBNET_PREFIX})." | |
| fi | |
| # ------------------------------------------------------------------ | |
| # Step 5: Deploy StorageClass and PVC with dedicated subnet | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "π Step 5: Deploying StorageClass and PVC with dedicated AMLFS subnet..." | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: storage.k8s.io/v1 | |
| kind: StorageClass | |
| metadata: | |
| name: ${STORAGECLASS_NAME} | |
| provisioner: azurelustre.csi.azure.com | |
| parameters: | |
| sku-name: "${AMLFS_SKU}" | |
| zones: "${AMLFS_ZONE}" | |
| maintenance-day-of-week: "Sunday" | |
| maintenance-time-of-day-utc: "02:00" | |
| subnet-name: "${AMLFS_SUBNET_NAME}" | |
| reclaimPolicy: Delete | |
| volumeBindingMode: Immediate | |
| mountOptions: | |
| - "noatime" | |
| - "flock" | |
| --- | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: ${PVC_NAME} | |
| spec: | |
| accessModes: | |
| - ReadWriteMany | |
| storageClassName: ${STORAGECLASS_NAME} | |
| resources: | |
| requests: | |
| storage: ${AMLFS_STORAGE_SIZE} | |
| EOF | |
| echo "β StorageClass '${STORAGECLASS_NAME}' and PVC '${PVC_NAME}' created." | |
| # ------------------------------------------------------------------ | |
| # Step 6: Wait for PVC to be bound | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "β³ Step 6: Waiting for PVC '${PVC_NAME}' to be bound..." | |
| echo " (AMLFS provisioning typically takes 15-20 minutes with a dedicated subnet)" | |
| echo "" | |
| SECONDS_WAITED=0 | |
| MAX_WAIT=2400 # 40 minutes | |
| while true; do | |
| PVC_STATUS=$(kubectl get pvc "${PVC_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown") | |
| if [[ "${PVC_STATUS}" == "Bound" ]]; then | |
| echo "" | |
| echo "β PVC '${PVC_NAME}' is Bound!" | |
| kubectl get pvc "${PVC_NAME}" | |
| break | |
| fi | |
| # Check for non-timeout errors (ignore DeadlineExceeded as the driver retries) | |
| LATEST_ERROR=$(kubectl describe pvc "${PVC_NAME}" 2>/dev/null | grep "ProvisioningFailed" | grep -v "DeadlineExceeded" | tail -1) | |
| if [[ -n "${LATEST_ERROR}" ]]; then | |
| echo " β οΈ Provisioning error: ${LATEST_ERROR}" | |
| fi | |
| if [[ ${SECONDS_WAITED} -ge ${MAX_WAIT} ]]; then | |
| echo "" | |
| echo "β Timed out after $((MAX_WAIT / 60)) minutes. PVC is still ${PVC_STATUS}." | |
| echo " The CSI driver will continue retrying in the background." | |
| echo " Monitor with: kubectl describe pvc ${PVC_NAME}" | |
| echo " Check Azure: az resource list --resource-group ${NODE_RESOURCE_GROUP} --resource-type Microsoft.StorageCache/amlFileSystems -o table" | |
| exit 1 | |
| fi | |
| echo " [$(date +%H:%M:%S)] PVC status: ${PVC_STATUS} (${SECONDS_WAITED}s elapsed)" | |
| sleep 30 | |
| SECONDS_WAITED=$((SECONDS_WAITED + 30)) | |
| done | |
| # ------------------------------------------------------------------ | |
| # Step 7: Verify with a test pod | |
| # ------------------------------------------------------------------ | |
| echo "" | |
| echo "π§ͺ Step 7: Deploying test pod to verify Lustre storage..." | |
| kubectl delete pod lustre-test --ignore-not-found=true --wait=false >/dev/null 2>&1 || true | |
| sleep 2 | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: Pod | |
| metadata: | |
| name: lustre-test | |
| spec: | |
| containers: | |
| - name: test | |
| image: busybox | |
| command: ["sh", "-c", "echo 'Hello from AMLFS!' > /mnt/lustre/test.txt && cat /mnt/lustre/test.txt && df -h /mnt/lustre && echo 'SUCCESS' && sleep 30"] | |
| volumeMounts: | |
| - mountPath: /mnt/lustre | |
| name: lustre-data | |
| restartPolicy: Never | |
| volumes: | |
| - name: lustre-data | |
| persistentVolumeClaim: | |
| claimName: ${PVC_NAME} | |
| EOF | |
| echo "β³ Waiting for test pod to complete..." | |
| kubectl wait --for=condition=Ready pod/lustre-test --timeout=120s 2>/dev/null || true | |
| sleep 5 | |
| echo "" | |
| echo "π Test pod logs:" | |
| kubectl logs lustre-test | |
| echo "" | |
| echo "π§Ή Cleaning up test pod..." | |
| kubectl delete pod lustre-test --wait=false >/dev/null 2>&1 || true | |
| echo "" | |
| echo "============================================" | |
| echo " β AMLFS Setup Complete!" | |
| echo "============================================" | |
| echo "" | |
| echo "Summary:" | |
| echo " - CSI Driver: azurelustre.csi.azure.com (installed)" | |
| echo " - StorageClass: ${STORAGECLASS_NAME}" | |
| echo " - PVC: ${PVC_NAME} (Bound)" | |
| echo " - Storage: ${AMLFS_STORAGE_SIZE} (${AMLFS_SKU})" | |
| echo " - AMLFS Subnet: ${AMLFS_SUBNET_NAME}" | |
| echo "" | |
| echo "Usage in your deployments:" | |
| echo " volumes:" | |
| echo " - name: lustre-data" | |
| echo " persistentVolumeClaim:" | |
| echo " claimName: ${PVC_NAME}" | |
| echo "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment