Skip to content

Instantly share code, notes, and snippets.

@oleksiyp
Created October 22, 2025 21:13
Show Gist options
  • Select an option

  • Save oleksiyp/7f8d5db2a6802060425bdbafa0715573 to your computer and use it in GitHub Desktop.

Select an option

Save oleksiyp/7f8d5db2a6802060425bdbafa0715573 to your computer and use it in GitHub Desktop.
---
- name: Deploy native Ceph cluster via Rook
become: true
hosts: bootstrapMaster
tasks:
- name: Ping host
ping:
- name: Install rook Helm repo
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
helm repo add rook-release https://charts.rook.io/release
helm repo update
- name: Install rook-ceph operator
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
helm upgrade --install rook rook-release/rook-ceph --namespace rook-ceph --create-namespace \
--set crds.enabled=true
kubectl wait --for=condition=Available --timeout=5m -n rook-ceph deployment/rook-ceph-operator
- name: Prepare Ceph storage devices on all nodes (partitioned)
become: true
vars:
ceph_partition: "/dev/nvme0n1p3"
hosts: [bootstrapMaster, masters, workers]
tasks:
- name: Check NVMe Ceph partition exists
stat:
path: "{{ ceph_partition }}"
register: nvme_ceph_partition
- name: Fail if Ceph partition not found
fail:
msg: "Ceph partition {{ ceph_partition }} not found. Run nvme-partitioning playbook first."
when: not nvme_ceph_partition.stat.exists
- name: Verify Ceph partition is clean
shell: |
echo "=== Ceph partition status ==="
lsblk {{ ceph_partition }}
echo ""
echo "=== Filesystem check ==="
blkid {{ ceph_partition }} || echo "No filesystem detected (good for Ceph)"
register: partition_status
- name: Show partition status
debug:
msg: "{{ partition_status.stdout_lines }}"
- name: Deploy native Ceph cluster via Rook
become: true
vars:
ceph_partition: "/dev/nvme0n1p3"
hosts: bootstrapMaster
tasks:
- name: Create native CephCluster
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
cat <<EOF | kubectl apply -f -
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.2
dataDirHostPath: /var/lib/rook
skipUpgradeChecks: false
continueUpgradeAfterChecksEvenIfNotHealthy: false
waitTimeoutForHealthyOSDInMinutes: 10
mon:
count: 3
allowMultiplePerNode: false
mgr:
count: 2
allowMultiplePerNode: false
modules:
- name: pg_autoscaler
enabled: true
dashboard:
enabled: true
port: 8443
ssl: true
monitoring:
enabled: false
network:
requireMsgr2: false
crashCollector:
disable: false
logCollector:
enabled: true
periodicity: daily
maxLogSize: 500M
cleanupPolicy:
confirmation: ""
sanitizeDisks:
method: quick
dataSource: zero
iteration: 1
allowUninstallWithVolumes: false
placement:
all:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- blade001
- blade002
- blade003
- blade004
- blade005
tolerations:
- effect: NoSchedule
key: node.kubernetes.io/unschedulable
operator: Exists
- effect: NoSchedule
key: node.cloudprovider.kubernetes.io/uninitialized
operator: Exists
annotations:
labels:
resources:
removeOSDsIfOutAndSafeToRemove: false
storage:
useAllNodes: false
useAllDevices: false
config:
osdsPerDevice: "1"
encryptedDevice: "false"
databaseSizeMB: "1024"
walSizeMB: "1024"
nodes:
- name: "blade001"
devices:
- name: "{{ ceph_partition }}"
config:
osdsPerDevice: "1"
- name: "blade002"
devices:
- name: "{{ ceph_partition }}"
config:
osdsPerDevice: "1"
- name: "blade003"
devices:
- name: "{{ ceph_partition }}"
config:
osdsPerDevice: "1"
- name: "blade004"
devices:
- name: "{{ ceph_partition }}"
config:
osdsPerDevice: "1"
- name: "blade005"
devices:
- name: "{{ ceph_partition }}"
config:
osdsPerDevice: "1"
disruptionManagement:
managePodBudgets: false
osdMaintenanceTimeout: 30
pgHealthCheckTimeout: 0
EOF
echo "⏳ Waiting for CephCluster to be ready..."
kubectl wait --for=condition=Ready --timeout=10m -n rook-ceph cephcluster/rook-ceph
- name: Verify Ceph OSDs are created
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
echo "πŸ” Checking Ceph cluster status..."
# Wait for OSDs to be created
echo "⏳ Waiting for OSDs to be ready..."
for i in {1..30}; do
OSD_COUNT=$(kubectl get pods -n rook-ceph -l app=rook-ceph-osd --no-headers 2>/dev/null | wc -l)
if [ "$OSD_COUNT" -ge 5 ]; then
echo "βœ… Found $OSD_COUNT OSDs"
break
fi
echo " Waiting for OSDs... ($OSD_COUNT/5 found)"
sleep 10
done
# Show OSD status
kubectl get pods -n rook-ceph -l app=rook-ceph-osd
ignore_errors: true
- name: Create RBD StorageClass and set as default
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
# Wait for Ceph cluster to be ready first
echo "⏳ Waiting for Ceph cluster to be healthy..."
kubectl wait --for=condition=Ready --timeout=20m -n rook-ceph cephcluster/rook-ceph || true
# Create RBD pool
cat <<EOF | kubectl apply -f -
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: replicapool
namespace: rook-ceph
spec:
failureDomain: host
replicated:
size: 3
EOF
# Wait for pool to be ready
kubectl wait --for=jsonpath='{.status.phase}'=Ready --timeout=15m -n rook-ceph cephblockpool/replicapool
# Remove default annotation from local-path StorageClass
kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' || true
# Create RBD StorageClass and set as default
cat <<EOF | kubectl apply -f -
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ceph-rbd
annotations:
storageclass.kubernetes.io/is-default-class: "true"
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
clusterID: rook-ceph
pool: replicapool
imageFormat: "2"
imageFeatures: layering
csi.storage.k8s.io/provisioner-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/provisioner-secret-namespace: rook-ceph
csi.storage.k8s.io/controller-expand-secret-name: rook-csi-rbd-provisioner
csi.storage.k8s.io/controller-expand-secret-namespace: rook-ceph
csi.storage.k8s.io/node-stage-secret-name: rook-csi-rbd-node
csi.storage.k8s.io/node-stage-secret-namespace: rook-ceph
csi.storage.k8s.io/fstype: ext4
allowVolumeExpansion: true
reclaimPolicy: Delete
EOF
- name: Create test PVC to validate Ceph integration
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
# Create test PVC
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-ceph-pvc
namespace: default
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: ceph-rbd
EOF
# Wait for PVC to be bound
echo "Waiting for test PVC to be bound..."
kubectl wait --for=condition=Bound pvc/test-ceph-pvc -n default --timeout=5m
# Show PVC status
kubectl get pvc test-ceph-pvc -n default
# Show PV details
PV_NAME=$(kubectl get pvc test-ceph-pvc -n default -o jsonpath='{.spec.volumeName}')
kubectl get pv $PV_NAME
echo "βœ… Native Ceph integration test successful!"
ignore_errors: true
- name: Clean up test PVC
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
# Delete test PVC after successful validation
kubectl delete pvc test-ceph-pvc -n default --ignore-not-found=true
echo "🧹 Test PVC cleaned up"
ignore_errors: true
- name: Display Ceph cluster summary
debug:
msg:
- "=================================================="
- "πŸŽ‰ Native Ceph Cluster Deployed Successfully"
- "=================================================="
- ""
- "βœ… Rook operator installed and configured"
- "βœ… Native Ceph cluster deployed on NVMe drives"
- "βœ… StorageClass 'ceph-rbd' created and set as default"
- "βœ… local-path StorageClass no longer default"
- "βœ… Test PVC validation completed"
- ""
- "πŸ“‹ Cluster Details:"
- " β€’ Monitors: 3 (on blade001, blade002, blade003)"
- " β€’ Managers: 2"
- " β€’ OSDs: 5 (NVMe partition 3 on all nodes)"
- " β€’ Dashboard: Enabled on port 8443"
- ""
- "πŸ“Š Storage Configuration:"
- " β€’ Pool: replicapool (3 replicas)"
- " β€’ Features: layering"
- " β€’ Reclaim Policy: Delete"
- " β€’ Volume Expansion: Enabled"
- " β€’ Default StorageClass: ceph-rbd"
- ""
- "πŸ”§ Management Commands:"
- " β€’ Cluster status: kubectl get cephcluster -n rook-ceph"
- " β€’ Ceph status: kubectl exec -n rook-ceph deployment/rook-ceph-tools -- ceph status"
- " β€’ Dashboard: kubectl get svc -n rook-ceph rook-ceph-mgr-dashboard"
- name: Final validation of Ceph cluster
shell: |-
set -euxo pipefail
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
echo "πŸ” Performing final validation..."
# Check Rook operator status
OPERATOR_READY=$(kubectl get deployment rook-ceph-operator -n rook-ceph -o jsonpath='{.status.readyReplicas}')
echo "βœ… Rook operator ready replicas: ${OPERATOR_READY:-0}"
# Check CephCluster status
CLUSTER_PHASE=$(kubectl get cephcluster rook-ceph -n rook-ceph -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
echo "βœ… CephCluster phase: ${CLUSTER_PHASE}"
# Check StorageClass
DEFAULT_SC=$(kubectl get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}')
echo "βœ… Default StorageClass: ${DEFAULT_SC}"
# Check CSI driver pods
CSI_PODS=$(kubectl get pods -n rook-ceph -l app=csi-rbdplugin --no-headers 2>/dev/null | wc -l)
echo "βœ… CSI RBD driver pods: ${CSI_PODS}"
# Check OSDs
OSD_PODS=$(kubectl get pods -n rook-ceph -l app=rook-ceph-osd --no-headers 2>/dev/null | wc -l)
echo "βœ… Ceph OSD pods: ${OSD_PODS}"
echo "πŸŽ‰ Native Ceph cluster validation completed!"
ignore_errors: true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment