Skip to content

Instantly share code, notes, and snippets.

@arubis
Last active January 13, 2026 16:55
Show Gist options
  • Select an option

  • Save arubis/162360286e2265a478c523e1f5fc3952 to your computer and use it in GitHub Desktop.

Select an option

Save arubis/162360286e2265a478c523e1f5fc3952 to your computer and use it in GitHub Desktop.
disaster-recovery-and-backup task fixes - monitoring infrastructure and k3s PVC
diff --git a/Dockerfile b/Dockerfile
index 9e0d59a..ff08060 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1 +1,2 @@
FROM nebula-devops
+ENV ALLOWED_NAMESPACES="monitoring"
diff --git a/setup.sh b/setup.sh
index c8ffe86..1f5ed56 100755
--- a/setup.sh
+++ b/setup.sh
@@ -336,32 +336,152 @@ if [ -n "$GITEA_POD" ]; then
fi
fi
-# Grant agent access to create PrometheusRules in monitoring namespace
-echo "Configuring RBAC for monitoring access..."
-kubectl apply -f - <<EOF
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
+# Install PrometheusRule CRD for monitoring
+echo "Installing PrometheusRule CRD..."
+kubectl apply -f - <<'EOF'
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+ name: prometheusrules.monitoring.coreos.com
+spec:
+ group: monitoring.coreos.com
+ names:
+ kind: PrometheusRule
+ listKind: PrometheusRuleList
+ plural: prometheusrules
+ singular: prometheusrule
+ scope: Namespaced
+ versions:
+ - name: v1
+ served: true
+ storage: true
+ schema:
+ openAPIV3Schema:
+ type: object
+ properties:
+ spec:
+ type: object
+ x-kubernetes-preserve-unknown-fields: true
+EOF
+
+# Configure Prometheus to load rules
+echo "Configuring Prometheus for backup monitoring..."
+# Add empty backup-alerts.yml file
+kubectl patch configmap prometheus-config -n monitoring --type=json -p='[
+ {"op": "add", "path": "/data/backup-alerts.yml", "value": "groups: []"}
+]' 2>/dev/null || true
+
+# Update prometheus.yml to include rule_files
+kubectl get configmap prometheus-config -n monitoring -o jsonpath='{.data.prometheus\.yml}' > /tmp/prom-config.yml
+sed -i 's|^rule_files:$|rule_files:\n - "/etc/prometheus/backup-alerts.yml"|' /tmp/prom-config.yml
+sed -i '/^ # - "alerts.yml"/d' /tmp/prom-config.yml
+kubectl create configmap prometheus-config \
+ --from-file=prometheus.yml=/tmp/prom-config.yml \
+ --from-literal=backup-alerts.yml="groups: []" \
+ -n monitoring --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true
+
+# Wait for ConfigMap propagation (up to 60s for mounted volumes)
+echo "Waiting for ConfigMap propagation..."
+sleep 65
+
+# Reload Prometheus to pick up the new configuration
+kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget --post-data="" -qO- http://localhost:9090/-/reload 2>/dev/null || true
+
+# Create syncer that watches PrometheusRules in bleater and syncs to Prometheus
+echo "Deploying prometheus-rule-syncer..."
+kubectl apply -f - <<'EOF'
+apiVersion: v1
+kind: ServiceAccount
metadata:
- name: prometheus-rules-creator
+ name: prometheus-rule-syncer
namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: prometheus-rule-syncer
rules:
- apiGroups: ["monitoring.coreos.com"]
resources: ["prometheusrules"]
- verbs: ["create", "update", "patch", "get", "list", "delete"]
+ verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+ resources: ["configmaps"]
+ verbs: ["get", "patch", "update"]
+- apiGroups: [""]
+ resources: ["pods"]
+ verbs: ["list", "get"]
+- apiGroups: [""]
+ resources: ["pods/exec"]
+ verbs: ["create"]
+- apiGroups: ["apps"]
+ resources: ["deployments"]
+ verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
+kind: ClusterRoleBinding
metadata:
- name: agent-prometheus-rules
- namespace: monitoring
+ name: prometheus-rule-syncer
subjects:
- kind: ServiceAccount
- name: ubuntu-user
- namespace: default
+ name: prometheus-rule-syncer
+ namespace: monitoring
roleRef:
- kind: Role
- name: prometheus-rules-creator
+ kind: ClusterRole
+ name: prometheus-rule-syncer
apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus-rule-syncer
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: prometheus-rule-syncer
+ template:
+ metadata:
+ labels:
+ app: prometheus-rule-syncer
+ spec:
+ serviceAccountName: prometheus-rule-syncer
+ containers:
+ - name: syncer
+ image: docker.io/bitnami/kubectl:latest
+ imagePullPolicy: IfNotPresent
+ command: ["/bin/bash", "-c"]
+ args:
+ - |
+ echo "Syncer started, watching all PrometheusRules every 5 seconds..."
+ LAST_HASH=""
+ while true; do
+ RULES_YAML=$(kubectl get prometheusrules -n bleater -o yaml 2>/dev/null)
+ if echo "$RULES_YAML" | grep -q "kind: List"; then
+ echo "$RULES_YAML" | awk '/^ spec:/,/^kind:/' | sed '1d;$d' | sed 's/^ //' > /tmp/rules.yml
+ if [ -s /tmp/rules.yml ]; then
+ CURRENT_HASH=$(md5sum /tmp/rules.yml | awk '{print $1}')
+ if [ "$CURRENT_HASH" != "$LAST_HASH" ]; then
+ echo "PrometheusRules changed, syncing to Prometheus..."
+ RULES=$(cat /tmp/rules.yml | awk '{printf "%s\\n", $0}' | sed 's/"/\\"/g')
+ kubectl patch configmap prometheus-config -n monitoring --type=merge -p "{\"data\":{\"backup-alerts.yml\":\"$RULES\"}}"
+ for i in {1..60}; do
+ if kubectl exec -n monitoring deploy/prometheus -c prometheus -- cat /etc/prometheus/backup-alerts.yml 2>/dev/null | grep -qE "BackupJobFailed|alert:"; then
+ echo "ConfigMap propagated (${i}s)"
+ break
+ fi
+ sleep 1
+ done
+ kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget --post-data="" -qO- http://localhost:9090/-/reload 2>/dev/null
+ LAST_HASH="$CURRENT_HASH"
+ fi
+ fi
+ fi
+ sleep 5
+ done
EOF
+# Wait for syncer to be ready
+kubectl rollout status deployment/prometheus-rule-syncer -n monitoring --timeout=60s 2>/dev/null || true
+
echo "Setup complete. Velero ready for agent configuration."
diff --git a/solution.sh b/solution.sh
index 0b922a6..3e93425 100755
--- a/solution.sh
+++ b/solution.sh
@@ -66,7 +66,19 @@ EOF
kubectl wait --for=condition=complete job/k3s-sqlite-backup-manual -n bleater --timeout=60s 2>/dev/null || true
else
# etcd mode - create CronJob for automated snapshots in bleater namespace
+ # Create PVC for etcd snapshots (required by grader)
kubectl apply -f - <<'EOF'
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: k3s-backup-pvc
+ namespace: bleater
+spec:
+ accessModes: [ReadWriteOnce]
+ resources:
+ requests:
+ storage: 2Gi
+---
apiVersion: batch/v1
kind: CronJob
metadata:
@@ -91,23 +103,26 @@ spec:
- /bin/sh
- -c
- |
- /usr/local/bin/k3s etcd-snapshot save --name scheduled-$(date +%Y%m%d-%H%M%S)
- /usr/local/bin/k3s etcd-snapshot ls | tail -n +6 | awk '{print $1}' | xargs -r /usr/local/bin/k3s etcd-snapshot delete || true
+ mkdir -p /backups/k3s
+ /host/usr/local/bin/k3s etcd-snapshot save --name scheduled-$(date +%Y%m%d-%H%M%S)
+ # Copy snapshots to PVC
+ cp /host/var/lib/rancher/k3s/server/db/snapshots/* /backups/k3s/ 2>/dev/null || true
+ # Cleanup old snapshots (keep 5)
+ cd /backups/k3s && ls -t *.db 2>/dev/null | tail -n +6 | xargs rm -f || true
volumeMounts:
- name: host
- mountPath: /usr/local/bin/k3s
- subPath: usr/local/bin/k3s
- - name: k3s-data
- mountPath: /var/lib/rancher/k3s
+ mountPath: /host
+ readOnly: true
+ - name: backup-volume
+ mountPath: /backups
volumes:
- name: host
hostPath:
- path: /usr/local/bin/k3s
- type: File
- - name: k3s-data
- hostPath:
- path: /var/lib/rancher/k3s
+ path: /
type: Directory
+ - name: backup-volume
+ persistentVolumeClaim:
+ claimName: k3s-backup-pvc
restartPolicy: OnFailure
EOF
kubectl delete job k3s-etcd-backup-manual -n bleater 2>/dev/null || true
@@ -481,21 +496,62 @@ fi
# 7. Monitoring and Alerts
echo "Configuring backup monitoring..."
-# Add alert rules to prometheus-config ConfigMap
-kubectl patch configmap prometheus-config -n monitoring --type=merge -p='
-{
- "data": {
- "backup-alerts.yml": "groups:\n - name: backup-alerts\n interval: 5s\n rules:\n - alert: BackupJobFailed\n expr: kube_job_status_failed{namespace=\"bleater\"} > 0\n for: 5s\n labels:\n severity: critical\n annotations:\n summary: \"Backup job failed\"\n description: \"Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed\"\n"
- }
-}'
+# Create PrometheusRule CRD for backup alerts
+kubectl apply -f - <<'EOF'
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: backup-alerts
+ namespace: bleater
+spec:
+ groups:
+ - name: backup-jobs
+ interval: 5s
+ rules:
+ - alert: BackupJobFailed
+ expr: kube_job_status_failed{namespace="bleater"} > 0
+ for: 5s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Backup job failed"
+ description: "Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed"
+EOF
-# Update prometheus.yml to include the rules file
-kubectl get configmap prometheus-config -n monitoring -o json | \
- jq '.data."prometheus.yml" |= sub(" # - \"alerts.yml\""; " - \"/etc/prometheus/backup-alerts.yml\"")' | \
- kubectl apply -f -
+# Directly sync rules to Prometheus (don't rely on syncer timing)
+echo "Syncing alert rules to Prometheus..."
-# Restart Prometheus to load new config
-kubectl delete pod -n monitoring -l app=prometheus
-kubectl wait --for=condition=ready pod -l app=prometheus -n monitoring --timeout=120s
+# Create the rules YAML content
+RULES_CONTENT='groups:
+ - name: backup-jobs
+ interval: 5s
+ rules:
+ - alert: BackupJobFailed
+ expr: kube_job_status_failed{namespace="bleater"} > 0
+ for: 5s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Backup job failed"
+ description: "Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed"'
+
+# Patch the ConfigMap directly
+kubectl patch configmap prometheus-config -n monitoring --type=merge -p "{\"data\":{\"backup-alerts.yml\":\"$RULES_CONTENT\"}}"
+
+# Restart Prometheus to guarantee ConfigMap is loaded (propagation can take 60+ seconds)
+echo "Restarting Prometheus to pick up new config..."
+kubectl rollout restart deployment/prometheus -n monitoring
+kubectl rollout status deployment/prometheus -n monitoring --timeout=120s
+
+# Wait for Prometheus to be ready and verify rules are loaded
+echo "Verifying rules loaded in Prometheus..."
+for i in {1..30}; do
+ RULES=$(kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget -qO- 'http://localhost:9090/api/v1/rules' 2>/dev/null)
+ if echo "$RULES" | grep -q "BackupJobFailed"; then
+ echo "Alert rules verified in Prometheus"
+ break
+ fi
+ sleep 2
+done
echo "Disaster recovery solution implemented"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment