Last active
January 13, 2026 16:55
-
-
Save arubis/162360286e2265a478c523e1f5fc3952 to your computer and use it in GitHub Desktop.
disaster-recovery-and-backup task fixes - monitoring infrastructure and k3s PVC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/Dockerfile b/Dockerfile | |
| index 9e0d59a..ff08060 100644 | |
| --- a/Dockerfile | |
| +++ b/Dockerfile | |
| @@ -1 +1,2 @@ | |
| FROM nebula-devops | |
| +ENV ALLOWED_NAMESPACES="monitoring" | |
| diff --git a/setup.sh b/setup.sh | |
| index c8ffe86..1f5ed56 100755 | |
| --- a/setup.sh | |
| +++ b/setup.sh | |
| @@ -336,32 +336,152 @@ if [ -n "$GITEA_POD" ]; then | |
| fi | |
| fi | |
| -# Grant agent access to create PrometheusRules in monitoring namespace | |
| -echo "Configuring RBAC for monitoring access..." | |
| -kubectl apply -f - <<EOF | |
| -apiVersion: rbac.authorization.k8s.io/v1 | |
| -kind: Role | |
| +# Install PrometheusRule CRD for monitoring | |
| +echo "Installing PrometheusRule CRD..." | |
| +kubectl apply -f - <<'EOF' | |
| +apiVersion: apiextensions.k8s.io/v1 | |
| +kind: CustomResourceDefinition | |
| +metadata: | |
| + name: prometheusrules.monitoring.coreos.com | |
| +spec: | |
| + group: monitoring.coreos.com | |
| + names: | |
| + kind: PrometheusRule | |
| + listKind: PrometheusRuleList | |
| + plural: prometheusrules | |
| + singular: prometheusrule | |
| + scope: Namespaced | |
| + versions: | |
| + - name: v1 | |
| + served: true | |
| + storage: true | |
| + schema: | |
| + openAPIV3Schema: | |
| + type: object | |
| + properties: | |
| + spec: | |
| + type: object | |
| + x-kubernetes-preserve-unknown-fields: true | |
| +EOF | |
| + | |
| +# Configure Prometheus to load rules | |
| +echo "Configuring Prometheus for backup monitoring..." | |
| +# Add empty backup-alerts.yml file | |
| +kubectl patch configmap prometheus-config -n monitoring --type=json -p='[ | |
| + {"op": "add", "path": "/data/backup-alerts.yml", "value": "groups: []"} | |
| +]' 2>/dev/null || true | |
| + | |
| +# Update prometheus.yml to include rule_files | |
| +kubectl get configmap prometheus-config -n monitoring -o jsonpath='{.data.prometheus\.yml}' > /tmp/prom-config.yml | |
| +sed -i 's|^rule_files:$|rule_files:\n - "/etc/prometheus/backup-alerts.yml"|' /tmp/prom-config.yml | |
| +sed -i '/^ # - "alerts.yml"/d' /tmp/prom-config.yml | |
| +kubectl create configmap prometheus-config \ | |
| + --from-file=prometheus.yml=/tmp/prom-config.yml \ | |
| + --from-literal=backup-alerts.yml="groups: []" \ | |
| + -n monitoring --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null || true | |
| + | |
| +# Wait for ConfigMap propagation (up to 60s for mounted volumes) | |
| +echo "Waiting for ConfigMap propagation..." | |
| +sleep 65 | |
| + | |
| +# Reload Prometheus to pick up the new configuration | |
| +kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget --post-data="" -qO- http://localhost:9090/-/reload 2>/dev/null || true | |
| + | |
| +# Create syncer that watches PrometheusRules in bleater and syncs to Prometheus | |
| +echo "Deploying prometheus-rule-syncer..." | |
| +kubectl apply -f - <<'EOF' | |
| +apiVersion: v1 | |
| +kind: ServiceAccount | |
| metadata: | |
| - name: prometheus-rules-creator | |
| + name: prometheus-rule-syncer | |
| namespace: monitoring | |
| +--- | |
| +apiVersion: rbac.authorization.k8s.io/v1 | |
| +kind: ClusterRole | |
| +metadata: | |
| + name: prometheus-rule-syncer | |
| rules: | |
| - apiGroups: ["monitoring.coreos.com"] | |
| resources: ["prometheusrules"] | |
| - verbs: ["create", "update", "patch", "get", "list", "delete"] | |
| + verbs: ["get", "list", "watch"] | |
| +- apiGroups: [""] | |
| + resources: ["configmaps"] | |
| + verbs: ["get", "patch", "update"] | |
| +- apiGroups: [""] | |
| + resources: ["pods"] | |
| + verbs: ["list", "get"] | |
| +- apiGroups: [""] | |
| + resources: ["pods/exec"] | |
| + verbs: ["create"] | |
| +- apiGroups: ["apps"] | |
| + resources: ["deployments"] | |
| + verbs: ["get"] | |
| --- | |
| apiVersion: rbac.authorization.k8s.io/v1 | |
| -kind: RoleBinding | |
| +kind: ClusterRoleBinding | |
| metadata: | |
| - name: agent-prometheus-rules | |
| - namespace: monitoring | |
| + name: prometheus-rule-syncer | |
| subjects: | |
| - kind: ServiceAccount | |
| - name: ubuntu-user | |
| - namespace: default | |
| + name: prometheus-rule-syncer | |
| + namespace: monitoring | |
| roleRef: | |
| - kind: Role | |
| - name: prometheus-rules-creator | |
| + kind: ClusterRole | |
| + name: prometheus-rule-syncer | |
| apiGroup: rbac.authorization.k8s.io | |
| +--- | |
| +apiVersion: apps/v1 | |
| +kind: Deployment | |
| +metadata: | |
| + name: prometheus-rule-syncer | |
| + namespace: monitoring | |
| +spec: | |
| + replicas: 1 | |
| + selector: | |
| + matchLabels: | |
| + app: prometheus-rule-syncer | |
| + template: | |
| + metadata: | |
| + labels: | |
| + app: prometheus-rule-syncer | |
| + spec: | |
| + serviceAccountName: prometheus-rule-syncer | |
| + containers: | |
| + - name: syncer | |
| + image: docker.io/bitnami/kubectl:latest | |
| + imagePullPolicy: IfNotPresent | |
| + command: ["/bin/bash", "-c"] | |
| + args: | |
| + - | | |
| + echo "Syncer started, watching all PrometheusRules every 5 seconds..." | |
| + LAST_HASH="" | |
| + while true; do | |
| + RULES_YAML=$(kubectl get prometheusrules -n bleater -o yaml 2>/dev/null) | |
| + if echo "$RULES_YAML" | grep -q "kind: List"; then | |
| + echo "$RULES_YAML" | awk '/^ spec:/,/^kind:/' | sed '1d;$d' | sed 's/^ //' > /tmp/rules.yml | |
| + if [ -s /tmp/rules.yml ]; then | |
| + CURRENT_HASH=$(md5sum /tmp/rules.yml | awk '{print $1}') | |
| + if [ "$CURRENT_HASH" != "$LAST_HASH" ]; then | |
| + echo "PrometheusRules changed, syncing to Prometheus..." | |
| + RULES=$(cat /tmp/rules.yml | awk '{printf "%s\\n", $0}' | sed 's/"/\\"/g') | |
| + kubectl patch configmap prometheus-config -n monitoring --type=merge -p "{\"data\":{\"backup-alerts.yml\":\"$RULES\"}}" | |
| + for i in {1..60}; do | |
| + if kubectl exec -n monitoring deploy/prometheus -c prometheus -- cat /etc/prometheus/backup-alerts.yml 2>/dev/null | grep -qE "BackupJobFailed|alert:"; then | |
| + echo "ConfigMap propagated (${i}s)" | |
| + break | |
| + fi | |
| + sleep 1 | |
| + done | |
| + kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget --post-data="" -qO- http://localhost:9090/-/reload 2>/dev/null | |
| + LAST_HASH="$CURRENT_HASH" | |
| + fi | |
| + fi | |
| + fi | |
| + sleep 5 | |
| + done | |
| EOF | |
| +# Wait for syncer to be ready | |
| +kubectl rollout status deployment/prometheus-rule-syncer -n monitoring --timeout=60s 2>/dev/null || true | |
| + | |
| echo "Setup complete. Velero ready for agent configuration." | |
| diff --git a/solution.sh b/solution.sh | |
| index 0b922a6..3e93425 100755 | |
| --- a/solution.sh | |
| +++ b/solution.sh | |
| @@ -66,7 +66,19 @@ EOF | |
| kubectl wait --for=condition=complete job/k3s-sqlite-backup-manual -n bleater --timeout=60s 2>/dev/null || true | |
| else | |
| # etcd mode - create CronJob for automated snapshots in bleater namespace | |
| + # Create PVC for etcd snapshots (required by grader) | |
| kubectl apply -f - <<'EOF' | |
| +apiVersion: v1 | |
| +kind: PersistentVolumeClaim | |
| +metadata: | |
| + name: k3s-backup-pvc | |
| + namespace: bleater | |
| +spec: | |
| + accessModes: [ReadWriteOnce] | |
| + resources: | |
| + requests: | |
| + storage: 2Gi | |
| +--- | |
| apiVersion: batch/v1 | |
| kind: CronJob | |
| metadata: | |
| @@ -91,23 +103,26 @@ spec: | |
| - /bin/sh | |
| - -c | |
| - | | |
| - /usr/local/bin/k3s etcd-snapshot save --name scheduled-$(date +%Y%m%d-%H%M%S) | |
| - /usr/local/bin/k3s etcd-snapshot ls | tail -n +6 | awk '{print $1}' | xargs -r /usr/local/bin/k3s etcd-snapshot delete || true | |
| + mkdir -p /backups/k3s | |
| + /host/usr/local/bin/k3s etcd-snapshot save --name scheduled-$(date +%Y%m%d-%H%M%S) | |
| + # Copy snapshots to PVC | |
| + cp /host/var/lib/rancher/k3s/server/db/snapshots/* /backups/k3s/ 2>/dev/null || true | |
| + # Cleanup old snapshots (keep 5) | |
| + cd /backups/k3s && ls -t *.db 2>/dev/null | tail -n +6 | xargs rm -f || true | |
| volumeMounts: | |
| - name: host | |
| - mountPath: /usr/local/bin/k3s | |
| - subPath: usr/local/bin/k3s | |
| - - name: k3s-data | |
| - mountPath: /var/lib/rancher/k3s | |
| + mountPath: /host | |
| + readOnly: true | |
| + - name: backup-volume | |
| + mountPath: /backups | |
| volumes: | |
| - name: host | |
| hostPath: | |
| - path: /usr/local/bin/k3s | |
| - type: File | |
| - - name: k3s-data | |
| - hostPath: | |
| - path: /var/lib/rancher/k3s | |
| + path: / | |
| type: Directory | |
| + - name: backup-volume | |
| + persistentVolumeClaim: | |
| + claimName: k3s-backup-pvc | |
| restartPolicy: OnFailure | |
| EOF | |
| kubectl delete job k3s-etcd-backup-manual -n bleater 2>/dev/null || true | |
| @@ -481,21 +496,62 @@ fi | |
| # 7. Monitoring and Alerts | |
| echo "Configuring backup monitoring..." | |
| -# Add alert rules to prometheus-config ConfigMap | |
| -kubectl patch configmap prometheus-config -n monitoring --type=merge -p=' | |
| -{ | |
| - "data": { | |
| - "backup-alerts.yml": "groups:\n - name: backup-alerts\n interval: 5s\n rules:\n - alert: BackupJobFailed\n expr: kube_job_status_failed{namespace=\"bleater\"} > 0\n for: 5s\n labels:\n severity: critical\n annotations:\n summary: \"Backup job failed\"\n description: \"Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed\"\n" | |
| - } | |
| -}' | |
| +# Create PrometheusRule CRD for backup alerts | |
| +kubectl apply -f - <<'EOF' | |
| +apiVersion: monitoring.coreos.com/v1 | |
| +kind: PrometheusRule | |
| +metadata: | |
| + name: backup-alerts | |
| + namespace: bleater | |
| +spec: | |
| + groups: | |
| + - name: backup-jobs | |
| + interval: 5s | |
| + rules: | |
| + - alert: BackupJobFailed | |
| + expr: kube_job_status_failed{namespace="bleater"} > 0 | |
| + for: 5s | |
| + labels: | |
| + severity: critical | |
| + annotations: | |
| + summary: "Backup job failed" | |
| + description: "Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed" | |
| +EOF | |
| -# Update prometheus.yml to include the rules file | |
| -kubectl get configmap prometheus-config -n monitoring -o json | \ | |
| - jq '.data."prometheus.yml" |= sub(" # - \"alerts.yml\""; " - \"/etc/prometheus/backup-alerts.yml\"")' | \ | |
| - kubectl apply -f - | |
| +# Directly sync rules to Prometheus (don't rely on syncer timing) | |
| +echo "Syncing alert rules to Prometheus..." | |
| -# Restart Prometheus to load new config | |
| -kubectl delete pod -n monitoring -l app=prometheus | |
| -kubectl wait --for=condition=ready pod -l app=prometheus -n monitoring --timeout=120s | |
| +# Create the rules YAML content | |
| +RULES_CONTENT='groups: | |
| + - name: backup-jobs | |
| + interval: 5s | |
| + rules: | |
| + - alert: BackupJobFailed | |
| + expr: kube_job_status_failed{namespace="bleater"} > 0 | |
| + for: 5s | |
| + labels: | |
| + severity: critical | |
| + annotations: | |
| + summary: "Backup job failed" | |
| + description: "Job {{ $labels.job_name }} in namespace {{ $labels.namespace }} has failed"' | |
| + | |
| +# Patch the ConfigMap directly | |
| +kubectl patch configmap prometheus-config -n monitoring --type=merge -p "{\"data\":{\"backup-alerts.yml\":\"$RULES_CONTENT\"}}" | |
| + | |
| +# Restart Prometheus to guarantee ConfigMap is loaded (propagation can take 60+ seconds) | |
| +echo "Restarting Prometheus to pick up new config..." | |
| +kubectl rollout restart deployment/prometheus -n monitoring | |
| +kubectl rollout status deployment/prometheus -n monitoring --timeout=120s | |
| + | |
| +# Wait for Prometheus to be ready and verify rules are loaded | |
| +echo "Verifying rules loaded in Prometheus..." | |
| +for i in {1..30}; do | |
| + RULES=$(kubectl exec -n monitoring deploy/prometheus -c prometheus -- wget -qO- 'http://localhost:9090/api/v1/rules' 2>/dev/null) | |
| + if echo "$RULES" | grep -q "BackupJobFailed"; then | |
| + echo "Alert rules verified in Prometheus" | |
| + break | |
| + fi | |
| + sleep 2 | |
| +done | |
| echo "Disaster recovery solution implemented" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment