Last active
June 3, 2022 20:36
-
-
Save cloudbehl/f88e3dc5096c11532691793d66844333 to your computer and use it in GitHub Desktop.
Ceph Prometheus alert rules
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: monitoring.coreos.com/v1 | |
| kind: PrometheusRule | |
| metadata: | |
| labels: | |
| prometheus: k8s | |
| role: alert-rules | |
| name: prometheus-ceph-rules | |
| namespace: openshift-storage | |
| spec: | |
| groups: | |
| - name: ceph.rules | |
| rules: | |
| - expr: | | |
| kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node) | |
| record: cluster:ceph_node_down:join_kube | |
| - expr: | | |
| (sum((max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)) * on (node) group_right() (label_replace(max by(pod_ip,node) (kube_pod_info{pod=~"node-exporter.*"}), "instance", "$1:9100", "pod_ip", "(.*)")) * on (instance) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (irate(node_disk_reads_completed_total[1m]) + irate(node_disk_writes_completed_total[1m]))>0))) | |
| record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m | |
| - name: telemeter.rules | |
| rules: | |
| - expr: | | |
| count(ceph_osd_metadata{job="rook-ceph-mgr"}) | |
| record: job:ceph_osd_metadata:count | |
| - expr: | | |
| count(kube_persistentvolume_info) | |
| record: job:kube_pv:count | |
| - expr: | | |
| sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"}) | |
| record: job:ceph_pools_iops:total | |
| - expr: | | |
| sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"}) | |
| record: job:ceph_pools_iops_bytes:total | |
| - expr: | | |
| count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version)) | |
| record: job:ceph_versions_running:count | |
| - name: ceph-mgr-status | |
| rules: | |
| - alert: CephMgrIsAbsent | |
| annotations: | |
| description: Ceph Manager has disappeared from Prometheus target discovery. | |
| message: Storage metrics collector service not available anymore. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| absent(up{job="rook-ceph-mgr"} == 1) | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: CephMgrIsMissingReplicas | |
| annotations: | |
| description: Ceph Manager is missing replicas. | |
| message: Storage metrics collector service doesn't have required no of replicas. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| sum(up{job="rook-ceph-mgr"}) < 1 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - name: ceph-mds-status | |
| rules: | |
| - alert: CephMdsMissingReplicas | |
| annotations: | |
| description: Minimum required replicas for storage metadata service not available. | |
| Might affect the working of storage cluster. | |
| message: Insufficient replicas for storage metadata service. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - name: quorum-alert.rules | |
| rules: | |
| - alert: CephMonQuorumAtRisk | |
| annotations: | |
| description: Storage cluster quorum is low. Contact Support. | |
| message: Storage quorum at risk | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= ((count(ceph_mon_metadata{job="rook-ceph-mgr"}) % 2) + 1) | |
| for: 15m | |
| labels: | |
| severity: critical | |
| - alert: CephMonHighNumberOfLeaderChanges | |
| annotations: | |
| description: 'Ceph Monitor "{{ $labels.job }}": instance {{ $labels.instance | |
| }} has seen {{ $value }} leader changes per minute recently.' | |
| message: Storage Cluster has seen many leader changes recently. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60 > 0.95 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - name: ceph-node-alert.rules | |
| rules: | |
| - alert: CephNodeDown | |
| annotations: | |
| description: Storage node {{ $labels.node }} went down. Please check the node | |
| immediately. | |
| message: Storage node {{ $labels.node }} went down | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| cluster:ceph_node_down:join_kube == 0 | |
| for: 30s | |
| labels: | |
| severity: critical | |
| - name: osd-alert.rules | |
| rules: | |
| - alert: CephOSDDiskNotResponding | |
| annotations: | |
| description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host | |
| }}. | |
| message: Disk not responding | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") | |
| for: 1m | |
| labels: | |
| severity: critical | |
| - alert: CephOSDDiskUnavailable | |
| annotations: | |
| description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host | |
| }}. | |
| message: Disk not accessible | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)") | |
| for: 1m | |
| labels: | |
| severity: critical | |
| - alert: CephDataRecoveryTakingTooLong | |
| annotations: | |
| description: Data recovery has been active for more than 2h. Contact Support. | |
| message: Data recovery is slow | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| ceph_pg_undersized > 0 | |
| for: 2h | |
| labels: | |
| severity: warning | |
| - alert: CephPGRepairTakingTooLong | |
| annotations: | |
| description: Self heal operations taking too long. Contact Support. | |
| message: Self heal problems detected | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| ceph_pg_inconsistent > 0 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| - name: cluster-state-alert.rules | |
| rules: | |
| - alert: CephClusterErrorState | |
| annotations: | |
| description: Storage cluster is in error state for more than 10m. | |
| message: Storage cluster is in error state | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| ceph_health_status{job="rook-ceph-mgr"} > 1 | |
| for: 10m | |
| labels: | |
| severity: critical | |
| - alert: CephClusterWarningState | |
| annotations: | |
| description: Storage cluster is in warning state for more than 10m. | |
| message: Storage cluster is in degraded state | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| ceph_health_status{job="rook-ceph-mgr"} == 1 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: CephOSDVersionMismatch | |
| annotations: | |
| description: There are {{ $value }} different versions of Ceph OSD components | |
| running. | |
| message: There are multiple versions of storage services running. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - alert: CephMonVersionMismatch | |
| annotations: | |
| description: There are {{ $value }} different versions of Ceph Mon components | |
| running. | |
| message: There are multiple versions of storage services running. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 | |
| for: 10m | |
| labels: | |
| severity: warning | |
| - name: cluster-utilization-alert.rules | |
| rules: | |
| - alert: CephClusterNearFull | |
| annotations: | |
| description: Storage cluster utilization has crossed 85%. | |
| message: Storage cluster is nearing full. Expansion is required. | |
| severity_level: warning | |
| storage_type: ceph | |
| expr: | | |
| sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.85 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| - alert: CephClusterCriticallyFull | |
| annotations: | |
| description: Storage cluster utilization has crossed 95%. | |
| message: Storage cluster is critically full and needs immediate expansion | |
| severity_level: error | |
| storage_type: ceph | |
| expr: | | |
| sum(ceph_osd_stat_bytes_used) / sum(ceph_osd_stat_bytes) > 0.95 | |
| for: 5m | |
| labels: | |
| severity: critical |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment