Created
November 26, 2025 00:36
-
-
Save agh/3bd1ff0a458a2ae25450017d50e99459 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # check-nvme-health.sh | |
| # NVMe-only health check using nvme-cli with delta error-log tracking, tabular output. | |
| set -uo pipefail | |
| RED="\033[0;31m" | |
| YELLOW="\033[0;33m" | |
| GREEN="\033[0;32m" | |
| NC="\033[0m" | |
| # 0 = all OK, 1 = WARN, 2 = FAIL | |
| STATUS=0 | |
| STATE_DIR=/var/lib/nvme-health | |
| log() { | |
| echo -e "$*" | |
| } | |
| header_block() { | |
| echo | |
| echo "============================================================" | |
| echo "NVMe HEALTH (via nvme-cli)" | |
| echo "============================================================" | |
| } | |
| need_nvme() { | |
| if ! command -v nvme >/dev/null 2>&1; then | |
| log "${RED}[FAIL] nvme-cli not installed (command 'nvme' not found)${NC}" | |
| exit 2 | |
| fi | |
| } | |
| # Pull a field from nvme smart-log text output | |
| parse_smart_field() { | |
| local smart="$1" | |
| local key="$2" | |
| printf "%s\n" "$smart" | awk -v key="$key" -F: ' | |
| $1 ~ key { | |
| gsub(/^[ \t]+/, "", $2); | |
| print $2; | |
| exit | |
| } | |
| ' | |
| } | |
| # Extract temperature in Celsius from nvme smart-log output (robust, ignores °F) | |
| parse_temp_celsius() { | |
| local smart="$1" | |
| local ctemp stemp | |
| # Composite temperature line: "temperature : 40 °C (313 K, 104 °F)" | |
| ctemp=$(printf "%s\n" "$smart" | \ | |
| awk -F'[:()]' ' | |
| tolower($1) ~ /^temperature[[:space:]]*$/ { | |
| for (i=2; i<=NF; i++) { | |
| if ($i ~ /°C/) { | |
| gsub(/[^0-9]/, "", $i) | |
| print $i | |
| exit | |
| } | |
| } | |
| } | |
| ') | |
| if [[ -n "$ctemp" ]]; then | |
| echo "$ctemp" | |
| return | |
| fi | |
| # Temperature Sensor lines | |
| stemp=$(printf "%s\n" "$smart" | \ | |
| awk -F'[:()]' ' | |
| tolower($1) ~ /temperature sensor/ { | |
| for (i=2; i<=NF; i++) { | |
| if ($i ~ /°C/) { | |
| gsub(/[^0-9]/, "", $i) | |
| print $i | |
| exit | |
| } | |
| } | |
| } | |
| ') | |
| if [[ -n "$stemp" ]]; then | |
| echo "$stemp" | |
| return | |
| fi | |
| echo "" | |
| } | |
| print_table_header() { | |
| # STATUS DEV MODEL SERIAL FW CRIT MEDIA ERRLOG DELTA USED TEMP | |
| printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \ | |
| "STATUS" "DEVICE" "MODEL" "SERIAL" "FW" "CWRN" "MEDIA" "ERRLOG" "DELTA" "USED" "TEMP" | |
| printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \ | |
| "------" "----------" "----------------------------" "--------------------" "--------" "----" "-------" "--------" "--------" "-----" "-----" | |
| } | |
| check_nvme_devices() { | |
| header_block | |
| mkdir -p "$STATE_DIR" | |
| mapfile -t nvme_nodes < <(nvme list 2>/dev/null | awk '/^\/dev\/nvme/ {print $1}') | |
| if ((${#nvme_nodes[@]} == 0)); then | |
| log "${YELLOW}[WARN] No NVMe devices found by 'nvme list'${NC}" | |
| return | |
| fi | |
| print_table_header | |
| for node in "${nvme_nodes[@]}"; do | |
| local ctrl | |
| ctrl="${node%%n*}" | |
| [[ -c "$ctrl" ]] || ctrl="$node" | |
| local id_out | |
| id_out=$(nvme id-ctrl "$ctrl" 2>/dev/null) | |
| if [[ -z "$id_out" ]]; then | |
| printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \ | |
| "${YELLOW}WARN${NC}" "$(basename "$ctrl")" "N/A" "N/A" "N/A" "-" "-" "-" "-" "-" "-" | |
| (( STATUS < 1 )) && STATUS=1 | |
| continue | |
| fi | |
| local model serial fwrev | |
| model=$(printf "%s\n" "$id_out" | awk -F: '/mn/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}') | |
| serial=$(printf "%s\n" "$id_out" | awk -F: '/sn/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}') | |
| fwrev=$(printf "%s\n" "$id_out" | awk -F: '/fr/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}') | |
| local smart_out | |
| smart_out=$(nvme smart-log "$ctrl" 2>/dev/null) | |
| if [[ -z "$smart_out" ]]; then | |
| printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \ | |
| "${YELLOW}WARN${NC}" "$(basename "$ctrl")" "${model:0:28}" "${serial:0:20}" "${fwrev:0:10}" "-" "-" "-" "-" "-" "-" | |
| (( STATUS < 1 )) && STATUS=1 | |
| continue | |
| fi | |
| local critical_warning media_errors num_err_log_entries percentage_used temp_c | |
| critical_warning=$(parse_smart_field "$smart_out" "critical_warning") | |
| media_errors=$(parse_smart_field "$smart_out" "media_errors") | |
| num_err_log_entries=$(parse_smart_field "$smart_out" "num_err_log_entries") | |
| percentage_used=$(parse_smart_field "$smart_out" "percentage_used") | |
| temp_c=$(parse_temp_celsius "$smart_out") | |
| critical_warning=${critical_warning:-0} | |
| media_errors=${media_errors:-0} | |
| num_err_log_entries=${num_err_log_entries:-0} | |
| percentage_used=${percentage_used:-0} | |
| local cw_dec | |
| if [[ "$critical_warning" =~ ^0x[0-9A-Fa-f]+$ ]]; then | |
| cw_dec=$((16#${critical_warning#0x})) | |
| else | |
| cw_dec=${critical_warning} | |
| fi | |
| percentage_used=$(printf "%s\n" "$percentage_used" | tr -dc '0-9') | |
| # delta tracking for error log | |
| local state_file prev_errlog delta_errlog | |
| state_file="${STATE_DIR}/$(basename "$ctrl").errlog" | |
| prev_errlog=0 | |
| if [[ -f "$state_file" ]]; then | |
| prev_errlog=$(cat "$state_file" 2>/dev/null || echo 0) | |
| fi | |
| num_err_log_entries=$((num_err_log_entries + 0)) | |
| prev_errlog=$((prev_errlog + 0)) | |
| delta_errlog=$(( num_err_log_entries - prev_errlog )) | |
| echo "$num_err_log_entries" > "$state_file" | |
| if (( delta_errlog < 0 )); then | |
| delta_errlog=0 | |
| fi | |
| local bad=0 warn=0 | |
| # Critical warnings and media errors | |
| if (( cw_dec > 0 )) || (( media_errors > 0 )); then | |
| bad=1 | |
| fi | |
| # Wear-out | |
| if (( percentage_used >= 100 )); then | |
| bad=1 | |
| elif (( percentage_used >= 90 )); then | |
| warn=1 | |
| fi | |
| # Temperature thresholds | |
| if [[ -n "$temp_c" ]]; then | |
| temp_c=$((temp_c + 0)) | |
| if (( temp_c >= 80 )); then | |
| bad=1 | |
| elif (( temp_c >= 70 )); then | |
| warn=1 | |
| fi | |
| fi | |
| # Error log delta threshold | |
| if (( delta_errlog > 100 )); then | |
| warn=1 | |
| fi | |
| # Build status label + color | |
| local label color | |
| if (( bad )); then | |
| label="FAIL" | |
| color="$RED" | |
| STATUS=2 | |
| elif (( warn )); then | |
| label="WARN" | |
| color="$YELLOW" | |
| (( STATUS < 1 )) && STATUS=1 | |
| else | |
| label="OK" | |
| color="$GREEN" | |
| fi | |
| # Truncate fields to keep the table sane | |
| local devshort modelshort serialshort fwshort | |
| devshort=$(basename "$ctrl") | |
| modelshort="${model:0:28}" | |
| serialshort="${serial:0:20}" | |
| fwshort="${fwrev:0:10}" | |
| printf "%b%-6s%b %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \ | |
| "$color" "$label" "$NC" \ | |
| "$devshort" \ | |
| "$modelshort" \ | |
| "$serialshort" \ | |
| "$fwshort" \ | |
| "$cw_dec" \ | |
| "$media_errors" \ | |
| "$num_err_log_entries" \ | |
| "$delta_errlog" \ | |
| "${percentage_used}%" \ | |
| "${temp_c:-N/A}" | |
| done | |
| } | |
| main() { | |
| need_nvme | |
| check_nvme_devices | |
| echo | |
| case "$STATUS" in | |
| 0) | |
| log "${GREEN}[SUMMARY] All NVMe devices OK${NC}" | |
| ;; | |
| 1) | |
| log "${YELLOW}[SUMMARY] One or more NVMe devices reported warnings (exit 1)${NC}" | |
| ;; | |
| 2) | |
| log "${RED}[SUMMARY] One or more NVMe devices reported failures (exit 2)${NC}" | |
| ;; | |
| esac | |
| exit "$STATUS" | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment