Skip to content

Instantly share code, notes, and snippets.

@agh
Created November 26, 2025 00:36
Show Gist options
  • Select an option

  • Save agh/3bd1ff0a458a2ae25450017d50e99459 to your computer and use it in GitHub Desktop.

Select an option

Save agh/3bd1ff0a458a2ae25450017d50e99459 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# check-nvme-health.sh
# NVMe-only health check using nvme-cli with delta error-log tracking, tabular output.
set -uo pipefail
RED="\033[0;31m"
YELLOW="\033[0;33m"
GREEN="\033[0;32m"
NC="\033[0m"
# 0 = all OK, 1 = WARN, 2 = FAIL
STATUS=0
STATE_DIR=/var/lib/nvme-health
log() {
echo -e "$*"
}
header_block() {
echo
echo "============================================================"
echo "NVMe HEALTH (via nvme-cli)"
echo "============================================================"
}
need_nvme() {
if ! command -v nvme >/dev/null 2>&1; then
log "${RED}[FAIL] nvme-cli not installed (command 'nvme' not found)${NC}"
exit 2
fi
}
# Pull a field from nvme smart-log text output
parse_smart_field() {
local smart="$1"
local key="$2"
printf "%s\n" "$smart" | awk -v key="$key" -F: '
$1 ~ key {
gsub(/^[ \t]+/, "", $2);
print $2;
exit
}
'
}
# Extract temperature in Celsius from nvme smart-log output (robust, ignores °F)
parse_temp_celsius() {
local smart="$1"
local ctemp stemp
# Composite temperature line: "temperature : 40 °C (313 K, 104 °F)"
ctemp=$(printf "%s\n" "$smart" | \
awk -F'[:()]' '
tolower($1) ~ /^temperature[[:space:]]*$/ {
for (i=2; i<=NF; i++) {
if ($i ~ /°C/) {
gsub(/[^0-9]/, "", $i)
print $i
exit
}
}
}
')
if [[ -n "$ctemp" ]]; then
echo "$ctemp"
return
fi
# Temperature Sensor lines
stemp=$(printf "%s\n" "$smart" | \
awk -F'[:()]' '
tolower($1) ~ /temperature sensor/ {
for (i=2; i<=NF; i++) {
if ($i ~ /°C/) {
gsub(/[^0-9]/, "", $i)
print $i
exit
}
}
}
')
if [[ -n "$stemp" ]]; then
echo "$stemp"
return
fi
echo ""
}
print_table_header() {
# STATUS DEV MODEL SERIAL FW CRIT MEDIA ERRLOG DELTA USED TEMP
printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \
"STATUS" "DEVICE" "MODEL" "SERIAL" "FW" "CWRN" "MEDIA" "ERRLOG" "DELTA" "USED" "TEMP"
printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \
"------" "----------" "----------------------------" "--------------------" "--------" "----" "-------" "--------" "--------" "-----" "-----"
}
check_nvme_devices() {
header_block
mkdir -p "$STATE_DIR"
mapfile -t nvme_nodes < <(nvme list 2>/dev/null | awk '/^\/dev\/nvme/ {print $1}')
if ((${#nvme_nodes[@]} == 0)); then
log "${YELLOW}[WARN] No NVMe devices found by 'nvme list'${NC}"
return
fi
print_table_header
for node in "${nvme_nodes[@]}"; do
local ctrl
ctrl="${node%%n*}"
[[ -c "$ctrl" ]] || ctrl="$node"
local id_out
id_out=$(nvme id-ctrl "$ctrl" 2>/dev/null)
if [[ -z "$id_out" ]]; then
printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \
"${YELLOW}WARN${NC}" "$(basename "$ctrl")" "N/A" "N/A" "N/A" "-" "-" "-" "-" "-" "-"
(( STATUS < 1 )) && STATUS=1
continue
fi
local model serial fwrev
model=$(printf "%s\n" "$id_out" | awk -F: '/mn/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}')
serial=$(printf "%s\n" "$id_out" | awk -F: '/sn/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}')
fwrev=$(printf "%s\n" "$id_out" | awk -F: '/fr/ {sub(/^[ \t]+/, "", $2); gsub(/[ \t]+$/, "", $2); print $2; exit}')
local smart_out
smart_out=$(nvme smart-log "$ctrl" 2>/dev/null)
if [[ -z "$smart_out" ]]; then
printf "%-6s %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \
"${YELLOW}WARN${NC}" "$(basename "$ctrl")" "${model:0:28}" "${serial:0:20}" "${fwrev:0:10}" "-" "-" "-" "-" "-" "-"
(( STATUS < 1 )) && STATUS=1
continue
fi
local critical_warning media_errors num_err_log_entries percentage_used temp_c
critical_warning=$(parse_smart_field "$smart_out" "critical_warning")
media_errors=$(parse_smart_field "$smart_out" "media_errors")
num_err_log_entries=$(parse_smart_field "$smart_out" "num_err_log_entries")
percentage_used=$(parse_smart_field "$smart_out" "percentage_used")
temp_c=$(parse_temp_celsius "$smart_out")
critical_warning=${critical_warning:-0}
media_errors=${media_errors:-0}
num_err_log_entries=${num_err_log_entries:-0}
percentage_used=${percentage_used:-0}
local cw_dec
if [[ "$critical_warning" =~ ^0x[0-9A-Fa-f]+$ ]]; then
cw_dec=$((16#${critical_warning#0x}))
else
cw_dec=${critical_warning}
fi
percentage_used=$(printf "%s\n" "$percentage_used" | tr -dc '0-9')
# delta tracking for error log
local state_file prev_errlog delta_errlog
state_file="${STATE_DIR}/$(basename "$ctrl").errlog"
prev_errlog=0
if [[ -f "$state_file" ]]; then
prev_errlog=$(cat "$state_file" 2>/dev/null || echo 0)
fi
num_err_log_entries=$((num_err_log_entries + 0))
prev_errlog=$((prev_errlog + 0))
delta_errlog=$(( num_err_log_entries - prev_errlog ))
echo "$num_err_log_entries" > "$state_file"
if (( delta_errlog < 0 )); then
delta_errlog=0
fi
local bad=0 warn=0
# Critical warnings and media errors
if (( cw_dec > 0 )) || (( media_errors > 0 )); then
bad=1
fi
# Wear-out
if (( percentage_used >= 100 )); then
bad=1
elif (( percentage_used >= 90 )); then
warn=1
fi
# Temperature thresholds
if [[ -n "$temp_c" ]]; then
temp_c=$((temp_c + 0))
if (( temp_c >= 80 )); then
bad=1
elif (( temp_c >= 70 )); then
warn=1
fi
fi
# Error log delta threshold
if (( delta_errlog > 100 )); then
warn=1
fi
# Build status label + color
local label color
if (( bad )); then
label="FAIL"
color="$RED"
STATUS=2
elif (( warn )); then
label="WARN"
color="$YELLOW"
(( STATUS < 1 )) && STATUS=1
else
label="OK"
color="$GREEN"
fi
# Truncate fields to keep the table sane
local devshort modelshort serialshort fwshort
devshort=$(basename "$ctrl")
modelshort="${model:0:28}"
serialshort="${serial:0:20}"
fwshort="${fwrev:0:10}"
printf "%b%-6s%b %-10s %-28s %-20s %-10s %4s %7s %8s %8s %5s %5s\n" \
"$color" "$label" "$NC" \
"$devshort" \
"$modelshort" \
"$serialshort" \
"$fwshort" \
"$cw_dec" \
"$media_errors" \
"$num_err_log_entries" \
"$delta_errlog" \
"${percentage_used}%" \
"${temp_c:-N/A}"
done
}
main() {
need_nvme
check_nvme_devices
echo
case "$STATUS" in
0)
log "${GREEN}[SUMMARY] All NVMe devices OK${NC}"
;;
1)
log "${YELLOW}[SUMMARY] One or more NVMe devices reported warnings (exit 1)${NC}"
;;
2)
log "${RED}[SUMMARY] One or more NVMe devices reported failures (exit 2)${NC}"
;;
esac
exit "$STATUS"
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment