Skip to content

Instantly share code, notes, and snippets.

@ergo70
Last active March 13, 2026 19:09
Show Gist options
  • Select an option

  • Save ergo70/e611b225ed081b6d2bbda3487dd3423c to your computer and use it in GitHub Desktop.

Select an option

Save ergo70/e611b225ed081b6d2bbda3487dd3423c to your computer and use it in GitHub Desktop.
Report NVIDIA GPU statistics for Icinga/Nagios monitoring using bash+nvidia-smi
#!/usr/bin/bash
###########################################################################################################################
#
# Report NVIDIA GPU statistics for Icinga/Nagios monitoring with bash + nvidia-smi.
#
# Fan info is only available on some models, power info on newer models. If not available, -1 is reported for those values.
# Fan info is known to be unreliable. Positive values do not mean that the fans are running, only that they should be.
# ECC might not be available or disabled. If not available, -1 is reported for those values.
# Possible sensors: https://gist.github.com/sansmoraxz/8a98d987f12d7edc983d611b8326fc67
#
# Author: Ernst-Georg Schmid
# License: MIT License (https://opensource.org/licenses/MIT)
#
# Changelog (sorted newest -> oldest):
# - 13-03-2026: RC3
# - 11-03-2026: RC2
# - 03-03-2026: RC1
# - 28-02-2026: First draft
#
###########################################################################################################################
# Create a few constants that can be used further in the script
VERSION=1.0.0RC3
# Exit codes for Nagios/Icinga
EXIT_OK=0
# EXIT_WARNING=1
# EXIT_CRITICAL=2
EXIT_UNKNOWN=3
# Determine some basic information, used later in the script
this_path=$(readlink -f "$0") ## Path of this file including filename
# dir_name=$(dirname ${this_path}) ## Dir where this file is
myname=$(basename ${this_path}) ## file name of this script
gpu_index=-1 ## Default GPU index to monitor. If -1, all GPUs are monitored.
# Display usage to the user
function help() {
echo "
NVIDIA GPU monitoring plugin for Nagios/Icinga - ${VERSION}
usage: ${myname} [existing GPU Index]
GPU to monitor by index (Default: Enumerate all GPUs)"
exit $EXIT_UNKNOWN
}
function check_args() {
if [ $# -gt 1 ]; then # # If too many arguments are passed, show the usage info.
echo "ERROR: Too many arguments provided."
help
fi
gpu_count=$(nvidia-smi -L | wc -l) # Query the number of NVIDIA GPUs present in the system. If this command fails, it likely means that no NVIDIA GPU is present.
if [ $? -ne 0 ]; then # GPU count query failed, likely no NVIDIA GPU present.
echo "ERROR: Failed to query GPU count. Is an NVIDIA GPU present and nvidia-smi installed?"
exit $EXIT_UNKNOWN
fi
if [ $# -eq 1 ]; then gpu_index=$1; fi # If a GPU index is provided as an argument, use it.
if [ "$gpu_index" -eq "$gpu_index" ] 2>/dev/null; then
# The argument is an integer, pass
:
else
echo "ERROR: GPU index argument must be an integer."
help
fi
if [ $gpu_index -gt $((gpu_count - 1)) ]; then # If the provided GPU index is out of range, show an error and exit with a warning status.
echo "ERROR: Invalid GPU index: $gpu_index [0, $((gpu_count - 1))]"
help
fi
}
function to_percent() {
echo "scale=1; $1*100.0 / $2" | bc -l
}
function prettify() {
local input="$1"
output="${input#[[:space:]]}"
output="${output/\[N\/A\]/-1}"
output="${output/N\/A/-1}"
echo "$output"
}
# Monitor a single GPU by index and output the relevant information in the format expected by Nagios/Icinga.
function monitor() {
smi_out=$(nvidia-smi -i $gpu_index --query-gpu=index,utilization.gpu,utilization.memory,power.draw,enforced.power.limit,ecc.errors.uncorrected.aggregate.total,temperature.gpu,temperature.memory --format=csv,noheader,nounits 2>/dev/null)
if [ $? -ne 0 ]; then exit $EXIT_UNKNOWN; fi
IFS=',' read -r idx gpu_usage mem_usage power_draw power_limit ecc_errors_uncorrected_total gpu_temp mem_temp<<<"$smi_out"
idx=$(prettify "$idx")
gpu_usage=$(prettify "$gpu_usage")
mem_usage=$(prettify "$mem_usage")
#fan=$(prettify "$fan")
power_draw=$(prettify "$power_draw")
power_limit=$(prettify "$power_limit")
ecc_err_total=$(prettify "$ecc_errors_uncorrected_total")
gpu_temperature=$(prettify "$gpu_temp")
mem_temperature=$(prettify "$mem_temp")
# power_limit=$(printf "%.0f" "$power_limit") # Round power limit to integer, as it is expected to be an integer value.
status_part+="gpu${idx} gpu_load ${gpu_usage}%, vram_load ${mem_usage}%, power_draw ${power_draw}W, power_limit ${power_limit}W, ecc_errors_uncorrected ${ecc_err_total}, gpu_temp ${gpu_temperature}°C, vram_temp ${mem_temperature}°C "
perf_part+=" gpu${idx}.gpu_load=${gpu_usage}%;;;0;100 gpu${idx}.vram_load=${mem_usage}%;;;0;100 gpu${idx}.power_draw=${power_draw}W;;;0;${power_limit} gpu${idx}.ecc_errors_uncorrected=${ecc_err_total};;1;; gpu${idx}.gpu_temp=${gpu_temperature}C;;;; gpu${idx}.vram_temp=${mem_temperature}C;;;;"
}
# Main code.
function main() {
check_args "$@"
if [ $gpu_index -ne -1 ]; then # If a specific GPU index is provided, only monitor that GPU. Otherwise, monitor all GPUs.
gpu_indexes=$gpu_index
status_part="OK - 1 GPU: "
else
gpu_indexes=$(seq 0 $((gpu_count - 1)))
status_part="OK - ${gpu_count} GPUs: "
fi
for gpu_index in $gpu_indexes; do # Iterate over the GPU indexes
monitor
done
echo "${status_part}|${perf_part}"
exit $EXIT_OK
}
# Bootstrap the script.
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment