Last active
March 13, 2026 19:09
-
-
Save ergo70/e611b225ed081b6d2bbda3487dd3423c to your computer and use it in GitHub Desktop.
Report NVIDIA GPU statistics for Icinga/Nagios monitoring using bash+nvidia-smi
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/bash | |
| ########################################################################################################################### | |
| # | |
| # Report NVIDIA GPU statistics for Icinga/Nagios monitoring with bash + nvidia-smi. | |
| # | |
| # Fan info is only available on some models, power info on newer models. If not available, -1 is reported for those values. | |
| # Fan info is known to be unreliable. Positive values do not mean that the fans are running, only that they should be. | |
| # ECC might not be available or disabled. If not available, -1 is reported for those values. | |
| # Possible sensors: https://gist.github.com/sansmoraxz/8a98d987f12d7edc983d611b8326fc67 | |
| # | |
| # Author: Ernst-Georg Schmid | |
| # License: MIT License (https://opensource.org/licenses/MIT) | |
| # | |
| # Changelog (sorted newest -> oldest): | |
| # - 13-03-2026: RC3 | |
| # - 11-03-2026: RC2 | |
| # - 03-03-2026: RC1 | |
| # - 28-02-2026: First draft | |
| # | |
| ########################################################################################################################### | |
| # Create a few constants that can be used further in the script | |
| VERSION=1.0.0RC3 | |
| # Exit codes for Nagios/Icinga | |
| EXIT_OK=0 | |
| # EXIT_WARNING=1 | |
| # EXIT_CRITICAL=2 | |
| EXIT_UNKNOWN=3 | |
| # Determine some basic information, used later in the script | |
| this_path=$(readlink -f "$0") ## Path of this file including filename | |
| # dir_name=$(dirname ${this_path}) ## Dir where this file is | |
| myname=$(basename ${this_path}) ## file name of this script | |
| gpu_index=-1 ## Default GPU index to monitor. If -1, all GPUs are monitored. | |
| # Display usage to the user | |
| function help() { | |
| echo " | |
| NVIDIA GPU monitoring plugin for Nagios/Icinga - ${VERSION} | |
| usage: ${myname} [existing GPU Index] | |
| GPU to monitor by index (Default: Enumerate all GPUs)" | |
| exit $EXIT_UNKNOWN | |
| } | |
| function check_args() { | |
| if [ $# -gt 1 ]; then # # If too many arguments are passed, show the usage info. | |
| echo "ERROR: Too many arguments provided." | |
| help | |
| fi | |
| gpu_count=$(nvidia-smi -L | wc -l) # Query the number of NVIDIA GPUs present in the system. If this command fails, it likely means that no NVIDIA GPU is present. | |
| if [ $? -ne 0 ]; then # GPU count query failed, likely no NVIDIA GPU present. | |
| echo "ERROR: Failed to query GPU count. Is an NVIDIA GPU present and nvidia-smi installed?" | |
| exit $EXIT_UNKNOWN | |
| fi | |
| if [ $# -eq 1 ]; then gpu_index=$1; fi # If a GPU index is provided as an argument, use it. | |
| if [ "$gpu_index" -eq "$gpu_index" ] 2>/dev/null; then | |
| # The argument is an integer, pass | |
| : | |
| else | |
| echo "ERROR: GPU index argument must be an integer." | |
| help | |
| fi | |
| if [ $gpu_index -gt $((gpu_count - 1)) ]; then # If the provided GPU index is out of range, show an error and exit with a warning status. | |
| echo "ERROR: Invalid GPU index: $gpu_index [0, $((gpu_count - 1))]" | |
| help | |
| fi | |
| } | |
| function to_percent() { | |
| echo "scale=1; $1*100.0 / $2" | bc -l | |
| } | |
| function prettify() { | |
| local input="$1" | |
| output="${input#[[:space:]]}" | |
| output="${output/\[N\/A\]/-1}" | |
| output="${output/N\/A/-1}" | |
| echo "$output" | |
| } | |
| # Monitor a single GPU by index and output the relevant information in the format expected by Nagios/Icinga. | |
| function monitor() { | |
| smi_out=$(nvidia-smi -i $gpu_index --query-gpu=index,utilization.gpu,utilization.memory,power.draw,enforced.power.limit,ecc.errors.uncorrected.aggregate.total,temperature.gpu,temperature.memory --format=csv,noheader,nounits 2>/dev/null) | |
| if [ $? -ne 0 ]; then exit $EXIT_UNKNOWN; fi | |
| IFS=',' read -r idx gpu_usage mem_usage power_draw power_limit ecc_errors_uncorrected_total gpu_temp mem_temp<<<"$smi_out" | |
| idx=$(prettify "$idx") | |
| gpu_usage=$(prettify "$gpu_usage") | |
| mem_usage=$(prettify "$mem_usage") | |
| #fan=$(prettify "$fan") | |
| power_draw=$(prettify "$power_draw") | |
| power_limit=$(prettify "$power_limit") | |
| ecc_err_total=$(prettify "$ecc_errors_uncorrected_total") | |
| gpu_temperature=$(prettify "$gpu_temp") | |
| mem_temperature=$(prettify "$mem_temp") | |
| # power_limit=$(printf "%.0f" "$power_limit") # Round power limit to integer, as it is expected to be an integer value. | |
| status_part+="gpu${idx} gpu_load ${gpu_usage}%, vram_load ${mem_usage}%, power_draw ${power_draw}W, power_limit ${power_limit}W, ecc_errors_uncorrected ${ecc_err_total}, gpu_temp ${gpu_temperature}°C, vram_temp ${mem_temperature}°C " | |
| perf_part+=" gpu${idx}.gpu_load=${gpu_usage}%;;;0;100 gpu${idx}.vram_load=${mem_usage}%;;;0;100 gpu${idx}.power_draw=${power_draw}W;;;0;${power_limit} gpu${idx}.ecc_errors_uncorrected=${ecc_err_total};;1;; gpu${idx}.gpu_temp=${gpu_temperature}C;;;; gpu${idx}.vram_temp=${mem_temperature}C;;;;" | |
| } | |
| # Main code. | |
| function main() { | |
| check_args "$@" | |
| if [ $gpu_index -ne -1 ]; then # If a specific GPU index is provided, only monitor that GPU. Otherwise, monitor all GPUs. | |
| gpu_indexes=$gpu_index | |
| status_part="OK - 1 GPU: " | |
| else | |
| gpu_indexes=$(seq 0 $((gpu_count - 1))) | |
| status_part="OK - ${gpu_count} GPUs: " | |
| fi | |
| for gpu_index in $gpu_indexes; do # Iterate over the GPU indexes | |
| monitor | |
| done | |
| echo "${status_part}|${perf_part}" | |
| exit $EXIT_OK | |
| } | |
| # Bootstrap the script. | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment