Skip to content

Instantly share code, notes, and snippets.

@michabbb
Created January 22, 2026 01:53
Show Gist options
  • Select an option

  • Save michabbb/a7d01b29feef01f7d1bc4d9b789dc5ce to your computer and use it in GitHub Desktop.

Select an option

Save michabbb/a7d01b29feef01f7d1bc4d9b789dc5ce to your computer and use it in GitHub Desktop.
Nagios/Icinga plugin to monitor Proxmox VE (vzdump) backup status via the Proxmox API. Supports checking specific VMs, all running VMs, or any recent backup. Works with both single-VM and batch backup jobs.
#!/bin/bash
#
# check_pve_backup - Nagios/Icinga plugin for Proxmox VE backup monitoring
#
# Monitors vzdump backup status via the Proxmox API (pvesh).
# Supports checking specific VMs, all running VMs, or any recent backup.
#
# Requirements:
# - Must run on a Proxmox VE host (needs pvesh command)
# - jq for JSON parsing
# - For NRPE: sudo access to run this script as root
#
# Modes:
# Default: Check if ANY backup ran successfully
# -v VMID: Check backup for specific VM (considers both VM-specific AND batch backups)
# -a: Check that ALL running VMs have a recent backup
#
# License: MIT
#
# Nagios exit codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
# Default values
MAX_AGE_HOURS=26 # Default: 26 hours (daily backup + buffer)
VMID="" # Optional: specific VM ID to check
CHECK_ALL=0 # Check all running VMs
usage() {
cat <<EOF
Usage: $0 [-h MAX_HOURS] [-v VMID] [-a]
Nagios/Icinga plugin to monitor Proxmox VE (vzdump) backup status.
Options:
-h MAX_HOURS Maximum age of last backup in hours (default: 26)
-v VMID Check backup for specific VM ID (uses newest backup)
-a Check ALL running VMs have a backup
-? Show this help message
Examples:
$0 # Check if any backup ran within last 26 hours
$0 -v 100 # Check if VM 100 was backed up within 26h
$0 -h 30 -v 100 # Check VM 100 with 30 hour threshold
$0 -a # Check ALL running VMs have recent backups
$0 -a -h 48 # Check all VMs with 48 hour threshold
Exit codes:
0 = OK - Backup found within threshold, status OK
1 = WARNING - (not currently used)
2 = CRITICAL - No backup found, backup too old, or backup failed
3 = UNKNOWN - Configuration error or API unavailable
NRPE configuration example:
command[check_vzdump]=sudo /usr/lib/nagios/plugins/check_pve_backup -v 100 -h 26
command[check_vzdump_all]=sudo /usr/lib/nagios/plugins/check_pve_backup -a
Note: When using -v VMID, the script checks both VM-specific backups AND
batch backups (where all VMs are backed up together). It uses whichever
is more recent, so it works correctly with "all VMs" backup jobs.
EOF
exit $STATE_UNKNOWN
}
# Parse arguments
while getopts "h:v:a?" opt; do
case $opt in
h) MAX_AGE_HOURS="$OPTARG" ;;
v) VMID="$OPTARG" ;;
a) CHECK_ALL=1 ;;
?) usage ;;
*) usage ;;
esac
done
# Validate MAX_AGE_HOURS is a number
if ! [[ "$MAX_AGE_HOURS" =~ ^[0-9]+$ ]]; then
echo "UNKNOWN - Invalid value for -h: $MAX_AGE_HOURS (must be a number)"
exit $STATE_UNKNOWN
fi
# Get hostname for pvesh command
NODE=$(hostname)
if [ -z "$NODE" ]; then
echo "UNKNOWN - Could not determine hostname"
exit $STATE_UNKNOWN
fi
# Check if pvesh is available
if ! command -v pvesh &> /dev/null; then
echo "UNKNOWN - pvesh command not found (is this a Proxmox VE host?)"
exit $STATE_UNKNOWN
fi
# Check if jq is available
if ! command -v jq &> /dev/null; then
echo "UNKNOWN - jq command not found (apt install jq)"
exit $STATE_UNKNOWN
fi
# Calculate threshold timestamp
NOW=$(date +%s)
THRESHOLD=$((NOW - MAX_AGE_HOURS * 3600))
#########################################################################
# Helper: Get VM name from VMID
#########################################################################
get_vm_name() {
local vmid="$1"
pvesh get /nodes/"$NODE"/qemu/"$vmid"/config --output-format json 2>/dev/null | jq -r '.name // empty'
}
#########################################################################
# Mode: Check specific VM (considers both VM-specific AND batch backups)
#########################################################################
if [ -n "$VMID" ]; then
# Verify VM exists
VM_NAME=$(get_vm_name "$VMID")
if [ -z "$VM_NAME" ]; then
echo "UNKNOWN - VM $VMID not found on this node"
exit $STATE_UNKNOWN
fi
# Get backup tasks
TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 50 --output-format json 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$TASKS" ]; then
echo "UNKNOWN - Failed to query Proxmox tasks API"
exit $STATE_UNKNOWN
fi
# Find latest VM-specific backup
VM_BACKUP=$(echo "$TASKS" | jq -r --arg vmid "$VMID" '
[.[] | select(.id == $vmid and .endtime != null)] |
sort_by(.endtime) |
reverse |
.[0] // empty
')
VM_ENDTIME=0
if [ -n "$VM_BACKUP" ] && [ "$VM_BACKUP" != "null" ]; then
VM_ENDTIME=$(echo "$VM_BACKUP" | jq -r '.endtime // 0')
fi
# Find latest batch backup (id="")
BATCH_BACKUP=$(echo "$TASKS" | jq -r '
[.[] | select(.id == "" and .endtime != null)] |
sort_by(.endtime) |
reverse |
.[0] // empty
')
BATCH_ENDTIME=0
if [ -n "$BATCH_BACKUP" ] && [ "$BATCH_BACKUP" != "null" ]; then
BATCH_ENDTIME=$(echo "$BATCH_BACKUP" | jq -r '.endtime // 0')
fi
# Use the newest backup (VM-specific or batch)
if [ "$VM_ENDTIME" -ge "$BATCH_ENDTIME" ] && [ "$VM_ENDTIME" -gt 0 ]; then
LATEST="$VM_BACKUP"
BACKUP_TYPE="VM"
elif [ "$BATCH_ENDTIME" -gt 0 ]; then
LATEST="$BATCH_BACKUP"
BACKUP_TYPE="batch"
else
echo "CRITICAL - No backup found for VM $VMID ($VM_NAME)"
exit $STATE_CRITICAL
fi
# Extract values
STATUS=$(echo "$LATEST" | jq -r '.status // "unknown"')
ENDTIME=$(echo "$LATEST" | jq -r '.endtime // 0')
STARTTIME=$(echo "$LATEST" | jq -r '.starttime // 0')
AGE_SECONDS=$((NOW - ENDTIME))
AGE_HOURS=$((AGE_SECONDS / 3600))
AGE_MINUTES=$(((AGE_SECONDS % 3600) / 60))
BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null)
DURATION=$((ENDTIME - STARTTIME))
DURATION_MIN=$((DURATION / 60))
# Check status
if [ "$STATUS" != "OK" ]; then
echo "CRITICAL - $VM_NAME backup failed: $STATUS (at $BACKUP_DATE)"
exit $STATE_CRITICAL
fi
# Check age
if [ "$ENDTIME" -lt "$THRESHOLD" ]; then
echo "CRITICAL - $VM_NAME backup too old: ${AGE_HOURS}h ${AGE_MINUTES}m (threshold: ${MAX_AGE_HOURS}h)"
exit $STATE_CRITICAL
fi
echo "OK - $VM_NAME backup ($BACKUP_TYPE): ${AGE_HOURS}h ${AGE_MINUTES}m ago ($BACKUP_DATE, ${DURATION_MIN}min)"
exit $STATE_OK
fi
#########################################################################
# Mode: Check ALL running VMs
#########################################################################
if [ "$CHECK_ALL" -eq 1 ]; then
# Get list of running VMs
RUNNING_VMS=$(pvesh get /nodes/"$NODE"/qemu --output-format json 2>/dev/null | jq -r '.[] | select(.status == "running") | "\(.vmid):\(.name)"')
if [ $? -ne 0 ] || [ -z "$RUNNING_VMS" ]; then
VM_COUNT=$(pvesh get /nodes/"$NODE"/qemu --output-format json 2>/dev/null | jq -r 'length')
if [ "$VM_COUNT" = "0" ]; then
echo "OK - No VMs on this node"
exit $STATE_OK
else
echo "OK - No running VMs to backup"
exit $STATE_OK
fi
fi
# Get all backup tasks
TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 100 --output-format json 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$TASKS" ]; then
echo "UNKNOWN - Failed to query Proxmox tasks API"
exit $STATE_UNKNOWN
fi
# Check for batch backup first (most efficient)
BATCH_BACKUP=$(echo "$TASKS" | jq -r --argjson threshold "$THRESHOLD" '
[.[] | select(.id == "" and .status == "OK" and .endtime >= $threshold)] |
sort_by(.endtime) |
reverse |
.[0] // empty
')
if [ -n "$BATCH_BACKUP" ] && [ "$BATCH_BACKUP" != "null" ]; then
ENDTIME=$(echo "$BATCH_BACKUP" | jq -r '.endtime')
AGE_H=$(( (NOW - ENDTIME) / 3600 ))
VM_COUNT=$(echo "$RUNNING_VMS" | wc -l)
VM_NAMES=$(echo "$RUNNING_VMS" | cut -d: -f2 | tr '\n' ',' | sed 's/,$//')
BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null)
echo "OK - Batch backup for $VM_COUNT VMs ($VM_NAMES): ${AGE_H}h ago ($BACKUP_DATE) | vms=$VM_COUNT"
exit $STATE_OK
fi
# No recent batch backup - check individual VMs
MISSING=""
OK_LIST=""
while IFS=: read -r vmid vmname; do
LATEST=$(echo "$TASKS" | jq -r --arg vmid "$vmid" '
[.[] | select(.id == $vmid and .status == "OK" and .endtime != null)] |
sort_by(.endtime) |
reverse |
.[0] // empty
')
if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then
MISSING="$MISSING $vmname"
else
ENDTIME=$(echo "$LATEST" | jq -r '.endtime')
if [ "$ENDTIME" -lt "$THRESHOLD" ]; then
AGE_H=$(( (NOW - ENDTIME) / 3600 ))
MISSING="$MISSING $vmname(${AGE_H}h)"
else
OK_LIST="$OK_LIST $vmname"
fi
fi
done <<< "$RUNNING_VMS"
if [ -n "$MISSING" ]; then
echo "CRITICAL - No recent backup for:$MISSING"
exit $STATE_CRITICAL
fi
VM_COUNT=$(echo "$RUNNING_VMS" | wc -l)
echo "OK - All $VM_COUNT VMs backed up:$OK_LIST | vms=$VM_COUNT"
exit $STATE_OK
fi
#########################################################################
# Mode: Check ANY backup (default)
#########################################################################
TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 10 --output-format json 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$TASKS" ]; then
echo "UNKNOWN - Failed to query Proxmox tasks API"
exit $STATE_UNKNOWN
fi
LATEST=$(echo "$TASKS" | jq -r 'sort_by(.endtime) | reverse | .[0] // empty')
if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then
echo "CRITICAL - No backup tasks found"
exit $STATE_CRITICAL
fi
STATUS=$(echo "$LATEST" | jq -r '.status // "unknown"')
ENDTIME=$(echo "$LATEST" | jq -r '.endtime // 0')
STARTTIME=$(echo "$LATEST" | jq -r '.starttime // 0')
TASK_VMID=$(echo "$LATEST" | jq -r '.id // ""')
if [ "$ENDTIME" -eq 0 ]; then
echo "UNKNOWN - Could not parse backup endtime"
exit $STATE_UNKNOWN
fi
AGE_SECONDS=$((NOW - ENDTIME))
AGE_HOURS=$((AGE_SECONDS / 3600))
AGE_MINUTES=$(((AGE_SECONDS % 3600) / 60))
BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null)
DURATION=$((ENDTIME - STARTTIME))
DURATION_MIN=$((DURATION / 60))
if [ -z "$TASK_VMID" ]; then
MSG_PREFIX="Batch backup"
else
VM_NAME=$(get_vm_name "$TASK_VMID")
MSG_PREFIX="Backup"
[ -n "$VM_NAME" ] && MSG_PREFIX="$VM_NAME backup"
fi
if [ "$STATUS" != "OK" ]; then
echo "CRITICAL - $MSG_PREFIX failed: $STATUS (at $BACKUP_DATE)"
exit $STATE_CRITICAL
fi
if [ "$ENDTIME" -lt "$THRESHOLD" ]; then
echo "CRITICAL - $MSG_PREFIX too old: ${AGE_HOURS}h ${AGE_MINUTES}m (threshold: ${MAX_AGE_HOURS}h)"
exit $STATE_CRITICAL
fi
echo "OK - $MSG_PREFIX: ${AGE_HOURS}h ${AGE_MINUTES}m ago ($BACKUP_DATE, ${DURATION_MIN}min)"
exit $STATE_OK
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment