Created
January 22, 2026 01:53
-
-
Save michabbb/a7d01b29feef01f7d1bc4d9b789dc5ce to your computer and use it in GitHub Desktop.
Nagios/Icinga plugin to monitor Proxmox VE (vzdump) backup status via the Proxmox API. Supports checking specific VMs, all running VMs, or any recent backup. Works with both single-VM and batch backup jobs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # | |
| # check_pve_backup - Nagios/Icinga plugin for Proxmox VE backup monitoring | |
| # | |
| # Monitors vzdump backup status via the Proxmox API (pvesh). | |
| # Supports checking specific VMs, all running VMs, or any recent backup. | |
| # | |
| # Requirements: | |
| # - Must run on a Proxmox VE host (needs pvesh command) | |
| # - jq for JSON parsing | |
| # - For NRPE: sudo access to run this script as root | |
| # | |
| # Modes: | |
| # Default: Check if ANY backup ran successfully | |
| # -v VMID: Check backup for specific VM (considers both VM-specific AND batch backups) | |
| # -a: Check that ALL running VMs have a recent backup | |
| # | |
| # License: MIT | |
| # | |
| # Nagios exit codes | |
| STATE_OK=0 | |
| STATE_WARNING=1 | |
| STATE_CRITICAL=2 | |
| STATE_UNKNOWN=3 | |
| # Default values | |
| MAX_AGE_HOURS=26 # Default: 26 hours (daily backup + buffer) | |
| VMID="" # Optional: specific VM ID to check | |
| CHECK_ALL=0 # Check all running VMs | |
| usage() { | |
| cat <<EOF | |
| Usage: $0 [-h MAX_HOURS] [-v VMID] [-a] | |
| Nagios/Icinga plugin to monitor Proxmox VE (vzdump) backup status. | |
| Options: | |
| -h MAX_HOURS Maximum age of last backup in hours (default: 26) | |
| -v VMID Check backup for specific VM ID (uses newest backup) | |
| -a Check ALL running VMs have a backup | |
| -? Show this help message | |
| Examples: | |
| $0 # Check if any backup ran within last 26 hours | |
| $0 -v 100 # Check if VM 100 was backed up within 26h | |
| $0 -h 30 -v 100 # Check VM 100 with 30 hour threshold | |
| $0 -a # Check ALL running VMs have recent backups | |
| $0 -a -h 48 # Check all VMs with 48 hour threshold | |
| Exit codes: | |
| 0 = OK - Backup found within threshold, status OK | |
| 1 = WARNING - (not currently used) | |
| 2 = CRITICAL - No backup found, backup too old, or backup failed | |
| 3 = UNKNOWN - Configuration error or API unavailable | |
| NRPE configuration example: | |
| command[check_vzdump]=sudo /usr/lib/nagios/plugins/check_pve_backup -v 100 -h 26 | |
| command[check_vzdump_all]=sudo /usr/lib/nagios/plugins/check_pve_backup -a | |
| Note: When using -v VMID, the script checks both VM-specific backups AND | |
| batch backups (where all VMs are backed up together). It uses whichever | |
| is more recent, so it works correctly with "all VMs" backup jobs. | |
| EOF | |
| exit $STATE_UNKNOWN | |
| } | |
| # Parse arguments | |
| while getopts "h:v:a?" opt; do | |
| case $opt in | |
| h) MAX_AGE_HOURS="$OPTARG" ;; | |
| v) VMID="$OPTARG" ;; | |
| a) CHECK_ALL=1 ;; | |
| ?) usage ;; | |
| *) usage ;; | |
| esac | |
| done | |
| # Validate MAX_AGE_HOURS is a number | |
| if ! [[ "$MAX_AGE_HOURS" =~ ^[0-9]+$ ]]; then | |
| echo "UNKNOWN - Invalid value for -h: $MAX_AGE_HOURS (must be a number)" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Get hostname for pvesh command | |
| NODE=$(hostname) | |
| if [ -z "$NODE" ]; then | |
| echo "UNKNOWN - Could not determine hostname" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Check if pvesh is available | |
| if ! command -v pvesh &> /dev/null; then | |
| echo "UNKNOWN - pvesh command not found (is this a Proxmox VE host?)" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Check if jq is available | |
| if ! command -v jq &> /dev/null; then | |
| echo "UNKNOWN - jq command not found (apt install jq)" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Calculate threshold timestamp | |
| NOW=$(date +%s) | |
| THRESHOLD=$((NOW - MAX_AGE_HOURS * 3600)) | |
| ######################################################################### | |
| # Helper: Get VM name from VMID | |
| ######################################################################### | |
| get_vm_name() { | |
| local vmid="$1" | |
| pvesh get /nodes/"$NODE"/qemu/"$vmid"/config --output-format json 2>/dev/null | jq -r '.name // empty' | |
| } | |
| ######################################################################### | |
| # Mode: Check specific VM (considers both VM-specific AND batch backups) | |
| ######################################################################### | |
| if [ -n "$VMID" ]; then | |
| # Verify VM exists | |
| VM_NAME=$(get_vm_name "$VMID") | |
| if [ -z "$VM_NAME" ]; then | |
| echo "UNKNOWN - VM $VMID not found on this node" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Get backup tasks | |
| TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 50 --output-format json 2>/dev/null) | |
| if [ $? -ne 0 ] || [ -z "$TASKS" ]; then | |
| echo "UNKNOWN - Failed to query Proxmox tasks API" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Find latest VM-specific backup | |
| VM_BACKUP=$(echo "$TASKS" | jq -r --arg vmid "$VMID" ' | |
| [.[] | select(.id == $vmid and .endtime != null)] | | |
| sort_by(.endtime) | | |
| reverse | | |
| .[0] // empty | |
| ') | |
| VM_ENDTIME=0 | |
| if [ -n "$VM_BACKUP" ] && [ "$VM_BACKUP" != "null" ]; then | |
| VM_ENDTIME=$(echo "$VM_BACKUP" | jq -r '.endtime // 0') | |
| fi | |
| # Find latest batch backup (id="") | |
| BATCH_BACKUP=$(echo "$TASKS" | jq -r ' | |
| [.[] | select(.id == "" and .endtime != null)] | | |
| sort_by(.endtime) | | |
| reverse | | |
| .[0] // empty | |
| ') | |
| BATCH_ENDTIME=0 | |
| if [ -n "$BATCH_BACKUP" ] && [ "$BATCH_BACKUP" != "null" ]; then | |
| BATCH_ENDTIME=$(echo "$BATCH_BACKUP" | jq -r '.endtime // 0') | |
| fi | |
| # Use the newest backup (VM-specific or batch) | |
| if [ "$VM_ENDTIME" -ge "$BATCH_ENDTIME" ] && [ "$VM_ENDTIME" -gt 0 ]; then | |
| LATEST="$VM_BACKUP" | |
| BACKUP_TYPE="VM" | |
| elif [ "$BATCH_ENDTIME" -gt 0 ]; then | |
| LATEST="$BATCH_BACKUP" | |
| BACKUP_TYPE="batch" | |
| else | |
| echo "CRITICAL - No backup found for VM $VMID ($VM_NAME)" | |
| exit $STATE_CRITICAL | |
| fi | |
| # Extract values | |
| STATUS=$(echo "$LATEST" | jq -r '.status // "unknown"') | |
| ENDTIME=$(echo "$LATEST" | jq -r '.endtime // 0') | |
| STARTTIME=$(echo "$LATEST" | jq -r '.starttime // 0') | |
| AGE_SECONDS=$((NOW - ENDTIME)) | |
| AGE_HOURS=$((AGE_SECONDS / 3600)) | |
| AGE_MINUTES=$(((AGE_SECONDS % 3600) / 60)) | |
| BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null) | |
| DURATION=$((ENDTIME - STARTTIME)) | |
| DURATION_MIN=$((DURATION / 60)) | |
| # Check status | |
| if [ "$STATUS" != "OK" ]; then | |
| echo "CRITICAL - $VM_NAME backup failed: $STATUS (at $BACKUP_DATE)" | |
| exit $STATE_CRITICAL | |
| fi | |
| # Check age | |
| if [ "$ENDTIME" -lt "$THRESHOLD" ]; then | |
| echo "CRITICAL - $VM_NAME backup too old: ${AGE_HOURS}h ${AGE_MINUTES}m (threshold: ${MAX_AGE_HOURS}h)" | |
| exit $STATE_CRITICAL | |
| fi | |
| echo "OK - $VM_NAME backup ($BACKUP_TYPE): ${AGE_HOURS}h ${AGE_MINUTES}m ago ($BACKUP_DATE, ${DURATION_MIN}min)" | |
| exit $STATE_OK | |
| fi | |
| ######################################################################### | |
| # Mode: Check ALL running VMs | |
| ######################################################################### | |
| if [ "$CHECK_ALL" -eq 1 ]; then | |
| # Get list of running VMs | |
| RUNNING_VMS=$(pvesh get /nodes/"$NODE"/qemu --output-format json 2>/dev/null | jq -r '.[] | select(.status == "running") | "\(.vmid):\(.name)"') | |
| if [ $? -ne 0 ] || [ -z "$RUNNING_VMS" ]; then | |
| VM_COUNT=$(pvesh get /nodes/"$NODE"/qemu --output-format json 2>/dev/null | jq -r 'length') | |
| if [ "$VM_COUNT" = "0" ]; then | |
| echo "OK - No VMs on this node" | |
| exit $STATE_OK | |
| else | |
| echo "OK - No running VMs to backup" | |
| exit $STATE_OK | |
| fi | |
| fi | |
| # Get all backup tasks | |
| TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 100 --output-format json 2>/dev/null) | |
| if [ $? -ne 0 ] || [ -z "$TASKS" ]; then | |
| echo "UNKNOWN - Failed to query Proxmox tasks API" | |
| exit $STATE_UNKNOWN | |
| fi | |
| # Check for batch backup first (most efficient) | |
| BATCH_BACKUP=$(echo "$TASKS" | jq -r --argjson threshold "$THRESHOLD" ' | |
| [.[] | select(.id == "" and .status == "OK" and .endtime >= $threshold)] | | |
| sort_by(.endtime) | | |
| reverse | | |
| .[0] // empty | |
| ') | |
| if [ -n "$BATCH_BACKUP" ] && [ "$BATCH_BACKUP" != "null" ]; then | |
| ENDTIME=$(echo "$BATCH_BACKUP" | jq -r '.endtime') | |
| AGE_H=$(( (NOW - ENDTIME) / 3600 )) | |
| VM_COUNT=$(echo "$RUNNING_VMS" | wc -l) | |
| VM_NAMES=$(echo "$RUNNING_VMS" | cut -d: -f2 | tr '\n' ',' | sed 's/,$//') | |
| BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null) | |
| echo "OK - Batch backup for $VM_COUNT VMs ($VM_NAMES): ${AGE_H}h ago ($BACKUP_DATE) | vms=$VM_COUNT" | |
| exit $STATE_OK | |
| fi | |
| # No recent batch backup - check individual VMs | |
| MISSING="" | |
| OK_LIST="" | |
| while IFS=: read -r vmid vmname; do | |
| LATEST=$(echo "$TASKS" | jq -r --arg vmid "$vmid" ' | |
| [.[] | select(.id == $vmid and .status == "OK" and .endtime != null)] | | |
| sort_by(.endtime) | | |
| reverse | | |
| .[0] // empty | |
| ') | |
| if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then | |
| MISSING="$MISSING $vmname" | |
| else | |
| ENDTIME=$(echo "$LATEST" | jq -r '.endtime') | |
| if [ "$ENDTIME" -lt "$THRESHOLD" ]; then | |
| AGE_H=$(( (NOW - ENDTIME) / 3600 )) | |
| MISSING="$MISSING $vmname(${AGE_H}h)" | |
| else | |
| OK_LIST="$OK_LIST $vmname" | |
| fi | |
| fi | |
| done <<< "$RUNNING_VMS" | |
| if [ -n "$MISSING" ]; then | |
| echo "CRITICAL - No recent backup for:$MISSING" | |
| exit $STATE_CRITICAL | |
| fi | |
| VM_COUNT=$(echo "$RUNNING_VMS" | wc -l) | |
| echo "OK - All $VM_COUNT VMs backed up:$OK_LIST | vms=$VM_COUNT" | |
| exit $STATE_OK | |
| fi | |
| ######################################################################### | |
| # Mode: Check ANY backup (default) | |
| ######################################################################### | |
| TASKS=$(pvesh get /nodes/"$NODE"/tasks --typefilter vzdump --limit 10 --output-format json 2>/dev/null) | |
| if [ $? -ne 0 ] || [ -z "$TASKS" ]; then | |
| echo "UNKNOWN - Failed to query Proxmox tasks API" | |
| exit $STATE_UNKNOWN | |
| fi | |
| LATEST=$(echo "$TASKS" | jq -r 'sort_by(.endtime) | reverse | .[0] // empty') | |
| if [ -z "$LATEST" ] || [ "$LATEST" = "null" ]; then | |
| echo "CRITICAL - No backup tasks found" | |
| exit $STATE_CRITICAL | |
| fi | |
| STATUS=$(echo "$LATEST" | jq -r '.status // "unknown"') | |
| ENDTIME=$(echo "$LATEST" | jq -r '.endtime // 0') | |
| STARTTIME=$(echo "$LATEST" | jq -r '.starttime // 0') | |
| TASK_VMID=$(echo "$LATEST" | jq -r '.id // ""') | |
| if [ "$ENDTIME" -eq 0 ]; then | |
| echo "UNKNOWN - Could not parse backup endtime" | |
| exit $STATE_UNKNOWN | |
| fi | |
| AGE_SECONDS=$((NOW - ENDTIME)) | |
| AGE_HOURS=$((AGE_SECONDS / 3600)) | |
| AGE_MINUTES=$(((AGE_SECONDS % 3600) / 60)) | |
| BACKUP_DATE=$(date -d "@$ENDTIME" "+%Y-%m-%d %H:%M" 2>/dev/null) | |
| DURATION=$((ENDTIME - STARTTIME)) | |
| DURATION_MIN=$((DURATION / 60)) | |
| if [ -z "$TASK_VMID" ]; then | |
| MSG_PREFIX="Batch backup" | |
| else | |
| VM_NAME=$(get_vm_name "$TASK_VMID") | |
| MSG_PREFIX="Backup" | |
| [ -n "$VM_NAME" ] && MSG_PREFIX="$VM_NAME backup" | |
| fi | |
| if [ "$STATUS" != "OK" ]; then | |
| echo "CRITICAL - $MSG_PREFIX failed: $STATUS (at $BACKUP_DATE)" | |
| exit $STATE_CRITICAL | |
| fi | |
| if [ "$ENDTIME" -lt "$THRESHOLD" ]; then | |
| echo "CRITICAL - $MSG_PREFIX too old: ${AGE_HOURS}h ${AGE_MINUTES}m (threshold: ${MAX_AGE_HOURS}h)" | |
| exit $STATE_CRITICAL | |
| fi | |
| echo "OK - $MSG_PREFIX: ${AGE_HOURS}h ${AGE_MINUTES}m ago ($BACKUP_DATE, ${DURATION_MIN}min)" | |
| exit $STATE_OK |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment