Skip to content

Instantly share code, notes, and snippets.

@jp1337
Created January 7, 2026 12:32
Show Gist options
  • Select an option

  • Save jp1337/1d5e4c885fe3a1820d3cf810c670f06c to your computer and use it in GitHub Desktop.

Select an option

Save jp1337/1d5e4c885fe3a1820d3cf810c670f06c to your computer and use it in GitHub Desktop.
Talos Linux Upgrade Script
#!/bin/bash
################################################################################
# Talos Linux Cluster Update Script (Generic Version)
################################################################################
# This script performs a safe, rolling update of Talos Linux nodes with:
# - Automatic node discovery via kubectl (control plane & worker nodes)
# - Automatic etcd backups before updates
# - Health checks and node readiness validation
# - Graceful upgrade strategy with configuration preservation
# - Automatic detection of the latest Talos Linux version
# - Uses official Metal installer images (suitable for bare metal/VM deployments)
#
# Usage:
# ./update-talos-nodes.sh [OPTIONS]
#
# Options:
# --version VERSION Specify a specific version (e.g., v1.9.2)
# --skip-backup Skip etcd backup creation (NOT recommended)
# --dry-run Show what would be done without executing
# --help Show this help message
#
# Emergency Recovery:
# Factory reset a node: talosctl -n xx.xx.xx.xx reset --graceful=false --reboot --system-labels-to-wipe=EPHEMERAL
#
# Factory Images:
# For custom images with extensions, create them at: https://factory.talos.dev/
################################################################################
# Exit on error, undefined variables, and pipe failures
set -euo pipefail
# Script configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly SCRIPT_DIR
BACKUP_DIR="${SCRIPT_DIR}/talos-backups"
readonly BACKUP_DIR
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
readonly TIMESTAMP
LOG_DIR="${SCRIPT_DIR}/talos-logs"
readonly LOG_DIR
LOG_FILE="${LOG_DIR}/talos-update-${TIMESTAMP}.log"
readonly LOG_FILE
# Talos configuration
readonly REPO="siderolabs/talos"
# Node arrays - will be populated by get_cluster_nodes()
CONTROL_PLANE_NODES=()
WORKER_NODES=()
CONTROL_PLANE_NODE=""
# Script options
SKIP_BACKUP=false
DRY_RUN=false
VERSION=""
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m' # No Color
################################################################################
# Helper Functions
################################################################################
# Print formatted messages
log_info() {
local msg="$1"
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}
log_success() {
local msg="$1"
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}
log_warning() {
local msg="$1"
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}
log_error() {
local msg="$1"
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}
# Print a separator line
print_separator() {
echo "================================================================================" | tee -a "${LOG_FILE}"
}
# Show usage information
show_help() {
cat << EOF
Talos Linux Cluster Update Script (Generic Version)
This script automatically updates your Talos Linux cluster to the latest version
using official Metal installer images.
Usage: $0 [OPTIONS]
Options:
--version VERSION Specify a specific version (e.g., v1.9.2)
--skip-backup Skip etcd backup creation (NOT recommended)
--dry-run Show what would be done without executing
--help Show this help message
Examples:
$0 # Update to latest version
$0 --version v1.9.2 # Update to specific version
$0 --dry-run # Preview updates without executing
Notes:
- This script uses official Talos Metal installer images
- For custom factory images with extensions, modify the INSTALLER_IMAGE variable
- Backups are saved to: ${BACKUP_DIR}
- Logs are saved to: ${LOG_DIR}
EOF
exit 0
}
# Get cluster nodes from kubectl
get_cluster_nodes() {
log_info "Discovering nodes from cluster..."
if ! kubectl cluster-info &> /dev/null; then
log_error "Cannot connect to Kubernetes cluster"
exit 1
fi
# Get control plane nodes (with role control-plane)
local cp_nodes
cp_nodes=$(kubectl get nodes -o wide --no-headers | grep -i "control-plane" | awk '{print $6}')
if [ -z "$cp_nodes" ]; then
log_error "No control plane nodes found in cluster"
exit 1
fi
# Store control plane nodes in CONTROL_PLANE_NODES array
while IFS= read -r node_ip; do
if [ -n "$node_ip" ]; then
CONTROL_PLANE_NODES+=("$node_ip")
fi
done <<< "$cp_nodes"
# Set first control plane node as CONTROL_PLANE_NODE for operations
CONTROL_PLANE_NODE="${CONTROL_PLANE_NODES[0]}"
log_success "Found ${#CONTROL_PLANE_NODES[@]} control plane node(s): ${CONTROL_PLANE_NODES[*]}"
# Get worker nodes (without role control-plane)
local worker_nodes
worker_nodes=$(kubectl get nodes -o wide --no-headers | grep -v "control-plane" | awk '{print $6}')
# Store worker nodes in WORKER_NODES array
while IFS= read -r node_ip; do
if [ -n "$node_ip" ]; then
WORKER_NODES+=("$node_ip")
fi
done <<< "$worker_nodes"
if [ ${#WORKER_NODES[@]} -gt 0 ]; then
log_success "Found ${#WORKER_NODES[@]} worker node(s): ${WORKER_NODES[*]}"
else
log_info "No worker nodes found in cluster"
fi
log_info "Using control plane node for operations: ${CONTROL_PLANE_NODE}"
}
# Check if required tools are installed
check_dependencies() {
log_info "Checking required dependencies..."
local missing_deps=()
if ! command -v talosctl &> /dev/null; then
missing_deps+=("talosctl")
fi
if ! command -v kubectl &> /dev/null; then
missing_deps+=("kubectl")
fi
if ! command -v curl &> /dev/null; then
missing_deps+=("curl")
fi
if [ ${#missing_deps[@]} -ne 0 ]; then
log_error "Missing required dependencies: ${missing_deps[*]}"
log_error "Please install: ${missing_deps[*]}"
exit 1
fi
log_success "All dependencies are installed"
}
# Get the latest Talos version from GitHub
get_latest_version() {
local version
version=$(curl --silent "https://api.github.com/repos/${REPO}/releases/latest" | grep -Po '(?<="tag_name": ")[^"]*')
if [ -z "$version" ]; then
log_error "Failed to fetch latest version from GitHub"
exit 1
fi
echo "$version"
}
# Get node status from Kubernetes
get_node_status() {
local node_ip="$1"
kubectl get nodes -o wide | grep "$node_ip" | awk '{print $2}'
}
# Get node version from Kubernetes
get_node_version() {
local node_ip="$1"
kubectl get nodes -o wide | grep "$node_ip" | awk '{print $9}' | sed 's/[()]//g'
}
# Wait for node to become ready
wait_for_node_ready() {
local node_ip="$1"
local max_wait="${2:-300}" # Default 5 minutes
local sleep_interval=10
local waited=0
log_info "Waiting for node ${node_ip} to become ready (timeout: ${max_wait}s)..."
while [ $waited -lt $max_wait ]; do
local status
status=$(get_node_status "$node_ip")
if [ "$status" = "Ready" ]; then
log_success "Node ${node_ip} is ready after ${waited}s"
return 0
fi
echo -n "." | tee -a "${LOG_FILE}"
sleep $sleep_interval
waited=$((waited + sleep_interval))
done
echo "" | tee -a "${LOG_FILE}"
log_error "Node ${node_ip} did not become ready within ${max_wait}s"
return 1
}
# Check cluster health
check_cluster_health() {
log_info "Checking cluster health..."
if ! kubectl cluster-info &> /dev/null; then
log_error "Cannot connect to Kubernetes cluster"
return 1
fi
local not_ready
not_ready=$(kubectl get nodes --no-headers | grep -c "NotReady" || true)
if [ "$not_ready" -gt 0 ]; then
log_warning "${not_ready} node(s) are not ready"
kubectl get nodes | tee -a "${LOG_FILE}"
return 1
fi
log_success "All nodes are ready"
return 0
}
# Create etcd backup
create_etcd_backup() {
if [ "$SKIP_BACKUP" = true ]; then
log_warning "Skipping etcd backup (--skip-backup flag set)"
return 0
fi
print_separator
log_info "Creating etcd backups for disaster recovery..."
# Create backup directory if it doesn't exist
mkdir -p "${BACKUP_DIR}"
local backup_file="${BACKUP_DIR}/etcd-snapshot-${TIMESTAMP}.db"
local backup_file_raw="${BACKUP_DIR}/etcd-db-${TIMESTAMP}.db"
# Create snapshot using talosctl
log_info "Creating etcd snapshot via talosctl..."
if talosctl -n "${CONTROL_PLANE_NODE}" etcd snapshot "${backup_file}"; then
log_success "Snapshot saved to: ${backup_file}"
else
log_error "Failed to create etcd snapshot"
return 1
fi
# Create second backup by copying the database directly
log_info "Creating raw etcd database backup..."
if talosctl -n "${CONTROL_PLANE_NODE}" cp /var/lib/etcd/member/snap/db "${backup_file_raw}"; then
log_success "Raw backup saved to: ${backup_file_raw}"
else
log_warning "Failed to create raw etcd backup (non-critical)"
fi
# Verify backups exist and have non-zero size
if [ ! -s "${backup_file}" ]; then
log_error "Backup file is empty or doesn't exist: ${backup_file}"
return 1
fi
local backup_size
backup_size=$(du -h "${backup_file}" | cut -f1)
log_success "Backup verification passed (size: ${backup_size})"
print_separator
return 0
}
# Update a single node
update_node() {
local node_ip="$1"
local image="$2"
local node_type="$3"
print_separator
log_info "Processing ${node_type} node: ${node_ip}"
local status
status=$(get_node_status "$node_ip")
local running_version
running_version=$(get_node_version "$node_ip")
log_info "Current status: ${status}"
log_info "Current version: ${running_version}"
log_info "Target version: ${VERSION}"
# Check if already on target version
if [ "$running_version" = "$VERSION" ]; then
log_success "Node ${node_ip} is already on version ${VERSION}"
print_separator
return 0
fi
# Handle NotReady nodes
if [ "$status" = "NotReady" ]; then
log_warning "Node ${node_ip} is not ready, attempting reboot..."
if [ "$DRY_RUN" = false ]; then
if ! talosctl reboot -n "$node_ip" --mode "powercycle" --timeout 120s; then
log_error "Failed to reboot node ${node_ip}"
return 1
fi
if ! wait_for_node_ready "$node_ip" 180; then
log_error "Node ${node_ip} did not become ready after reboot"
return 1
fi
else
log_info "[DRY-RUN] Would reboot node ${node_ip}"
fi
fi
# Perform the upgrade
log_info "Upgrading node ${node_ip} to ${VERSION}..."
if [ "$DRY_RUN" = false ]; then
if ! talosctl upgrade --nodes "$node_ip" --image "$image" --preserve --stage; then
log_error "Upgrade failed for node ${node_ip}"
return 1
fi
log_success "Upgrade command sent successfully"
# Wait for node to complete upgrade and become ready
log_info "Waiting for node to complete upgrade and become ready..."
if ! wait_for_node_ready "$node_ip" 600; then
log_error "Node ${node_ip} did not become ready after upgrade"
return 1
fi
# Verify new version
local new_version
new_version=$(get_node_version "$node_ip")
log_info "Node version after upgrade: ${new_version}"
if [ "$new_version" = "$VERSION" ]; then
log_success "Node ${node_ip} successfully upgraded to ${VERSION}"
else
log_warning "Node version (${new_version}) doesn't match target (${VERSION})"
fi
# Manual verification pause
echo ""
log_info "Please verify the node status manually:"
kubectl get nodes -o wide | grep "$node_ip" | tee -a "${LOG_FILE}"
echo ""
read -p "Press [ENTER] to continue with the next node, or Ctrl+C to abort... " -r
else
log_info "[DRY-RUN] Would upgrade node ${node_ip} to ${image}"
fi
print_separator
return 0
}
# Update all nodes of a specific type
update_nodes() {
local -n nodes=$1
local image="$2"
local node_type="$3"
if [ ${#nodes[@]} -eq 0 ]; then
log_info "No ${node_type} nodes to update"
return 0
fi
log_info "Starting update of ${#nodes[@]} ${node_type} node(s)..."
for node_ip in "${nodes[@]}"; do
if ! update_node "$node_ip" "$image" "$node_type"; then
log_error "Failed to update ${node_type} node ${node_ip}"
log_error "Aborting further updates for safety"
return 1
fi
# Check cluster health after each node update
if [ "$DRY_RUN" = false ]; then
if ! check_cluster_health; then
log_error "Cluster health check failed after updating ${node_ip}"
log_error "Please investigate before continuing"
return 1
fi
fi
done
log_success "All ${node_type} nodes updated successfully"
return 0
}
# Update Kubernetes version
update_kubernetes() {
print_separator
log_info "Updating Kubernetes version..."
if [ "$DRY_RUN" = false ]; then
if talosctl upgrade-k8s -n "${CONTROL_PLANE_NODE}"; then
log_success "Kubernetes upgrade initiated"
else
log_warning "Kubernetes upgrade failed or no upgrade available"
fi
else
log_info "[DRY-RUN] Would upgrade Kubernetes via: talosctl upgrade-k8s -n ${CONTROL_PLANE_NODE}"
fi
print_separator
}
################################################################################
# Main Script
################################################################################
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--version)
VERSION="$2"
shift 2
;;
--skip-backup)
SKIP_BACKUP=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--help)
show_help
;;
*)
echo "Unknown option: $1"
show_help
;;
esac
done
# Initialize logging
mkdir -p "${LOG_DIR}"
touch "${LOG_FILE}"
# Print header
print_separator
log_info "Talos Linux Cluster Update Script (Generic Version)"
log_info "Started at: $(date '+%Y-%m-%d %H:%M:%S')"
log_info "Log file: ${LOG_FILE}"
print_separator
# Check dependencies
check_dependencies
# Discover cluster nodes
get_cluster_nodes
# Get version if not specified
if [ -z "$VERSION" ]; then
log_info "Fetching latest Talos version from GitHub..."
VERSION=$(get_latest_version)
log_info "Target version: ${VERSION} (latest)"
else
log_info "Target version: ${VERSION} (specified)"
fi
# Build installer image URL (using official Metal installer)
INSTALLER_IMAGE="ghcr.io/siderolabs/installer:${VERSION}"
log_info "Installer image: ${INSTALLER_IMAGE}"
# Show dry-run notice if applicable
if [ "$DRY_RUN" = true ]; then
log_warning "DRY-RUN MODE: No changes will be made"
fi
# Initial cluster health check
if ! check_cluster_health; then
log_error "Cluster is not healthy. Please fix issues before updating."
exit 1
fi
# Create etcd backups
if ! create_etcd_backup; then
log_error "Failed to create etcd backup"
read -p "Continue without backup? [y/N] " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
log_error "Update aborted by user"
exit 1
fi
fi
# Update control plane nodes first
if ! update_nodes CONTROL_PLANE_NODES "$INSTALLER_IMAGE" "Control Plane"; then
log_error "Failed to update control plane nodes"
exit 1
fi
# Update worker nodes
if ! update_nodes WORKER_NODES "$INSTALLER_IMAGE" "Worker"; then
log_error "Failed to update worker nodes"
exit 1
fi
# Update Kubernetes
update_kubernetes
# Final health check
if [ "$DRY_RUN" = false ]; then
log_info "Performing final cluster health check..."
if check_cluster_health; then
log_success "Cluster is healthy after updates"
else
log_warning "Cluster health check failed after updates"
fi
fi
# Print summary
print_separator
log_success "Cluster update completed successfully!"
log_info "All nodes have been updated to version ${VERSION}"
log_info "Backup location: ${BACKUP_DIR}"
log_info "Log file: ${LOG_FILE}"
print_separator
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment