Created
January 7, 2026 12:32
-
-
Save jp1337/1d5e4c885fe3a1820d3cf810c670f06c to your computer and use it in GitHub Desktop.
Talos Linux Upgrade Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| ################################################################################ | |
| # Talos Linux Cluster Update Script (Generic Version) | |
| ################################################################################ | |
| # This script performs a safe, rolling update of Talos Linux nodes with: | |
| # - Automatic node discovery via kubectl (control plane & worker nodes) | |
| # - Automatic etcd backups before updates | |
| # - Health checks and node readiness validation | |
| # - Graceful upgrade strategy with configuration preservation | |
| # - Automatic detection of the latest Talos Linux version | |
| # - Uses official Metal installer images (suitable for bare metal/VM deployments) | |
| # | |
| # Usage: | |
| # ./update-talos-nodes.sh [OPTIONS] | |
| # | |
| # Options: | |
| # --version VERSION Specify a specific version (e.g., v1.9.2) | |
| # --skip-backup Skip etcd backup creation (NOT recommended) | |
| # --dry-run Show what would be done without executing | |
| # --help Show this help message | |
| # | |
| # Emergency Recovery: | |
| # Factory reset a node: talosctl -n xx.xx.xx.xx reset --graceful=false --reboot --system-labels-to-wipe=EPHEMERAL | |
| # | |
| # Factory Images: | |
| # For custom images with extensions, create them at: https://factory.talos.dev/ | |
| ################################################################################ | |
| # Exit on error, undefined variables, and pipe failures | |
| set -euo pipefail | |
| # Script configuration | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| readonly SCRIPT_DIR | |
| BACKUP_DIR="${SCRIPT_DIR}/talos-backups" | |
| readonly BACKUP_DIR | |
| TIMESTAMP=$(date +%Y%m%d-%H%M%S) | |
| readonly TIMESTAMP | |
| LOG_DIR="${SCRIPT_DIR}/talos-logs" | |
| readonly LOG_DIR | |
| LOG_FILE="${LOG_DIR}/talos-update-${TIMESTAMP}.log" | |
| readonly LOG_FILE | |
| # Talos configuration | |
| readonly REPO="siderolabs/talos" | |
| # Node arrays - will be populated by get_cluster_nodes() | |
| CONTROL_PLANE_NODES=() | |
| WORKER_NODES=() | |
| CONTROL_PLANE_NODE="" | |
| # Script options | |
| SKIP_BACKUP=false | |
| DRY_RUN=false | |
| VERSION="" | |
| # Colors for output | |
| readonly RED='\033[0;31m' | |
| readonly GREEN='\033[0;32m' | |
| readonly YELLOW='\033[1;33m' | |
| readonly BLUE='\033[0;34m' | |
| readonly NC='\033[0m' # No Color | |
| ################################################################################ | |
| # Helper Functions | |
| ################################################################################ | |
| # Print formatted messages | |
| log_info() { | |
| local msg="$1" | |
| echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}" | |
| } | |
| log_success() { | |
| local msg="$1" | |
| echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}" | |
| } | |
| log_warning() { | |
| local msg="$1" | |
| echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}" | |
| } | |
| log_error() { | |
| local msg="$1" | |
| echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}" | |
| } | |
| # Print a separator line | |
| print_separator() { | |
| echo "================================================================================" | tee -a "${LOG_FILE}" | |
| } | |
| # Show usage information | |
| show_help() { | |
| cat << EOF | |
| Talos Linux Cluster Update Script (Generic Version) | |
| This script automatically updates your Talos Linux cluster to the latest version | |
| using official Metal installer images. | |
| Usage: $0 [OPTIONS] | |
| Options: | |
| --version VERSION Specify a specific version (e.g., v1.9.2) | |
| --skip-backup Skip etcd backup creation (NOT recommended) | |
| --dry-run Show what would be done without executing | |
| --help Show this help message | |
| Examples: | |
| $0 # Update to latest version | |
| $0 --version v1.9.2 # Update to specific version | |
| $0 --dry-run # Preview updates without executing | |
| Notes: | |
| - This script uses official Talos Metal installer images | |
| - For custom factory images with extensions, modify the INSTALLER_IMAGE variable | |
| - Backups are saved to: ${BACKUP_DIR} | |
| - Logs are saved to: ${LOG_DIR} | |
| EOF | |
| exit 0 | |
| } | |
| # Get cluster nodes from kubectl | |
| get_cluster_nodes() { | |
| log_info "Discovering nodes from cluster..." | |
| if ! kubectl cluster-info &> /dev/null; then | |
| log_error "Cannot connect to Kubernetes cluster" | |
| exit 1 | |
| fi | |
| # Get control plane nodes (with role control-plane) | |
| local cp_nodes | |
| cp_nodes=$(kubectl get nodes -o wide --no-headers | grep -i "control-plane" | awk '{print $6}') | |
| if [ -z "$cp_nodes" ]; then | |
| log_error "No control plane nodes found in cluster" | |
| exit 1 | |
| fi | |
| # Store control plane nodes in CONTROL_PLANE_NODES array | |
| while IFS= read -r node_ip; do | |
| if [ -n "$node_ip" ]; then | |
| CONTROL_PLANE_NODES+=("$node_ip") | |
| fi | |
| done <<< "$cp_nodes" | |
| # Set first control plane node as CONTROL_PLANE_NODE for operations | |
| CONTROL_PLANE_NODE="${CONTROL_PLANE_NODES[0]}" | |
| log_success "Found ${#CONTROL_PLANE_NODES[@]} control plane node(s): ${CONTROL_PLANE_NODES[*]}" | |
| # Get worker nodes (without role control-plane) | |
| local worker_nodes | |
| worker_nodes=$(kubectl get nodes -o wide --no-headers | grep -v "control-plane" | awk '{print $6}') | |
| # Store worker nodes in WORKER_NODES array | |
| while IFS= read -r node_ip; do | |
| if [ -n "$node_ip" ]; then | |
| WORKER_NODES+=("$node_ip") | |
| fi | |
| done <<< "$worker_nodes" | |
| if [ ${#WORKER_NODES[@]} -gt 0 ]; then | |
| log_success "Found ${#WORKER_NODES[@]} worker node(s): ${WORKER_NODES[*]}" | |
| else | |
| log_info "No worker nodes found in cluster" | |
| fi | |
| log_info "Using control plane node for operations: ${CONTROL_PLANE_NODE}" | |
| } | |
| # Check if required tools are installed | |
| check_dependencies() { | |
| log_info "Checking required dependencies..." | |
| local missing_deps=() | |
| if ! command -v talosctl &> /dev/null; then | |
| missing_deps+=("talosctl") | |
| fi | |
| if ! command -v kubectl &> /dev/null; then | |
| missing_deps+=("kubectl") | |
| fi | |
| if ! command -v curl &> /dev/null; then | |
| missing_deps+=("curl") | |
| fi | |
| if [ ${#missing_deps[@]} -ne 0 ]; then | |
| log_error "Missing required dependencies: ${missing_deps[*]}" | |
| log_error "Please install: ${missing_deps[*]}" | |
| exit 1 | |
| fi | |
| log_success "All dependencies are installed" | |
| } | |
| # Get the latest Talos version from GitHub | |
| get_latest_version() { | |
| local version | |
| version=$(curl --silent "https://api.github.com/repos/${REPO}/releases/latest" | grep -Po '(?<="tag_name": ")[^"]*') | |
| if [ -z "$version" ]; then | |
| log_error "Failed to fetch latest version from GitHub" | |
| exit 1 | |
| fi | |
| echo "$version" | |
| } | |
| # Get node status from Kubernetes | |
| get_node_status() { | |
| local node_ip="$1" | |
| kubectl get nodes -o wide | grep "$node_ip" | awk '{print $2}' | |
| } | |
| # Get node version from Kubernetes | |
| get_node_version() { | |
| local node_ip="$1" | |
| kubectl get nodes -o wide | grep "$node_ip" | awk '{print $9}' | sed 's/[()]//g' | |
| } | |
| # Wait for node to become ready | |
| wait_for_node_ready() { | |
| local node_ip="$1" | |
| local max_wait="${2:-300}" # Default 5 minutes | |
| local sleep_interval=10 | |
| local waited=0 | |
| log_info "Waiting for node ${node_ip} to become ready (timeout: ${max_wait}s)..." | |
| while [ $waited -lt $max_wait ]; do | |
| local status | |
| status=$(get_node_status "$node_ip") | |
| if [ "$status" = "Ready" ]; then | |
| log_success "Node ${node_ip} is ready after ${waited}s" | |
| return 0 | |
| fi | |
| echo -n "." | tee -a "${LOG_FILE}" | |
| sleep $sleep_interval | |
| waited=$((waited + sleep_interval)) | |
| done | |
| echo "" | tee -a "${LOG_FILE}" | |
| log_error "Node ${node_ip} did not become ready within ${max_wait}s" | |
| return 1 | |
| } | |
| # Check cluster health | |
| check_cluster_health() { | |
| log_info "Checking cluster health..." | |
| if ! kubectl cluster-info &> /dev/null; then | |
| log_error "Cannot connect to Kubernetes cluster" | |
| return 1 | |
| fi | |
| local not_ready | |
| not_ready=$(kubectl get nodes --no-headers | grep -c "NotReady" || true) | |
| if [ "$not_ready" -gt 0 ]; then | |
| log_warning "${not_ready} node(s) are not ready" | |
| kubectl get nodes | tee -a "${LOG_FILE}" | |
| return 1 | |
| fi | |
| log_success "All nodes are ready" | |
| return 0 | |
| } | |
| # Create etcd backup | |
| create_etcd_backup() { | |
| if [ "$SKIP_BACKUP" = true ]; then | |
| log_warning "Skipping etcd backup (--skip-backup flag set)" | |
| return 0 | |
| fi | |
| print_separator | |
| log_info "Creating etcd backups for disaster recovery..." | |
| # Create backup directory if it doesn't exist | |
| mkdir -p "${BACKUP_DIR}" | |
| local backup_file="${BACKUP_DIR}/etcd-snapshot-${TIMESTAMP}.db" | |
| local backup_file_raw="${BACKUP_DIR}/etcd-db-${TIMESTAMP}.db" | |
| # Create snapshot using talosctl | |
| log_info "Creating etcd snapshot via talosctl..." | |
| if talosctl -n "${CONTROL_PLANE_NODE}" etcd snapshot "${backup_file}"; then | |
| log_success "Snapshot saved to: ${backup_file}" | |
| else | |
| log_error "Failed to create etcd snapshot" | |
| return 1 | |
| fi | |
| # Create second backup by copying the database directly | |
| log_info "Creating raw etcd database backup..." | |
| if talosctl -n "${CONTROL_PLANE_NODE}" cp /var/lib/etcd/member/snap/db "${backup_file_raw}"; then | |
| log_success "Raw backup saved to: ${backup_file_raw}" | |
| else | |
| log_warning "Failed to create raw etcd backup (non-critical)" | |
| fi | |
| # Verify backups exist and have non-zero size | |
| if [ ! -s "${backup_file}" ]; then | |
| log_error "Backup file is empty or doesn't exist: ${backup_file}" | |
| return 1 | |
| fi | |
| local backup_size | |
| backup_size=$(du -h "${backup_file}" | cut -f1) | |
| log_success "Backup verification passed (size: ${backup_size})" | |
| print_separator | |
| return 0 | |
| } | |
| # Update a single node | |
| update_node() { | |
| local node_ip="$1" | |
| local image="$2" | |
| local node_type="$3" | |
| print_separator | |
| log_info "Processing ${node_type} node: ${node_ip}" | |
| local status | |
| status=$(get_node_status "$node_ip") | |
| local running_version | |
| running_version=$(get_node_version "$node_ip") | |
| log_info "Current status: ${status}" | |
| log_info "Current version: ${running_version}" | |
| log_info "Target version: ${VERSION}" | |
| # Check if already on target version | |
| if [ "$running_version" = "$VERSION" ]; then | |
| log_success "Node ${node_ip} is already on version ${VERSION}" | |
| print_separator | |
| return 0 | |
| fi | |
| # Handle NotReady nodes | |
| if [ "$status" = "NotReady" ]; then | |
| log_warning "Node ${node_ip} is not ready, attempting reboot..." | |
| if [ "$DRY_RUN" = false ]; then | |
| if ! talosctl reboot -n "$node_ip" --mode "powercycle" --timeout 120s; then | |
| log_error "Failed to reboot node ${node_ip}" | |
| return 1 | |
| fi | |
| if ! wait_for_node_ready "$node_ip" 180; then | |
| log_error "Node ${node_ip} did not become ready after reboot" | |
| return 1 | |
| fi | |
| else | |
| log_info "[DRY-RUN] Would reboot node ${node_ip}" | |
| fi | |
| fi | |
| # Perform the upgrade | |
| log_info "Upgrading node ${node_ip} to ${VERSION}..." | |
| if [ "$DRY_RUN" = false ]; then | |
| if ! talosctl upgrade --nodes "$node_ip" --image "$image" --preserve --stage; then | |
| log_error "Upgrade failed for node ${node_ip}" | |
| return 1 | |
| fi | |
| log_success "Upgrade command sent successfully" | |
| # Wait for node to complete upgrade and become ready | |
| log_info "Waiting for node to complete upgrade and become ready..." | |
| if ! wait_for_node_ready "$node_ip" 600; then | |
| log_error "Node ${node_ip} did not become ready after upgrade" | |
| return 1 | |
| fi | |
| # Verify new version | |
| local new_version | |
| new_version=$(get_node_version "$node_ip") | |
| log_info "Node version after upgrade: ${new_version}" | |
| if [ "$new_version" = "$VERSION" ]; then | |
| log_success "Node ${node_ip} successfully upgraded to ${VERSION}" | |
| else | |
| log_warning "Node version (${new_version}) doesn't match target (${VERSION})" | |
| fi | |
| # Manual verification pause | |
| echo "" | |
| log_info "Please verify the node status manually:" | |
| kubectl get nodes -o wide | grep "$node_ip" | tee -a "${LOG_FILE}" | |
| echo "" | |
| read -p "Press [ENTER] to continue with the next node, or Ctrl+C to abort... " -r | |
| else | |
| log_info "[DRY-RUN] Would upgrade node ${node_ip} to ${image}" | |
| fi | |
| print_separator | |
| return 0 | |
| } | |
| # Update all nodes of a specific type | |
| update_nodes() { | |
| local -n nodes=$1 | |
| local image="$2" | |
| local node_type="$3" | |
| if [ ${#nodes[@]} -eq 0 ]; then | |
| log_info "No ${node_type} nodes to update" | |
| return 0 | |
| fi | |
| log_info "Starting update of ${#nodes[@]} ${node_type} node(s)..." | |
| for node_ip in "${nodes[@]}"; do | |
| if ! update_node "$node_ip" "$image" "$node_type"; then | |
| log_error "Failed to update ${node_type} node ${node_ip}" | |
| log_error "Aborting further updates for safety" | |
| return 1 | |
| fi | |
| # Check cluster health after each node update | |
| if [ "$DRY_RUN" = false ]; then | |
| if ! check_cluster_health; then | |
| log_error "Cluster health check failed after updating ${node_ip}" | |
| log_error "Please investigate before continuing" | |
| return 1 | |
| fi | |
| fi | |
| done | |
| log_success "All ${node_type} nodes updated successfully" | |
| return 0 | |
| } | |
| # Update Kubernetes version | |
| update_kubernetes() { | |
| print_separator | |
| log_info "Updating Kubernetes version..." | |
| if [ "$DRY_RUN" = false ]; then | |
| if talosctl upgrade-k8s -n "${CONTROL_PLANE_NODE}"; then | |
| log_success "Kubernetes upgrade initiated" | |
| else | |
| log_warning "Kubernetes upgrade failed or no upgrade available" | |
| fi | |
| else | |
| log_info "[DRY-RUN] Would upgrade Kubernetes via: talosctl upgrade-k8s -n ${CONTROL_PLANE_NODE}" | |
| fi | |
| print_separator | |
| } | |
| ################################################################################ | |
| # Main Script | |
| ################################################################################ | |
| # Parse command line arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --version) | |
| VERSION="$2" | |
| shift 2 | |
| ;; | |
| --skip-backup) | |
| SKIP_BACKUP=true | |
| shift | |
| ;; | |
| --dry-run) | |
| DRY_RUN=true | |
| shift | |
| ;; | |
| --help) | |
| show_help | |
| ;; | |
| *) | |
| echo "Unknown option: $1" | |
| show_help | |
| ;; | |
| esac | |
| done | |
| # Initialize logging | |
| mkdir -p "${LOG_DIR}" | |
| touch "${LOG_FILE}" | |
| # Print header | |
| print_separator | |
| log_info "Talos Linux Cluster Update Script (Generic Version)" | |
| log_info "Started at: $(date '+%Y-%m-%d %H:%M:%S')" | |
| log_info "Log file: ${LOG_FILE}" | |
| print_separator | |
| # Check dependencies | |
| check_dependencies | |
| # Discover cluster nodes | |
| get_cluster_nodes | |
| # Get version if not specified | |
| if [ -z "$VERSION" ]; then | |
| log_info "Fetching latest Talos version from GitHub..." | |
| VERSION=$(get_latest_version) | |
| log_info "Target version: ${VERSION} (latest)" | |
| else | |
| log_info "Target version: ${VERSION} (specified)" | |
| fi | |
| # Build installer image URL (using official Metal installer) | |
| INSTALLER_IMAGE="ghcr.io/siderolabs/installer:${VERSION}" | |
| log_info "Installer image: ${INSTALLER_IMAGE}" | |
| # Show dry-run notice if applicable | |
| if [ "$DRY_RUN" = true ]; then | |
| log_warning "DRY-RUN MODE: No changes will be made" | |
| fi | |
| # Initial cluster health check | |
| if ! check_cluster_health; then | |
| log_error "Cluster is not healthy. Please fix issues before updating." | |
| exit 1 | |
| fi | |
| # Create etcd backups | |
| if ! create_etcd_backup; then | |
| log_error "Failed to create etcd backup" | |
| read -p "Continue without backup? [y/N] " -n 1 -r | |
| echo | |
| if [[ ! $REPLY =~ ^[Yy]$ ]]; then | |
| log_error "Update aborted by user" | |
| exit 1 | |
| fi | |
| fi | |
| # Update control plane nodes first | |
| if ! update_nodes CONTROL_PLANE_NODES "$INSTALLER_IMAGE" "Control Plane"; then | |
| log_error "Failed to update control plane nodes" | |
| exit 1 | |
| fi | |
| # Update worker nodes | |
| if ! update_nodes WORKER_NODES "$INSTALLER_IMAGE" "Worker"; then | |
| log_error "Failed to update worker nodes" | |
| exit 1 | |
| fi | |
| # Update Kubernetes | |
| update_kubernetes | |
| # Final health check | |
| if [ "$DRY_RUN" = false ]; then | |
| log_info "Performing final cluster health check..." | |
| if check_cluster_health; then | |
| log_success "Cluster is healthy after updates" | |
| else | |
| log_warning "Cluster health check failed after updates" | |
| fi | |
| fi | |
| # Print summary | |
| print_separator | |
| log_success "Cluster update completed successfully!" | |
| log_info "All nodes have been updated to version ${VERSION}" | |
| log_info "Backup location: ${BACKUP_DIR}" | |
| log_info "Log file: ${LOG_FILE}" | |
| print_separator | |
| exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment