jp1337/update-talos-nodes.sh

## update-talos-nodes.sh
#!/bin/bash

################################################################################
# Talos Linux Cluster Update Script (Generic Version)
################################################################################
# This script performs a safe, rolling update of Talos Linux nodes with:
# - Automatic node discovery via kubectl (control plane & worker nodes)
# - Automatic etcd backups before updates
# - Health checks and node readiness validation
# - Graceful upgrade strategy with configuration preservation
# - Automatic detection of the latest Talos Linux version
# - Uses official Metal installer images (suitable for bare metal/VM deployments)
#
# Usage:
#   ./update-talos-nodes.sh [OPTIONS]
#
# Options:
#   --version VERSION    Specify a specific version (e.g., v1.9.2)
#   --skip-backup        Skip etcd backup creation (NOT recommended)
#   --dry-run           Show what would be done without executing
#   --help              Show this help message
#
# Emergency Recovery:
#   Factory reset a node: talosctl -n xx.xx.xx.xx reset --graceful=false --reboot --system-labels-to-wipe=EPHEMERAL
#
# Factory Images:
#   For custom images with extensions, create them at: https://factory.talos.dev/
################################################################################

# Exit on error, undefined variables, and pipe failures
set -euo pipefail

# Script configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly SCRIPT_DIR
BACKUP_DIR="${SCRIPT_DIR}/talos-backups"
readonly BACKUP_DIR
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
readonly TIMESTAMP
LOG_DIR="${SCRIPT_DIR}/talos-logs"
readonly LOG_DIR
LOG_FILE="${LOG_DIR}/talos-update-${TIMESTAMP}.log"
readonly LOG_FILE

# Talos configuration
readonly REPO="siderolabs/talos"

# Node arrays - will be populated by get_cluster_nodes()
CONTROL_PLANE_NODES=()
WORKER_NODES=()
CONTROL_PLANE_NODE=""

# Script options
SKIP_BACKUP=false
DRY_RUN=false
VERSION=""

# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m' # No Color

################################################################################
# Helper Functions
################################################################################

# Print formatted messages
log_info() {
    local msg="$1"
    echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}

log_success() {
    local msg="$1"
    echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}

log_warning() {
    local msg="$1"
    echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}

log_error() {
    local msg="$1"
    echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" | tee -a "${LOG_FILE}"
}

# Print a separator line
print_separator() {
    echo "================================================================================" | tee -a "${LOG_FILE}"
}

# Show usage information
show_help() {
    cat << EOF
Talos Linux Cluster Update Script (Generic Version)

This script automatically updates your Talos Linux cluster to the latest version
using official Metal installer images.

Usage: $0 [OPTIONS]

Options:
    --version VERSION    Specify a specific version (e.g., v1.9.2)
    --skip-backup        Skip etcd backup creation (NOT recommended)
    --dry-run           Show what would be done without executing
    --help              Show this help message

Examples:
    $0                          # Update to latest version
    $0 --version v1.9.2        # Update to specific version
    $0 --dry-run               # Preview updates without executing

Notes:
    - This script uses official Talos Metal installer images
    - For custom factory images with extensions, modify the INSTALLER_IMAGE variable
    - Backups are saved to: ${BACKUP_DIR}
    - Logs are saved to: ${LOG_DIR}

EOF
    exit 0
}

# Get cluster nodes from kubectl
get_cluster_nodes() {
    log_info "Discovering nodes from cluster..."

    if ! kubectl cluster-info &> /dev/null; then
        log_error "Cannot connect to Kubernetes cluster"
        exit 1
    fi

    # Get control plane nodes (with role control-plane)
    local cp_nodes
    cp_nodes=$(kubectl get nodes -o wide --no-headers | grep -i "control-plane" | awk '{print $6}')

    if [ -z "$cp_nodes" ]; then
        log_error "No control plane nodes found in cluster"
        exit 1
    fi

    # Store control plane nodes in CONTROL_PLANE_NODES array
    while IFS= read -r node_ip; do
        if [ -n "$node_ip" ]; then
            CONTROL_PLANE_NODES+=("$node_ip")
        fi
    done <<< "$cp_nodes"

    # Set first control plane node as CONTROL_PLANE_NODE for operations
    CONTROL_PLANE_NODE="${CONTROL_PLANE_NODES[0]}"

    log_success "Found ${#CONTROL_PLANE_NODES[@]} control plane node(s): ${CONTROL_PLANE_NODES[*]}"

    # Get worker nodes (without role control-plane)
    local worker_nodes
    worker_nodes=$(kubectl get nodes -o wide --no-headers | grep -v "control-plane" | awk '{print $6}')

    # Store worker nodes in WORKER_NODES array
    while IFS= read -r node_ip; do
        if [ -n "$node_ip" ]; then
            WORKER_NODES+=("$node_ip")
        fi
    done <<< "$worker_nodes"

    if [ ${#WORKER_NODES[@]} -gt 0 ]; then
        log_success "Found ${#WORKER_NODES[@]} worker node(s): ${WORKER_NODES[*]}"
    else
        log_info "No worker nodes found in cluster"
    fi

    log_info "Using control plane node for operations: ${CONTROL_PLANE_NODE}"
}

# Check if required tools are installed
check_dependencies() {
    log_info "Checking required dependencies..."

    local missing_deps=()

    if ! command -v talosctl &> /dev/null; then
        missing_deps+=("talosctl")
    fi

    if ! command -v kubectl &> /dev/null; then
        missing_deps+=("kubectl")
    fi

    if ! command -v curl &> /dev/null; then
        missing_deps+=("curl")
    fi

    if [ ${#missing_deps[@]} -ne 0 ]; then
        log_error "Missing required dependencies: ${missing_deps[*]}"
        log_error "Please install: ${missing_deps[*]}"
        exit 1
    fi

    log_success "All dependencies are installed"
}

# Get the latest Talos version from GitHub
get_latest_version() {
    local version
    version=$(curl --silent "https://api.github.com/repos/${REPO}/releases/latest" | grep -Po '(?<="tag_name": ")[^"]*')

    if [ -z "$version" ]; then
        log_error "Failed to fetch latest version from GitHub"
        exit 1
    fi

    echo "$version"
}

# Get node status from Kubernetes
get_node_status() {
    local node_ip="$1"
    kubectl get nodes -o wide | grep "$node_ip" | awk '{print $2}'
}

# Get node version from Kubernetes
get_node_version() {
    local node_ip="$1"
    kubectl get nodes -o wide | grep "$node_ip" | awk '{print $9}' | sed 's/[()]//g'
}

# Wait for node to become ready
wait_for_node_ready() {
    local node_ip="$1"
    local max_wait="${2:-300}"  # Default 5 minutes
    local sleep_interval=10
    local waited=0

    log_info "Waiting for node ${node_ip} to become ready (timeout: ${max_wait}s)..."

    while [ $waited -lt $max_wait ]; do
        local status
        status=$(get_node_status "$node_ip")

        if [ "$status" = "Ready" ]; then
            log_success "Node ${node_ip} is ready after ${waited}s"
            return 0
        fi

        echo -n "." | tee -a "${LOG_FILE}"
        sleep $sleep_interval
        waited=$((waited + sleep_interval))
    done

    echo "" | tee -a "${LOG_FILE}"
    log_error "Node ${node_ip} did not become ready within ${max_wait}s"
    return 1
}

# Check cluster health
check_cluster_health() {
    log_info "Checking cluster health..."

    if ! kubectl cluster-info &> /dev/null; then
        log_error "Cannot connect to Kubernetes cluster"
        return 1
    fi

    local not_ready
    not_ready=$(kubectl get nodes --no-headers | grep -c "NotReady" || true)

    if [ "$not_ready" -gt 0 ]; then
        log_warning "${not_ready} node(s) are not ready"
        kubectl get nodes | tee -a "${LOG_FILE}"
        return 1
    fi

    log_success "All nodes are ready"
    return 0
}

# Create etcd backup
create_etcd_backup() {
    if [ "$SKIP_BACKUP" = true ]; then
        log_warning "Skipping etcd backup (--skip-backup flag set)"
        return 0
    fi

    print_separator
    log_info "Creating etcd backups for disaster recovery..."

    # Create backup directory if it doesn't exist
    mkdir -p "${BACKUP_DIR}"

    local backup_file="${BACKUP_DIR}/etcd-snapshot-${TIMESTAMP}.db"
    local backup_file_raw="${BACKUP_DIR}/etcd-db-${TIMESTAMP}.db"

    # Create snapshot using talosctl
    log_info "Creating etcd snapshot via talosctl..."
    if talosctl -n "${CONTROL_PLANE_NODE}" etcd snapshot "${backup_file}"; then
        log_success "Snapshot saved to: ${backup_file}"
    else
        log_error "Failed to create etcd snapshot"
        return 1
    fi

    # Create second backup by copying the database directly
    log_info "Creating raw etcd database backup..."
    if talosctl -n "${CONTROL_PLANE_NODE}" cp /var/lib/etcd/member/snap/db "${backup_file_raw}"; then
        log_success "Raw backup saved to: ${backup_file_raw}"
    else
        log_warning "Failed to create raw etcd backup (non-critical)"
    fi

    # Verify backups exist and have non-zero size
    if [ ! -s "${backup_file}" ]; then
        log_error "Backup file is empty or doesn't exist: ${backup_file}"
        return 1
    fi

    local backup_size
    backup_size=$(du -h "${backup_file}" | cut -f1)
    log_success "Backup verification passed (size: ${backup_size})"

    print_separator
    return 0
}

# Update a single node
update_node() {
    local node_ip="$1"
    local image="$2"
    local node_type="$3"

    print_separator
    log_info "Processing ${node_type} node: ${node_ip}"

    local status
    status=$(get_node_status "$node_ip")

    local running_version
    running_version=$(get_node_version "$node_ip")

    log_info "Current status: ${status}"
    log_info "Current version: ${running_version}"
    log_info "Target version: ${VERSION}"

    # Check if already on target version
    if [ "$running_version" = "$VERSION" ]; then
        log_success "Node ${node_ip} is already on version ${VERSION}"
        print_separator
        return 0
    fi

    # Handle NotReady nodes
    if [ "$status" = "NotReady" ]; then
        log_warning "Node ${node_ip} is not ready, attempting reboot..."

        if [ "$DRY_RUN" = false ]; then
            if ! talosctl reboot -n "$node_ip" --mode "powercycle" --timeout 120s; then
                log_error "Failed to reboot node ${node_ip}"
                return 1
            fi

            if ! wait_for_node_ready "$node_ip" 180; then
                log_error "Node ${node_ip} did not become ready after reboot"
                return 1
            fi
        else
            log_info "[DRY-RUN] Would reboot node ${node_ip}"
        fi
    fi

    # Perform the upgrade
    log_info "Upgrading node ${node_ip} to ${VERSION}..."

    if [ "$DRY_RUN" = false ]; then
        if ! talosctl upgrade --nodes "$node_ip" --image "$image" --preserve --stage; then
            log_error "Upgrade failed for node ${node_ip}"
            return 1
        fi

        log_success "Upgrade command sent successfully"

        # Wait for node to complete upgrade and become ready
        log_info "Waiting for node to complete upgrade and become ready..."
        if ! wait_for_node_ready "$node_ip" 600; then
            log_error "Node ${node_ip} did not become ready after upgrade"
            return 1
        fi

        # Verify new version
        local new_version
        new_version=$(get_node_version "$node_ip")
        log_info "Node version after upgrade: ${new_version}"

        if [ "$new_version" = "$VERSION" ]; then
            log_success "Node ${node_ip} successfully upgraded to ${VERSION}"
        else
            log_warning "Node version (${new_version}) doesn't match target (${VERSION})"
        fi

        # Manual verification pause
        echo ""
        log_info "Please verify the node status manually:"
        kubectl get nodes -o wide | grep "$node_ip" | tee -a "${LOG_FILE}"
        echo ""
        read -p "Press [ENTER] to continue with the next node, or Ctrl+C to abort... " -r
    else
        log_info "[DRY-RUN] Would upgrade node ${node_ip} to ${image}"
    fi

    print_separator
    return 0
}

# Update all nodes of a specific type
update_nodes() {
    local -n nodes=$1
    local image="$2"
    local node_type="$3"

    if [ ${#nodes[@]} -eq 0 ]; then
        log_info "No ${node_type} nodes to update"
        return 0
    fi

    log_info "Starting update of ${#nodes[@]} ${node_type} node(s)..."

    for node_ip in "${nodes[@]}"; do
        if ! update_node "$node_ip" "$image" "$node_type"; then
            log_error "Failed to update ${node_type} node ${node_ip}"
            log_error "Aborting further updates for safety"
            return 1
        fi

        # Check cluster health after each node update
        if [ "$DRY_RUN" = false ]; then
            if ! check_cluster_health; then
                log_error "Cluster health check failed after updating ${node_ip}"
                log_error "Please investigate before continuing"
                return 1
            fi
        fi
    done

    log_success "All ${node_type} nodes updated successfully"
    return 0
}

# Update Kubernetes version
update_kubernetes() {
    print_separator
    log_info "Updating Kubernetes version..."

    if [ "$DRY_RUN" = false ]; then
        if talosctl upgrade-k8s -n "${CONTROL_PLANE_NODE}"; then
            log_success "Kubernetes upgrade initiated"
        else
            log_warning "Kubernetes upgrade failed or no upgrade available"
        fi
    else
        log_info "[DRY-RUN] Would upgrade Kubernetes via: talosctl upgrade-k8s -n ${CONTROL_PLANE_NODE}"
    fi

    print_separator
}

################################################################################
# Main Script
################################################################################

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --version)
            VERSION="$2"
            shift 2
            ;;
        --skip-backup)
            SKIP_BACKUP=true
            shift
            ;;
        --dry-run)
            DRY_RUN=true
            shift
            ;;
        --help)
            show_help
            ;;
        *)
            echo "Unknown option: $1"
            show_help
            ;;
    esac
done

# Initialize logging
mkdir -p "${LOG_DIR}"
touch "${LOG_FILE}"

# Print header
print_separator
log_info "Talos Linux Cluster Update Script (Generic Version)"
log_info "Started at: $(date '+%Y-%m-%d %H:%M:%S')"
log_info "Log file: ${LOG_FILE}"
print_separator

# Check dependencies
check_dependencies

# Discover cluster nodes
get_cluster_nodes

# Get version if not specified
if [ -z "$VERSION" ]; then
    log_info "Fetching latest Talos version from GitHub..."
    VERSION=$(get_latest_version)
    log_info "Target version: ${VERSION} (latest)"
else
    log_info "Target version: ${VERSION} (specified)"
fi

# Build installer image URL (using official Metal installer)
INSTALLER_IMAGE="ghcr.io/siderolabs/installer:${VERSION}"

log_info "Installer image: ${INSTALLER_IMAGE}"

# Show dry-run notice if applicable
if [ "$DRY_RUN" = true ]; then
    log_warning "DRY-RUN MODE: No changes will be made"
fi

# Initial cluster health check
if ! check_cluster_health; then
    log_error "Cluster is not healthy. Please fix issues before updating."
    exit 1
fi

# Create etcd backups
if ! create_etcd_backup; then
    log_error "Failed to create etcd backup"
    read -p "Continue without backup? [y/N] " -n 1 -r
    echo
    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
        log_error "Update aborted by user"
        exit 1
    fi
fi

# Update control plane nodes first
if ! update_nodes CONTROL_PLANE_NODES "$INSTALLER_IMAGE" "Control Plane"; then
    log_error "Failed to update control plane nodes"
    exit 1
fi

# Update worker nodes
if ! update_nodes WORKER_NODES "$INSTALLER_IMAGE" "Worker"; then
    log_error "Failed to update worker nodes"
    exit 1
fi

# Update Kubernetes
update_kubernetes

# Final health check
if [ "$DRY_RUN" = false ]; then
    log_info "Performing final cluster health check..."
    if check_cluster_health; then
        log_success "Cluster is healthy after updates"
    else
        log_warning "Cluster health check failed after updates"
    fi
fi

# Print summary
print_separator
log_success "Cluster update completed successfully!"
log_info "All nodes have been updated to version ${VERSION}"
log_info "Backup location: ${BACKUP_DIR}"
log_info "Log file: ${LOG_FILE}"
print_separator

exit 0
	#!/bin/bash

	################################################################################
	# Talos Linux Cluster Update Script (Generic Version)
	################################################################################
	# This script performs a safe, rolling update of Talos Linux nodes with:
	# - Automatic node discovery via kubectl (control plane & worker nodes)
	# - Automatic etcd backups before updates
	# - Health checks and node readiness validation
	# - Graceful upgrade strategy with configuration preservation
	# - Automatic detection of the latest Talos Linux version
	# - Uses official Metal installer images (suitable for bare metal/VM deployments)
	#
	# Usage:
	# ./update-talos-nodes.sh [OPTIONS]
	#
	# Options:
	# --version VERSION Specify a specific version (e.g., v1.9.2)
	# --skip-backup Skip etcd backup creation (NOT recommended)
	# --dry-run Show what would be done without executing
	# --help Show this help message
	#
	# Emergency Recovery:
	# Factory reset a node: talosctl -n xx.xx.xx.xx reset --graceful=false --reboot --system-labels-to-wipe=EPHEMERAL
	#
	# Factory Images:
	# For custom images with extensions, create them at: https://factory.talos.dev/
	################################################################################

	# Exit on error, undefined variables, and pipe failures
	set -euo pipefail

	# Script configuration
	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	readonly SCRIPT_DIR
	BACKUP_DIR="${SCRIPT_DIR}/talos-backups"
	readonly BACKUP_DIR
	TIMESTAMP=$(date +%Y%m%d-%H%M%S)
	readonly TIMESTAMP
	LOG_DIR="${SCRIPT_DIR}/talos-logs"
	readonly LOG_DIR
	LOG_FILE="${LOG_DIR}/talos-update-${TIMESTAMP}.log"
	readonly LOG_FILE

	# Talos configuration
	readonly REPO="siderolabs/talos"

	# Node arrays - will be populated by get_cluster_nodes()
	CONTROL_PLANE_NODES=()
	WORKER_NODES=()
	CONTROL_PLANE_NODE=""

	# Script options
	SKIP_BACKUP=false
	DRY_RUN=false
	VERSION=""

	# Colors for output
	readonly RED='\033[0;31m'
	readonly GREEN='\033[0;32m'
	readonly YELLOW='\033[1;33m'
	readonly BLUE='\033[0;34m'
	readonly NC='\033[0m' # No Color

	################################################################################
	# Helper Functions
	################################################################################

	# Print formatted messages
	log_info() {
	local msg="$1"
	echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" \| tee -a "${LOG_FILE}"
	}

	log_success() {
	local msg="$1"
	echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" \| tee -a "${LOG_FILE}"
	}

	log_warning() {
	local msg="$1"
	echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" \| tee -a "${LOG_FILE}"
	}

	log_error() {
	local msg="$1"
	echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') - ${msg}" \| tee -a "${LOG_FILE}"
	}

	# Print a separator line
	print_separator() {
	echo "================================================================================" \| tee -a "${LOG_FILE}"
	}

	# Show usage information
	show_help() {
	cat << EOF
	Talos Linux Cluster Update Script (Generic Version)

	This script automatically updates your Talos Linux cluster to the latest version
	using official Metal installer images.

	Usage: $0 [OPTIONS]

	Options:
	--version VERSION Specify a specific version (e.g., v1.9.2)
	--skip-backup Skip etcd backup creation (NOT recommended)
	--dry-run Show what would be done without executing
	--help Show this help message

	Examples:
	$0 # Update to latest version
	$0 --version v1.9.2 # Update to specific version
	$0 --dry-run # Preview updates without executing

	Notes:
	- This script uses official Talos Metal installer images
	- For custom factory images with extensions, modify the INSTALLER_IMAGE variable
	- Backups are saved to: ${BACKUP_DIR}
	- Logs are saved to: ${LOG_DIR}

	EOF
	exit 0
	}

	# Get cluster nodes from kubectl
	get_cluster_nodes() {
	log_info "Discovering nodes from cluster..."

	if ! kubectl cluster-info &> /dev/null; then
	log_error "Cannot connect to Kubernetes cluster"
	exit 1
	fi

	# Get control plane nodes (with role control-plane)
	local cp_nodes
	cp_nodes=$(kubectl get nodes -o wide --no-headers \| grep -i "control-plane" \| awk '{print $6}')

	if [ -z "$cp_nodes" ]; then
	log_error "No control plane nodes found in cluster"
	exit 1
	fi

	# Store control plane nodes in CONTROL_PLANE_NODES array
	while IFS= read -r node_ip; do
	if [ -n "$node_ip" ]; then
	CONTROL_PLANE_NODES+=("$node_ip")
	fi
	done <<< "$cp_nodes"

	# Set first control plane node as CONTROL_PLANE_NODE for operations
	CONTROL_PLANE_NODE="${CONTROL_PLANE_NODES[0]}"

	log_success "Found ${#CONTROL_PLANE_NODES[@]} control plane node(s): ${CONTROL_PLANE_NODES[*]}"

	# Get worker nodes (without role control-plane)
	local worker_nodes
	worker_nodes=$(kubectl get nodes -o wide --no-headers \| grep -v "control-plane" \| awk '{print $6}')

	# Store worker nodes in WORKER_NODES array
	while IFS= read -r node_ip; do
	if [ -n "$node_ip" ]; then
	WORKER_NODES+=("$node_ip")
	fi
	done <<< "$worker_nodes"

	if [ ${#WORKER_NODES[@]} -gt 0 ]; then
	log_success "Found ${#WORKER_NODES[@]} worker node(s): ${WORKER_NODES[*]}"
	else
	log_info "No worker nodes found in cluster"
	fi

	log_info "Using control plane node for operations: ${CONTROL_PLANE_NODE}"
	}

	# Check if required tools are installed
	check_dependencies() {
	log_info "Checking required dependencies..."

	local missing_deps=()

	if ! command -v talosctl &> /dev/null; then
	missing_deps+=("talosctl")
	fi

	if ! command -v kubectl &> /dev/null; then
	missing_deps+=("kubectl")
	fi

	if ! command -v curl &> /dev/null; then
	missing_deps+=("curl")
	fi

	if [ ${#missing_deps[@]} -ne 0 ]; then
	log_error "Missing required dependencies: ${missing_deps[*]}"
	log_error "Please install: ${missing_deps[*]}"
	exit 1
	fi

	log_success "All dependencies are installed"
	}

	# Get the latest Talos version from GitHub
	get_latest_version() {
	local version
	version=$(curl --silent "https://api.github.com/repos/${REPO}/releases/latest" \| grep -Po '(?<="tag_name": ")[^"]*')

	if [ -z "$version" ]; then
	log_error "Failed to fetch latest version from GitHub"
	exit 1
	fi

	echo "$version"
	}

	# Get node status from Kubernetes
	get_node_status() {
	local node_ip="$1"
	kubectl get nodes -o wide \| grep "$node_ip" \| awk '{print $2}'
	}

	# Get node version from Kubernetes
	get_node_version() {
	local node_ip="$1"
	kubectl get nodes -o wide \| grep "$node_ip" \| awk '{print $9}' \| sed 's/[()]//g'
	}

	# Wait for node to become ready
	wait_for_node_ready() {
	local node_ip="$1"
	local max_wait="${2:-300}" # Default 5 minutes
	local sleep_interval=10
	local waited=0

	log_info "Waiting for node ${node_ip} to become ready (timeout: ${max_wait}s)..."

	while [ $waited -lt $max_wait ]; do
	local status
	status=$(get_node_status "$node_ip")

	if [ "$status" = "Ready" ]; then
	log_success "Node ${node_ip} is ready after ${waited}s"
	return 0
	fi

	echo -n "." \| tee -a "${LOG_FILE}"
	sleep $sleep_interval
	waited=$((waited + sleep_interval))
	done

	echo "" \| tee -a "${LOG_FILE}"
	log_error "Node ${node_ip} did not become ready within ${max_wait}s"
	return 1
	}

	# Check cluster health
	check_cluster_health() {
	log_info "Checking cluster health..."

	if ! kubectl cluster-info &> /dev/null; then
	log_error "Cannot connect to Kubernetes cluster"
	return 1
	fi

	local not_ready
	not_ready=$(kubectl get nodes --no-headers \| grep -c "NotReady" \|\| true)

	if [ "$not_ready" -gt 0 ]; then
	log_warning "${not_ready} node(s) are not ready"
	kubectl get nodes \| tee -a "${LOG_FILE}"
	return 1
	fi

	log_success "All nodes are ready"
	return 0
	}

	# Create etcd backup
	create_etcd_backup() {
	if [ "$SKIP_BACKUP" = true ]; then
	log_warning "Skipping etcd backup (--skip-backup flag set)"
	return 0
	fi

	print_separator
	log_info "Creating etcd backups for disaster recovery..."

	# Create backup directory if it doesn't exist
	mkdir -p "${BACKUP_DIR}"

	local backup_file="${BACKUP_DIR}/etcd-snapshot-${TIMESTAMP}.db"
	local backup_file_raw="${BACKUP_DIR}/etcd-db-${TIMESTAMP}.db"

	# Create snapshot using talosctl
	log_info "Creating etcd snapshot via talosctl..."
	if talosctl -n "${CONTROL_PLANE_NODE}" etcd snapshot "${backup_file}"; then
	log_success "Snapshot saved to: ${backup_file}"
	else
	log_error "Failed to create etcd snapshot"
	return 1
	fi

	# Create second backup by copying the database directly
	log_info "Creating raw etcd database backup..."
	if talosctl -n "${CONTROL_PLANE_NODE}" cp /var/lib/etcd/member/snap/db "${backup_file_raw}"; then
	log_success "Raw backup saved to: ${backup_file_raw}"
	else
	log_warning "Failed to create raw etcd backup (non-critical)"
	fi

	# Verify backups exist and have non-zero size
	if [ ! -s "${backup_file}" ]; then
	log_error "Backup file is empty or doesn't exist: ${backup_file}"
	return 1
	fi

	local backup_size
	backup_size=$(du -h "${backup_file}" \| cut -f1)
	log_success "Backup verification passed (size: ${backup_size})"

	print_separator
	return 0
	}

	# Update a single node
	update_node() {
	local node_ip="$1"
	local image="$2"
	local node_type="$3"

	print_separator
	log_info "Processing ${node_type} node: ${node_ip}"

	local status
	status=$(get_node_status "$node_ip")

	local running_version
	running_version=$(get_node_version "$node_ip")

	log_info "Current status: ${status}"
	log_info "Current version: ${running_version}"
	log_info "Target version: ${VERSION}"

	# Check if already on target version
	if [ "$running_version" = "$VERSION" ]; then
	log_success "Node ${node_ip} is already on version ${VERSION}"
	print_separator
	return 0
	fi

	# Handle NotReady nodes
	if [ "$status" = "NotReady" ]; then
	log_warning "Node ${node_ip} is not ready, attempting reboot..."

	if [ "$DRY_RUN" = false ]; then
	if ! talosctl reboot -n "$node_ip" --mode "powercycle" --timeout 120s; then
	log_error "Failed to reboot node ${node_ip}"
	return 1
	fi

	if ! wait_for_node_ready "$node_ip" 180; then
	log_error "Node ${node_ip} did not become ready after reboot"
	return 1
	fi
	else
	log_info "[DRY-RUN] Would reboot node ${node_ip}"
	fi
	fi

	# Perform the upgrade
	log_info "Upgrading node ${node_ip} to ${VERSION}..."

	if [ "$DRY_RUN" = false ]; then
	if ! talosctl upgrade --nodes "$node_ip" --image "$image" --preserve --stage; then
	log_error "Upgrade failed for node ${node_ip}"
	return 1
	fi

	log_success "Upgrade command sent successfully"

	# Wait for node to complete upgrade and become ready
	log_info "Waiting for node to complete upgrade and become ready..."
	if ! wait_for_node_ready "$node_ip" 600; then
	log_error "Node ${node_ip} did not become ready after upgrade"
	return 1
	fi

	# Verify new version
	local new_version
	new_version=$(get_node_version "$node_ip")
	log_info "Node version after upgrade: ${new_version}"

	if [ "$new_version" = "$VERSION" ]; then
	log_success "Node ${node_ip} successfully upgraded to ${VERSION}"
	else
	log_warning "Node version (${new_version}) doesn't match target (${VERSION})"
	fi

	# Manual verification pause
	echo ""
	log_info "Please verify the node status manually:"
	kubectl get nodes -o wide \| grep "$node_ip" \| tee -a "${LOG_FILE}"
	echo ""
	read -p "Press [ENTER] to continue with the next node, or Ctrl+C to abort... " -r
	else
	log_info "[DRY-RUN] Would upgrade node ${node_ip} to ${image}"
	fi

	print_separator
	return 0
	}

	# Update all nodes of a specific type
	update_nodes() {
	local -n nodes=$1
	local image="$2"
	local node_type="$3"

	if [ ${#nodes[@]} -eq 0 ]; then
	log_info "No ${node_type} nodes to update"
	return 0
	fi

	log_info "Starting update of ${#nodes[@]} ${node_type} node(s)..."

	for node_ip in "${nodes[@]}"; do
	if ! update_node "$node_ip" "$image" "$node_type"; then
	log_error "Failed to update ${node_type} node ${node_ip}"
	log_error "Aborting further updates for safety"
	return 1
	fi

	# Check cluster health after each node update
	if [ "$DRY_RUN" = false ]; then
	if ! check_cluster_health; then
	log_error "Cluster health check failed after updating ${node_ip}"
	log_error "Please investigate before continuing"
	return 1
	fi
	fi
	done

	log_success "All ${node_type} nodes updated successfully"
	return 0
	}

	# Update Kubernetes version
	update_kubernetes() {
	print_separator
	log_info "Updating Kubernetes version..."

	if [ "$DRY_RUN" = false ]; then
	if talosctl upgrade-k8s -n "${CONTROL_PLANE_NODE}"; then
	log_success "Kubernetes upgrade initiated"
	else
	log_warning "Kubernetes upgrade failed or no upgrade available"
	fi
	else
	log_info "[DRY-RUN] Would upgrade Kubernetes via: talosctl upgrade-k8s -n ${CONTROL_PLANE_NODE}"
	fi

	print_separator
	}

	################################################################################
	# Main Script
	################################################################################

	# Parse command line arguments
	while [[ $# -gt 0 ]]; do
	case $1 in
	--version)
	VERSION="$2"
	shift 2
	;;
	--skip-backup)
	SKIP_BACKUP=true
	shift
	;;
	--dry-run)
	DRY_RUN=true
	shift
	;;
	--help)
	show_help
	;;
	*)
	echo "Unknown option: $1"
	show_help
	;;
	esac
	done

	# Initialize logging
	mkdir -p "${LOG_DIR}"
	touch "${LOG_FILE}"

	# Print header
	print_separator
	log_info "Talos Linux Cluster Update Script (Generic Version)"
	log_info "Started at: $(date '+%Y-%m-%d %H:%M:%S')"
	log_info "Log file: ${LOG_FILE}"
	print_separator

	# Check dependencies
	check_dependencies

	# Discover cluster nodes
	get_cluster_nodes

	# Get version if not specified
	if [ -z "$VERSION" ]; then
	log_info "Fetching latest Talos version from GitHub..."
	VERSION=$(get_latest_version)
	log_info "Target version: ${VERSION} (latest)"
	else
	log_info "Target version: ${VERSION} (specified)"
	fi

	# Build installer image URL (using official Metal installer)
	INSTALLER_IMAGE="ghcr.io/siderolabs/installer:${VERSION}"

	log_info "Installer image: ${INSTALLER_IMAGE}"

	# Show dry-run notice if applicable
	if [ "$DRY_RUN" = true ]; then
	log_warning "DRY-RUN MODE: No changes will be made"
	fi

	# Initial cluster health check
	if ! check_cluster_health; then
	log_error "Cluster is not healthy. Please fix issues before updating."
	exit 1
	fi

	# Create etcd backups
	if ! create_etcd_backup; then
	log_error "Failed to create etcd backup"
	read -p "Continue without backup? [y/N] " -n 1 -r
	echo
	if [[ ! $REPLY =~ ^[Yy]$ ]]; then
	log_error "Update aborted by user"
	exit 1
	fi
	fi

	# Update control plane nodes first
	if ! update_nodes CONTROL_PLANE_NODES "$INSTALLER_IMAGE" "Control Plane"; then
	log_error "Failed to update control plane nodes"
	exit 1
	fi

	# Update worker nodes
	if ! update_nodes WORKER_NODES "$INSTALLER_IMAGE" "Worker"; then
	log_error "Failed to update worker nodes"
	exit 1
	fi

	# Update Kubernetes
	update_kubernetes

	# Final health check
	if [ "$DRY_RUN" = false ]; then
	log_info "Performing final cluster health check..."
	if check_cluster_health; then
	log_success "Cluster is healthy after updates"
	else
	log_warning "Cluster health check failed after updates"
	fi
	fi

	# Print summary
	print_separator
	log_success "Cluster update completed successfully!"
	log_info "All nodes have been updated to version ${VERSION}"
	log_info "Backup location: ${BACKUP_DIR}"
	log_info "Log file: ${LOG_FILE}"
	print_separator

	exit 0
No results found