Last active
March 10, 2026 07:10
-
-
Save bachmanity1/128e76d5037cf5a154cd18446c2e089d to your computer and use it in GitHub Desktop.
Dry run: check CRUSH weight only
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -euo pipefail | |
| POLL_INTERVAL=10 | |
| TIMEOUT=1800 # 30 minutes | |
| HOST_FILE="" | |
| HOSTS=() | |
| MODE="drain" # drain or check | |
| usage() { | |
| echo "Usage: $0 [--check] [--file <hosts-file>] [hostname ...] [--timeout <seconds>]" | |
| echo "" | |
| echo "Drains all daemons from Ceph host(s), waits until fully drained," | |
| echo "then removes each host and its CRUSH entry." | |
| echo "Exits immediately on any error." | |
| echo "" | |
| echo "Options:" | |
| echo " --check Verify hosts are fully removed (no drain/remove)" | |
| echo " --file <path> File with one hostname per line" | |
| echo " --timeout <seconds> Max wait time per host for drain (default: 1800)" | |
| exit 1 | |
| } | |
| die() { echo "ERROR: $*" >&2; exit 1; } | |
| [[ $# -lt 1 ]] && usage | |
| [[ "$1" == "--help" || "$1" == "-h" ]] && usage | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --check) MODE="check"; shift ;; | |
| --file|-f) HOST_FILE="$2"; shift 2 ;; | |
| --timeout) TIMEOUT="$2"; shift 2 ;; | |
| --help|-h) usage ;; | |
| *) HOSTS+=("$1"); shift ;; | |
| esac | |
| done | |
| if [[ -n "$HOST_FILE" ]]; then | |
| [[ -f "$HOST_FILE" ]] || die "file not found: $HOST_FILE" | |
| while IFS= read -r line; do | |
| line="${line%%#*}" | |
| line="$(echo "$line" | xargs)" | |
| [[ -n "$line" ]] && HOSTS+=("$line") | |
| done < "$HOST_FILE" | |
| fi | |
| [[ ${#HOSTS[@]} -eq 0 ]] && die "no hosts specified" | |
| # orch uses FQDN (hostname.n3r), crush uses short hostname | |
| orch_name() { echo "${1}.n3r"; } | |
| crush_name() { echo "${1%.n3r}"; } | |
| check_host() { | |
| local hostname="$1" | |
| local orch_h crush_h | |
| orch_h=$(orch_name "$hostname") | |
| crush_h=$(crush_name "$hostname") | |
| local ok=true | |
| # Check no daemons running | |
| local daemon_count | |
| daemon_count=$(ceph orch ps "$orch_h" 2>&1 | grep -c "$orch_h" || true) | |
| if [[ "$daemon_count" -gt 0 ]]; then | |
| echo "FAIL: [$hostname] $daemon_count daemon(s) still running" | |
| ok=false | |
| else | |
| echo "OK: [$hostname] no daemons running" | |
| fi | |
| # Check host removed from orch | |
| if ceph orch host ls | grep -q "$orch_h"; then | |
| echo "FAIL: [$hostname] still in host list" | |
| ok=false | |
| else | |
| echo "OK: [$hostname] removed from orch" | |
| fi | |
| # Check removed from crush map | |
| if ceph osd crush tree | grep -q "host $crush_h"; then | |
| echo "FAIL: [$hostname] still in crush map" | |
| ok=false | |
| else | |
| echo "OK: [$hostname] removed from crush map" | |
| fi | |
| [[ "$ok" == true ]] | |
| } | |
| if [[ "$MODE" == "check" ]]; then | |
| echo "Checking hosts: ${HOSTS[*]}" | |
| echo "" | |
| failed=() | |
| for hostname in "${HOSTS[@]}"; do | |
| if ! check_host "$hostname"; then | |
| failed+=("$hostname") | |
| fi | |
| echo "" | |
| done | |
| if [[ ${#failed[@]} -gt 0 ]]; then | |
| die "hosts not fully removed: ${failed[*]}" | |
| fi | |
| echo "=== All ${#HOSTS[@]} host(s) verified as removed ===" | |
| exit 0 | |
| fi | |
| echo "Hosts to drain and remove: ${HOSTS[*]}" | |
| echo "" | |
| for hostname in "${HOSTS[@]}"; do | |
| orch_h=$(orch_name "$hostname") | |
| crush_h=$(crush_name "$hostname") | |
| # Validate host exists | |
| ceph orch host ls | grep -q "$orch_h" || die "[$hostname] not found in 'ceph orch host ls' (looked for $orch_h)" | |
| # Validate CRUSH weight is zero before draining | |
| crush_weight=$(ceph osd crush tree | awk -v h="$crush_h" '$0 ~ "host " h {print $2}') | |
| [[ -z "$crush_weight" ]] && die "[$hostname] not found in crush tree (looked for $crush_h)" | |
| [[ "$crush_weight" != "0" ]] && die "[$hostname] CRUSH weight is $crush_weight — reweight OSDs to 0 first" | |
| echo "=== [$hostname] CRUSH weight is 0 ===" | |
| # echo "=== [$hostname] Draining ===" | |
| # ceph orch host drain "$orch_h" | |
| # elapsed=0 | |
| # while true; do | |
| # output=$(ceph orch ps "$orch_h" 2>&1) | |
| # count=$(echo "$output" | grep -c "$orch_h" || true) | |
| # | |
| # if [[ "$count" -eq 0 ]]; then | |
| # echo "=== [$hostname] All daemons drained ===" | |
| # break | |
| # fi | |
| # | |
| # [[ "$elapsed" -ge "$TIMEOUT" ]] && die "[$hostname] timeout after ${elapsed}s — $count daemon(s) still running" | |
| # | |
| # echo "[$hostname] Waiting... $count daemon(s) remaining (${elapsed}s elapsed)" | |
| # sleep "$POLL_INTERVAL" | |
| # elapsed=$((elapsed + POLL_INTERVAL)) | |
| # done | |
| # echo "=== [$hostname] Removing host ===" | |
| # ceph orch host rm "$orch_h" | |
| # echo "=== [$hostname] Removing CRUSH entry ===" | |
| # ceph osd crush rm "$crush_h" | |
| # echo "=== [$hostname] Cleanup ===" | |
| # ./ceph-v2-manage.sh cleanup -l "$orch_h" -p ceph-v2 --skip-check -v | |
| echo "=== [$hostname] Done ===" | |
| echo "" | |
| done | |
| echo "=== All ${#HOSTS[@]} host(s) removed successfully ===" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment