Skip to content

Instantly share code, notes, and snippets.

@bachmanity1
Last active March 10, 2026 07:10
Show Gist options
  • Select an option

  • Save bachmanity1/128e76d5037cf5a154cd18446c2e089d to your computer and use it in GitHub Desktop.

Select an option

Save bachmanity1/128e76d5037cf5a154cd18446c2e089d to your computer and use it in GitHub Desktop.
Dry run: check CRUSH weight only
#!/bin/bash
set -euo pipefail
POLL_INTERVAL=10
TIMEOUT=1800 # 30 minutes
HOST_FILE=""
HOSTS=()
MODE="drain" # drain or check
usage() {
echo "Usage: $0 [--check] [--file <hosts-file>] [hostname ...] [--timeout <seconds>]"
echo ""
echo "Drains all daemons from Ceph host(s), waits until fully drained,"
echo "then removes each host and its CRUSH entry."
echo "Exits immediately on any error."
echo ""
echo "Options:"
echo " --check Verify hosts are fully removed (no drain/remove)"
echo " --file <path> File with one hostname per line"
echo " --timeout <seconds> Max wait time per host for drain (default: 1800)"
exit 1
}
die() { echo "ERROR: $*" >&2; exit 1; }
[[ $# -lt 1 ]] && usage
[[ "$1" == "--help" || "$1" == "-h" ]] && usage
while [[ $# -gt 0 ]]; do
case "$1" in
--check) MODE="check"; shift ;;
--file|-f) HOST_FILE="$2"; shift 2 ;;
--timeout) TIMEOUT="$2"; shift 2 ;;
--help|-h) usage ;;
*) HOSTS+=("$1"); shift ;;
esac
done
if [[ -n "$HOST_FILE" ]]; then
[[ -f "$HOST_FILE" ]] || die "file not found: $HOST_FILE"
while IFS= read -r line; do
line="${line%%#*}"
line="$(echo "$line" | xargs)"
[[ -n "$line" ]] && HOSTS+=("$line")
done < "$HOST_FILE"
fi
[[ ${#HOSTS[@]} -eq 0 ]] && die "no hosts specified"
# orch uses FQDN (hostname.n3r), crush uses short hostname
orch_name() { echo "${1}.n3r"; }
crush_name() { echo "${1%.n3r}"; }
check_host() {
local hostname="$1"
local orch_h crush_h
orch_h=$(orch_name "$hostname")
crush_h=$(crush_name "$hostname")
local ok=true
# Check no daemons running
local daemon_count
daemon_count=$(ceph orch ps "$orch_h" 2>&1 | grep -c "$orch_h" || true)
if [[ "$daemon_count" -gt 0 ]]; then
echo "FAIL: [$hostname] $daemon_count daemon(s) still running"
ok=false
else
echo "OK: [$hostname] no daemons running"
fi
# Check host removed from orch
if ceph orch host ls | grep -q "$orch_h"; then
echo "FAIL: [$hostname] still in host list"
ok=false
else
echo "OK: [$hostname] removed from orch"
fi
# Check removed from crush map
if ceph osd crush tree | grep -q "host $crush_h"; then
echo "FAIL: [$hostname] still in crush map"
ok=false
else
echo "OK: [$hostname] removed from crush map"
fi
[[ "$ok" == true ]]
}
if [[ "$MODE" == "check" ]]; then
echo "Checking hosts: ${HOSTS[*]}"
echo ""
failed=()
for hostname in "${HOSTS[@]}"; do
if ! check_host "$hostname"; then
failed+=("$hostname")
fi
echo ""
done
if [[ ${#failed[@]} -gt 0 ]]; then
die "hosts not fully removed: ${failed[*]}"
fi
echo "=== All ${#HOSTS[@]} host(s) verified as removed ==="
exit 0
fi
echo "Hosts to drain and remove: ${HOSTS[*]}"
echo ""
for hostname in "${HOSTS[@]}"; do
orch_h=$(orch_name "$hostname")
crush_h=$(crush_name "$hostname")
# Validate host exists
ceph orch host ls | grep -q "$orch_h" || die "[$hostname] not found in 'ceph orch host ls' (looked for $orch_h)"
# Validate CRUSH weight is zero before draining
crush_weight=$(ceph osd crush tree | awk -v h="$crush_h" '$0 ~ "host " h {print $2}')
[[ -z "$crush_weight" ]] && die "[$hostname] not found in crush tree (looked for $crush_h)"
[[ "$crush_weight" != "0" ]] && die "[$hostname] CRUSH weight is $crush_weight — reweight OSDs to 0 first"
echo "=== [$hostname] CRUSH weight is 0 ==="
# echo "=== [$hostname] Draining ==="
# ceph orch host drain "$orch_h"
# elapsed=0
# while true; do
# output=$(ceph orch ps "$orch_h" 2>&1)
# count=$(echo "$output" | grep -c "$orch_h" || true)
#
# if [[ "$count" -eq 0 ]]; then
# echo "=== [$hostname] All daemons drained ==="
# break
# fi
#
# [[ "$elapsed" -ge "$TIMEOUT" ]] && die "[$hostname] timeout after ${elapsed}s — $count daemon(s) still running"
#
# echo "[$hostname] Waiting... $count daemon(s) remaining (${elapsed}s elapsed)"
# sleep "$POLL_INTERVAL"
# elapsed=$((elapsed + POLL_INTERVAL))
# done
# echo "=== [$hostname] Removing host ==="
# ceph orch host rm "$orch_h"
# echo "=== [$hostname] Removing CRUSH entry ==="
# ceph osd crush rm "$crush_h"
# echo "=== [$hostname] Cleanup ==="
# ./ceph-v2-manage.sh cleanup -l "$orch_h" -p ceph-v2 --skip-check -v
echo "=== [$hostname] Done ==="
echo ""
done
echo "=== All ${#HOSTS[@]} host(s) removed successfully ==="
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment