lukluk/rescue.sh

## rescue.sh
#!/bin/bash
NAMESPACE="apps"

echo "🔍 Scanning pods in namespace: $NAMESPACE ..."

# Step 1: detect pods that are failing (CrashLoopBackOff, Error, etc)
> /tmp/deploy_not_running.txt

kubectl get pods -n "$NAMESPACE" -o json |
  jq -r '.items[]
    | select(.status.containerStatuses != null)
    | select([.status.containerStatuses[].state.waiting.reason] | tostring | test("CrashLoopBackOff|Error|ImagePullBackOff|CreateContainerError"))
    | .metadata.ownerReferences[0].name' \
  | sort -u | while read -r RS; do
      DEPLOY=$(kubectl get rs "$RS" -n "$NAMESPACE" -o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null)
      if [[ -n "$DEPLOY" ]]; then
        echo "$DEPLOY" >> /tmp/deploy_not_running.txt
      fi
    done

sort -u -o /tmp/deploy_not_running.txt /tmp/deploy_not_running.txt

# Step 2: Handle empty list
if [[ ! -s /tmp/deploy_not_running.txt ]]; then
  echo "✅ All deployments appear healthy."
  exit 0
fi

echo "🚨 Problematic deployments detected:"
cat /tmp/deploy_not_running.txt

# Step 3: Delete all except the first
FIRST_DEPLOY=$(head -n 1 /tmp/deploy_not_running.txt)
tail -n +2 /tmp/deploy_not_running.txt > /tmp/deploy_to_delete.txt

if [[ -s /tmp/deploy_to_delete.txt ]]; then
  echo "🧹 Deleting deployments (except $FIRST_DEPLOY):"
  while read -r DEPLOY; do
    [[ -n "$DEPLOY" ]] && kubectl delete deploy "$DEPLOY" -n "$NAMESPACE" --ignore-not-found
  done < /tmp/deploy_to_delete.txt
fi

# Step 4: Sequential restart loop
DEPLOY_LIST=($(cat /tmp/deploy_not_running.txt))
INDEX=0

while [[ $INDEX -lt ${#DEPLOY_LIST[@]} ]]; do
  DEPLOY=${DEPLOY_LIST[$INDEX]}
  echo "🔁 Restarting deployment: $DEPLOY"
  kubectl rollout restart deployment "$DEPLOY" -n "$NAMESPACE"

  echo "⏳ Waiting for pods in $DEPLOY to become Running..."
  while true; do
    # Count non-running or crashlooping pods for this deployment
    NOT_OK=$(kubectl get pods -n "$NAMESPACE" -l app="$DEPLOY" -o json |
      jq '[.items[] | select(
        (.status.phase != "Running")
        or ([.status.containerStatuses[].state.waiting.reason] | tostring | test("CrashLoopBackOff|Error|ImagePullBackOff|CreateContainerError"))
      )] | length')

    if [[ "$NOT_OK" -eq 0 ]]; then
      echo "✅ $DEPLOY pods are all healthy."
      break
    else
      echo "❌ Still unhealthy pods ($NOT_OK). Rechecking in 60s..."
      sleep 60
    fi
  done

  INDEX=$((INDEX + 1))
done

echo "🎉 All failing deployments have been recovered."
	#!/bin/bash
	NAMESPACE="apps"

	echo "🔍 Scanning pods in namespace: $NAMESPACE ..."

	# Step 1: detect pods that are failing (CrashLoopBackOff, Error, etc)
	> /tmp/deploy_not_running.txt

	kubectl get pods -n "$NAMESPACE" -o json \|
	jq -r '.items[]
	\| select(.status.containerStatuses != null)
	\| select([.status.containerStatuses[].state.waiting.reason] \| tostring \| test("CrashLoopBackOff\|Error\|ImagePullBackOff\|CreateContainerError"))
	\| .metadata.ownerReferences[0].name' \
	\| sort -u \| while read -r RS; do
	DEPLOY=$(kubectl get rs "$RS" -n "$NAMESPACE" -o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null)
	if [[ -n "$DEPLOY" ]]; then
	echo "$DEPLOY" >> /tmp/deploy_not_running.txt
	fi
	done

	sort -u -o /tmp/deploy_not_running.txt /tmp/deploy_not_running.txt

	# Step 2: Handle empty list
	if [[ ! -s /tmp/deploy_not_running.txt ]]; then
	echo "✅ All deployments appear healthy."
	exit 0
	fi

	echo "🚨 Problematic deployments detected:"
	cat /tmp/deploy_not_running.txt

	# Step 3: Delete all except the first
	FIRST_DEPLOY=$(head -n 1 /tmp/deploy_not_running.txt)
	tail -n +2 /tmp/deploy_not_running.txt > /tmp/deploy_to_delete.txt

	if [[ -s /tmp/deploy_to_delete.txt ]]; then
	echo "🧹 Deleting deployments (except $FIRST_DEPLOY):"
	while read -r DEPLOY; do
	[[ -n "$DEPLOY" ]] && kubectl delete deploy "$DEPLOY" -n "$NAMESPACE" --ignore-not-found
	done < /tmp/deploy_to_delete.txt
	fi

	# Step 4: Sequential restart loop
	DEPLOY_LIST=($(cat /tmp/deploy_not_running.txt))
	INDEX=0

	while [[ $INDEX -lt ${#DEPLOY_LIST[@]} ]]; do
	DEPLOY=${DEPLOY_LIST[$INDEX]}
	echo "🔁 Restarting deployment: $DEPLOY"
	kubectl rollout restart deployment "$DEPLOY" -n "$NAMESPACE"

	echo "⏳ Waiting for pods in $DEPLOY to become Running..."
	while true; do
	# Count non-running or crashlooping pods for this deployment
	NOT_OK=$(kubectl get pods -n "$NAMESPACE" -l app="$DEPLOY" -o json \|
	jq '[.items[] \| select(
	(.status.phase != "Running")
	or ([.status.containerStatuses[].state.waiting.reason] \| tostring \| test("CrashLoopBackOff\|Error\|ImagePullBackOff\|CreateContainerError"))
	)] \| length')

	if [[ "$NOT_OK" -eq 0 ]]; then
	echo "✅ $DEPLOY pods are all healthy."
	break
	else
	echo "❌ Still unhealthy pods ($NOT_OK). Rechecking in 60s..."
	sleep 60
	fi
	done

	INDEX=$((INDEX + 1))
	done

	echo "🎉 All failing deployments have been recovered."
No results found