Skip to content

Instantly share code, notes, and snippets.

@ruivieira
Created February 17, 2026 08:41
Show Gist options
  • Select an option

  • Save ruivieira/037274e0f17bebb855ba22ba66635498 to your computer and use it in GitHub Desktop.

Select an option

Save ruivieira/037274e0f17bebb855ba22ba66635498 to your computer and use it in GitHub Desktop.
Redeploy EvalHub
#!/usr/bin/env bash
# Redeploy EvalHub: purge cached images from nodes, restart operator, and recreate the CR.
# Run from the root of a trustyai-service-operator clone.
#
# Usage:
# cd trustyai-service-operator
# NS=prabhu ./resources/redeploy-evalhub.sh # clean all images
# NS=prabhu ./resources/redeploy-evalhub.sh --operator --hub # clean only operator + hub
# NS=prabhu ./resources/redeploy-evalhub.sh --operator=quay.io/custom/operator:v2 # clean custom operator image
# NS=prabhu ./resources/redeploy-evalhub.sh --operator -y # skip confirmation
#
# With no image flags, all images are cleaned from node caches.
# With image flags, only the selected images are cleaned.
# A flag without a value uses the default image for that component.
# Selected images are also written to config/base/params.env before uploading manifests.
# Use -y/--yes to skip the confirmation prompt (e.g. in CI).
set -euo pipefail
NS="${NS:-test}"
REPO_ROOT="$(pwd)"
YES=false
# Default images per component
DEFAULT_HUB="quay.io/evalhub/evalhub:latest"
DEFAULT_OPERATOR="quay.io/trustyai/trustyai-service-operator:latest"
DEFAULT_LMEVAL="quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2"
# Parse CLI flags
SELECTED=()
for arg in "$@"; do
case "$arg" in
--hub=*) SELECTED+=("hub:${arg#--hub=}") ;;
--hub) SELECTED+=("hub:${DEFAULT_HUB}") ;;
--operator=*) SELECTED+=("operator:${arg#--operator=}") ;;
--operator) SELECTED+=("operator:${DEFAULT_OPERATOR}") ;;
--lmeval=*) SELECTED+=("lmeval:${arg#--lmeval=}") ;;
--lmeval) SELECTED+=("lmeval:${DEFAULT_LMEVAL}") ;;
--all) SELECTED=("all") ;;
-y|--yes) YES=true ;;
*)
echo "Unknown flag: $arg"
echo "Flags: --hub, --operator, --lmeval, --all, -y/--yes"
exit 1
;;
esac
done
# Resolve final image values
IMG_HUB=""
IMG_OPERATOR=""
IMG_LMEVAL=""
if [ ${#SELECTED[@]} -eq 0 ] || [[ " ${SELECTED[*]} " == *" all "* ]]; then
IMG_HUB="$DEFAULT_HUB"
IMG_OPERATOR="$DEFAULT_OPERATOR"
IMG_LMEVAL="$DEFAULT_LMEVAL"
else
for entry in "${SELECTED[@]}"; do
component="${entry%%:*}"
image="${entry#*:}"
case "$component" in
hub) IMG_HUB="$image" ;;
operator) IMG_OPERATOR="$image" ;;
lmeval) IMG_LMEVAL="$image" ;;
esac
done
fi
# Build the list of images to clean from node caches
IMAGES=()
[ -n "$IMG_HUB" ] && IMAGES+=("$IMG_HUB")
[ -n "$IMG_OPERATOR" ] && IMAGES+=("$IMG_OPERATOR")
[ -n "$IMG_LMEVAL" ] && IMAGES+=("$IMG_LMEVAL")
# --- Summary and confirmation ---
echo "EvalHub redeployment"
echo "===================="
echo ""
echo " Namespace: $NS"
echo ""
echo "This will:"
echo " 1. Remove the following images from ALL worker node caches (crictl rmi):"
for img in "${IMAGES[@]}"; do
echo " - $img"
done
echo " 2. Delete the EvalHub CR in namespace '$NS'"
echo " 3. Delete legacy RBAC resources (ClusterRoles, bindings, ServiceAccounts)"
echo " 4. Update config/base/params.env with selected images"
echo " 5. Upload operator manifests and restart the operator"
echo " 6. Reapply the EvalHub CRD and CR"
echo " 7. Delete old evaluation jobs in namespace '$NS'"
echo ""
echo "WARNING: Image removal affects ALL worker nodes in the cluster."
echo " Other workloads using these images will re-pull on next restart."
echo ""
if [ "$YES" != true ]; then
read -r -p "Proceed? [y/N] " confirm
case "$confirm" in
[yY][eE][sS]|[yY]) ;;
*)
echo "Aborted."
exit 0
;;
esac
echo ""
fi
# --- Clean cached images from all worker nodes ---
echo "=== Cleaning cached images from worker nodes ==="
NODES=$(oc get nodes -l node-role.kubernetes.io/worker -o jsonpath='{.items[*].metadata.name}')
if [ -z "$NODES" ]; then
echo "No worker nodes found, skipping image cleanup"
else
for NODE in $NODES; do
echo " Node: $NODE"
CMDS=""
for IMG in "${IMAGES[@]}"; do
CMDS+="crictl rmi ${IMG} 2>/dev/null || true; "
done
oc debug "node/$NODE" --quiet -- chroot /host bash -c "$CMDS" 2>/dev/null || true
done
fi
echo "Image cleanup done"
echo ""
# --- Step 2: Delete existing EvalHub CR ---
echo "=== Deleting EvalHub CR ==="
kubectl delete evalhub evalhub -n "$NS" --ignore-not-found=true
echo ""
# --- Validate repo root ---
if [ ! -d "$REPO_ROOT/config/crd" ]; then
echo "ERROR: Must be run from the root of a trustyai-service-operator clone."
echo " Expected to find config/crd/ in $(pwd)"
exit 1
fi
# --- Cleanup legacy RBAC ---
echo "=== Cleaning up legacy RBAC ==="
# Legacy ClusterRoles
for CR in \
evalhub-proxy-role \
evalhub-jobs-proxy-role \
trustyai-service-operator-evalhub-proxy-role \
trustyai-service-operator-evalhub-jobs-proxy-role \
evalhub-api-role \
evalhub-jobs-api-role \
evalhub-resource-manager \
trustyai-service-operator-evalhub-api-role \
trustyai-service-operator-evalhub-jobs-api-role \
trustyai-service-operator-evalhub-resource-manager; do
kubectl delete clusterrole "$CR" --ignore-not-found=true 2>/dev/null || true
done
# Legacy monolithic resource-manager ClusterRoleBindings
for CRB in \
evalhub-resource-manager-binding \
trustyai-service-operator-evalhub-resource-manager-binding; do
kubectl delete clusterrolebinding "$CRB" --ignore-not-found=true 2>/dev/null || true
done
# ClusterRoleBindings referencing removed resource-manager ClusterRoles
for CRB in $(kubectl get clusterrolebindings -o json 2>/dev/null | \
python3 -c 'import json,sys; d=json.load(sys.stdin); bad={"evalhub-resource-manager","trustyai-service-operator-evalhub-resource-manager"}; \
[print(i["metadata"]["name"]) for i in d.get("items",[]) if i.get("roleRef",{}).get("kind")=="ClusterRole" and i.get("roleRef",{}).get("name") in bad]' 2>/dev/null); do
echo " Deleting CRB referencing removed resource-manager ClusterRole: ${CRB}"
kubectl delete clusterrolebinding "${CRB}" --ignore-not-found=true 2>/dev/null || true
done
# Legacy ClusterRoleBindings (pattern: *-{namespace}-*-proxy-rolebinding)
for CRB in $(kubectl get clusterrolebindings -o name 2>/dev/null | grep -- '-proxy-rolebinding$' | grep -- "-${NS}-"); do
echo " Deleting legacy ClusterRoleBinding: ${CRB}"
kubectl delete "${CRB}" --ignore-not-found=true 2>/dev/null || true
done
# Legacy ClusterRoleBindings (pattern: {namespace}-*-jobs-proxy)
for CRB in $(kubectl get clusterrolebindings -o name 2>/dev/null | grep -- '-jobs-proxy$' | grep "^clusterrolebinding.rbac.authorization.k8s.io/${NS}-"); do
echo " Deleting legacy jobs ClusterRoleBinding: ${CRB}"
kubectl delete "${CRB}" --ignore-not-found=true 2>/dev/null || true
done
# Legacy RoleBindings in namespace (pattern: *-jobs-proxy-rolebinding)
for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-jobs-proxy-rolebinding$'); do
echo " Deleting legacy jobs proxy RoleBinding: ${RB}"
kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true
done
# Stale cross-namespace RoleBindings (evalhub-jobs-mlflow in opendatahub)
for RB in $(kubectl get rolebindings -n opendatahub -o name 2>/dev/null | grep 'evalhub-jobs-mlflow' || true); do
echo " Deleting stale cross-namespace RoleBinding: ${RB}"
kubectl delete "${RB}" -n opendatahub --ignore-not-found=true 2>/dev/null || true
done
# Legacy proxy ServiceAccounts
for SA in $(kubectl get serviceaccounts -n "$NS" -o name 2>/dev/null | grep -- '-proxy$' | grep -v 'kube-rbac-proxy'); do
echo " Deleting legacy proxy ServiceAccount: ${SA}"
kubectl delete "${SA}" -n "$NS" --ignore-not-found=true 2>/dev/null || true
done
# Legacy MLFlow proxy RoleBindings
for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-mlflow-proxy$'); do
echo " Deleting legacy MLFlow proxy RoleBinding: ${RB}"
kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true
done
# Legacy resource-manager RoleBindings
for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-resource-manager$'); do
echo " Deleting legacy resource-manager RoleBinding: ${RB}"
kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true
done
echo "Legacy RBAC cleanup done"
echo ""
# --- Update params.env with selected images ---
echo "=== Updating params.env ==="
update_params_env() {
local file="$1"
if [ ! -f "$file" ]; then
echo " WARNING: $file not found, skipping"
return
fi
echo " $file"
[ -n "$IMG_OPERATOR" ] && sed -i "s|^trustyaiOperatorImage=.*|trustyaiOperatorImage=${IMG_OPERATOR}|" "$file"
[ -n "$IMG_HUB" ] && sed -i "s|^evalHubImage=.*|evalHubImage=${IMG_HUB}|" "$file"
[ -n "$IMG_LMEVAL" ] && sed -i "s|^lmes-pod-image=.*|lmes-pod-image=${IMG_LMEVAL}|" "$file"
}
update_params_env "$REPO_ROOT/config/base/params.env"
update_params_env "$REPO_ROOT/config/overlays/odh/params.env"
[ -n "$IMG_OPERATOR" ] && echo " trustyaiOperatorImage=${IMG_OPERATOR}"
[ -n "$IMG_HUB" ] && echo " evalHubImage=${IMG_HUB}"
[ -n "$IMG_LMEVAL" ] && echo " lmes-pod-image=${IMG_LMEVAL}"
echo ""
# --- Upload operator manifests ---
echo "=== Uploading operator manifests ==="
OPERATOR_NS="openshift-operators"
OPERATOR_LABEL="name=opendatahub-operator"
CSV_PATTERN="opendatahub-operator"
COMPONENT="trustyai"
MANIFESTS_DIR="$REPO_ROOT/config"
PVC_NAME="${COMPONENT}-manifests"
MOUNT_PATH="/opt/manifests/${COMPONENT}"
VOLUME_NAME="${COMPONENT}-manifests"
# Create PVC if it doesn't exist
if oc get pvc "$PVC_NAME" -n "$OPERATOR_NS" &>/dev/null; then
echo " PVC '$PVC_NAME' already exists, reusing"
else
echo " Creating PVC '$PVC_NAME'..."
oc apply -f - <<PVCEOF
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ${PVC_NAME}
namespace: ${OPERATOR_NS}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
PVCEOF
fi
# Find and patch the CSV
CSV_NAME=$(oc get csv -n "$OPERATOR_NS" -o name 2>/dev/null | grep "$CSV_PATTERN" | head -1 | cut -d/ -f2)
if [ -z "$CSV_NAME" ]; then
echo "ERROR: No CSV found matching '$CSV_PATTERN' in $OPERATOR_NS"
exit 1
fi
echo " Found CSV: $CSV_NAME"
# Get current volumeMount/volume indices for cleanup
CSV_JSON=$(oc get csv "$CSV_NAME" -n "$OPERATOR_NS" -o json)
VM_INDEX=$(echo "$CSV_JSON" | python3 -c "
import json,sys; d=json.load(sys.stdin)
vms=d['spec']['install']['spec']['deployments'][0]['spec']['template']['spec'].get('containers',[{}])[0].get('volumeMounts',[])
idx=[i for i,v in enumerate(vms) if v.get('name')=='$VOLUME_NAME']
print(idx[0] if idx else -1)
" 2>/dev/null || echo "-1")
VOL_INDEX=$(echo "$CSV_JSON" | python3 -c "
import json,sys; d=json.load(sys.stdin)
vols=d['spec']['install']['spec']['deployments'][0]['spec']['template']['spec'].get('volumes',[])
idx=[i for i,v in enumerate(vols) if v.get('name')=='$VOLUME_NAME']
print(idx[0] if idx else -1)
" 2>/dev/null || echo "-1")
PATCH='[
{"op":"replace","path":"/spec/install/spec/deployments/0/spec/replicas","value":1},
{"op":"replace","path":"/spec/install/spec/deployments/0/spec/strategy/type","value":"Recreate"},
{"op":"add","path":"/spec/install/spec/deployments/0/spec/template/spec/securityContext","value":{"fsGroup":65532}}'
[ "$VM_INDEX" != "-1" ] && PATCH+=",{\"op\":\"remove\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/containers/0/volumeMounts/$VM_INDEX\"}"
[ "$VOL_INDEX" != "-1" ] && PATCH+=",{\"op\":\"remove\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/volumes/$VOL_INDEX\"}"
PATCH+=",{\"op\":\"add\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/containers/0/volumeMounts/-\",\"value\":{\"name\":\"$VOLUME_NAME\",\"mountPath\":\"$MOUNT_PATH\"}}"
PATCH+=",{\"op\":\"add\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/volumes/-\",\"value\":{\"name\":\"$VOLUME_NAME\",\"persistentVolumeClaim\":{\"claimName\":\"$PVC_NAME\"}}}"
PATCH+=']'
echo " Patching CSV..."
echo "$PATCH" | oc patch csv "$CSV_NAME" -n "$OPERATOR_NS" --type json --patch-file /dev/stdin
# Wait for operator pod
echo " Waiting for operator pod..."
oc wait --for=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'=True \
pod -l "$OPERATOR_LABEL" -n "$OPERATOR_NS" --timeout=300s
# Copy manifests into the pod
POD_NAME=$(oc get pod -l "$OPERATOR_LABEL" -n "$OPERATOR_NS" -o jsonpath='{.items[0].metadata.name}')
echo " Clearing existing manifests in pod..."
oc exec -n "$OPERATOR_NS" "$POD_NAME" -- sh -c "rm -rf ${MOUNT_PATH}/* ${MOUNT_PATH}/.[!.]* 2>/dev/null || true"
echo " Copying manifests to pod..."
oc cp "${MANIFESTS_DIR}/." "${OPERATOR_NS}/${POD_NAME}:${MOUNT_PATH}"
# Patch manager.yaml to set imagePullPolicy: Always
MANAGER_YAML="${MOUNT_PATH}/manager/manager.yaml"
MANAGER_CONTENT=$(oc exec -n "$OPERATOR_NS" "$POD_NAME" -- cat "$MANAGER_YAML" 2>/dev/null || echo "")
if [ -n "$MANAGER_CONTENT" ] && ! echo "$MANAGER_CONTENT" | grep -q 'imagePullPolicy:'; then
echo " Setting imagePullPolicy: Always in manager.yaml..."
echo "$MANAGER_CONTENT" | sed 's/^\(\s*image:\s.*\)$/\1\n imagePullPolicy: Always/' | \
oc exec -n "$OPERATOR_NS" "$POD_NAME" -i -- sh -c "cat > $MANAGER_YAML"
fi
echo "Manifests uploaded"
echo ""
# --- Restart operator ---
echo "=== Restarting operator ==="
oc rollout restart deployment -n "$OPERATOR_NS" -l "$OPERATOR_LABEL"
oc rollout status deployment -n "$OPERATOR_NS" -l "$OPERATOR_LABEL" --timeout=300s
echo ""
# --- Apply EvalHub CRD ---
echo "=== Applying EvalHub CRD ==="
kubectl apply -f "$REPO_ROOT/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml"
echo ""
# --- Step 7: Deploy EvalHub CR ---
echo "=== Deploying EvalHub CR ==="
kubectl apply -f - <<EOF
apiVersion: trustyai.opendatahub.io/v1alpha1
kind: EvalHub
metadata:
name: evalhub
namespace: ${NS}
spec:
replicas: 1
database:
secret: evalhub-db-credentials
env:
- name: MLFLOW_TRACKING_URI
value: "https://mlflow.opendatahub.svc.cluster.local:8443"
EOF
echo ""
# --- Step 8: Delete old evaluation jobs ---
echo "=== Deleting old evaluation jobs ==="
kubectl delete jobs -n "$NS" -l app=evalhub --ignore-not-found=true
echo ""
# --- Step 9: Wait and check status ---
echo "Waiting for EvalHub to be ready..."
sleep 10
echo ""
echo "=== EvalHub Status ==="
kubectl get evalhub evalhub -n "$NS" 2>/dev/null || echo "EvalHub CR not found"
echo ""
kubectl get pods -n "$NS" | grep evalhub || echo "No evalhub pods found"
echo ""
kubectl get svc -n "$NS" | grep evalhub || echo "No evalhub services found"
echo ""
oc get route -n "$NS" | grep evalhub || echo "No evalhub routes found"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment