Created
February 17, 2026 08:41
-
-
Save ruivieira/037274e0f17bebb855ba22ba66635498 to your computer and use it in GitHub Desktop.
Redeploy EvalHub
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Redeploy EvalHub: purge cached images from nodes, restart operator, and recreate the CR. | |
| # Run from the root of a trustyai-service-operator clone. | |
| # | |
| # Usage: | |
| # cd trustyai-service-operator | |
| # NS=prabhu ./resources/redeploy-evalhub.sh # clean all images | |
| # NS=prabhu ./resources/redeploy-evalhub.sh --operator --hub # clean only operator + hub | |
| # NS=prabhu ./resources/redeploy-evalhub.sh --operator=quay.io/custom/operator:v2 # clean custom operator image | |
| # NS=prabhu ./resources/redeploy-evalhub.sh --operator -y # skip confirmation | |
| # | |
| # With no image flags, all images are cleaned from node caches. | |
| # With image flags, only the selected images are cleaned. | |
| # A flag without a value uses the default image for that component. | |
| # Selected images are also written to config/base/params.env before uploading manifests. | |
| # Use -y/--yes to skip the confirmation prompt (e.g. in CI). | |
| set -euo pipefail | |
| NS="${NS:-test}" | |
| REPO_ROOT="$(pwd)" | |
| YES=false | |
| # Default images per component | |
| DEFAULT_HUB="quay.io/evalhub/evalhub:latest" | |
| DEFAULT_OPERATOR="quay.io/trustyai/trustyai-service-operator:latest" | |
| DEFAULT_LMEVAL="quay.io/opendatahub/ta-lmes-job:odh-3.4-ea2" | |
| # Parse CLI flags | |
| SELECTED=() | |
| for arg in "$@"; do | |
| case "$arg" in | |
| --hub=*) SELECTED+=("hub:${arg#--hub=}") ;; | |
| --hub) SELECTED+=("hub:${DEFAULT_HUB}") ;; | |
| --operator=*) SELECTED+=("operator:${arg#--operator=}") ;; | |
| --operator) SELECTED+=("operator:${DEFAULT_OPERATOR}") ;; | |
| --lmeval=*) SELECTED+=("lmeval:${arg#--lmeval=}") ;; | |
| --lmeval) SELECTED+=("lmeval:${DEFAULT_LMEVAL}") ;; | |
| --all) SELECTED=("all") ;; | |
| -y|--yes) YES=true ;; | |
| *) | |
| echo "Unknown flag: $arg" | |
| echo "Flags: --hub, --operator, --lmeval, --all, -y/--yes" | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| # Resolve final image values | |
| IMG_HUB="" | |
| IMG_OPERATOR="" | |
| IMG_LMEVAL="" | |
| if [ ${#SELECTED[@]} -eq 0 ] || [[ " ${SELECTED[*]} " == *" all "* ]]; then | |
| IMG_HUB="$DEFAULT_HUB" | |
| IMG_OPERATOR="$DEFAULT_OPERATOR" | |
| IMG_LMEVAL="$DEFAULT_LMEVAL" | |
| else | |
| for entry in "${SELECTED[@]}"; do | |
| component="${entry%%:*}" | |
| image="${entry#*:}" | |
| case "$component" in | |
| hub) IMG_HUB="$image" ;; | |
| operator) IMG_OPERATOR="$image" ;; | |
| lmeval) IMG_LMEVAL="$image" ;; | |
| esac | |
| done | |
| fi | |
| # Build the list of images to clean from node caches | |
| IMAGES=() | |
| [ -n "$IMG_HUB" ] && IMAGES+=("$IMG_HUB") | |
| [ -n "$IMG_OPERATOR" ] && IMAGES+=("$IMG_OPERATOR") | |
| [ -n "$IMG_LMEVAL" ] && IMAGES+=("$IMG_LMEVAL") | |
| # --- Summary and confirmation --- | |
| echo "EvalHub redeployment" | |
| echo "====================" | |
| echo "" | |
| echo " Namespace: $NS" | |
| echo "" | |
| echo "This will:" | |
| echo " 1. Remove the following images from ALL worker node caches (crictl rmi):" | |
| for img in "${IMAGES[@]}"; do | |
| echo " - $img" | |
| done | |
| echo " 2. Delete the EvalHub CR in namespace '$NS'" | |
| echo " 3. Delete legacy RBAC resources (ClusterRoles, bindings, ServiceAccounts)" | |
| echo " 4. Update config/base/params.env with selected images" | |
| echo " 5. Upload operator manifests and restart the operator" | |
| echo " 6. Reapply the EvalHub CRD and CR" | |
| echo " 7. Delete old evaluation jobs in namespace '$NS'" | |
| echo "" | |
| echo "WARNING: Image removal affects ALL worker nodes in the cluster." | |
| echo " Other workloads using these images will re-pull on next restart." | |
| echo "" | |
| if [ "$YES" != true ]; then | |
| read -r -p "Proceed? [y/N] " confirm | |
| case "$confirm" in | |
| [yY][eE][sS]|[yY]) ;; | |
| *) | |
| echo "Aborted." | |
| exit 0 | |
| ;; | |
| esac | |
| echo "" | |
| fi | |
| # --- Clean cached images from all worker nodes --- | |
| echo "=== Cleaning cached images from worker nodes ===" | |
| NODES=$(oc get nodes -l node-role.kubernetes.io/worker -o jsonpath='{.items[*].metadata.name}') | |
| if [ -z "$NODES" ]; then | |
| echo "No worker nodes found, skipping image cleanup" | |
| else | |
| for NODE in $NODES; do | |
| echo " Node: $NODE" | |
| CMDS="" | |
| for IMG in "${IMAGES[@]}"; do | |
| CMDS+="crictl rmi ${IMG} 2>/dev/null || true; " | |
| done | |
| oc debug "node/$NODE" --quiet -- chroot /host bash -c "$CMDS" 2>/dev/null || true | |
| done | |
| fi | |
| echo "Image cleanup done" | |
| echo "" | |
| # --- Step 2: Delete existing EvalHub CR --- | |
| echo "=== Deleting EvalHub CR ===" | |
| kubectl delete evalhub evalhub -n "$NS" --ignore-not-found=true | |
| echo "" | |
| # --- Validate repo root --- | |
| if [ ! -d "$REPO_ROOT/config/crd" ]; then | |
| echo "ERROR: Must be run from the root of a trustyai-service-operator clone." | |
| echo " Expected to find config/crd/ in $(pwd)" | |
| exit 1 | |
| fi | |
| # --- Cleanup legacy RBAC --- | |
| echo "=== Cleaning up legacy RBAC ===" | |
| # Legacy ClusterRoles | |
| for CR in \ | |
| evalhub-proxy-role \ | |
| evalhub-jobs-proxy-role \ | |
| trustyai-service-operator-evalhub-proxy-role \ | |
| trustyai-service-operator-evalhub-jobs-proxy-role \ | |
| evalhub-api-role \ | |
| evalhub-jobs-api-role \ | |
| evalhub-resource-manager \ | |
| trustyai-service-operator-evalhub-api-role \ | |
| trustyai-service-operator-evalhub-jobs-api-role \ | |
| trustyai-service-operator-evalhub-resource-manager; do | |
| kubectl delete clusterrole "$CR" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy monolithic resource-manager ClusterRoleBindings | |
| for CRB in \ | |
| evalhub-resource-manager-binding \ | |
| trustyai-service-operator-evalhub-resource-manager-binding; do | |
| kubectl delete clusterrolebinding "$CRB" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # ClusterRoleBindings referencing removed resource-manager ClusterRoles | |
| for CRB in $(kubectl get clusterrolebindings -o json 2>/dev/null | \ | |
| python3 -c 'import json,sys; d=json.load(sys.stdin); bad={"evalhub-resource-manager","trustyai-service-operator-evalhub-resource-manager"}; \ | |
| [print(i["metadata"]["name"]) for i in d.get("items",[]) if i.get("roleRef",{}).get("kind")=="ClusterRole" and i.get("roleRef",{}).get("name") in bad]' 2>/dev/null); do | |
| echo " Deleting CRB referencing removed resource-manager ClusterRole: ${CRB}" | |
| kubectl delete clusterrolebinding "${CRB}" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy ClusterRoleBindings (pattern: *-{namespace}-*-proxy-rolebinding) | |
| for CRB in $(kubectl get clusterrolebindings -o name 2>/dev/null | grep -- '-proxy-rolebinding$' | grep -- "-${NS}-"); do | |
| echo " Deleting legacy ClusterRoleBinding: ${CRB}" | |
| kubectl delete "${CRB}" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy ClusterRoleBindings (pattern: {namespace}-*-jobs-proxy) | |
| for CRB in $(kubectl get clusterrolebindings -o name 2>/dev/null | grep -- '-jobs-proxy$' | grep "^clusterrolebinding.rbac.authorization.k8s.io/${NS}-"); do | |
| echo " Deleting legacy jobs ClusterRoleBinding: ${CRB}" | |
| kubectl delete "${CRB}" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy RoleBindings in namespace (pattern: *-jobs-proxy-rolebinding) | |
| for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-jobs-proxy-rolebinding$'); do | |
| echo " Deleting legacy jobs proxy RoleBinding: ${RB}" | |
| kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Stale cross-namespace RoleBindings (evalhub-jobs-mlflow in opendatahub) | |
| for RB in $(kubectl get rolebindings -n opendatahub -o name 2>/dev/null | grep 'evalhub-jobs-mlflow' || true); do | |
| echo " Deleting stale cross-namespace RoleBinding: ${RB}" | |
| kubectl delete "${RB}" -n opendatahub --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy proxy ServiceAccounts | |
| for SA in $(kubectl get serviceaccounts -n "$NS" -o name 2>/dev/null | grep -- '-proxy$' | grep -v 'kube-rbac-proxy'); do | |
| echo " Deleting legacy proxy ServiceAccount: ${SA}" | |
| kubectl delete "${SA}" -n "$NS" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy MLFlow proxy RoleBindings | |
| for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-mlflow-proxy$'); do | |
| echo " Deleting legacy MLFlow proxy RoleBinding: ${RB}" | |
| kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| # Legacy resource-manager RoleBindings | |
| for RB in $(kubectl get rolebindings -n "$NS" -o name 2>/dev/null | grep -- '-resource-manager$'); do | |
| echo " Deleting legacy resource-manager RoleBinding: ${RB}" | |
| kubectl delete "${RB}" -n "$NS" --ignore-not-found=true 2>/dev/null || true | |
| done | |
| echo "Legacy RBAC cleanup done" | |
| echo "" | |
| # --- Update params.env with selected images --- | |
| echo "=== Updating params.env ===" | |
| update_params_env() { | |
| local file="$1" | |
| if [ ! -f "$file" ]; then | |
| echo " WARNING: $file not found, skipping" | |
| return | |
| fi | |
| echo " $file" | |
| [ -n "$IMG_OPERATOR" ] && sed -i "s|^trustyaiOperatorImage=.*|trustyaiOperatorImage=${IMG_OPERATOR}|" "$file" | |
| [ -n "$IMG_HUB" ] && sed -i "s|^evalHubImage=.*|evalHubImage=${IMG_HUB}|" "$file" | |
| [ -n "$IMG_LMEVAL" ] && sed -i "s|^lmes-pod-image=.*|lmes-pod-image=${IMG_LMEVAL}|" "$file" | |
| } | |
| update_params_env "$REPO_ROOT/config/base/params.env" | |
| update_params_env "$REPO_ROOT/config/overlays/odh/params.env" | |
| [ -n "$IMG_OPERATOR" ] && echo " trustyaiOperatorImage=${IMG_OPERATOR}" | |
| [ -n "$IMG_HUB" ] && echo " evalHubImage=${IMG_HUB}" | |
| [ -n "$IMG_LMEVAL" ] && echo " lmes-pod-image=${IMG_LMEVAL}" | |
| echo "" | |
| # --- Upload operator manifests --- | |
| echo "=== Uploading operator manifests ===" | |
| OPERATOR_NS="openshift-operators" | |
| OPERATOR_LABEL="name=opendatahub-operator" | |
| CSV_PATTERN="opendatahub-operator" | |
| COMPONENT="trustyai" | |
| MANIFESTS_DIR="$REPO_ROOT/config" | |
| PVC_NAME="${COMPONENT}-manifests" | |
| MOUNT_PATH="/opt/manifests/${COMPONENT}" | |
| VOLUME_NAME="${COMPONENT}-manifests" | |
| # Create PVC if it doesn't exist | |
| if oc get pvc "$PVC_NAME" -n "$OPERATOR_NS" &>/dev/null; then | |
| echo " PVC '$PVC_NAME' already exists, reusing" | |
| else | |
| echo " Creating PVC '$PVC_NAME'..." | |
| oc apply -f - <<PVCEOF | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: ${PVC_NAME} | |
| namespace: ${OPERATOR_NS} | |
| spec: | |
| accessModes: | |
| - ReadWriteOnce | |
| resources: | |
| requests: | |
| storage: 1Gi | |
| PVCEOF | |
| fi | |
| # Find and patch the CSV | |
| CSV_NAME=$(oc get csv -n "$OPERATOR_NS" -o name 2>/dev/null | grep "$CSV_PATTERN" | head -1 | cut -d/ -f2) | |
| if [ -z "$CSV_NAME" ]; then | |
| echo "ERROR: No CSV found matching '$CSV_PATTERN' in $OPERATOR_NS" | |
| exit 1 | |
| fi | |
| echo " Found CSV: $CSV_NAME" | |
| # Get current volumeMount/volume indices for cleanup | |
| CSV_JSON=$(oc get csv "$CSV_NAME" -n "$OPERATOR_NS" -o json) | |
| VM_INDEX=$(echo "$CSV_JSON" | python3 -c " | |
| import json,sys; d=json.load(sys.stdin) | |
| vms=d['spec']['install']['spec']['deployments'][0]['spec']['template']['spec'].get('containers',[{}])[0].get('volumeMounts',[]) | |
| idx=[i for i,v in enumerate(vms) if v.get('name')=='$VOLUME_NAME'] | |
| print(idx[0] if idx else -1) | |
| " 2>/dev/null || echo "-1") | |
| VOL_INDEX=$(echo "$CSV_JSON" | python3 -c " | |
| import json,sys; d=json.load(sys.stdin) | |
| vols=d['spec']['install']['spec']['deployments'][0]['spec']['template']['spec'].get('volumes',[]) | |
| idx=[i for i,v in enumerate(vols) if v.get('name')=='$VOLUME_NAME'] | |
| print(idx[0] if idx else -1) | |
| " 2>/dev/null || echo "-1") | |
| PATCH='[ | |
| {"op":"replace","path":"/spec/install/spec/deployments/0/spec/replicas","value":1}, | |
| {"op":"replace","path":"/spec/install/spec/deployments/0/spec/strategy/type","value":"Recreate"}, | |
| {"op":"add","path":"/spec/install/spec/deployments/0/spec/template/spec/securityContext","value":{"fsGroup":65532}}' | |
| [ "$VM_INDEX" != "-1" ] && PATCH+=",{\"op\":\"remove\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/containers/0/volumeMounts/$VM_INDEX\"}" | |
| [ "$VOL_INDEX" != "-1" ] && PATCH+=",{\"op\":\"remove\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/volumes/$VOL_INDEX\"}" | |
| PATCH+=",{\"op\":\"add\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/containers/0/volumeMounts/-\",\"value\":{\"name\":\"$VOLUME_NAME\",\"mountPath\":\"$MOUNT_PATH\"}}" | |
| PATCH+=",{\"op\":\"add\",\"path\":\"/spec/install/spec/deployments/0/spec/template/spec/volumes/-\",\"value\":{\"name\":\"$VOLUME_NAME\",\"persistentVolumeClaim\":{\"claimName\":\"$PVC_NAME\"}}}" | |
| PATCH+=']' | |
| echo " Patching CSV..." | |
| echo "$PATCH" | oc patch csv "$CSV_NAME" -n "$OPERATOR_NS" --type json --patch-file /dev/stdin | |
| # Wait for operator pod | |
| echo " Waiting for operator pod..." | |
| oc wait --for=jsonpath='{.status.conditions[?(@.type=="Ready")].status}'=True \ | |
| pod -l "$OPERATOR_LABEL" -n "$OPERATOR_NS" --timeout=300s | |
| # Copy manifests into the pod | |
| POD_NAME=$(oc get pod -l "$OPERATOR_LABEL" -n "$OPERATOR_NS" -o jsonpath='{.items[0].metadata.name}') | |
| echo " Clearing existing manifests in pod..." | |
| oc exec -n "$OPERATOR_NS" "$POD_NAME" -- sh -c "rm -rf ${MOUNT_PATH}/* ${MOUNT_PATH}/.[!.]* 2>/dev/null || true" | |
| echo " Copying manifests to pod..." | |
| oc cp "${MANIFESTS_DIR}/." "${OPERATOR_NS}/${POD_NAME}:${MOUNT_PATH}" | |
| # Patch manager.yaml to set imagePullPolicy: Always | |
| MANAGER_YAML="${MOUNT_PATH}/manager/manager.yaml" | |
| MANAGER_CONTENT=$(oc exec -n "$OPERATOR_NS" "$POD_NAME" -- cat "$MANAGER_YAML" 2>/dev/null || echo "") | |
| if [ -n "$MANAGER_CONTENT" ] && ! echo "$MANAGER_CONTENT" | grep -q 'imagePullPolicy:'; then | |
| echo " Setting imagePullPolicy: Always in manager.yaml..." | |
| echo "$MANAGER_CONTENT" | sed 's/^\(\s*image:\s.*\)$/\1\n imagePullPolicy: Always/' | \ | |
| oc exec -n "$OPERATOR_NS" "$POD_NAME" -i -- sh -c "cat > $MANAGER_YAML" | |
| fi | |
| echo "Manifests uploaded" | |
| echo "" | |
| # --- Restart operator --- | |
| echo "=== Restarting operator ===" | |
| oc rollout restart deployment -n "$OPERATOR_NS" -l "$OPERATOR_LABEL" | |
| oc rollout status deployment -n "$OPERATOR_NS" -l "$OPERATOR_LABEL" --timeout=300s | |
| echo "" | |
| # --- Apply EvalHub CRD --- | |
| echo "=== Applying EvalHub CRD ===" | |
| kubectl apply -f "$REPO_ROOT/config/crd/bases/trustyai.opendatahub.io_evalhubs.yaml" | |
| echo "" | |
| # --- Step 7: Deploy EvalHub CR --- | |
| echo "=== Deploying EvalHub CR ===" | |
| kubectl apply -f - <<EOF | |
| apiVersion: trustyai.opendatahub.io/v1alpha1 | |
| kind: EvalHub | |
| metadata: | |
| name: evalhub | |
| namespace: ${NS} | |
| spec: | |
| replicas: 1 | |
| database: | |
| secret: evalhub-db-credentials | |
| env: | |
| - name: MLFLOW_TRACKING_URI | |
| value: "https://mlflow.opendatahub.svc.cluster.local:8443" | |
| EOF | |
| echo "" | |
| # --- Step 8: Delete old evaluation jobs --- | |
| echo "=== Deleting old evaluation jobs ===" | |
| kubectl delete jobs -n "$NS" -l app=evalhub --ignore-not-found=true | |
| echo "" | |
| # --- Step 9: Wait and check status --- | |
| echo "Waiting for EvalHub to be ready..." | |
| sleep 10 | |
| echo "" | |
| echo "=== EvalHub Status ===" | |
| kubectl get evalhub evalhub -n "$NS" 2>/dev/null || echo "EvalHub CR not found" | |
| echo "" | |
| kubectl get pods -n "$NS" | grep evalhub || echo "No evalhub pods found" | |
| echo "" | |
| kubectl get svc -n "$NS" | grep evalhub || echo "No evalhub services found" | |
| echo "" | |
| oc get route -n "$NS" | grep evalhub || echo "No evalhub routes found" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment