Skip to content

Instantly share code, notes, and snippets.

@jimangel
Created March 1, 2026 18:45
Show Gist options
  • Select an option

  • Save jimangel/9976fac4fb02e473a2df1fdaf5995512 to your computer and use it in GitHub Desktop.

Select an option

Save jimangel/9976fac4fb02e473a2df1fdaf5995512 to your computer and use it in GitHub Desktop.
1,000 kueue job test

What?

This submits 1,000 "sleep" workloads to kueue into 2 local queues.

Pre Reqs / helpers

  • KinD cluster
cat <<'EOF' > kind-config.yaml
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
  kubeadmConfigPatches:
  - |
    kind: KubeletConfiguration
    maxPods: 250
- role: worker
- role: worker
- role: worker
EOF

kind create cluster --name kueue-testing --config kind-config.yaml
  • Running the dev / commit version of kueue
make deploy
make kind-image-build
kind load docker-image --name kueue-testing ...
kubectl -n kueue-system patch deployment kueue-controller-manager \
  --type='json' \
  -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/imagePullPolicy", "value": "IfNotPresent"}]'
  • (optional) Ingestion of metrics by prometheus / grafana

I'm shipping from kind to a remote instance, enabled via remote-write:

  prometheus:
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--web.external-url=https://prometheus.overcastlab.com'
      - '--web.enable-remote-write-receiver'    # ADD THIS

Add agent RBAC / deployment:

kubectl create ns monitoring

kubectl apply -f - <<'EOF'
---
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus-agent
  namespace: monitoring
---
# Permission to scrape metrics via kube-rbac-proxy
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus-agent-metrics
rules:
  - nonResourceURLs: ["/metrics"]
    verbs: ["get"]
  - apiGroups: [""]
    resources: ["nodes", "nodes/metrics", "services", "endpoints", "pods"]
    verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus-agent-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-agent-metrics
subjects:
  - kind: ServiceAccount
    name: prometheus-agent
    namespace: monitoring
---
# Also bind the kueue metrics-reader role if it exists
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus-agent-kueue-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kueue-metrics-reader
subjects:
  - kind: ServiceAccount
    name: prometheus-agent
    namespace: monitoring
---
# Long-lived token secret (so we don't need projected tokens for a dev setup)
apiVersion: v1
kind: Secret
metadata:
  name: prometheus-agent-token
  namespace: monitoring
  annotations:
    kubernetes.io/service-account.name: prometheus-agent
type: kubernetes.io/service-account-token
kubectl apply -f - <<'EOF'
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-agent-config
  namespace: monitoring
data:
  prometheus.yml: |
    global:
      scrape_interval: 10s
      external_labels:
        cluster: "local-kind-kueue"
        source: "prometheus-agent"

    scrape_configs:
      - job_name: "kueue-controller"
        scheme: https
        tls_config:
          insecure_skip_verify: true
        authorization:
          credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        # Scrape the kueue metrics service directly via cluster DNS
        static_configs:
          - targets:
              - "kueue-controller-manager-metrics-service.kueue-system.svc.cluster.local:8443"

    remote_write:
      - url: "https://prometheus.overcastlab.com/api/v1/write"
        tls_config:
          insecure_skip_verify: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus-agent
  namespace: monitoring
  labels:
    app: prometheus-agent
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus-agent
  template:
    metadata:
      labels:
        app: prometheus-agent
    spec:
      serviceAccountName: prometheus-agent
      containers:
        - name: prometheus
          image: prom/prometheus:latest
          args:
            - "--enable-feature=agent"
            - "--config.file=/etc/prometheus/prometheus.yml"
            - "--web.listen-address=:9090"
          ports:
            - containerPort: 9090
          volumeMounts:
            - name: config
              mountPath: /etc/prometheus
              readOnly: true
          resources:
            requests:
              cpu: 50m
              memory: 128Mi
            limits:
              memory: 256Mi
      volumes:
        - name: config
          configMap:
            name: prometheus-agent-config
EOF
  • WaitForPods Ready enabled
kubectl get cm -n kueue-system kueue-manager-config -o json | python3 -c "
import json,sys,re
d=json.load(sys.stdin)
k='controller_manager_config.yaml'
lines=d['data'][k].split('\n')
out=[]
i=0
while i<len(lines):
    l=lines[i]
    if re.match(r'^#waitForPodsReady:',l):
        out.append(l[1:])
        i+=1
        while i<len(lines) and re.match(r'^#[ \t]',lines[i]):
            out.append(lines[i][1:])
            i+=1
    else:
        out.append(l)
        i+=1
d['data'][k]='\n'.join(out)
print(json.dumps(d))
" | kubectl replace -f -

kubectl rollout restart deploy kueue-controller-manager -n kueue-system
  • LQ metrics feature gate enabled
kubectl get cm -n kueue-system kueue-manager-config -o json | python3 -c "
import json,sys,re
d=json.load(sys.stdin)
k='controller_manager_config.yaml'
cfg=d['data'][k]
if 'featureGates:' not in cfg:
    cfg=re.sub(r'(kind: Configuration\n)',r'\1featureGates:\n  LocalQueueMetrics: true\n',cfg)
d['data'][k]=cfg
print(json.dumps(d))
" | kubectl replace -f -

kubectl rollout restart deploy kueue-controller-manager -n kueue-system

Lastly, ensure the queue used has a good amount of nomial resources:

kubectl apply -f ~/kueue/examples/admin/single-clusterqueue-setup.yaml

# edit...
#!/bin/bash
NUM_JOBS=1000
QUEUE_1_NAME="user-queue"
QUEUE_1_NS="default"
QUEUE_2_NAME="user-queue-2" QUEUE_2_NS="team-b"
echo "Ensuring namespace '$QUEUE_2_NS' exists..."
kubectl get namespace "$QUEUE_2_NS" &>/dev/null || kubectl create namespace "$QUEUE_2_NS"
echo "Ensuring LocalQueue '$QUEUE_2_NAME' in namespace '$QUEUE_2_NS' exists..."
cat <<EOF | kubectl apply -f -
apiVersion: kueue.x-k8s.io/v1beta2
kind: LocalQueue
metadata:
namespace: "$QUEUE_2_NS"
name: "$QUEUE_2_NAME"
spec:
clusterQueue: "cluster-queue"
EOF
echo "Submitting $NUM_JOBS mixed-workload jobs split across two LocalQueues..."
for i in $(seq 1 $NUM_JOBS); do
# Randomize CPU between 10m and 100m
CPU=$(( (RANDOM % 10 + 1) * 10 ))m
# Randomize Memory between 50Mi and 250Mi
MEM=10Mi
#MEM=$(( (RANDOM % 21 + 5) * 10 ))Mi
# Randomize sleep duration between 10s and 45s
DURATION=$(( RANDOM % 31 + 120 ))
# Alternate jobs between the two LocalQueues to exercise both metrics
if (( i % 2 == 0 )); then
QUEUE_NAME="$QUEUE_1_NAME"
NAMESPACE="$QUEUE_1_NS"
else
QUEUE_NAME="$QUEUE_2_NAME"
NAMESPACE="$QUEUE_2_NS"
fi
cat <<EOF | kubectl create -f -
apiVersion: batch/v1
kind: Job
metadata:
name: demo-job-$i
namespace: $NAMESPACE
labels:
kueue.x-k8s.io/queue-name: $QUEUE_NAME
app: kueue-demo
spec:
parallelism: 1
completions: 1
template:
spec:
containers:
- name: dummy-workload
image: busybox:1.36
command: ["sleep", "$DURATION"]
resources:
requests:
cpu: $CPU
memory: $MEM
restartPolicy: Never
EOF
sleep 0.1
done
echo "Finished submitting $NUM_JOBS jobs!"
echo ""
echo "To verify metrics, run:"
echo " kubectl port-forward -n kueue-system svc/kueue-controller-manager-metrics-service 8443:8443"
echo " curl -sk https://localhost:8443/metrics | grep -E 'kueue_pods_ready_workloads|kueue_local_queue_pods_ready_workloads'"
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 100,
"panels": [],
"title": "Overview",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "text",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 1
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "/^git_version$/",
"values": false
},
"showPercentChange": false,
"textMode": "value",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "kueue_build_info",
"format": "table",
"instant": true,
"legendFormat": "__auto",
"refId": "A"
}
],
"title": "Kueue Version",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "Inactive"
},
"1": {
"color": "green",
"text": "Active"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 4,
"y": 1
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "leader_election_master_status",
"instant": true,
"legendFormat": "{{name}}",
"refId": "A"
}
],
"title": "Leader Election",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 8,
"y": 1
},
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kueue_admitted_active_workloads{cluster_queue=~\"$cluster_queue\"})",
"legendFormat": "Admitted Active",
"range": true,
"refId": "A"
}
],
"title": "Admitted Active Workloads",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "orange",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 1
},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kueue_pending_workloads{cluster_queue=~\"$cluster_queue\", status=\"active\"})",
"legendFormat": "Active Pending",
"range": true,
"refId": "A"
}
],
"title": "Pending Workloads (Active)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 1
},
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kueue_pending_workloads{cluster_queue=~\"$cluster_queue\", status=\"inadmissible\"})",
"legendFormat": "Inadmissible",
"range": true,
"refId": "A"
}
],
"title": "Inadmissible Workloads",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "purple",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 1
},
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(kueue_reserving_active_workloads{cluster_queue=~\"$cluster_queue\"})",
"legendFormat": "Reserving",
"range": true,
"refId": "A"
}
],
"title": "Reserving Active",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"0": {
"color": "red",
"text": "Inactive"
},
"1": {
"color": "green",
"text": "Active"
}
},
"type": "value"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": 0
},
{
"color": "green",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 0,
"y": 4
},
"id": 6,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "kueue_cluster_queue_status{status=\"active\", cluster_queue=~\"$cluster_queue\"}",
"instant": true,
"legendFormat": "{{cluster_queue}}",
"refId": "A"
}
],
"title": "ClusterQueue Status",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 4
},
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(kueue_admitted_workloads_total{cluster_queue=~\"$cluster_queue\"}[$__range]))",
"legendFormat": "Total Admitted",
"range": true,
"refId": "A"
}
],
"title": "Total Admitted",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 4
},
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": true,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(kueue_finished_workloads_total{cluster_queue=~\"$cluster_queue\"}[$__range]))",
"legendFormat": "Total Finished",
"range": true,
"refId": "A"
}
],
"title": "Total Finished",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 8
},
"id": 101,
"panels": [],
"title": "Workload Pipeline",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (kueue_pending_workloads{cluster_queue=~\"$cluster_queue\", status=\"active\"})",
"legendFormat": "{{cluster_queue}} - pending (active)",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (kueue_pending_workloads{cluster_queue=~\"$cluster_queue\", status=\"inadmissible\"}) or on() vector(0)",
"legendFormat": "{{cluster_queue}} - pending (inadmissible)",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (kueue_admitted_active_workloads{cluster_queue=~\"$cluster_queue\"}) or on() vector(0)",
"legendFormat": "{{cluster_queue}} - admitted active",
"range": true,
"refId": "C"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (kueue_reserving_active_workloads{cluster_queue=~\"$cluster_queue\"}) or on() vector(0)",
"legendFormat": "{{cluster_queue}} - reserving",
"range": true,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by(cluster_queue, replica_role) (kueue_pods_ready_workloads{cluster_queue=~\"$cluster_queue\"}) or on() vector(0)",
"legendFormat": "{{cluster_queue}} · pods ready ({{replica_role}})",
"range": true,
"refId": "E"
}
],
"title": "Workload States Over Time",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "workloads/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 9
},
"id": 11,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (rate(kueue_admitted_workloads_total{cluster_queue=~\"$cluster_queue\"}[5m]))",
"legendFormat": "{{cluster_queue}} - admission rate",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (rate(kueue_finished_workloads_total{cluster_queue=~\"$cluster_queue\"}[5m]))",
"legendFormat": "{{cluster_queue}} - completion rate",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (rate(kueue_quota_reserved_workloads_total{cluster_queue=~\"$cluster_queue\"}[5m]))",
"legendFormat": "{{cluster_queue}} - quota reservation rate",
"range": true,
"refId": "C"
}
],
"title": "Workload Throughput (rate/sec)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 80,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 17
},
"id": 12,
"options": {
"legend": {
"calcs": [
"sum"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (increase(kueue_admitted_workloads_total{cluster_queue=~\"$cluster_queue\"}[5m]))",
"legendFormat": "{{cluster_queue}} - admitted",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (increase(kueue_finished_workloads_total{cluster_queue=~\"$cluster_queue\"}[5m]))",
"legendFormat": "{{cluster_queue}} - finished",
"range": true,
"refId": "B"
}
],
"title": "Workloads Admitted vs Finished (5m buckets)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current count of finished workloads per cluster_queue (gauge).",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 17
},
"id": 13,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue) (increase(kueue_finished_workloads{cluster_queue=~\"$cluster_queue\"}[$__range]))",
"legendFormat": "{{cluster_queue}} - finished (gauge)",
"range": true,
"refId": "A"
}
],
"title": "Finished Workloads (gauge snapshot)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 25
},
"id": 102,
"panels": [],
"title": "Scheduling Efficiency / Latency",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 26
},
"id": 20,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by (le, cluster_queue) (rate(kueue_admission_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by (le, cluster_queue) (rate(kueue_admission_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p95",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, cluster_queue) (rate(kueue_admission_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p99",
"range": true,
"refId": "C"
}
],
"title": "Admission Wait Time (p50 / p95 / p99)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 26
},
"id": 21,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by (le, cluster_queue) (rate(kueue_quota_reserved_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by (le, cluster_queue) (rate(kueue_quota_reserved_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p95",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, cluster_queue) (rate(kueue_quota_reserved_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p99",
"range": true,
"refId": "C"
}
],
"title": "Quota Reserved Wait Time (p50 / p95 / p99)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Time from admission until pods are actually ready. Requires waitForPodsReady.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"id": 22,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by (le, cluster_queue) (rate(kueue_admitted_until_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by (le, cluster_queue) (rate(kueue_admitted_until_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p95",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, cluster_queue) (rate(kueue_admitted_until_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p99",
"range": true,
"refId": "C"
}
],
"title": "Admitted → Ready Wait Time (p50 / p95 / p99)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Total time from workload creation/requeue until ready. The full pipeline latency.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"id": 23,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by (le, cluster_queue) (rate(kueue_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by (le, cluster_queue) (rate(kueue_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p95",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, cluster_queue) (rate(kueue_ready_wait_time_seconds_bucket{cluster_queue=~\"$cluster_queue\"}[5m])))",
"legendFormat": "{{cluster_queue}} p99",
"range": true,
"refId": "C"
}
],
"title": "End-to-End Ready Wait Time (p50 / p95 / p99)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 42
},
"id": 103,
"panels": [],
"title": "Admission Scheduler",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "How long each scheduling cycle takes, split by success vs inadmissible.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 43
},
"id": 30,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by (le, result) (rate(kueue_admission_attempt_duration_seconds_bucket[5m])))",
"legendFormat": "{{result}} p50",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by (le, result) (rate(kueue_admission_attempt_duration_seconds_bucket[5m])))",
"legendFormat": "{{result}} p95",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, result) (rate(kueue_admission_attempt_duration_seconds_bucket[5m])))",
"legendFormat": "{{result}} p99",
"range": true,
"refId": "C"
}
],
"title": "Admission Attempt Duration (p50 / p95 / p99)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Rate of scheduling attempts. High inadmissible rate may indicate resource pressure or misconfiguration.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "attempts/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 80,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "inadmissible"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "orange",
"mode": "fixed"
}
}
]
},
{
"matcher": {
"id": "byName",
"options": "success"
},
"properties": [
{
"id": "color",
"value": {
"fixedColor": "green",
"mode": "fixed"
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 43
},
"id": 31,
"options": {
"legend": {
"calcs": [
"sum"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (result) (rate(kueue_admission_attempts_total[5m]))",
"legendFormat": "{{result}}",
"range": true,
"refId": "A"
}
],
"title": "Admission Attempts Rate (success vs inadmissible)",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 51
},
"id": 104,
"panels": [],
"title": "Controller Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "reconciliations/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 52
},
"id": 40,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (controller) (rate(controller_runtime_reconcile_total{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller\"}[5m]))",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Reconciliation Rate (key controllers)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "errors/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 0.1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 52
},
"id": 41,
"options": {
"legend": {
"calcs": [
"sum",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (controller) (rate(controller_runtime_reconcile_errors_total[5m])) > 0",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Reconciliation Errors",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 60
},
"id": 42,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller\"}[5m])))",
"legendFormat": "{{controller}} p99",
"range": true,
"refId": "A"
}
],
"title": "Reconciliation Latency p99 (key controllers)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 60
},
"id": 43,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "controller_runtime_active_workers{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller|v1_pod\"}",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Active Workers per Controller",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 68
},
"id": 105,
"panels": [],
"title": "Work Queue Internals",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Current backlog of items waiting to be processed. Sustained high depth indicates the controller is falling behind.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "yellow",
"value": 5
},
{
"color": "red",
"value": 20
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 69
},
"id": 50,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (controller) (workqueue_depth{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller|v1_pod|multikueue_workload\"})",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Work Queue Depth",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "adds/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 69
},
"id": 51,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (controller) (rate(workqueue_adds_total{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller|v1_pod\"}[5m]))",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Work Queue Add Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Rate of retries per controller. Sustained retries may indicate transient API errors or contention.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "retries/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 77
},
"id": 52,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (controller) (rate(workqueue_retries_total{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller\"}[5m])) > 0",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
}
],
"title": "Work Queue Retry Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Growing unfinished_work_seconds indicates stuck reconciliation. longest_running_processor shows if a single item is blocking.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
},
{
"color": "red",
"value": 60
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 77
},
"id": 53,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "workqueue_unfinished_work_seconds{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller\"} > 0",
"legendFormat": "{{controller}}",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "workqueue_longest_running_processor_seconds{controller=~\"job|workload_controller|clusterqueue_controller|localqueue_controller\"} > 0",
"legendFormat": "{{controller}} (longest processor)",
"range": true,
"refId": "B"
}
],
"title": "Unfinished Work / Stuck Processors",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 85
},
"id": 106,
"panels": [],
"title": "Webhook Performance",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 86
},
"id": 60,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by (le, webhook) (rate(controller_runtime_webhook_latency_seconds_bucket{webhook=~\"/mutate.*|/validate.*\"}[5m])))",
"legendFormat": "{{webhook}} p99",
"range": true,
"refId": "A"
}
],
"title": "Webhook Latency p99",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "req/sec",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 86
},
"id": 61,
"options": {
"legend": {
"calcs": [
"mean",
"sum"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (webhook, code) (rate(controller_runtime_webhook_requests_total{webhook=~\"/mutate.*|/validate.*\"}[5m])) > 0",
"legendFormat": "{{webhook}} [{{code}}]",
"range": true,
"refId": "A"
}
],
"title": "Webhook Request Rate by Status Code",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 94
},
"id": 107,
"panels": [],
"title": "Process / Runtime",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 95
},
"id": 70,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "process_resident_memory_bytes{job=~\".*kueue.*\"}",
"legendFormat": "RSS",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "go_memstats_heap_inuse_bytes{job=~\".*kueue.*\"}",
"legendFormat": "Heap In-Use",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "go_memstats_stack_inuse_bytes{job=~\".*kueue.*\"}",
"legendFormat": "Stack",
"range": true,
"refId": "C"
}
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 95
},
"id": 71,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(process_cpu_seconds_total{job=~\".*kueue.*\"}[5m])",
"legendFormat": "CPU cores used",
"range": true,
"refId": "A"
}
],
"title": "CPU Usage (cores)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 95
},
"id": 72,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "go_goroutines{job=~\".*kueue.*\"}",
"legendFormat": "Goroutines",
"range": true,
"refId": "A"
}
],
"title": "Goroutines",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 103
},
"id": 108,
"panels": [],
"title": "Pods Ready by Role",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Snapshot: pods-ready count per replica_role across selected ClusterQueues (kueue_pods_ready_workloads).",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "blue",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 0,
"y": 104
},
"id": 80,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (replica_role) (kueue_pods_ready_workloads{cluster_queue=~\"$cluster_queue\"})",
"instant": true,
"legendFormat": "{{replica_role}}",
"refId": "A"
}
],
"title": "Pods Ready – ClusterQueue (by role, now)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Snapshot: pods-ready count per replica_role across all LocalQueues (kueue_local_queue_pods_ready_workloads).",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "purple",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 12,
"y": 104
},
"id": 81,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"percentChangeColorMode": "standard",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showPercentChange": false,
"textMode": "auto",
"wideLayout": true
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (replica_role) (kueue_local_queue_pods_ready_workloads)",
"instant": true,
"legendFormat": "{{replica_role}}",
"refId": "A"
}
],
"title": "Pods Ready – LocalQueue (by role, now)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "kueue_pods_ready_workloads broken out by ClusterQueue × replica_role over time. Spot leader/worker imbalance or sudden readiness drops per queue.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 108
},
"id": 82,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (cluster_queue, replica_role) (kueue_pods_ready_workloads{cluster_queue=~\"$cluster_queue\", replica_role=~\"$replica_role\"})",
"legendFormat": "{{cluster_queue}} · {{replica_role}}",
"range": true,
"refId": "A"
}
],
"title": "ClusterQueue – Pods Ready by Role (over time)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "kueue_local_queue_pods_ready_workloads broken out by namespace, name, and replica_role. Per-team / per-namespace readiness that CQ-scope metrics can't provide.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 15,
"gradientMode": "opacity",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 108
},
"id": 83,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (namespace, name, replica_role) (kueue_local_queue_pods_ready_workloads{replica_role=~\"$replica_role\"})",
"legendFormat": "{{namespace}}/{{name}} · {{replica_role}}",
"range": true,
"refId": "A"
}
],
"title": "LocalQueue – Pods Ready by Role (over time)",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Stacked area: total pods-ready across all queues split by replica_role. Shows the aggregate leader/worker ratio at a glance. Both CQ and LQ series are included.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 60,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"showValues": false,
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "normal"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": 0
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 117
},
"id": 84,
"options": {
"legend": {
"calcs": [
"mean",
"max",
"lastNotNull"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"hideZeros": true,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.4.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (replica_role) (kueue_pods_ready_workloads{cluster_queue=~\"$cluster_queue\", replica_role=~\"$replica_role\"})",
"legendFormat": "CQ · {{replica_role}}",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum by (replica_role) (kueue_local_queue_pods_ready_workloads{replica_role=~\"$replica_role\"})",
"legendFormat": "LQ · {{replica_role}}",
"range": true,
"refId": "B"
}
],
"title": "Total Pods Ready by Role – CQ + LQ stacked",
"type": "timeseries"
}
],
"preload": false,
"refresh": "5s",
"schemaVersion": 42,
"tags": [
"kueue",
"kubernetes",
"batch",
"scheduling"
],
"templating": {
"list": [
{
"current": {
"text": "prometheus",
"value": "P1809F7CD0C75ACF3"
},
"includeAll": false,
"label": "Prometheus",
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"current": {
"text": "All",
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(kueue_cluster_queue_status, cluster_queue)",
"includeAll": true,
"label": "Cluster Queue",
"multi": true,
"name": "cluster_queue",
"options": [],
"query": {
"query": "label_values(kueue_cluster_queue_status, cluster_queue)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"regexApplyTo": "value",
"sort": 1,
"type": "query"
},
{
"current": {
"text": "All",
"value": [
"$__all"
]
},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(kueue_pods_ready_workloads, replica_role)",
"includeAll": true,
"label": "Replica Role",
"multi": true,
"name": "replica_role",
"options": [],
"query": {
"query": "label_values(kueue_pods_ready_workloads, replica_role)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"regexApplyTo": "value",
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Kueue Workload Scheduler Dashboard-2",
"uid": "kueue-workload-scheduler-2",
"version": 8,
"weekStart": ""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment