Skip to content

Instantly share code, notes, and snippets.

@arubis
Created February 19, 2026 21:25
Show Gist options
  • Select an option

  • Save arubis/9df733c3c96ea3ae47a26517fafb784b to your computer and use it in GitHub Desktop.

Select an option

Save arubis/9df733c3c96ea3ae47a26517fafb784b to your computer and use it in GitHub Desktop.
Synthetic Endpoint Monitoring — Reconciled Task Files (review-patched v46)
FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2
RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data
RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \
| tar -xzf - -C /usr/local/bin crane
ENV ALLOWED_NAMESPACES="observability"
RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar
RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar
RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar
ENV DISPLAY_NUM=1
ENV COMPUTER_HEIGHT_PX=768
ENV COMPUTER_WIDTH_PX=1024
#!/usr/bin/env python3
from __future__ import annotations
import subprocess
import json
import time
from apex_arena._types import GradingResult
import re
from datetime import timedelta
_grafana_api_ready = None
_active_port_forwards: set = set()
def sh(cmd):
r = subprocess.run(cmd, shell=True, capture_output=True, text=True)
return r.returncode, r.stdout.strip(), r.stderr.strip()
def normalize_image(img):
"""Strip docker.io/ prefix for image comparison."""
for prefix in ("docker.io/library/", "docker.io/"):
if img.startswith(prefix):
return img[len(prefix):]
return img
def port_forward(kind, name, namespace, local_port, remote_port):
key = (name, namespace, local_port)
if key in _active_port_forwards:
return
sh(f"pkill -f 'kubectl port-forward.*{local_port}' || true")
sh(
f"kubectl port-forward -n {namespace} {kind}/{name} "
f"{local_port}:{remote_port} >/dev/null 2>&1 &"
)
time.sleep(5)
_active_port_forwards.add(key)
def parse_duration(duration_str):
"""Parse Prometheus duration string to timedelta."""
match = re.match(r'(\d+)(s|m|h)', str(duration_str))
if not match:
return timedelta(0)
value, unit = int(match.group(1)), match.group(2)
if unit == 's':
return timedelta(seconds=value)
elif unit == 'm':
return timedelta(minutes=value)
elif unit == 'h':
return timedelta(hours=value)
return timedelta(0)
_TERMINAL_POD_STATES = {"ImagePullBackOff", "ErrImagePull", "InvalidImageName"}
def check_deployment_health(name, namespace):
"""Quick check for terminal pod errors. Returns (healthy, message)."""
code, out, _ = sh(
f"kubectl get deployment {name} -n {namespace} -o json"
)
if code != 0:
return False, f"Deployment '{name}' not found in {namespace}"
try:
data = json.loads(out)
selector = data["spec"]["selector"]["matchLabels"]
label_str = ",".join(f"{k}={v}" for k, v in selector.items())
except (json.JSONDecodeError, KeyError):
return True, "Could not parse selector, assuming healthy"
code, out, _ = sh(
f"kubectl get pods -n {namespace} -l {label_str} -o json"
)
if code != 0:
return True, "Could not list pods, assuming healthy"
try:
pods = json.loads(out).get("items", [])
except json.JSONDecodeError:
return True, "Could not parse pods, assuming healthy"
for pod in pods:
statuses = pod.get("status", {}).get("containerStatuses", [])
for cs in statuses:
waiting = cs.get("state", {}).get("waiting", {})
reason = waiting.get("reason", "")
if reason in _TERMINAL_POD_STATES:
pod_name = pod.get("metadata", {}).get("name", "unknown")
return False, (
f"Pod '{pod_name}' in {namespace} is stuck in {reason} — "
f"deployment '{name}' will not recover"
)
return True, f"Deployment '{name}' pods are not in terminal error state"
# -------------------------------------------------------------------
# RESOURCE CHECKS
# -------------------------------------------------------------------
def deployment_exists(name, namespace):
code, _, _ = sh(f"kubectl get deployment {name} -n {namespace}")
return (
(True, f"Deployment '{name}' exists in {namespace}")
if code == 0
else (False, f"Deployment '{name}' not found in {namespace}")
)
def configmap_contains(name, namespace, required_strings):
code, out, _ = sh(
f"kubectl get configmap {name} -n {namespace} -o yaml"
)
if code != 0:
return False, f"ConfigMap '{name}' not readable"
missing = [s for s in required_strings if s not in out]
if missing:
return False, f"ConfigMap '{name}' missing: {missing}"
return True, f"ConfigMap '{name}' contains required content"
def service_exists(name, namespace, port):
code, out, _ = sh(
f"kubectl get svc {name} -n {namespace} -o json"
)
if code != 0:
return False, f"Service '{name}' not found in {namespace}"
data = json.loads(out)
ports = [p.get("port") for p in data.get("spec", {}).get("ports", [])]
if port in ports:
return True, f"Service '{name}' exposes port {port}"
return False, f"Service '{name}' does not expose port {port}"
def wait_for_grafana_api():
global _grafana_api_ready
if _grafana_api_ready is not None:
return _grafana_api_ready
sh(
"kubectl wait --for=condition=ready pod -l app=grafana "
"-n observability --timeout=60s"
)
for _ in range(6):
code, _, _ = sh(
"kubectl exec -n observability deploy/grafana -- "
"wget -qO- http://localhost:3000/api/health"
)
if code == 0:
_grafana_api_ready = True
return True
time.sleep(5)
_grafana_api_ready = False
return False
def configmap_exists(name, namespace):
code, _, _ = sh(f"kubectl get configmap {name} -n {namespace}")
return (
(True, f"ConfigMap '{name}' exists in {namespace}")
if code == 0
else (False, f"ConfigMap '{name}' not found in {namespace}")
)
def pvc_exists(name, namespace):
code, _, _ = sh(f"kubectl get pvc {name} -n {namespace}")
return (
(True, f"PVC '{name}' exists in {namespace}")
if code == 0
else (False, f"PVC '{name}' not found in {namespace}")
)
# -------------------------------------------------------------------
# BLACKBOX FUNCTIONAL CHECKS
# -------------------------------------------------------------------
def blackbox_metrics_exposed():
healthy, msg = check_deployment_health("blackbox-exporter", "observability")
if not healthy:
return False, msg
port_forward(
"svc",
"blackbox-exporter",
"observability",
9115,
9115,
)
code, out, _ = sh("curl -s http://localhost:9115/metrics")
if code == 0 and "blackbox_exporter_config_last_reload_successful" in out:
return True, "Blackbox exporter metrics endpoint is reachable"
return False, "Blackbox exporter metrics endpoint not responding correctly"
def blackbox_config_has_required_modules():
code, out, _ = sh(
"kubectl get configmap blackbox-config "
"-n observability -o jsonpath='{.data}'"
)
if code != 0:
return False, "blackbox-config not readable"
required_modules = ["http_2xx", "tcp_connect"]
missing = [m for m in required_modules if m not in out]
if missing:
return False, f"blackbox-config missing required modules: {missing}"
return True, "blackbox-config contains all required probe modules"
def kubernetes_api_tcp_probe_configured():
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o jsonpath='{.data.prometheus\\.yml}'"
)
if code != 0:
return False, "Prometheus config not readable"
if "kubernetes.default" not in out:
return False, "Kubernetes API server not configured as probe target"
import re
tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?'
if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out:
return False, "Must use 'tcp_connect' module (not tcp_connect_tls) for Kubernetes API TCP probe"
return True, "Kubernetes API TCP probe is configured with tcp_connect module"
def prometheus_has_probe_metrics():
healthy, msg = check_deployment_health("prometheus", "observability")
if not healthy:
return False, msg
port_forward("svc", "prometheus", "observability", 9090, 9090)
for _ in range(6):
code, out, _ = sh(
"curl -s "
"'http://localhost:9090/api/v1/query?query=probe_success'"
)
if code == 0 and '"result"' in out:
return True, "Prometheus is collecting probe metrics"
time.sleep(5)
return False, "Prometheus not returning probe metrics after retries"
def check_slo_burn_rate_alerts():
"""Verify alerts implement proper multi-window SLO burn rate logic."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
# Must have multiple time windows in recording rules or alert expressions
windows = re.findall(
r"avg_over_time\([^)]*\[(\d+[mh])\]\)",
out
)
if len(set(windows)) < 2:
return False, (
"Burn rate alerts must use multiple time windows "
"(e.g., 5m and 1h)"
)
# Must have at least 2 distinct 'for:' durations across alert rules
# (evidence of fast-burn vs slow-burn detection windows)
for_durations = re.findall(r"for:\s*(\d+[smh])", out)
unique_durations = {parse_duration(d) for d in for_durations}
unique_durations.discard(timedelta(0))
if len(unique_durations) < 2:
return False, (
"SLO burn rate alerting requires multiple detection windows "
"(e.g., a fast-burn alert with 'for: 2m' and a slow-burn "
"alert with 'for: 1h')"
)
return True, "Valid multi-window SLO burn rate alerts detected"
def prometheus_scrape_interval_valid():
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o jsonpath='{.data.prometheus\\.yml}'"
)
if "global:" not in out:
return False, "Prometheus config missing global section"
global_section = out.split("scrape_configs")[0] if "scrape_configs" in out else out
if "scrape_interval: 15s" not in global_section and "scrape_interval: 10s" not in global_section:
return False, "Global scrape_interval must be 10s or 15s"
return True, "Scrape interval is appropriately configured"
def check_alert_for_duration():
"""Verify alerts have appropriate 'for' duration for timely detection."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
for_match = re.search(
r'alert:\s*SyntheticProbeFailure.*?for:\s*(\d+[smh])',
out,
re.DOTALL
)
if not for_match:
if 'SyntheticProbeFailure' not in out:
return False, "SyntheticProbeFailure alert not found"
return False, "Alert missing 'for' duration"
for_duration = for_match.group(1)
duration = parse_duration(for_duration)
if duration > timedelta(minutes=2):
return False, f"Alert 'for' duration {for_duration} exceeds 2m detection requirement"
if duration < timedelta(seconds=30):
return False, f"Alert 'for' duration {for_duration} too short, will cause flapping"
return True, f"Alert 'for' duration {for_duration} is appropriate"
def check_alert_annotations():
"""Verify alerts have required annotations for operational use."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
if 'SyntheticProbeFailure' not in out:
return False, "SyntheticProbeFailure alert not found"
alert_section = out[out.find('SyntheticProbeFailure'):]
if 'annotations:' not in alert_section:
return False, "Alert missing annotations section"
if 'description' not in alert_section and 'summary' not in alert_section:
return False, "Alert missing description/summary annotation"
return True, "Alert has required annotations"
def check_recording_rules():
"""Verify recording rules exist AND are used in alert expressions."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
# Must define recording rules
if "record:" not in out:
return False, "Prometheus should define recording rules"
# Extract recording rule names
record_names = re.findall(
r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
out
)
if not record_names:
return False, "No valid recording rule names found"
# Multi-window availability requires at least 2 recording rules
if len(record_names) < 2:
return False, (
"Multiple recording rules needed for multi-window "
"availability signals (e.g., 5m and 1h windows)"
)
# At least 2 recording rules must be referenced in alert expressions
alert_section = out[out.find("alert:"):] if "alert:" in out else out
used_count = sum(1 for name in record_names if name in alert_section)
if used_count < 2:
return False, (
"At least 2 recording rules should be referenced in alert "
"expressions for multi-window burn rate detection"
)
return True, "Recording rules exist and are used in alerts"
def check_blackbox_modules():
"""Verify correct Blackbox modules used for each protocol."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o jsonpath='{.data.prometheus\\.yml}'"
)
if code != 0:
return False, "Prometheus config not readable"
out_lower = out.lower()
# --- Kubernetes API must use tcp_connect ---
if 'kubernetes.default' in out:
kube_pos = out.find('kubernetes.default')
kube_section = out[max(0, kube_pos - 500):kube_pos + 200]
if 'tcp_connect' not in kube_section:
return False, "Kubernetes API target should use tcp_connect module"
# --- HTTP endpoints must use http_2xx ---
if 'argocd' in out_lower:
argocd_pos = out_lower.find('argocd')
argocd_section = out[max(0, argocd_pos - 500):argocd_pos + 200]
if 'http_2xx' not in argocd_section and 'http' in argocd_section:
return False, "HTTP targets should use http_2xx module"
return True, "Blackbox modules correctly matched to target protocols"
def check_alert_severity_labels():
"""Verify alerts define severity labels."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
if "severity:" not in out:
return False, (
"Alerts must define severity labels "
"(critical or warning)"
)
return True, "Alert severity labels present"
def check_dashboard_uses_recording_rules():
"""Dashboard should reference recording rules instead of raw PromQL."""
code, dash_out, _ = sh(
"kubectl get configmap grafana-dashboards "
"-n observability -o yaml"
)
if code != 0:
return False, "grafana-dashboards ConfigMap not readable"
# Extract actual recording rule names from prometheus-config
code, prom_out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code == 0:
record_names = re.findall(
r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
prom_out
)
if record_names and any(name in dash_out for name in record_names):
return True, "Dashboard references recording rules"
# Fallback: accept any probe:*:* pattern (recording rule convention)
if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out):
return True, "Dashboard references recording rules"
return False, (
"Dashboard should reference pre-computed recording rules "
"(e.g., probe:availability:5m) instead of raw PromQL"
)
def argocd_probe_success():
port_forward(
"svc",
"blackbox-exporter",
"observability",
9115,
9115,
)
cmd = (
"curl -s "
"'http://localhost:9115/probe?"
"target=http://argocd.devops.local:80/api/version&module=http_2xx' | "
"grep '^probe_success 1'"
)
code, out, _ = sh(cmd)
if code == 0 and out:
return True, "Synthetic probe reports Argo CD endpoint as available"
return False, "Synthetic probe did not report Argo CD as available"
def deployment_uses_image(name, namespace, expected_image):
code, out, _ = sh(
f"kubectl get deployment {name} -n {namespace} -o json"
)
if code != 0:
return False, f"Deployment '{name}' not found"
data = json.loads(out)
containers = data["spec"]["template"]["spec"]["containers"]
images = [c.get("image", "") for c in containers]
actual = [normalize_image(i) for i in images]
expected = normalize_image(expected_image)
if expected in actual:
return True, f"Deployment '{name}' uses image '{expected_image}'"
return False, f"Expected {expected_image}, found {images}"
def prometheus_blackbox_relabeling_present():
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o jsonpath='{.data.prometheus\\.yml}'"
)
if code != 0:
return False, "Prometheus config not readable"
has_blackbox_addr = ("blackbox-exporter:9115" in out or
"blackbox-exporter.observability" in out)
required_snippets = [
"metrics_path: /probe",
"__param_target",
]
missing = [s for s in required_snippets if s not in out]
if not has_blackbox_addr:
missing.append("blackbox-exporter address")
if not missing:
return True, "Prometheus blackbox relabeling is correctly configured"
return False, f"Missing blackbox relabeling elements: {missing}"
def prometheus_alert_fires_for_failing_probe():
for dep in ("prometheus", "blackbox-exporter"):
healthy, msg = check_deployment_health(dep, "observability")
if not healthy:
return False, msg
port_forward("svc", "prometheus", "observability", 9090, 9090)
for _ in range(18):
code, out, _ = sh(
"curl -s http://localhost:9090/api/v1/alerts"
)
if (
code == 0
and "SyntheticProbeFailure" in out
and "does-not-exist.devops.local" in out
):
return True, "SyntheticProbeFailure alert is firing"
time.sleep(8)
return False, "SyntheticProbeFailure alert did not fire"
def grafana_has_prometheus_datasource():
healthy, msg = check_deployment_health("grafana", "observability")
if not healthy:
return False, msg
if not wait_for_grafana_api():
return False, "Grafana API not reachable"
for _ in range(6):
code, out, _ = sh(
"kubectl exec -n observability deploy/grafana -- "
"wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
"http://localhost:3000/api/datasources"
)
if code == 0 and "Prometheus" in out:
return True, "Grafana Prometheus datasource configured"
time.sleep(5)
return False, "Grafana Prometheus datasource missing"
def grafana_has_blackbox_dashboard():
healthy, msg = check_deployment_health("grafana", "observability")
if not healthy:
return False, msg
if not wait_for_grafana_api():
return False, "Grafana API not reachable"
for _ in range(6):
code, out, _ = sh(
"kubectl exec -n observability deploy/grafana -- "
"wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
"http://localhost:3000/api/search"
)
if code == 0 and any(kw in out for kw in [
"Synthetic", "Blackbox", "Probe", "Endpoint"
]):
return True, "Grafana dashboard for synthetic probes exists"
time.sleep(5)
return False, "Grafana dashboard missing"
def prometheus_uses_pvc():
code, out, _ = sh(
"kubectl get deployment prometheus "
"-n observability -o json"
)
if code != 0:
return False, "Prometheus deployment not found"
data = json.loads(out)
volumes = data["spec"]["template"]["spec"].get("volumes", [])
mounts = data["spec"]["template"]["spec"]["containers"][0].get("volumeMounts", [])
pvc_used = any(v.get("persistentVolumeClaim") for v in volumes)
mounted = any(m.get("mountPath") == "/prometheus" for m in mounts)
if pvc_used and mounted:
return True, "Prometheus is using persistent storage"
return False, "Prometheus PVC is not mounted at /prometheus"
def alert_rule_identifies_endpoint():
"""Verify alert annotations reference the failing endpoint."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
# Alerts must reference the endpoint in annotations so operators
# can identify which endpoint failed
has_label_template = re.search(
r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out
)
if has_label_template:
return True, "Alert annotations identify the failing endpoint"
return False, (
"Alert annotations must reference the failing endpoint "
"(e.g., {{ $labels.instance }}) for operational use"
)
def alert_has_minimum_duration():
"""Verify alert rule has for: 2m or greater duration"""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
duration_pattern = r'for:\s*([2-9]|[1-9]\d+)m'
if re.search(duration_pattern, out):
return True, "Alert rule has correct minimum duration (>=2m)"
return False, "Alert rule must fire 'for: 2m' or longer (not immediate)"
def prometheus_alert_is_per_endpoint():
for dep in ("prometheus", "blackbox-exporter"):
healthy, msg = check_deployment_health(dep, "observability")
if not healthy:
return False, msg
for _ in range(18):
code, out, _ = sh(
"kubectl exec -n observability deploy/prometheus -- "
"wget -qO- http://localhost:9090/api/v1/alerts"
)
if code != 0 or "SyntheticProbeFailure" not in out:
time.sleep(8)
continue
try:
data = json.loads(out)
alerts = data.get("data", {}).get("alerts", [])
synthetic = [
a for a in alerts
if a.get("labels", {}).get("alertname") == "SyntheticProbeFailure"
]
if len(synthetic) < 1:
time.sleep(8)
continue
instances = {
a.get("labels", {}).get("instance", "") for a in synthetic
}
if not all(instances):
return False, (
"SyntheticProbeFailure alerts lack instance labels — "
"alerting is not scoped per endpoint"
)
firing = {
a.get("labels", {}).get("instance", "")
for a in synthetic if a.get("state") == "firing"
}
has_failing = any("does-not-exist" in i for i in firing)
if has_failing:
return True, (
"Alerts fire per endpoint (failing endpoint alerts "
"independently with instance labels)"
)
time.sleep(8)
except (json.JSONDecodeError, KeyError):
time.sleep(10)
continue
return False, "Could not verify per-endpoint alert scoping"
def prometheus_does_not_use_up_metric():
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o yaml"
)
if code != 0:
return False, "Prometheus config not readable"
if " up " in out or "up==" in out or "up ==" in out:
return False, "Alerting incorrectly uses exporter 'up' metric"
return True, "Alerting correctly avoids exporter 'up' metric"
def check_endpoint_count():
"""Verify at least 3 probe targets are configured."""
code, out, _ = sh(
"kubectl get configmap prometheus-config "
"-n observability -o jsonpath='{.data.prometheus\\.yml}'"
)
if code != 0:
return False, "Prometheus config not readable"
out_lower = out.lower()
required_targets = {
'argocd': 'argocd' in out_lower,
'kubernetes_api': 'kubernetes.default' in out_lower,
'test_endpoint': 'does-not-exist' in out_lower,
}
missing = [k for k, found in required_targets.items() if not found]
if missing:
return False, f"Missing probe targets: {missing}"
return True, "All required endpoints configured"
def check_grafana_dashboard_semantics():
"""Verify Grafana dashboard uses correct semantic patterns for synthetic monitoring."""
code, out, _ = sh(
"kubectl get configmap grafana-dashboards "
"-n observability -o yaml"
)
if code != 0:
return False, "grafana-dashboards ConfigMap not readable"
issues = []
# ------------------------------------------------------------------
# Check 1: dashboard must show availability data (not raw binary)
# ------------------------------------------------------------------
has_availability_metric = (
"probe_success" in out
or re.search(r"probe:[a-zA-Z_]*availab", out)
or re.search(r"probe:[a-zA-Z_]*success", out)
)
if has_availability_metric:
if "probe_success" in out:
has_time_agg = any(fn in out for fn in [
"avg_over_time",
"min_over_time",
"max_over_time",
"sum_over_time",
])
if not has_time_agg:
issues.append(
"Dashboard uses raw probe_success without time "
"aggregation (expected avg_over_time or similar)"
)
else:
issues.append(
"Dashboard does not reference probe availability metrics "
"(probe_success or a recording rule like probe:availability)"
)
# ------------------------------------------------------------------
# Check 2: per-endpoint breakdown (instance / target)
# ------------------------------------------------------------------
has_grouping = any(x in out for x in [
"by (instance)",
"by (target)",
"$labels.instance",
"$labels.target",
"{{ instance }}",
"{{instance}}",
"{{ target }}",
"{{target}}",
])
if not has_grouping:
issues.append(
"Dashboard does not show per-endpoint breakdown "
"(missing by(instance), legendFormat with {{ instance }}, "
"or target label usage)"
)
# ------------------------------------------------------------------
# Check 3: availability not shown as raw binary signal
# ------------------------------------------------------------------
has_normalized = (
# Percentage form (e.g., * 100)
any(x in out for x in ["* 100", "*100", "100 *"])
# Or [0,1] normalized via time aggregation (avg_over_time already
# produces a continuous availability ratio, not binary)
or any(fn in out for fn in [
"avg_over_time", "min_over_time", "max_over_time",
])
# Or uses a recording rule that pre-computes availability
or re.search(r"probe:[a-zA-Z_]+:", out)
)
if not has_normalized:
issues.append(
"Dashboard should represent availability as a normalized "
"measure (e.g., avg_over_time for ratio or * 100 for percentage)"
)
# ------------------------------------------------------------------
# Check 4: response-time / latency metrics present
# ------------------------------------------------------------------
latency_metrics = [
"probe_duration_seconds",
"probe_http_duration_seconds",
"probe_tcp_connection_duration_seconds",
"probe_dns_lookup_time_seconds",
]
has_latency = any(metric in out for metric in latency_metrics)
if not has_latency:
issues.append(
"Dashboard should include response-time metrics "
"(e.g. probe_duration_seconds or protocol-specific durations)"
)
# ------------------------------------------------------------------
# Final result
# ------------------------------------------------------------------
if issues:
return False, "; ".join(issues)
return True, (
"Dashboard uses time-aggregated probe metrics, per-endpoint breakdown, "
"availability percentage, and latency visualization"
)
def grade(transcript: str) -> GradingResult:
feedback = []
# ------------------------
# Gate checks (non-scored)
# ------------------------
gate_checks = [
# Resource existence
lambda: deployment_exists("blackbox-exporter", "observability"),
lambda: service_exists("blackbox-exporter", "observability", 9115),
lambda: configmap_exists("blackbox-config", "observability"),
lambda: deployment_exists("prometheus", "observability"),
lambda: configmap_exists("prometheus-config", "observability"),
lambda: pvc_exists("prometheus-data", "observability"),
blackbox_config_has_required_modules,
prometheus_scrape_interval_valid,
# Image correctness
lambda: deployment_uses_image(
"blackbox-exporter", "observability",
"prom/blackbox-exporter:v0.25.0",
),
lambda: deployment_uses_image(
"prometheus", "observability",
"prom/prometheus:v3.8.1",
),
lambda: deployment_uses_image(
"grafana", "observability",
"grafana/grafana:12.2",
),
# Core operational
blackbox_metrics_exposed,
prometheus_has_probe_metrics,
prometheus_blackbox_relabeling_present,
argocd_probe_success,
kubernetes_api_tcp_probe_configured,
prometheus_uses_pvc,
# Basic config quality
grafana_has_prometheus_datasource,
check_alert_severity_labels,
check_alert_annotations,
check_alert_for_duration,
alert_has_minimum_duration,
prometheus_does_not_use_up_metric,
check_slo_burn_rate_alerts,
check_blackbox_modules,
]
for fn in gate_checks:
try:
ok, msg = fn()
except Exception as e:
ok = False
msg = str(e)
feedback.append(("✓ " if ok else "✗ ") + msg)
# ------------------------
# Scored checks (partial)
# ------------------------
scored_checks = {
"grafana_dashboard_present": grafana_has_blackbox_dashboard,
"endpoint_count": check_endpoint_count,
"grafana_dashboard_semantics": check_grafana_dashboard_semantics,
"failing_in_alert": prometheus_alert_fires_for_failing_probe,
"alert_identifies_endpoint": alert_rule_identifies_endpoint,
"per_endpoint": prometheus_alert_is_per_endpoint,
"records": check_recording_rules,
"recording_rules": check_dashboard_uses_recording_rules,
}
subscores = {}
for key, fn in scored_checks.items():
try:
ok, msg = fn()
except Exception as e:
ok = False
msg = str(e)
subscores[key] = 1.0 if ok else 0.0
feedback.append(("✓ " if ok else "✗ ") + msg)
#
total_checks = len(scored_checks)
weight = 1.0 / total_checks
weights = {k: weight for k in scored_checks}
score = sum(subscores[k] * weights[k] for k in subscores)
return GradingResult(
score=round(score, 4),
subscores=subscores,
weights=weights,
feedback=" | ".join(feedback),
)

Synthetic Endpoint Monitoring — Reconciled Task Files

Base version: v46 (a6b6b25b-fbdf-4830-bd13-258c6bfd9948) Status: Review patch applied — ready for solution test and eval Approach: Minimal changes on v46; author's implementations preserved where both versions address the same issue satisfactorily

These are the complete, ready-to-upload task files with review patches applied on top of the author's v46 submission. A companion gist contains the raw patch and detailed per-change rationale.


What changed from v46

Blocking fixes

  1. Wiki URL correctedtask.yaml pointed to gitea.devops.local:3000/root/bleater-app/wiki but setup.sh creates the wiki at gitea.devops.local/root/platform-docs/wiki/. Agent gets a 404 and can't discover probe targets.

  2. Grader restructured: 25 scored → 25 gates + 8 scored — v46's 25 equal-weight scored checks (0.04 each) let an agent score 0.32 by passing trivial existence checks without deploying anything functional. Now gates provide diagnostic feedback while 8 substantive scored checks (0.125 each) require real working functionality to earn points.

  3. SyntheticProbeFailure alert name added to task.yaml — The grader checks for this exact string in 4+ places but v46's task description never names it. An agent choosing "ProbeFailure" or "EndpointDown" fails through no fault of their own.

Reliability fixes

  1. Wiki creation via git-commit (setup.sh) — Replaces flaky Gitea POST .../wiki/new API with direct git commit to the bare wiki repo. Eliminates timing-dependent 404s.

  2. Image tag aliasing (setup.sh) — ctr images tag docker.io/X X ensures both prefixed and unprefixed image references resolve. Prevents ErrImagePull when agents write image: prom/prometheus:v3.8.1 but k3s only has docker.io/prom/prometheus:v3.8.1.

  3. Distractor image cleanup (setup.sh) — Removes old versions (prometheus v2.54.1, grafana 11.3.0) from the Nebula base image so agents can't discover and use wrong versions.

  4. Image version table in wiki (setup.sh) — v46's wiki told agents to "discover available versions using standard container tooling" but ctr images list output is ambiguous in air-gapped environments. Explicit table is the fair approach.

Grader robustness

  1. normalize_image() — Strips docker.io/ prefix when comparing deployed images against expected versions.

  2. check_deployment_health() — Fast-fails on terminal pod states (ImagePullBackOff, ErrImagePull) instead of retrying for 2+ minutes when a deployment will never recover.

  3. Grafana API cachingwait_for_grafana_api() result is cached to avoid redundant 30s waits across multiple Grafana checks.

  4. Port-forward deduplication — Prevents killing and re-establishing active port-forwards when multiple checks use the same port.

Check quality

  1. check_slo_burn_rate_alerts tightened — v46 used keyword matching ("burn", "14.4", "error budget") which passes if an agent writes # burn rate in a comment. Now requires structural evidence: ≥2 unique avg_over_time time windows AND ≥2 unique for: durations.

  2. check_recording_rules tightened — v46 passed if record: keyword existed + either "probe:" or "availability" appeared anywhere. Now requires ≥2 recording rules defined AND ≥2 referenced in alert expressions.

  3. alert_rule_identifies_endpoint tightened — v46 accepted by (instance) in PromQL expressions as sufficient. Now requires {{ $labels.instance }} in alert annotations — proving the alert message identifies the endpoint for operators, not just that the expression preserves the label.

  4. check_grafana_dashboard_semantics loosened — v46 required literal probe_success in the dashboard. Now also accepts recording rules like probe:availability. Agents using recording rules (better practice) shouldn't be penalized.

Cleanup

  1. Removed duplicate grafana_dashboard_uses_time_aggregation — Defined twice in v46 (lines 574 and 732); second shadows the first. Functionality already covered by check_grafana_dashboard_semantics.

  2. Removed HTTPS TLS check from check_blackbox_modules — v46 required explicit tls_config for HTTPS targets, but the default blackbox exporter behavior already verifies TLS. Task says "preserve default TLS verification behavior."

  3. Removed remote_write from solution.sh — Dummy URL https://url/insert/0/prometheus/api/v1/write causes Prometheus connection timeouts. Agents would copy it.

  4. Burn rate alerts use recording rules in solution.sh — probe:availability:5m instead of raw avg_over_time(probe_success[5m]). Required by scored check_recording_rules and aligns with task requirement about efficient querying.

  5. Removed duplicate paragraphs from task.yamlscrape_interval stated twice, TCP monitoring mentioned 3x.

  6. Fixed double-slash typo in setup.sh"//workdir/grafana-12.2.tar""/workdir/grafana-12.2.tar".


Grader architecture

graph TD
    subgraph Gates["Gate Checks (25) — feedback only, no score"]
        G1["6 resource existence"]
        G2["3 image correctness"]
        G3["6 operational checks"]
        G4["10 config quality checks"]
    end

    subgraph Scored["Scored Checks (8 × 0.125)"]
        S1["grafana_dashboard_present<br/><i>Grafana up + dashboard visible</i>"]
        S2["endpoint_count<br/><i>3 targets: argocd, k8s API, test</i>"]
        S3["grafana_dashboard_semantics<br/><i>time agg + per-endpoint + % + latency</i>"]
        S4["failing_in_alert<br/><i>SyntheticProbeFailure fires</i>"]
        S5["alert_identifies_endpoint<br/><i>annotations: {{ $labels.instance }}</i>"]
        S6["per_endpoint<br/><i>alerts fire independently per target</i>"]
        S7["records<br/><i>≥2 recording rules used in alerts</i>"]
        S8["recording_rules<br/><i>dashboard references recording rules</i>"]
    end

    Gates --> Scored
Loading

What was kept from v46

v46 addition Disposition Notes
prometheus_scrape_interval_valid() Kept as gate Validates 10s/15s specifically — stricter and more aligned with task than our generic ≤30s check
check_alert_for_duration() Kept as gate Validates for: 30s–2m on SyntheticProbeFailure. Solution uses for: 1m, passes.
deployment_uses_any_image() Removed normalize_image() in deployment_uses_image() handles docker.io/ prefix more cleanly
check_grafana_dashboard_semantics() Kept as scored Multi-part dashboard quality check — good design
check_endpoint_count() Kept as scored Validates all 3 required targets

Files in this gist

File Lines Notes
grader.py 1008 25 gates + 8 scored checks
task.yaml 113 Wiki URL fixed, alert name added, deduped
setup.sh 255 Git-commit wiki, image tagging, distractor cleanup
solution.sh 473 Recording rules in alerts/dashboard, no remote_write
Dockerfile 20 Unchanged from v46
#!/bin/bash
set -e
# ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- #
# Start supervisord if not already running (manages k3s, dockerd, dnsmasq)
if ! supervisorctl status &>/dev/null; then
echo "Starting supervisord..."
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
sleep 5
fi
# Set kubeconfig for k3s
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
# Wait for k3s to be ready (k3s can take 30-60 seconds to start)
echo "Waiting for k3s to be ready..."
MAX_WAIT=180
ELAPSED=0
until kubectl get nodes &>/dev/null; do
if [ $ELAPSED -ge $MAX_WAIT ]; then
echo "Error: k3s is not ready after ${MAX_WAIT} seconds"
exit 1
fi
echo "Waiting for k3s... (${ELAPSED}s elapsed)"
sleep 2
ELAPSED=$((ELAPSED + 2))
done
echo "k3s is ready!"
# ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- #
echo "Granting Minimal Required Permission"
kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: ubuntu-user-configmap-editor
namespace: observability
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: ubuntu-user-configmap-editor-binding
namespace: observability
subjects:
- kind: ServiceAccount
name: ubuntu-user
namespace: observability
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ubuntu-user-configmap-editor
EOF
# ------------------------------------------------------------------------------
# Create Gitea repo
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
curl -s -X POST \
-u "root:Admin@123456" \
-H "Content-Type: application/json" \
-d "{
\"name\": \"platform-docs\",
\"private\": false,
\"auto_init\": true
}" \
http://localhost:3000/api/v1/user/repos
'
# ------------------------------------------------------------------------------
# Wiki creation — commit directly to bare git repo (bypasses API timing issues)
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
set -e
WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git
# Ensure bare wiki repo exists (Gitea does NOT auto-create it)
if [ ! -d "$WIKI_REPO" ]; then
git init --bare "$WIKI_REPO" >/dev/null 2>&1
fi
# Commit wiki content directly into the bare repo via a temp clone
TMPDIR=$(mktemp -d)
cd "$TMPDIR"
git init >/dev/null 2>&1
git config user.email "root@gitea.local"
git config user.name "root"
cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF
# Platform Services - Monitoring Requirements
## Critical Platform Services
### ArgoCD GitOps Controller
| Property | Value |
|----------|-------|
| Ingress | \`argocd.devops.local\` |
| Protocol | HTTP |
| Criticality | High |
### Kubernetes API Server
| Property | Value |
|----------|-------|
| Access | \`kubernetes.default.svc:443\` |
| Protocol | TCP |
| Criticality | Critical |
## Alerting Validation
Configure a probe against a known-unreachable endpoint to verify alerting:
http://does-not-exist.devops.local
Alerts should identify the specific failing endpoint (not aggregate into one global alert).
## Preloaded Container Images
This cluster operates in air-gapped mode. The following images are available:
| Component | Image |
|-----------|-------|
| Prometheus | \`prom/prometheus:v3.8.1\` |
| Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` |
| Grafana | \`grafana/grafana:12.2\` |
Use these exact image references in your deployments. No other versions are available.
WIKIEOF
git add . >/dev/null 2>&1
git commit -m "Initial wiki content" >/dev/null 2>&1
git remote add origin "$WIKI_REPO"
git push -f origin master >/dev/null 2>&1
cd /
rm -rf "$TMPDIR"
chown -R git:git "$WIKI_REPO"
' && echo "[SETUP] Platform documentation created" \
|| echo "[SETUP] WARN: Wiki creation failed"
# Poke the Gitea wiki API to force indexing of the git-committed content
kubectl exec -n gitea deploy/gitea -- sh -c '
for i in 1 2 3; do
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
-u "root:Admin@123456" \
"http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages")
if [ "$HTTP_CODE" = "200" ]; then
break
fi
sleep 2
done
' 2>/dev/null || true
echo "[SETUP] Importing pre-cached container images..."
PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar"
BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar"
GRAFANA_TAR="/workdir/grafana-12.2.tar"
if [ ! -f "$GRAFANA_TAR" ]; then
echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR"
exit 1
fi
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images import --no-unpack "$GRAFANA_TAR"
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images list | grep -q "grafana/grafana:12.2" || {
echo "Error: Grafana image was not imported correctly"
exit 1
}
if [ ! -f "$PROMETHEUS_TAR" ]; then
echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR"
exit 1
fi
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images import "$PROMETHEUS_TAR"
if [ ! -f "$BLACKBOX_TAR" ]; then
echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR"
exit 1
fi
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images import --no-unpack "$BLACKBOX_TAR"
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images list | grep -q "prom/prometheus:v3.8.1" || {
echo "Error: Prometheus image was not imported correctly"
exit 1
}
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images list | grep -q "prom/blackbox-exporter:v0.25.0" || {
echo "Error: Blackbox image was not imported correctly"
exit 1
}
CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io"
$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1
$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0
$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2
ALLOWED_IMAGES=(
"prom/prometheus:v3.8.1"
"docker.io/prom/prometheus:v3.8.1"
"prom/blackbox-exporter:v0.25.0"
"docker.io/prom/blackbox-exporter:v0.25.0"
"grafana/grafana:12.2"
"docker.io/grafana/grafana:12.2"
)
is_allowed() {
for allowed in "${ALLOWED_IMAGES[@]}"; do
[[ "$1" == "$allowed" ]] && return 0
done
return 1
}
$CTR images list -q | while read -r image; do
case "$image" in
*prometheus*|*blackbox-exporter*|*grafana*)
is_allowed "$image" || $CTR images remove "$image" 2>/dev/null || true
;;
esac
done
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null || true
ctr --address /run/k3s/containerd/containerd.sock \
--namespace k8s.io \
images remove docker.io/grafana/grafana:11.3.0 2>/dev/null || true
rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR"
rm -rf /workdir/*.tar 2>/dev/null || true
echo "[SETUP] Prometheus and Blackbox images imported successfully"
#!/bin/bash
set -euo pipefail
echo ""
echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)"
echo ""
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
NAMESPACE="observability"
# Ensure namespace exists
kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 || kubectl create namespace ${NAMESPACE}
echo ""
echo ">>> Applying Blackbox Exporter configuration"
echo ""
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: blackbox-config
namespace: ${NAMESPACE}
data:
blackbox.yml: |
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: []
method: GET
tcp_connect:
prober: tcp
timeout: 5s
EOF
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
namespace: ${NAMESPACE}
labels:
app: blackbox-exporter
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
containers:
- name: blackbox-exporter
image: prom/blackbox-exporter:v0.25.0
ports:
- containerPort: 9115
args:
- "--config.file=/etc/blackbox/blackbox.yml"
securityContext:
runAsNonRoot: true
runAsUser: 1000
volumeMounts:
- name: config-volume
mountPath: /etc/blackbox
volumes:
- name: config-volume
configMap:
name: blackbox-config
items:
- key: blackbox.yml
path: blackbox.yml
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
name: blackbox-exporter
namespace: ${NAMESPACE}
spec:
selector:
app: blackbox-exporter
ports:
- name: http
port: 9115
targetPort: 9115
EOF
echo ""
echo ">>> Applying Prometheus storage"
echo ""
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: ${NAMESPACE}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi
EOF
echo ""
echo ">>> Applying Prometheus configuration"
echo ""
echo "Applying Prometheus config..."
cat <<'EOF' | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: observability
data:
prometheus.yml: |
global:
scrape_interval: 15s
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: blackbox
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://argocd.devops.local
- http://does-not-exist.devops.local
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
- job_name: blackbox-kubernetes-api
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- kubernetes.default.svc:443
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
- job_name: blackbox-exporter
static_configs:
- targets:
- blackbox-exporter:9115
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
rules.yml: |
groups:
# --------------------------------------------------
# Recording rules
# --------------------------------------------------
- name: synthetic-recording
rules:
- record: probe:availability:5m
expr: avg_over_time(probe_success[5m])
- record: probe:availability:1h
expr: avg_over_time(probe_success[1h])
- record: probe:latency_p99:5m
expr: |
histogram_quantile(
0.99,
sum(rate(probe_duration_seconds_bucket[5m]))
by (le, instance)
)
# --------------------------------------------------
# Compatibility alert (legacy graders depend on this)
# --------------------------------------------------
- name: synthetic-compat
rules:
- alert: SyntheticProbeFailure
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Synthetic probe failed"
description: "Endpoint {{ $labels.instance }} is unreachable"
# --------------------------------------------------
# SLO burn-rate alerts
# --------------------------------------------------
- name: synthetic-slo
rules:
- alert: SyntheticProbeHighBurnRate
expr: |
(1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4
for: 2m
labels:
severity: critical
annotations:
summary: "High synthetic availability burn rate"
description: "High error budget burn rate for {{ $labels.instance }}"
- alert: SyntheticProbeLowBurnRate
expr: |
(1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1
for: 1h
labels:
severity: warning
annotations:
summary: "Sustained synthetic availability degradation"
description: "Sustained error budget burn rate for {{ $labels.instance }}"
EOF
echo ""
echo ">>> Deploying Prometheus"
echo ""
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: ${NAMESPACE}
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 1000
containers:
- name: prometheus
image: prom/prometheus:v3.8.1
args:
- "--config.file=/etc/prometheus/prometheus.yml"
ports:
- containerPort: 9090
volumeMounts:
- name: config-volume
mountPath: /etc/prometheus/prometheus.yml
subPath: prometheus.yml
- name: data-volume
mountPath: /prometheus
- name: rules-volume
mountPath: /etc/prometheus/rules
volumes:
- name: config-volume
configMap:
name: prometheus-config
- name: data-volume
persistentVolumeClaim:
claimName: prometheus-data
- name: rules-volume
configMap:
name: prometheus-config
items:
- key: rules.yml
path: rules.yml
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: ${NAMESPACE}
spec:
selector:
app: prometheus
ports:
- name: web
port: 9090
targetPort: 9090
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: ${NAMESPACE}
data:
datasources.yml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-provider
namespace: ${NAMESPACE}
data:
dashboards.yml: |
apiVersion: 1
providers:
- name: default
folder: ''
type: file
options:
path: /var/lib/grafana/dashboards
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
namespace: observability
data:
blackbox-dashboard.json: |
{
"title": "Synthetic Endpoint Availability",
"schemaVersion": 38,
"panels": [
{
"type": "timeseries",
"title": "Probe Availability (%)",
"targets": [
{
"expr": "probe:availability:5m * 100",
"legendFormat": "{{ instance }}",
"refId": "A"
}
]
},
{
"type": "timeseries",
"title": "Probe Latency (seconds)",
"targets": [
{
"expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)",
"legendFormat": "{{ instance }}",
"refId": "B"
}
]
}
]
}
EOF
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: ${NAMESPACE}
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
image: grafana/grafana:12.2
ports:
- containerPort: 3000
volumeMounts:
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
# Provider config
- name: dashboard-provider
mountPath: /etc/grafana/provisioning/dashboards
# Actual JSON dashboards
- name: dashboard-json
mountPath: /var/lib/grafana/dashboards
volumes:
- name: datasources
configMap:
name: grafana-datasources
- name: dashboard-provider
configMap:
name: grafana-dashboard-provider
- name: dashboard-json
configMap:
name: grafana-dashboards
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: ${NAMESPACE}
spec:
selector:
app: grafana
ports:
- name: web
port: 3000
targetPort: 3000
EOF
echo ""
echo ">>> Waiting for deployments to become ready"
echo ""
kubectl rollout status deployment/grafana -n ${NAMESPACE}
kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE}
kubectl rollout status deployment/prometheus -n ${NAMESPACE}
echo ""
echo ">>> Solution deployment completed successfully"
echo ""
id: synthetic-endpoint-monitoring
metadata:
category: observability
difficulty: advanced
prompt: |
[Context]
The Nebula platform hosts multiple internal services exposed through Kubernetes ingress.
While application-level metrics exist, the platform team lacks visibility into endpoint
availability from a real client perspective.
Synthetic probing is required to validate ingress reachability and detect outages
independent of application instrumentation.
[Environment Notes]
The cluster operates in air-gapped mode.
Required container images are already preloaded into the runtime environment and must
be used as-is without pulling from external registries. Available image versions
are documented in the platform wiki.
[Task]
Implement synthetic endpoint monitoring in the `observability` namespace.
The monitoring stack must include the following Kubernetes resources with the
specified names:
- A deployment named `blackbox-exporter`
- A service named `blackbox-exporter` exposing the probe endpoint
- A configuration ConfigMap named `blackbox-config`
- A monitoring deployment named `prometheus`
- A configuration ConfigMap named `prometheus-config`
- A persistent volume claim named `prometheus-data`
Prometheus must be exposed via a Kubernetes Service named `prometheus`
on port 9090.
The Nebula internal developer wiki documents commonly exposed platform services
and their ingress hostnames:
http://gitea.devops.local/root/platform-docs/wiki/
The solution must actively probe internal service endpoints that represent
real client access patterns. Check the Gitea wiki for information about
critical platform services that require monitoring.
Probe results must be exposed as metrics and be consumable by the monitoring system.
[Blackbox Exporter Requirements]
The blackbox exporter must support multiple probe types:
- HTTP probes for application-layer availability
- TCP probes for transport-layer connectivity
Probe behavior must follow protocol-appropriate validation:
- Application-layer probes must validate protocol correctness
- Transport-layer probes must validate connectivity only
- HTTP probe modules must preserve default TLS verification behavior
The Kubernetes API server health must be monitored using TCP connectivity probes
against its standard in-cluster service name. This check must validate only
basic transport-layer reachability (no TLS or HTTP validation).
Exporter-level metrics (e.g., `up`) must not be used as a substitute for
synthetic probe result metrics.
[Prometheus Requirements]
The global `scrape_interval` must be set to **15s or 10s**.
Prometheus must be configured to scrape blackbox probe targets using appropriate
relabeling. Configuration should support efficient querying and reuse of commonly
evaluated availability signals. Repeated or computationally expensive expressions
should not be evaluated directly at query time.
[Alerting Requirements]
Prometheus must define alerting rules based on synthetic probe result metrics.
At least one alert must be named **SyntheticProbeFailure** and represent
endpoint-level availability failure detected via synthetic probes.
Alerts detecting sustained availability degradation should be based on
SLO-style burn rate concepts (e.g., evaluating error budget consumption
over time rather than fixed thresholds).
Alerts must:
- Detect rapid availability loss
- Detect sustained availability degradation over longer periods
- Distinguish failures on a per-endpoint basis
- Avoid relying solely on instantaneous probe failures or fixed thresholds
Alerting rules must be derived from probe result metrics and must distinguish
failures on a per-endpoint basis.
[Visualization Requirements]
The monitoring stack must include a visualization layer for synthetic probe results.
Required resource:
- ConfigMap `grafana-dashboards` for dashboard definitions
A visualization service must be deployed in the `observability` namespace
and configured to consume metrics directly from Prometheus without manual
configuration through a web interface.
The visualization must include at least one dashboard that presents the
availability status of synthetic probes on a per-endpoint basis.
Dashboards must present synthetic probe results in a form suitable for
service-level assessment rather than raw signal inspection.
Visualizations must:
- Represent availability as a normalized measure over time
- Allow comparison across individual endpoints
- Include at least one indicator related to request or probe responsiveness
- Not rely solely on binary success/failure signals
Dashboard and data source configuration must be reproducible and stored
declaratively as Kubernetes resources.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment