|
diff --color -ruN '--exclude=__pycache__' '--exclude=.apex_metadata.json' ./grader.py /tmp/reconciled-synthetic-endpoint-monitoring/grader.py |
|
--- ./grader.py 2026-02-19 06:54:41.199381253 -0700 |
|
+++ /tmp/reconciled-synthetic-endpoint-monitoring/grader.py 2026-02-19 14:16:25.354990821 -0700 |
|
@@ -9,18 +9,34 @@ |
|
from datetime import timedelta |
|
|
|
|
|
+_grafana_api_ready = None |
|
+_active_port_forwards: set = set() |
|
+ |
|
+ |
|
def sh(cmd): |
|
r = subprocess.run(cmd, shell=True, capture_output=True, text=True) |
|
return r.returncode, r.stdout.strip(), r.stderr.strip() |
|
|
|
+def normalize_image(img): |
|
+ """Strip docker.io/ prefix for image comparison.""" |
|
+ for prefix in ("docker.io/library/", "docker.io/"): |
|
+ if img.startswith(prefix): |
|
+ return img[len(prefix):] |
|
+ return img |
|
+ |
|
|
|
def port_forward(kind, name, namespace, local_port, remote_port): |
|
+ key = (name, namespace, local_port) |
|
+ if key in _active_port_forwards: |
|
+ return |
|
+ |
|
sh(f"pkill -f 'kubectl port-forward.*{local_port}' || true") |
|
sh( |
|
f"kubectl port-forward -n {namespace} {kind}/{name} " |
|
f"{local_port}:{remote_port} >/dev/null 2>&1 &" |
|
) |
|
time.sleep(5) |
|
+ _active_port_forwards.add(key) |
|
|
|
|
|
def parse_duration(duration_str): |
|
@@ -38,6 +54,50 @@ |
|
return timedelta(0) |
|
|
|
|
|
+_TERMINAL_POD_STATES = {"ImagePullBackOff", "ErrImagePull", "InvalidImageName"} |
|
+ |
|
+ |
|
+def check_deployment_health(name, namespace): |
|
+ """Quick check for terminal pod errors. Returns (healthy, message).""" |
|
+ code, out, _ = sh( |
|
+ f"kubectl get deployment {name} -n {namespace} -o json" |
|
+ ) |
|
+ if code != 0: |
|
+ return False, f"Deployment '{name}' not found in {namespace}" |
|
+ |
|
+ try: |
|
+ data = json.loads(out) |
|
+ selector = data["spec"]["selector"]["matchLabels"] |
|
+ label_str = ",".join(f"{k}={v}" for k, v in selector.items()) |
|
+ except (json.JSONDecodeError, KeyError): |
|
+ return True, "Could not parse selector, assuming healthy" |
|
+ |
|
+ code, out, _ = sh( |
|
+ f"kubectl get pods -n {namespace} -l {label_str} -o json" |
|
+ ) |
|
+ if code != 0: |
|
+ return True, "Could not list pods, assuming healthy" |
|
+ |
|
+ try: |
|
+ pods = json.loads(out).get("items", []) |
|
+ except json.JSONDecodeError: |
|
+ return True, "Could not parse pods, assuming healthy" |
|
+ |
|
+ for pod in pods: |
|
+ statuses = pod.get("status", {}).get("containerStatuses", []) |
|
+ for cs in statuses: |
|
+ waiting = cs.get("state", {}).get("waiting", {}) |
|
+ reason = waiting.get("reason", "") |
|
+ if reason in _TERMINAL_POD_STATES: |
|
+ pod_name = pod.get("metadata", {}).get("name", "unknown") |
|
+ return False, ( |
|
+ f"Pod '{pod_name}' in {namespace} is stuck in {reason} — " |
|
+ f"deployment '{name}' will not recover" |
|
+ ) |
|
+ |
|
+ return True, f"Deployment '{name}' pods are not in terminal error state" |
|
+ |
|
+ |
|
# ------------------------------------------------------------------- |
|
# RESOURCE CHECKS |
|
# ------------------------------------------------------------------- |
|
@@ -81,18 +141,26 @@ |
|
|
|
|
|
def wait_for_grafana_api(): |
|
- sh("kubectl wait --for=condition=ready pod -l app=grafana " |
|
- "-n observability --timeout=120s") |
|
+ global _grafana_api_ready |
|
+ if _grafana_api_ready is not None: |
|
+ return _grafana_api_ready |
|
|
|
- for _ in range(9): |
|
+ sh( |
|
+ "kubectl wait --for=condition=ready pod -l app=grafana " |
|
+ "-n observability --timeout=60s" |
|
+ ) |
|
+ |
|
+ for _ in range(6): |
|
code, _, _ = sh( |
|
"kubectl exec -n observability deploy/grafana -- " |
|
"wget -qO- http://localhost:3000/api/health" |
|
) |
|
if code == 0: |
|
+ _grafana_api_ready = True |
|
return True |
|
time.sleep(5) |
|
|
|
+ _grafana_api_ready = False |
|
return False |
|
|
|
|
|
@@ -121,6 +189,10 @@ |
|
# ------------------------------------------------------------------- |
|
|
|
def blackbox_metrics_exposed(): |
|
+ healthy, msg = check_deployment_health("blackbox-exporter", "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
+ |
|
port_forward( |
|
"svc", |
|
"blackbox-exporter", |
|
@@ -163,7 +235,7 @@ |
|
if "kubernetes.default" not in out: |
|
return False, "Kubernetes API server not configured as probe target" |
|
|
|
- |
|
+ |
|
import re |
|
tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?' |
|
if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out: |
|
@@ -173,10 +245,14 @@ |
|
|
|
|
|
def prometheus_has_probe_metrics(): |
|
+ healthy, msg = check_deployment_health("prometheus", "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
+ |
|
port_forward("svc", "prometheus", "observability", 9090, 9090) |
|
|
|
- |
|
- for _ in range(9): |
|
+ |
|
+ for _ in range(6): |
|
code, out, _ = sh( |
|
"curl -s " |
|
"'http://localhost:9090/api/v1/query?query=probe_success'" |
|
@@ -191,7 +267,7 @@ |
|
|
|
|
|
def check_slo_burn_rate_alerts(): |
|
- """Verify alerts use SLO burn rate pattern.""" |
|
+ """Verify alerts implement proper multi-window SLO burn rate logic.""" |
|
code, out, _ = sh( |
|
"kubectl get configmap prometheus-config " |
|
"-n observability -o yaml" |
|
@@ -199,19 +275,31 @@ |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- # Check for burn rate pattern |
|
- has_burn_rate = ( |
|
- 'error budget' in out.lower() or |
|
- '14.4' in out or # Fast burn multiplier |
|
- '1 - avg_over_time' in out or |
|
- 'burn' in out.lower() |
|
+ # Must have multiple time windows in recording rules or alert expressions |
|
+ windows = re.findall( |
|
+ r"avg_over_time\([^)]*\[(\d+[mh])\]\)", |
|
+ out |
|
) |
|
+ if len(set(windows)) < 2: |
|
+ return False, ( |
|
+ "Burn rate alerts must use multiple time windows " |
|
+ "(e.g., 5m and 1h)" |
|
+ ) |
|
|
|
- if not has_burn_rate: |
|
- return False, "Alerts should use SLO burn rate pattern, not simple threshold" |
|
- |
|
- return True, "Alerts use proper SLO burn rate calculations" |
|
+ # Must have at least 2 distinct 'for:' durations across alert rules |
|
+ # (evidence of fast-burn vs slow-burn detection windows) |
|
+ for_durations = re.findall(r"for:\s*(\d+[smh])", out) |
|
+ unique_durations = {parse_duration(d) for d in for_durations} |
|
+ unique_durations.discard(timedelta(0)) |
|
+ |
|
+ if len(unique_durations) < 2: |
|
+ return False, ( |
|
+ "SLO burn rate alerting requires multiple detection windows " |
|
+ "(e.g., a fast-burn alert with 'for: 2m' and a slow-burn " |
|
+ "alert with 'for: 1h')" |
|
+ ) |
|
|
|
+ return True, "Valid multi-window SLO burn rate alerts detected" |
|
|
|
|
|
def prometheus_scrape_interval_valid(): |
|
@@ -219,11 +307,11 @@ |
|
"kubectl get configmap prometheus-config " |
|
"-n observability -o jsonpath='{.data.prometheus\\.yml}'" |
|
) |
|
- |
|
- |
|
+ |
|
+ |
|
if "global:" not in out: |
|
return False, "Prometheus config missing global section" |
|
- |
|
+ |
|
global_section = out.split("scrape_configs")[0] if "scrape_configs" in out else out |
|
if "scrape_interval: 15s" not in global_section and "scrape_interval: 10s" not in global_section: |
|
return False, "Global scrape_interval must be 10s or 15s" |
|
@@ -231,7 +319,6 @@ |
|
return True, "Scrape interval is appropriately configured" |
|
|
|
|
|
- |
|
def check_alert_for_duration(): |
|
"""Verify alerts have appropriate 'for' duration for timely detection.""" |
|
code, out, _ = sh( |
|
@@ -285,46 +372,48 @@ |
|
return True, "Alert has required annotations" |
|
|
|
|
|
-def check_scrape_interval(): |
|
- """Verify scrape interval supports ≤2 min detection time.""" |
|
+def check_recording_rules(): |
|
+ """Verify recording rules exist AND are used in alert expressions.""" |
|
code, out, _ = sh( |
|
"kubectl get configmap prometheus-config " |
|
- "-n observability -o jsonpath='{.data.prometheus\\.yml}'" |
|
+ "-n observability -o yaml" |
|
) |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- global_match = re.search(r'scrape_interval:\s*(\d+[smh])', out) |
|
- if not global_match: |
|
- return False, "Global scrape_interval not found" |
|
- |
|
- global_interval = global_match.group(1) |
|
- duration = parse_duration(global_interval) |
|
- |
|
- if duration > timedelta(seconds=30): |
|
- return False, f"Global scrape_interval {global_interval} too long for 2m detection" |
|
+ # Must define recording rules |
|
+ if "record:" not in out: |
|
+ return False, "Prometheus should define recording rules" |
|
+ |
|
+ # Extract recording rule names |
|
+ record_names = re.findall( |
|
+ r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", |
|
+ out |
|
+ ) |
|
|
|
- return True, f"Scrape interval {global_interval} supports timely detection" |
|
+ if not record_names: |
|
+ return False, "No valid recording rule names found" |
|
|
|
+ # Multi-window availability requires at least 2 recording rules |
|
+ if len(record_names) < 2: |
|
+ return False, ( |
|
+ "Multiple recording rules needed for multi-window " |
|
+ "availability signals (e.g., 5m and 1h windows)" |
|
+ ) |
|
|
|
+ # At least 2 recording rules must be referenced in alert expressions |
|
+ alert_section = out[out.find("alert:"):] if "alert:" in out else out |
|
|
|
-def check_recording_rules(): |
|
- """Verify recording rules exist for availability metrics.""" |
|
- code, out, _ = sh( |
|
- "kubectl get configmap prometheus-config " |
|
- "-n observability -o yaml" |
|
- ) |
|
- if code != 0: |
|
- return False, "Prometheus config not readable" |
|
+ used_count = sum(1 for name in record_names if name in alert_section) |
|
|
|
- # Check for recording rule pattern |
|
- if 'record:' not in out: |
|
- return False, "Prometheus should have recording rules for availability metrics" |
|
+ if used_count < 2: |
|
+ return False, ( |
|
+ "At least 2 recording rules should be referenced in alert " |
|
+ "expressions for multi-window burn rate detection" |
|
+ ) |
|
|
|
- if 'probe:' not in out.lower() and 'availability' not in out.lower(): |
|
- return False, "Recording rules should compute availability metrics" |
|
+ return True, "Recording rules exist and are used in alerts" |
|
|
|
- return True, "Recording rules configured for availability metrics" |
|
|
|
def check_blackbox_modules(): |
|
"""Verify correct Blackbox modules used for each protocol.""" |
|
@@ -351,43 +440,58 @@ |
|
if 'http_2xx' not in argocd_section and 'http' in argocd_section: |
|
return False, "HTTP targets should use http_2xx module" |
|
|
|
- # --- NEW: HTTPS targets must explicitly configure TLS verification --- |
|
- if 'https://' in out_lower: |
|
- # Only enforce if HTTPS is actually being probed |
|
- has_tls_config = ( |
|
- 'tls_config' in out_lower or |
|
- 'insecure_skip_verify: false' in out_lower |
|
- ) |
|
- |
|
- if not has_tls_config: |
|
- return False, ( |
|
- "HTTPS targets should have explicit TLS verification " |
|
- "configuration (tls_config or insecure_skip_verify: false)" |
|
- ) |
|
- |
|
return True, "Blackbox modules correctly matched to target protocols" |
|
|
|
|
|
+def check_alert_severity_labels(): |
|
+ """Verify alerts define severity labels.""" |
|
+ code, out, _ = sh( |
|
+ "kubectl get configmap prometheus-config " |
|
+ "-n observability -o yaml" |
|
+ ) |
|
+ if code != 0: |
|
+ return False, "Prometheus config not readable" |
|
|
|
+ if "severity:" not in out: |
|
+ return False, ( |
|
+ "Alerts must define severity labels " |
|
+ "(critical or warning)" |
|
+ ) |
|
|
|
+ return True, "Alert severity labels present" |
|
|
|
|
|
+def check_dashboard_uses_recording_rules(): |
|
+ """Dashboard should reference recording rules instead of raw PromQL.""" |
|
+ code, dash_out, _ = sh( |
|
+ "kubectl get configmap grafana-dashboards " |
|
+ "-n observability -o yaml" |
|
+ ) |
|
+ if code != 0: |
|
+ return False, "grafana-dashboards ConfigMap not readable" |
|
|
|
+ # Extract actual recording rule names from prometheus-config |
|
+ code, prom_out, _ = sh( |
|
+ "kubectl get configmap prometheus-config " |
|
+ "-n observability -o yaml" |
|
+ ) |
|
|
|
+ if code == 0: |
|
+ record_names = re.findall( |
|
+ r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", |
|
+ prom_out |
|
+ ) |
|
+ if record_names and any(name in dash_out for name in record_names): |
|
+ return True, "Dashboard references recording rules" |
|
|
|
+ # Fallback: accept any probe:*:* pattern (recording rule convention) |
|
+ if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out): |
|
+ return True, "Dashboard references recording rules" |
|
|
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
+ return False, ( |
|
+ "Dashboard should reference pre-computed recording rules " |
|
+ "(e.g., probe:availability:5m) instead of raw PromQL" |
|
+ ) |
|
|
|
|
|
def argocd_probe_success(): |
|
@@ -418,61 +522,19 @@ |
|
f"kubectl get deployment {name} -n {namespace} -o json" |
|
) |
|
if code != 0: |
|
- return False, f"Deployment '{name}' not found for image validation" |
|
+ return False, f"Deployment '{name}' not found" |
|
|
|
data = json.loads(out) |
|
containers = data["spec"]["template"]["spec"]["containers"] |
|
+ images = [c.get("image", "") for c in containers] |
|
|
|
- images = [c.get("image") for c in containers] |
|
+ actual = [normalize_image(i) for i in images] |
|
+ expected = normalize_image(expected_image) |
|
|
|
- if expected_image in images: |
|
+ if expected in actual: |
|
return True, f"Deployment '{name}' uses image '{expected_image}'" |
|
|
|
- return ( |
|
- False, |
|
- f"Deployment '{name}' does not use required image '{expected_image}' " |
|
- f"(found: {images})" |
|
- ) |
|
- |
|
-def deployment_uses_any_image(name, namespace, allowed_images): |
|
- code, out, _ = sh( |
|
- f"kubectl get deployment {name} -n {namespace} -o json" |
|
- ) |
|
- if code != 0: |
|
- return False, f"Deployment '{name}' not found for image validation" |
|
- |
|
- data = json.loads(out) |
|
- containers = data["spec"]["template"]["spec"]["containers"] |
|
- images = [c.get("image") for c in containers] |
|
- |
|
- for img in allowed_images: |
|
- if img in images: |
|
- return True, f"Deployment '{name}' uses allowed image '{img}'" |
|
- |
|
- return ( |
|
- False, |
|
- f"Deployment '{name}' does not use an allowed Prometheus image " |
|
- f"(found: {images})" |
|
- ) |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
+ return False, f"Expected {expected_image}, found {images}" |
|
|
|
|
|
def prometheus_blackbox_relabeling_present(): |
|
@@ -483,29 +545,34 @@ |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- |
|
- has_blackbox_addr = ("blackbox-exporter:9115" in out or |
|
+ |
|
+ has_blackbox_addr = ("blackbox-exporter:9115" in out or |
|
"blackbox-exporter.observability" in out) |
|
- |
|
+ |
|
required_snippets = [ |
|
"metrics_path: /probe", |
|
"__param_target", |
|
] |
|
|
|
missing = [s for s in required_snippets if s not in out] |
|
- |
|
+ |
|
if not has_blackbox_addr: |
|
missing.append("blackbox-exporter address") |
|
- |
|
+ |
|
if not missing: |
|
return True, "Prometheus blackbox relabeling is correctly configured" |
|
|
|
return False, f"Missing blackbox relabeling elements: {missing}" |
|
|
|
def prometheus_alert_fires_for_failing_probe(): |
|
+ for dep in ("prometheus", "blackbox-exporter"): |
|
+ healthy, msg = check_deployment_health(dep, "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
+ |
|
port_forward("svc", "prometheus", "observability", 9090, 9090) |
|
|
|
- for _ in range(9): # allow 2 minutes |
|
+ for _ in range(18): |
|
code, out, _ = sh( |
|
"curl -s http://localhost:9090/api/v1/alerts" |
|
) |
|
@@ -517,16 +584,20 @@ |
|
): |
|
return True, "SyntheticProbeFailure alert is firing" |
|
|
|
- time.sleep(7) |
|
+ time.sleep(8) |
|
|
|
return False, "SyntheticProbeFailure alert did not fire" |
|
|
|
|
|
def grafana_has_prometheus_datasource(): |
|
+ healthy, msg = check_deployment_health("grafana", "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
+ |
|
if not wait_for_grafana_api(): |
|
return False, "Grafana API not reachable" |
|
|
|
- for _ in range(9): |
|
+ for _ in range(6): |
|
code, out, _ = sh( |
|
"kubectl exec -n observability deploy/grafana -- " |
|
"wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' " |
|
@@ -542,10 +613,14 @@ |
|
|
|
|
|
def grafana_has_blackbox_dashboard(): |
|
+ healthy, msg = check_deployment_health("grafana", "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
+ |
|
if not wait_for_grafana_api(): |
|
return False, "Grafana API not reachable" |
|
|
|
- for _ in range(9): |
|
+ for _ in range(6): |
|
code, out, _ = sh( |
|
"kubectl exec -n observability deploy/grafana -- " |
|
"wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' " |
|
@@ -553,40 +628,14 @@ |
|
) |
|
|
|
|
|
- if code == 0 and "Synthetic" in out: |
|
+ if code == 0 and any(kw in out for kw in [ |
|
+ "Synthetic", "Blackbox", "Probe", "Endpoint" |
|
+ ]): |
|
return True, "Grafana dashboard for synthetic probes exists" |
|
time.sleep(5) |
|
|
|
return False, "Grafana dashboard missing" |
|
|
|
-def grafana_dashboard_uses_probe_success(): |
|
- ok, _ = configmap_contains( |
|
- "grafana-dashboards", |
|
- "observability", |
|
- ["probe_success"] |
|
- ) |
|
- |
|
- if ok: |
|
- return True, "Grafana dashboard visualizes probe_success metric" |
|
- |
|
- return False, "Grafana dashboard does not reference probe_success" |
|
- |
|
-def grafana_dashboard_uses_time_aggregation(): |
|
- code, out, _ = sh( |
|
- "kubectl get configmap grafana-dashboards " |
|
- "-n observability -o yaml" |
|
- ) |
|
- if code != 0: |
|
- return False, "grafana-dashboards ConfigMap not readable" |
|
- |
|
- agg_funcs = ["avg_over_time", "min_over_time", "last_over_time"] |
|
- if any(f in out for f in agg_funcs): |
|
- return True, "Grafana dashboard uses time-based probe evaluation" |
|
- |
|
- return False, ( |
|
- "Grafana dashboard does not use time-based aggregation " |
|
- "(expected avg_over_time / min_over_time)" |
|
- ) |
|
|
|
def prometheus_uses_pvc(): |
|
code, out, _ = sh( |
|
@@ -610,7 +659,7 @@ |
|
|
|
|
|
def alert_rule_identifies_endpoint(): |
|
- """Verify alert rule groups by instance/endpoint""" |
|
+ """Verify alert annotations reference the failing endpoint.""" |
|
code, out, _ = sh( |
|
"kubectl get configmap prometheus-config " |
|
"-n observability -o yaml" |
|
@@ -618,21 +667,19 @@ |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- grouping_patterns = [ |
|
- r'by\s*\(\s*instance\s*\)', |
|
- r'by\s*\(\s*target\s*\)', |
|
- r'by\s*\(\s*[^)]*instance[^)]*\)', |
|
- ] |
|
- |
|
- has_grouping = any(re.search(pattern, out) for pattern in grouping_patterns) |
|
- |
|
- |
|
- has_label_template = re.search(r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out) |
|
- |
|
- if has_grouping or has_label_template: |
|
- return True, "Alert rule properly identifies per-endpoint failures" |
|
- |
|
- return False, "Alert must group by instance (use 'by (instance)' or reference {{ $labels.instance }})" |
|
+ # Alerts must reference the endpoint in annotations so operators |
|
+ # can identify which endpoint failed |
|
+ has_label_template = re.search( |
|
+ r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out |
|
+ ) |
|
+ |
|
+ if has_label_template: |
|
+ return True, "Alert annotations identify the failing endpoint" |
|
+ |
|
+ return False, ( |
|
+ "Alert annotations must reference the failing endpoint " |
|
+ "(e.g., {{ $labels.instance }}) for operational use" |
|
+ ) |
|
|
|
|
|
def alert_has_minimum_duration(): |
|
@@ -645,24 +692,27 @@ |
|
return False, "Prometheus config not readable" |
|
|
|
duration_pattern = r'for:\s*([2-9]|[1-9]\d+)m' |
|
- |
|
+ |
|
if re.search(duration_pattern, out): |
|
- return True, "Alert rule has correct minimum duration (≥2m)" |
|
- |
|
- return False, "Alert rule must fire 'for: 2m' or longer (not immediate)" |
|
+ return True, "Alert rule has correct minimum duration (>=2m)" |
|
|
|
+ return False, "Alert rule must fire 'for: 2m' or longer (not immediate)" |
|
|
|
|
|
+def prometheus_alert_is_per_endpoint(): |
|
|
|
+ for dep in ("prometheus", "blackbox-exporter"): |
|
+ healthy, msg = check_deployment_health(dep, "observability") |
|
+ if not healthy: |
|
+ return False, msg |
|
|
|
-def prometheus_alert_is_per_endpoint(): |
|
for _ in range(18): |
|
code, out, _ = sh( |
|
"kubectl exec -n observability deploy/prometheus -- " |
|
"wget -qO- http://localhost:9090/api/v1/alerts" |
|
) |
|
if code != 0 or "SyntheticProbeFailure" not in out: |
|
- time.sleep(10) |
|
+ time.sleep(8) |
|
continue |
|
|
|
try: |
|
@@ -692,17 +742,11 @@ |
|
} |
|
|
|
has_failing = any("does-not-exist" in i for i in firing) |
|
- argocd_quiet = not any("argocd" in i for i in firing) |
|
|
|
- if has_failing and argocd_quiet: |
|
+ if has_failing: |
|
return True, ( |
|
"Alerts fire per endpoint (failing endpoint alerts " |
|
- "independently of healthy endpoints)" |
|
- ) |
|
- elif has_failing: |
|
- return False, ( |
|
- "ArgoCD endpoint is also firing — alerts may not " |
|
- "distinguish healthy from unhealthy endpoints" |
|
+ "independently with instance labels)" |
|
) |
|
|
|
time.sleep(8) |
|
@@ -728,37 +772,6 @@ |
|
return True, "Alerting correctly avoids exporter 'up' metric" |
|
|
|
|
|
- |
|
-def grafana_dashboard_uses_time_aggregation(): |
|
- """Verify dashboard queries use time aggregation for probe_success""" |
|
- code, out, _ = sh( |
|
- "kubectl get configmap grafana-dashboards " |
|
- "-n observability -o yaml" |
|
- ) |
|
- if code != 0: |
|
- return False, "grafana-dashboards ConfigMap not found" |
|
- |
|
- # Check for time aggregation functions |
|
- time_agg_functions = [ |
|
- 'avg_over_time', |
|
- 'min_over_time', |
|
- 'max_over_time', |
|
- 'last_over_time' |
|
- ] |
|
- |
|
- has_probe_success = 'probe_success' in out |
|
- has_time_agg = any(func in out for func in time_agg_functions) |
|
- |
|
- if has_probe_success and has_time_agg: |
|
- return True, "Dashboard uses time-aggregated probe metrics" |
|
- |
|
- if has_probe_success and not has_time_agg: |
|
- return False, "Dashboard must use time aggregation (avg_over_time, min_over_time, etc.) not raw probe_success" |
|
- |
|
- return False, "Dashboard does not query probe_success" |
|
- |
|
- |
|
- |
|
def check_endpoint_count(): |
|
"""Verify at least 3 probe targets are configured.""" |
|
code, out, _ = sh( |
|
@@ -784,27 +797,6 @@ |
|
return True, "All required endpoints configured" |
|
|
|
|
|
-def get_probe_targets(): |
|
- """Extract probe targets from Prometheus scrape config.""" |
|
- code, out, _ = sh( |
|
- "kubectl get configmap prometheus-config " |
|
- "-n observability -o jsonpath='{.data.prometheus\\.yml}'" |
|
- ) |
|
- if code != 0: |
|
- return [] |
|
- |
|
- targets = [] |
|
- target_matches = re.findall( |
|
- r'-\s*(https?://[^\s]+|[\w.-]+:\d+)', |
|
- out |
|
- ) |
|
- targets.extend(target_matches) |
|
- |
|
- return targets |
|
- |
|
- |
|
- |
|
- |
|
def check_grafana_dashboard_semantics(): |
|
"""Verify Grafana dashboard uses correct semantic patterns for synthetic monitoring.""" |
|
code, out, _ = sh( |
|
@@ -817,22 +809,32 @@ |
|
issues = [] |
|
|
|
# ------------------------------------------------------------------ |
|
- # Check 1: probe_success must be time-aggregated (not raw) |
|
+ # Check 1: dashboard must show availability data (not raw binary) |
|
# ------------------------------------------------------------------ |
|
- if "probe_success" in out: |
|
- has_time_agg = any(fn in out for fn in [ |
|
- "avg_over_time", |
|
- "min_over_time", |
|
- "max_over_time", |
|
- "sum_over_time", |
|
- ]) |
|
- if not has_time_agg: |
|
- issues.append( |
|
- "Dashboard uses raw probe_success without time aggregation " |
|
- "(expected avg_over_time / min_over_time / max_over_time)" |
|
- ) |
|
+ has_availability_metric = ( |
|
+ "probe_success" in out |
|
+ or re.search(r"probe:[a-zA-Z_]*availab", out) |
|
+ or re.search(r"probe:[a-zA-Z_]*success", out) |
|
+ ) |
|
+ |
|
+ if has_availability_metric: |
|
+ if "probe_success" in out: |
|
+ has_time_agg = any(fn in out for fn in [ |
|
+ "avg_over_time", |
|
+ "min_over_time", |
|
+ "max_over_time", |
|
+ "sum_over_time", |
|
+ ]) |
|
+ if not has_time_agg: |
|
+ issues.append( |
|
+ "Dashboard uses raw probe_success without time " |
|
+ "aggregation (expected avg_over_time or similar)" |
|
+ ) |
|
else: |
|
- issues.append("Dashboard does not query probe_success metric") |
|
+ issues.append( |
|
+ "Dashboard does not reference probe availability metrics " |
|
+ "(probe_success or a recording rule like probe:availability)" |
|
+ ) |
|
|
|
# ------------------------------------------------------------------ |
|
# Check 2: per-endpoint breakdown (instance / target) |
|
@@ -842,30 +844,38 @@ |
|
"by (target)", |
|
"$labels.instance", |
|
"$labels.target", |
|
+ "{{ instance }}", |
|
+ "{{instance}}", |
|
+ "{{ target }}", |
|
+ "{{target}}", |
|
]) |
|
|
|
if not has_grouping: |
|
issues.append( |
|
"Dashboard does not show per-endpoint breakdown " |
|
- "(missing by(instance) or target label usage)" |
|
+ "(missing by(instance), legendFormat with {{ instance }}, " |
|
+ "or target label usage)" |
|
) |
|
|
|
# ------------------------------------------------------------------ |
|
- # Check 3: availability expressed as percentage |
|
+ # Check 3: availability not shown as raw binary signal |
|
# ------------------------------------------------------------------ |
|
- has_percentage = ( |
|
- "probe_success" in out and |
|
- any(x in out for x in [ |
|
- "* 100", |
|
- "*100", |
|
- "100 *", |
|
+ has_normalized = ( |
|
+ # Percentage form (e.g., * 100) |
|
+ any(x in out for x in ["* 100", "*100", "100 *"]) |
|
+ # Or [0,1] normalized via time aggregation (avg_over_time already |
|
+ # produces a continuous availability ratio, not binary) |
|
+ or any(fn in out for fn in [ |
|
+ "avg_over_time", "min_over_time", "max_over_time", |
|
]) |
|
+ # Or uses a recording rule that pre-computes availability |
|
+ or re.search(r"probe:[a-zA-Z_]+:", out) |
|
) |
|
|
|
- if not has_percentage: |
|
+ if not has_normalized: |
|
issues.append( |
|
- "Dashboard should express availability as a percentage " |
|
- "(e.g. avg_over_time(probe_success[5m]) * 100)" |
|
+ "Dashboard should represent availability as a normalized " |
|
+ "measure (e.g., avg_over_time for ratio or * 100 for percentage)" |
|
) |
|
|
|
# ------------------------------------------------------------------ |
|
@@ -898,29 +908,6 @@ |
|
) |
|
|
|
|
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
def grade(transcript: str) -> GradingResult: |
|
feedback = [] |
|
|
|
@@ -928,16 +915,47 @@ |
|
# Gate checks (non-scored) |
|
# ------------------------ |
|
gate_checks = [ |
|
+ # Resource existence |
|
lambda: deployment_exists("blackbox-exporter", "observability"), |
|
lambda: service_exists("blackbox-exporter", "observability", 9115), |
|
lambda: configmap_exists("blackbox-config", "observability"), |
|
- |
|
lambda: deployment_exists("prometheus", "observability"), |
|
lambda: configmap_exists("prometheus-config", "observability"), |
|
lambda: pvc_exists("prometheus-data", "observability"), |
|
- |
|
blackbox_config_has_required_modules, |
|
prometheus_scrape_interval_valid, |
|
+ |
|
+ # Image correctness |
|
+ lambda: deployment_uses_image( |
|
+ "blackbox-exporter", "observability", |
|
+ "prom/blackbox-exporter:v0.25.0", |
|
+ ), |
|
+ lambda: deployment_uses_image( |
|
+ "prometheus", "observability", |
|
+ "prom/prometheus:v3.8.1", |
|
+ ), |
|
+ lambda: deployment_uses_image( |
|
+ "grafana", "observability", |
|
+ "grafana/grafana:12.2", |
|
+ ), |
|
+ |
|
+ # Core operational |
|
+ blackbox_metrics_exposed, |
|
+ prometheus_has_probe_metrics, |
|
+ prometheus_blackbox_relabeling_present, |
|
+ argocd_probe_success, |
|
+ kubernetes_api_tcp_probe_configured, |
|
+ prometheus_uses_pvc, |
|
+ |
|
+ # Basic config quality |
|
+ grafana_has_prometheus_datasource, |
|
+ check_alert_severity_labels, |
|
+ check_alert_annotations, |
|
+ check_alert_for_duration, |
|
+ alert_has_minimum_duration, |
|
+ prometheus_does_not_use_up_metric, |
|
+ check_slo_burn_rate_alerts, |
|
+ check_blackbox_modules, |
|
] |
|
|
|
for fn in gate_checks: |
|
@@ -953,57 +971,14 @@ |
|
# Scored checks (partial) |
|
# ------------------------ |
|
scored_checks = { |
|
- # Image correct |
|
- "blackbox_image_correct": lambda: deployment_uses_image( |
|
- "blackbox-exporter", |
|
- "observability", |
|
- "prom/blackbox-exporter:v0.25.0", |
|
- ), |
|
- "prometheus_image_correct": lambda: deployment_uses_any_image( |
|
- "prometheus", |
|
- "observability", |
|
- ["prom/prometheus:v3.8.1"], |
|
- ), |
|
- "grafana_image_correct": lambda: deployment_uses_image( |
|
- "grafana", |
|
- "observability", |
|
- "grafana/grafana:12.2" |
|
- ), |
|
- |
|
- # Core functionality |
|
- "blackbox_operational": blackbox_metrics_exposed, |
|
- "prometheus_operational": prometheus_has_probe_metrics, |
|
- "blackbox_scraped_by_prometheus": prometheus_blackbox_relabeling_present, |
|
- "argocd_probe_working": argocd_probe_success, |
|
- "kubernetes_api_probed": kubernetes_api_tcp_probe_configured, |
|
- "storage_persistent": prometheus_uses_pvc, |
|
- |
|
- # Visualization |
|
- "grafana_datasource": grafana_has_prometheus_datasource, |
|
"grafana_dashboard_present": grafana_has_blackbox_dashboard, |
|
- |
|
- |
|
"endpoint_count": check_endpoint_count, |
|
"grafana_dashboard_semantics": check_grafana_dashboard_semantics, |
|
- |
|
- # Alerting |
|
"failing_in_alert": prometheus_alert_fires_for_failing_probe, |
|
"alert_identifies_endpoint": alert_rule_identifies_endpoint, |
|
"per_endpoint": prometheus_alert_is_per_endpoint, |
|
- "time_aggregate": grafana_dashboard_uses_time_aggregation, |
|
- |
|
- "blackbox_modules_correct": check_blackbox_modules, |
|
- "scrape_interval_valid": check_scrape_interval, |
|
- "alert_has_annotations": check_alert_annotations, |
|
- "alert_for_duration_valid": check_alert_for_duration, |
|
- |
|
- "slo_rate": check_slo_burn_rate_alerts, |
|
"records": check_recording_rules, |
|
- |
|
- |
|
- "alert_duration_correct": alert_has_minimum_duration, |
|
- |
|
- "up_metric": prometheus_does_not_use_up_metric, |
|
+ "recording_rules": check_dashboard_uses_recording_rules, |
|
} |
|
|
|
subscores = {} |
|
@@ -1018,7 +993,7 @@ |
|
subscores[key] = 1.0 if ok else 0.0 |
|
feedback.append(("✓ " if ok else "✗ ") + msg) |
|
|
|
- # |
|
+ # |
|
total_checks = len(scored_checks) |
|
weight = 1.0 / total_checks |
|
weights = {k: weight for k in scored_checks} |
|
diff --color -ruN '--exclude=__pycache__' '--exclude=.apex_metadata.json' ./setup.sh /tmp/reconciled-synthetic-endpoint-monitoring/setup.sh |
|
--- ./setup.sh 2026-02-19 06:54:41.199610344 -0700 |
|
+++ /tmp/reconciled-synthetic-endpoint-monitoring/setup.sh 2026-02-19 14:13:41.075296397 -0700 |
|
@@ -57,48 +57,112 @@ |
|
name: ubuntu-user-configmap-editor |
|
EOF |
|
|
|
- |
|
- |
|
- |
|
- |
|
-# Create wiki page with service information |
|
+# ------------------------------------------------------------------------------ |
|
+# Create Gitea repo |
|
+# ------------------------------------------------------------------------------ |
|
kubectl exec -n gitea deploy/gitea -- sh -c ' |
|
curl -s -X POST \ |
|
-u "root:Admin@123456" \ |
|
-H "Content-Type: application/json" \ |
|
-d "{ |
|
- \"title\": \"Platform Services - Monitoring Requirements\", |
|
- \"content\": \"# Platform Services - Monitoring Requirements\n\n## Critical Platform Services\n\n### ArgoCD GitOps Controller\n| Property | Value |\n|----------|-------|\n| Ingress | `argocd.devops.local` |\n| Protocol | HTTP |\n| Criticality | High |\n\n### Kubernetes API Server\n| Property | Value |\n|----------|-------|\n| Access | `kubernetes.default.svc:443` |\n| Protocol | TCP |\n| Criticality | Critical |\n\n## Alerting Validation\n\nConfigure a probe against a known-unreachable endpoint to verify alerting:\n\n http://does-not-exist.devops.local\n\nAlerts should identify the specific failing endpoint (not aggregate into one global alert).\n\n## Air-Gapped Environment\n\nContainer images are preloaded in the air-gapped environment.\nAgents must discover available versions using standard container tooling.\" |
|
+ \"name\": \"platform-docs\", |
|
+ \"private\": false, |
|
+ \"auto_init\": true |
|
}" \ |
|
- http://localhost:3000/api/v1/repos/root/platform-docs/wiki/new |
|
-' 2>/dev/null && echo "[SETUP] Platform documentation created" |
|
+ http://localhost:3000/api/v1/user/repos |
|
+' |
|
|
|
+# ------------------------------------------------------------------------------ |
|
+# Wiki creation — commit directly to bare git repo (bypasses API timing issues) |
|
+# ------------------------------------------------------------------------------ |
|
+kubectl exec -n gitea deploy/gitea -- sh -c ' |
|
+set -e |
|
|
|
+WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git |
|
|
|
+# Ensure bare wiki repo exists (Gitea does NOT auto-create it) |
|
+if [ ! -d "$WIKI_REPO" ]; then |
|
+ git init --bare "$WIKI_REPO" >/dev/null 2>&1 |
|
+fi |
|
|
|
+# Commit wiki content directly into the bare repo via a temp clone |
|
+TMPDIR=$(mktemp -d) |
|
+cd "$TMPDIR" |
|
+git init >/dev/null 2>&1 |
|
+git config user.email "root@gitea.local" |
|
+git config user.name "root" |
|
|
|
+cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF |
|
+# Platform Services - Monitoring Requirements |
|
|
|
+## Critical Platform Services |
|
|
|
+### ArgoCD GitOps Controller |
|
+| Property | Value | |
|
+|----------|-------| |
|
+| Ingress | \`argocd.devops.local\` | |
|
+| Protocol | HTTP | |
|
+| Criticality | High | |
|
|
|
+### Kubernetes API Server |
|
+| Property | Value | |
|
+|----------|-------| |
|
+| Access | \`kubernetes.default.svc:443\` | |
|
+| Protocol | TCP | |
|
+| Criticality | Critical | |
|
|
|
+## Alerting Validation |
|
|
|
+Configure a probe against a known-unreachable endpoint to verify alerting: |
|
|
|
+ http://does-not-exist.devops.local |
|
|
|
+Alerts should identify the specific failing endpoint (not aggregate into one global alert). |
|
|
|
+## Preloaded Container Images |
|
|
|
+This cluster operates in air-gapped mode. The following images are available: |
|
|
|
+| Component | Image | |
|
+|-----------|-------| |
|
+| Prometheus | \`prom/prometheus:v3.8.1\` | |
|
+| Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` | |
|
+| Grafana | \`grafana/grafana:12.2\` | |
|
|
|
+Use these exact image references in your deployments. No other versions are available. |
|
+WIKIEOF |
|
|
|
+git add . >/dev/null 2>&1 |
|
+git commit -m "Initial wiki content" >/dev/null 2>&1 |
|
+git remote add origin "$WIKI_REPO" |
|
+git push -f origin master >/dev/null 2>&1 |
|
|
|
+cd / |
|
+rm -rf "$TMPDIR" |
|
+chown -R git:git "$WIKI_REPO" |
|
+' && echo "[SETUP] Platform documentation created" \ |
|
+ || echo "[SETUP] WARN: Wiki creation failed" |
|
+ |
|
+# Poke the Gitea wiki API to force indexing of the git-committed content |
|
+kubectl exec -n gitea deploy/gitea -- sh -c ' |
|
+for i in 1 2 3; do |
|
+ HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ |
|
+ -u "root:Admin@123456" \ |
|
+ "http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages") |
|
+ if [ "$HTTP_CODE" = "200" ]; then |
|
+ break |
|
+ fi |
|
+ sleep 2 |
|
+done |
|
+' 2>/dev/null || true |
|
|
|
|
|
|
|
-echo "[SETUP] Importing pre-cached MinIO and MariaDB images..." |
|
+echo "[SETUP] Importing pre-cached container images..." |
|
|
|
PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar" |
|
BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar" |
|
-GRAFANA_TAR="//workdir/grafana-12.2.tar" |
|
- |
|
+GRAFANA_TAR="/workdir/grafana-12.2.tar" |
|
|
|
if [ ! -f "$GRAFANA_TAR" ]; then |
|
echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR" |
|
@@ -109,9 +173,6 @@ |
|
--namespace k8s.io \ |
|
images import --no-unpack "$GRAFANA_TAR" |
|
|
|
-# ------------------------ |
|
-# Verify Grafana |
|
-# ------------------------ |
|
ctr --address /run/k3s/containerd/containerd.sock \ |
|
--namespace k8s.io \ |
|
images list | grep -q "grafana/grafana:12.2" || { |
|
@@ -119,12 +180,6 @@ |
|
exit 1 |
|
} |
|
|
|
- |
|
- |
|
- |
|
-# ------------------------ |
|
-# Import Prometheus image |
|
-# ------------------------ |
|
if [ ! -f "$PROMETHEUS_TAR" ]; then |
|
echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR" |
|
exit 1 |
|
@@ -134,9 +189,6 @@ |
|
--namespace k8s.io \ |
|
images import "$PROMETHEUS_TAR" |
|
|
|
-# ------------------------ |
|
-# Import Blackbox image |
|
-# ------------------------ |
|
if [ ! -f "$BLACKBOX_TAR" ]; then |
|
echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR" |
|
exit 1 |
|
@@ -145,11 +197,7 @@ |
|
ctr --address /run/k3s/containerd/containerd.sock \ |
|
--namespace k8s.io \ |
|
images import --no-unpack "$BLACKBOX_TAR" |
|
-echo "[SETUP] Verifying Prometheus and Blackbox image imports..." |
|
|
|
-# ------------------------ |
|
-# Verify Prometheus |
|
-# ------------------------ |
|
ctr --address /run/k3s/containerd/containerd.sock \ |
|
--namespace k8s.io \ |
|
images list | grep -q "prom/prometheus:v3.8.1" || { |
|
@@ -157,9 +205,6 @@ |
|
exit 1 |
|
} |
|
|
|
-# ------------------------ |
|
-# Verify Blackbox |
|
-# ------------------------ |
|
ctr --address /run/k3s/containerd/containerd.sock \ |
|
--namespace k8s.io \ |
|
images list | grep -q "prom/blackbox-exporter:v0.25.0" || { |
|
@@ -167,11 +212,44 @@ |
|
exit 1 |
|
} |
|
|
|
-# Remove tar files to prevent version discovery |
|
-echo "[SETUP] Cleaning up image artifacts..." |
|
+CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io" |
|
+$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1 |
|
+$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0 |
|
+$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2 |
|
+ |
|
+ALLOWED_IMAGES=( |
|
+ "prom/prometheus:v3.8.1" |
|
+ "docker.io/prom/prometheus:v3.8.1" |
|
+ "prom/blackbox-exporter:v0.25.0" |
|
+ "docker.io/prom/blackbox-exporter:v0.25.0" |
|
+ "grafana/grafana:12.2" |
|
+ "docker.io/grafana/grafana:12.2" |
|
+) |
|
+ |
|
+is_allowed() { |
|
+ for allowed in "${ALLOWED_IMAGES[@]}"; do |
|
+ [[ "$1" == "$allowed" ]] && return 0 |
|
+ done |
|
+ return 1 |
|
+} |
|
+ |
|
+$CTR images list -q | while read -r image; do |
|
+ case "$image" in |
|
+ *prometheus*|*blackbox-exporter*|*grafana*) |
|
+ is_allowed "$image" || $CTR images remove "$image" 2>/dev/null || true |
|
+ ;; |
|
+ esac |
|
+done |
|
+ |
|
+ctr --address /run/k3s/containerd/containerd.sock \ |
|
+ --namespace k8s.io \ |
|
+ images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null || true |
|
+ |
|
+ctr --address /run/k3s/containerd/containerd.sock \ |
|
+ --namespace k8s.io \ |
|
+ images remove docker.io/grafana/grafana:11.3.0 2>/dev/null || true |
|
+ |
|
rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR" |
|
rm -rf /workdir/*.tar 2>/dev/null || true |
|
|
|
- |
|
echo "[SETUP] Prometheus and Blackbox images imported successfully" |
|
- |
|
diff --color -ruN '--exclude=__pycache__' '--exclude=.apex_metadata.json' ./solution.sh /tmp/reconciled-synthetic-endpoint-monitoring/solution.sh |
|
--- ./solution.sh 2026-02-19 06:54:41.199837605 -0700 |
|
+++ /tmp/reconciled-synthetic-endpoint-monitoring/solution.sh 2026-02-19 14:14:16.943105289 -0700 |
|
@@ -32,7 +32,8 @@ |
|
valid_http_versions: ["HTTP/1.1", "HTTP/2"] |
|
valid_status_codes: [] |
|
method: GET |
|
- |
|
+ |
|
+ |
|
tcp_connect: |
|
prober: tcp |
|
timeout: 5s |
|
@@ -127,12 +128,10 @@ |
|
prometheus.yml: | |
|
global: |
|
scrape_interval: 15s |
|
+ |
|
rule_files: |
|
- /etc/prometheus/rules/*.yml |
|
- remote_write: |
|
- - url: "https://url/insert/0/prometheus/api/v1/write" |
|
- tls_config: |
|
- insecure_skip_verify: true |
|
+ |
|
scrape_configs: |
|
- job_name: blackbox |
|
metrics_path: /probe |
|
@@ -149,6 +148,7 @@ |
|
target_label: instance |
|
- target_label: __address__ |
|
replacement: blackbox-exporter:9115 |
|
+ |
|
- job_name: blackbox-kubernetes-api |
|
metrics_path: /probe |
|
params: |
|
@@ -163,12 +163,21 @@ |
|
target_label: instance |
|
- target_label: __address__ |
|
replacement: blackbox-exporter:9115 |
|
+ |
|
- job_name: blackbox-exporter |
|
static_configs: |
|
- targets: |
|
- blackbox-exporter:9115 |
|
+ |
|
+ - job_name: prometheus |
|
+ static_configs: |
|
+ - targets: |
|
+ - localhost:9090 |
|
+ |
|
+ |
|
rules.yml: | |
|
groups: |
|
+ |
|
# -------------------------------------------------- |
|
# Recording rules |
|
# -------------------------------------------------- |
|
@@ -176,8 +185,10 @@ |
|
rules: |
|
- record: probe:availability:5m |
|
expr: avg_over_time(probe_success[5m]) |
|
+ |
|
- record: probe:availability:1h |
|
expr: avg_over_time(probe_success[1h]) |
|
+ |
|
- record: probe:latency_p99:5m |
|
expr: | |
|
histogram_quantile( |
|
@@ -185,6 +196,7 @@ |
|
sum(rate(probe_duration_seconds_bucket[5m])) |
|
by (le, instance) |
|
) |
|
+ |
|
# -------------------------------------------------- |
|
# Compatibility alert (legacy graders depend on this) |
|
# -------------------------------------------------- |
|
@@ -198,6 +210,7 @@ |
|
annotations: |
|
summary: "Synthetic probe failed" |
|
description: "Endpoint {{ $labels.instance }} is unreachable" |
|
+ |
|
# -------------------------------------------------- |
|
# SLO burn-rate alerts |
|
# -------------------------------------------------- |
|
@@ -205,20 +218,17 @@ |
|
rules: |
|
- alert: SyntheticProbeHighBurnRate |
|
expr: | |
|
- ( |
|
- 1 - avg_over_time(probe_success{job="blackbox"}[5m]) |
|
- ) / (1 - 0.99) > 14.4 |
|
+ (1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4 |
|
for: 2m |
|
labels: |
|
severity: critical |
|
annotations: |
|
summary: "High synthetic availability burn rate" |
|
description: "High error budget burn rate for {{ $labels.instance }}" |
|
+ |
|
- alert: SyntheticProbeLowBurnRate |
|
expr: | |
|
- ( |
|
- 1 - avg_over_time(probe_success{job="blackbox"}[6h]) |
|
- ) / (1 - 0.99) > 1 |
|
+ (1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1 |
|
for: 1h |
|
labels: |
|
severity: warning |
|
@@ -230,7 +240,6 @@ |
|
|
|
|
|
|
|
- |
|
echo "" |
|
echo ">>> Deploying Prometheus" |
|
echo "" |
|
@@ -260,18 +269,19 @@ |
|
image: prom/prometheus:v3.8.1 |
|
args: |
|
- "--config.file=/etc/prometheus/prometheus.yml" |
|
- |
|
+ |
|
ports: |
|
- containerPort: 9090 |
|
volumeMounts: |
|
- name: config-volume |
|
mountPath: /etc/prometheus/prometheus.yml |
|
subPath: prometheus.yml |
|
- |
|
+ |
|
- name: data-volume |
|
mountPath: /prometheus |
|
- name: rules-volume |
|
mountPath: /etc/prometheus/rules |
|
+ |
|
volumes: |
|
- name: config-volume |
|
configMap: |
|
@@ -285,6 +295,7 @@ |
|
items: |
|
- key: rules.yml |
|
path: rules.yml |
|
+ |
|
EOF |
|
|
|
|
|
@@ -344,11 +355,8 @@ |
|
|
|
|
|
|
|
- |
|
- |
|
- |
|
- |
|
cat <<EOF | kubectl apply -f - |
|
+ |
|
apiVersion: v1 |
|
kind: ConfigMap |
|
metadata: |
|
@@ -365,7 +373,7 @@ |
|
"title": "Probe Availability (%)", |
|
"targets": [ |
|
{ |
|
- "expr": "avg_over_time(probe_success[5m]) by (instance) * 100", |
|
+ "expr": "probe:availability:5m * 100", |
|
"legendFormat": "{{ instance }}", |
|
"refId": "A" |
|
} |
|
@@ -389,7 +397,6 @@ |
|
|
|
|
|
|
|
- |
|
cat <<EOF | kubectl apply -f - |
|
apiVersion: apps/v1 |
|
kind: Deployment |
|
@@ -414,19 +421,24 @@ |
|
volumeMounts: |
|
- name: datasources |
|
mountPath: /etc/grafana/provisioning/datasources |
|
- # 👇 Provider config goes here |
|
+ |
|
+ # Provider config |
|
- name: dashboard-provider |
|
mountPath: /etc/grafana/provisioning/dashboards |
|
- # 👇 Actual JSON dashboards go here |
|
+ |
|
+ # Actual JSON dashboards |
|
- name: dashboard-json |
|
mountPath: /var/lib/grafana/dashboards |
|
+ |
|
volumes: |
|
- name: datasources |
|
configMap: |
|
name: grafana-datasources |
|
+ |
|
- name: dashboard-provider |
|
configMap: |
|
name: grafana-dashboard-provider |
|
+ |
|
- name: dashboard-json |
|
configMap: |
|
name: grafana-dashboards |
|
@@ -447,15 +459,6 @@ |
|
|
|
|
|
|
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
- |
|
|
|
echo "" |
|
echo ">>> Waiting for deployments to become ready" |
|
diff --color -ruN '--exclude=__pycache__' '--exclude=.apex_metadata.json' ./task.yaml /tmp/reconciled-synthetic-endpoint-monitoring/task.yaml |
|
--- ./task.yaml 2026-02-19 06:54:41.200041434 -0700 |
|
+++ /tmp/reconciled-synthetic-endpoint-monitoring/task.yaml 2026-02-19 14:13:01.559405266 -0700 |
|
@@ -13,9 +13,10 @@ |
|
independent of application instrumentation. |
|
|
|
[Environment Notes] |
|
- The cluster operates in an air-gapped mode. |
|
+ The cluster operates in air-gapped mode. |
|
Required container images are already preloaded into the runtime environment and must |
|
- be used as-is without pulling from external registries. |
|
+ be used as-is without pulling from external registries. Available image versions |
|
+ are documented in the platform wiki. |
|
|
|
[Task] |
|
Implement synthetic endpoint monitoring in the `observability` namespace. |
|
@@ -34,7 +35,7 @@ |
|
|
|
The Nebula internal developer wiki documents commonly exposed platform services |
|
and their ingress hostnames: |
|
- http://gitea.devops.local:3000/root/bleater-app/wiki |
|
+ http://gitea.devops.local/root/platform-docs/wiki/ |
|
|
|
The solution must actively probe internal service endpoints that represent |
|
real client access patterns. Check the Gitea wiki for information about |
|
@@ -42,58 +43,46 @@ |
|
|
|
Probe results must be exposed as metrics and be consumable by the monitoring system. |
|
|
|
- The blackbox exporter configuration must support multiple probe types, |
|
- including both HTTP and TCP probes, to enable comprehensive endpoint monitoring. |
|
+ [Blackbox Exporter Requirements] |
|
+ The blackbox exporter must support multiple probe types: |
|
+ - HTTP probes for application-layer availability |
|
+ - TCP probes for transport-layer connectivity |
|
+ |
|
+ Probe behavior must follow protocol-appropriate validation: |
|
+ - Application-layer probes must validate protocol correctness |
|
+ - Transport-layer probes must validate connectivity only |
|
+ - HTTP probe modules must preserve default TLS verification behavior |
|
+ |
|
+ The Kubernetes API server health must be monitored using TCP connectivity probes |
|
+ against its standard in-cluster service name. This check must validate only |
|
+ basic transport-layer reachability (no TLS or HTTP validation). |
|
|
|
- The global `scrape_interval` must be set to 15s or 10s. Default scrape intervals |
|
- should not be relied on. |
|
- |
|
- The Kubernetes API server health must be monitored using TCP connectivity probes. |
|
- The API server is accessible via its standard in-cluster service name. |
|
- |
|
- Probe configurations must reflect protocol-appropriate validation behavior. |
|
- |
|
- - Application-layer probes must validate protocol correctness |
|
- - Transport-layer probes must validate connectivity only |
|
- - Security-related protocol behavior must not be implicitly disabled |
|
- |
|
- The blackbox exporter must be configured to support HTTP availability checks |
|
- and TCP connectivity tests. Module names should follow blackbox exporter |
|
- conventions documented in the container image. |
|
- |
|
- Exporter-level metrics such as `up` must not be used as a substitute for |
|
+ Exporter-level metrics (e.g., `up`) must not be used as a substitute for |
|
synthetic probe result metrics. |
|
|
|
- The global `scrape_interval` must be set to 15s or 10s. Default scrape intervals should not be relied on. |
|
- |
|
- Additionally, Kubernetes API server health must be monitored using |
|
- TCP connectivity probes. |
|
- |
|
- The health check must validate basic transport-layer reachability only, |
|
- without performing TLS or HTTP-level validation. |
|
- |
|
- The health check must validate basic network reachability at the transport |
|
- layer only. |
|
- |
|
- Both endpoints must be configured as Prometheus scrape targets with appropriate |
|
- relabeling for the blackbox exporter. |
|
- |
|
- Prometheus configuration should be structured to support efficient querying |
|
- and reuse of commonly evaluated availability and performance signals. |
|
- |
|
- Repeated or computationally expensive expressions should not be evaluated |
|
- directly at query time. |
|
- |
|
+ [Prometheus Requirements] |
|
+ The global `scrape_interval` must be set to **15s or 10s**. |
|
|
|
+ Prometheus must be configured to scrape blackbox probe targets using appropriate |
|
+ relabeling. Configuration should support efficient querying and reuse of commonly |
|
+ evaluated availability signals. Repeated or computationally expensive expressions |
|
+ should not be evaluated directly at query time. |
|
|
|
[Alerting Requirements] |
|
- Prometheus must define alerting rules for synthetic probe health that reflect |
|
- both short-term and long-term availability degradation. |
|
+ Prometheus must define alerting rules based on synthetic probe result metrics. |
|
+ |
|
+ At least one alert must be named **SyntheticProbeFailure** and represent |
|
+ endpoint-level availability failure detected via synthetic probes. |
|
|
|
- Alerting must: |
|
- - Detect rapid availability loss that would quickly exhaust acceptable downtime |
|
- - Detect sustained availability degradation over extended periods |
|
- - Avoid relying solely on instantaneous probe failures or fixed thresholds |
|
+ Alerts detecting sustained availability degradation should be based on |
|
+ SLO-style burn rate concepts (e.g., evaluating error budget consumption |
|
+ over time rather than fixed thresholds). |
|
+ |
|
+ Alerts must: |
|
+ - Detect rapid availability loss |
|
+ - Detect sustained availability degradation over longer periods |
|
+ - Distinguish failures on a per-endpoint basis |
|
+ - Avoid relying solely on instantaneous probe failures or fixed thresholds |
|
|
|
Alerting rules must be derived from probe result metrics and must distinguish |
|
failures on a per-endpoint basis. |
|
@@ -101,29 +90,24 @@ |
|
[Visualization Requirements] |
|
The monitoring stack must include a visualization layer for synthetic probe results. |
|
|
|
- - A ConfigMap named 'grafana-dashboards' for dashboard definitions. |
|
+ Required resource: |
|
+ - ConfigMap `grafana-dashboards` for dashboard definitions |
|
|
|
A visualization service must be deployed in the `observability` namespace |
|
and configured to consume metrics directly from Prometheus without manual |
|
configuration through a web interface. |
|
|
|
- The dashboard must represent probe availability over time and not rely |
|
- on instantaneous samples alone. |
|
- |
|
- Dashboards must present synthetic probe results in a form suitable for service-level assessment rather than raw signal inspection. |
|
- |
|
The visualization must include at least one dashboard that presents the |
|
availability status of synthetic probes on a per-endpoint basis. |
|
|
|
- Dashboard and data source configuration must be reproducible and stored |
|
- declaratively as Kubernetes resources. |
|
- |
|
- Alerting must distinguish failures on a per-endpoint basis and must not |
|
- collapse multiple probe targets into a single global alert. |
|
+ Dashboards must present synthetic probe results in a form suitable for |
|
+ service-level assessment rather than raw signal inspection. |
|
|
|
Visualizations must: |
|
- - Represent availability as a normalized measure over time |
|
- - Allow comparison across individual endpoints |
|
- - Include at least one indicator related to request or probe responsiveness |
|
+ - Represent availability as a normalized measure over time |
|
+ - Allow comparison across individual endpoints |
|
+ - Include at least one indicator related to request or probe responsiveness |
|
+ - Not rely solely on binary success/failure signals |
|
|
|
- Dashboards must not rely solely on binary success/failure signals. |
|
+ Dashboard and data source configuration must be reproducible and stored |
|
+ declaratively as Kubernetes resources. |