Last active
February 19, 2026 20:39
-
-
Save arubis/8184d51d1ae05205933d686d43af323a to your computer and use it in GitHub Desktop.
synthetic-endpoint-monitoring task (local review version, post-v44 patches)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2 | |
| RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data | |
| RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \ | |
| | tar -xzf - -C /usr/local/bin crane | |
| ENV ALLOWED_NAMESPACES="observability" | |
| RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar | |
| RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar | |
| RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar | |
| ENV DISPLAY_NUM=1 | |
| ENV COMPUTER_HEIGHT_PX=768 | |
| ENV COMPUTER_WIDTH_PX=1024 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import subprocess | |
| import json | |
| import time | |
| from apex_arena._types import GradingResult | |
| import re | |
| from datetime import timedelta | |
| _grafana_api_ready = None | |
| _active_port_forwards: set = set() | |
| def sh(cmd): | |
| r = subprocess.run(cmd, shell=True, capture_output=True, text=True) | |
| return r.returncode, r.stdout.strip(), r.stderr.strip() | |
| def normalize_image(img): | |
| """Strip docker.io/ prefix for image comparison.""" | |
| for prefix in ("docker.io/library/", "docker.io/"): | |
| if img.startswith(prefix): | |
| return img[len(prefix):] | |
| return img | |
| def port_forward(kind, name, namespace, local_port, remote_port): | |
| key = (name, namespace, local_port) | |
| if key in _active_port_forwards: | |
| return | |
| sh(f"pkill -f 'kubectl port-forward.*{local_port}' || true") | |
| sh( | |
| f"kubectl port-forward -n {namespace} {kind}/{name} " | |
| f"{local_port}:{remote_port} >/dev/null 2>&1 &" | |
| ) | |
| time.sleep(5) | |
| _active_port_forwards.add(key) | |
| def parse_duration(duration_str): | |
| """Parse Prometheus duration string to timedelta.""" | |
| match = re.match(r'(\d+)(s|m|h)', str(duration_str)) | |
| if not match: | |
| return timedelta(0) | |
| value, unit = int(match.group(1)), match.group(2) | |
| if unit == 's': | |
| return timedelta(seconds=value) | |
| elif unit == 'm': | |
| return timedelta(minutes=value) | |
| elif unit == 'h': | |
| return timedelta(hours=value) | |
| return timedelta(0) | |
| _TERMINAL_POD_STATES = {"ImagePullBackOff", "ErrImagePull", "InvalidImageName"} | |
| def check_deployment_health(name, namespace): | |
| """Quick check for terminal pod errors. Returns (healthy, message).""" | |
| code, out, _ = sh( | |
| f"kubectl get deployment {name} -n {namespace} -o json" | |
| ) | |
| if code != 0: | |
| return False, f"Deployment '{name}' not found in {namespace}" | |
| try: | |
| data = json.loads(out) | |
| selector = data["spec"]["selector"]["matchLabels"] | |
| label_str = ",".join(f"{k}={v}" for k, v in selector.items()) | |
| except (json.JSONDecodeError, KeyError): | |
| return True, "Could not parse selector, assuming healthy" | |
| code, out, _ = sh( | |
| f"kubectl get pods -n {namespace} -l {label_str} -o json" | |
| ) | |
| if code != 0: | |
| return True, "Could not list pods, assuming healthy" | |
| try: | |
| pods = json.loads(out).get("items", []) | |
| except json.JSONDecodeError: | |
| return True, "Could not parse pods, assuming healthy" | |
| for pod in pods: | |
| statuses = pod.get("status", {}).get("containerStatuses", []) | |
| for cs in statuses: | |
| waiting = cs.get("state", {}).get("waiting", {}) | |
| reason = waiting.get("reason", "") | |
| if reason in _TERMINAL_POD_STATES: | |
| pod_name = pod.get("metadata", {}).get("name", "unknown") | |
| return False, ( | |
| f"Pod '{pod_name}' in {namespace} is stuck in {reason} โ " | |
| f"deployment '{name}' will not recover" | |
| ) | |
| return True, f"Deployment '{name}' pods are not in terminal error state" | |
| # ------------------------------------------------------------------- | |
| # RESOURCE CHECKS | |
| # ------------------------------------------------------------------- | |
| def deployment_exists(name, namespace): | |
| code, _, _ = sh(f"kubectl get deployment {name} -n {namespace}") | |
| return ( | |
| (True, f"Deployment '{name}' exists in {namespace}") | |
| if code == 0 | |
| else (False, f"Deployment '{name}' not found in {namespace}") | |
| ) | |
| def configmap_contains(name, namespace, required_strings): | |
| code, out, _ = sh( | |
| f"kubectl get configmap {name} -n {namespace} -o yaml" | |
| ) | |
| if code != 0: | |
| return False, f"ConfigMap '{name}' not readable" | |
| missing = [s for s in required_strings if s not in out] | |
| if missing: | |
| return False, f"ConfigMap '{name}' missing: {missing}" | |
| return True, f"ConfigMap '{name}' contains required content" | |
| def service_exists(name, namespace, port): | |
| code, out, _ = sh( | |
| f"kubectl get svc {name} -n {namespace} -o json" | |
| ) | |
| if code != 0: | |
| return False, f"Service '{name}' not found in {namespace}" | |
| data = json.loads(out) | |
| ports = [p.get("port") for p in data.get("spec", {}).get("ports", [])] | |
| if port in ports: | |
| return True, f"Service '{name}' exposes port {port}" | |
| return False, f"Service '{name}' does not expose port {port}" | |
| def wait_for_grafana_api(): | |
| global _grafana_api_ready | |
| if _grafana_api_ready is not None: | |
| return _grafana_api_ready | |
| sh( | |
| "kubectl wait --for=condition=ready pod -l app=grafana " | |
| "-n observability --timeout=60s" | |
| ) | |
| for _ in range(6): | |
| code, _, _ = sh( | |
| "kubectl exec -n observability deploy/grafana -- " | |
| "wget -qO- http://localhost:3000/api/health" | |
| ) | |
| if code == 0: | |
| _grafana_api_ready = True | |
| return True | |
| time.sleep(5) | |
| _grafana_api_ready = False | |
| return False | |
| def configmap_exists(name, namespace): | |
| code, _, _ = sh(f"kubectl get configmap {name} -n {namespace}") | |
| return ( | |
| (True, f"ConfigMap '{name}' exists in {namespace}") | |
| if code == 0 | |
| else (False, f"ConfigMap '{name}' not found in {namespace}") | |
| ) | |
| def pvc_exists(name, namespace): | |
| code, _, _ = sh(f"kubectl get pvc {name} -n {namespace}") | |
| return ( | |
| (True, f"PVC '{name}' exists in {namespace}") | |
| if code == 0 | |
| else (False, f"PVC '{name}' not found in {namespace}") | |
| ) | |
| # ------------------------------------------------------------------- | |
| # BLACKBOX FUNCTIONAL CHECKS | |
| # ------------------------------------------------------------------- | |
| def blackbox_metrics_exposed(): | |
| healthy, msg = check_deployment_health("blackbox-exporter", "observability") | |
| if not healthy: | |
| return False, msg | |
| port_forward( | |
| "svc", | |
| "blackbox-exporter", | |
| "observability", | |
| 9115, | |
| 9115, | |
| ) | |
| code, out, _ = sh("curl -s http://localhost:9115/metrics") | |
| if code == 0 and "blackbox_exporter_config_last_reload_successful" in out: | |
| return True, "Blackbox exporter metrics endpoint is reachable" | |
| return False, "Blackbox exporter metrics endpoint not responding correctly" | |
| def blackbox_config_has_required_modules(): | |
| code, out, _ = sh( | |
| "kubectl get configmap blackbox-config " | |
| "-n observability -o jsonpath='{.data}'" | |
| ) | |
| if code != 0: | |
| return False, "blackbox-config not readable" | |
| required_modules = ["http_2xx", "tcp_connect"] | |
| missing = [m for m in required_modules if m not in out] | |
| if missing: | |
| return False, f"blackbox-config missing required modules: {missing}" | |
| return True, "blackbox-config contains all required probe modules" | |
| def kubernetes_api_tcp_probe_configured(): | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| if "kubernetes.default" not in out: | |
| return False, "Kubernetes API server not configured as probe target" | |
| tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?' | |
| if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out: | |
| return False, "Must use 'tcp_connect' module (not tcp_connect_tls) for Kubernetes API TCP probe" | |
| return True, "Kubernetes API TCP probe is configured with tcp_connect module" | |
| def prometheus_has_probe_metrics(): | |
| healthy, msg = check_deployment_health("prometheus", "observability") | |
| if not healthy: | |
| return False, msg | |
| port_forward("svc", "prometheus", "observability", 9090, 9090) | |
| for _ in range(6): | |
| code, out, _ = sh( | |
| "curl -s " | |
| "'http://localhost:9090/api/v1/query?query=probe_success'" | |
| ) | |
| if code == 0 and '"result"' in out: | |
| return True, "Prometheus is collecting probe metrics" | |
| time.sleep(5) | |
| return False, "Prometheus not returning probe metrics after retries" | |
| def check_slo_burn_rate_alerts(): | |
| """Verify alerts implement proper multi-window SLO burn rate logic.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| # Must have multiple time windows in recording rules or alert expressions | |
| windows = re.findall( | |
| r"avg_over_time\([^)]*\[(\d+[mh])\]\)", | |
| out | |
| ) | |
| if len(set(windows)) < 2: | |
| return False, ( | |
| "Burn rate alerts must use multiple time windows " | |
| "(e.g., 5m and 1h)" | |
| ) | |
| # Must have at least 2 distinct 'for:' durations across alert rules | |
| # (evidence of fast-burn vs slow-burn detection windows) | |
| for_durations = re.findall(r"for:\s*(\d+[smh])", out) | |
| unique_durations = {parse_duration(d) for d in for_durations} | |
| unique_durations.discard(timedelta(0)) | |
| if len(unique_durations) < 2: | |
| return False, ( | |
| "SLO burn rate alerting requires multiple detection windows " | |
| "(e.g., a fast-burn alert with 'for: 2m' and a slow-burn " | |
| "alert with 'for: 1h')" | |
| ) | |
| return True, "Valid multi-window SLO burn rate alerts detected" | |
| def check_alert_annotations(): | |
| """Verify alerts have required annotations for operational use.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| if 'SyntheticProbeFailure' not in out: | |
| return False, "SyntheticProbeFailure alert not found" | |
| alert_section = out[out.find('SyntheticProbeFailure'):] | |
| if 'annotations:' not in alert_section: | |
| return False, "Alert missing annotations section" | |
| if 'description' not in alert_section and 'summary' not in alert_section: | |
| return False, "Alert missing description/summary annotation" | |
| return True, "Alert has required annotations" | |
| def check_scrape_interval(): | |
| """Verify scrape interval supports โค2 min detection time.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| global_match = re.search(r'scrape_interval:\s*(\d+[smh])', out) | |
| if not global_match: | |
| return False, "Global scrape_interval not found" | |
| global_interval = global_match.group(1) | |
| duration = parse_duration(global_interval) | |
| if duration > timedelta(seconds=30): | |
| return False, f"Global scrape_interval {global_interval} too long for 2m detection" | |
| return True, f"Scrape interval {global_interval} supports timely detection" | |
| def check_recording_rules(): | |
| """Verify recording rules exist AND are used in alert expressions.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| # Must define recording rules | |
| if "record:" not in out: | |
| return False, "Prometheus should define recording rules" | |
| # Extract recording rule names | |
| record_names = re.findall( | |
| r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", | |
| out | |
| ) | |
| if not record_names: | |
| return False, "No valid recording rule names found" | |
| # Multi-window availability requires at least 2 recording rules | |
| if len(record_names) < 2: | |
| return False, ( | |
| "Multiple recording rules needed for multi-window " | |
| "availability signals (e.g., 5m and 1h windows)" | |
| ) | |
| # At least 2 recording rules must be referenced in alert expressions | |
| alert_section = out[out.find("alert:"):] if "alert:" in out else out | |
| used_count = sum(1 for name in record_names if name in alert_section) | |
| if used_count < 2: | |
| return False, ( | |
| "At least 2 recording rules should be referenced in alert " | |
| "expressions for multi-window burn rate detection" | |
| ) | |
| return True, "Recording rules exist and are used in alerts" | |
| def check_blackbox_modules(): | |
| """Verify correct Blackbox modules used for each protocol.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| out_lower = out.lower() | |
| # --- Kubernetes API must use tcp_connect --- | |
| if 'kubernetes.default' in out: | |
| kube_pos = out.find('kubernetes.default') | |
| kube_section = out[max(0, kube_pos - 500):kube_pos + 200] | |
| if 'tcp_connect' not in kube_section: | |
| return False, "Kubernetes API target should use tcp_connect module" | |
| # --- HTTP endpoints must use http_2xx --- | |
| if 'argocd' in out_lower: | |
| argocd_pos = out_lower.find('argocd') | |
| argocd_section = out[max(0, argocd_pos - 500):argocd_pos + 200] | |
| if 'http_2xx' not in argocd_section and 'http' in argocd_section: | |
| return False, "HTTP targets should use http_2xx module" | |
| return True, "Blackbox modules correctly matched to target protocols" | |
| def check_alert_severity_labels(): | |
| """Verify alerts define severity labels.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| if "severity:" not in out: | |
| return False, ( | |
| "Alerts must define severity labels " | |
| "(critical or warning)" | |
| ) | |
| return True, "Alert severity labels present" | |
| def check_dashboard_uses_recording_rules(): | |
| """Dashboard should reference recording rules instead of raw PromQL.""" | |
| code, dash_out, _ = sh( | |
| "kubectl get configmap grafana-dashboards " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "grafana-dashboards ConfigMap not readable" | |
| # Extract actual recording rule names from prometheus-config | |
| code, prom_out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code == 0: | |
| record_names = re.findall( | |
| r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", | |
| prom_out | |
| ) | |
| if record_names and any(name in dash_out for name in record_names): | |
| return True, "Dashboard references recording rules" | |
| # Fallback: accept any probe:*:* pattern (recording rule convention) | |
| if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out): | |
| return True, "Dashboard references recording rules" | |
| return False, ( | |
| "Dashboard should reference pre-computed recording rules " | |
| "(e.g., probe:availability:5m) instead of raw PromQL" | |
| ) | |
| def check_prometheus_self_scrape(): | |
| """Verify Prometheus scrapes its own metrics.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| if "job_name: prometheus" not in out and "job_name: 'prometheus'" not in out: | |
| return False, "Prometheus should scrape its own metrics (job_name: prometheus)" | |
| return True, "Prometheus self-monitoring configured" | |
| def argocd_probe_success(): | |
| port_forward( | |
| "svc", | |
| "blackbox-exporter", | |
| "observability", | |
| 9115, | |
| 9115, | |
| ) | |
| cmd = ( | |
| "curl -s " | |
| "'http://localhost:9115/probe?" | |
| "target=http://argocd.devops.local:80/api/version&module=http_2xx' | " | |
| "grep '^probe_success 1'" | |
| ) | |
| code, out, _ = sh(cmd) | |
| if code == 0 and out: | |
| return True, "Synthetic probe reports Argo CD endpoint as available" | |
| return False, "Synthetic probe did not report Argo CD as available" | |
| def deployment_uses_image(name, namespace, expected_image): | |
| code, out, _ = sh( | |
| f"kubectl get deployment {name} -n {namespace} -o json" | |
| ) | |
| if code != 0: | |
| return False, f"Deployment '{name}' not found" | |
| data = json.loads(out) | |
| containers = data["spec"]["template"]["spec"]["containers"] | |
| images = [c.get("image", "") for c in containers] | |
| actual = [normalize_image(i) for i in images] | |
| expected = normalize_image(expected_image) | |
| if expected in actual: | |
| return True, f"Deployment '{name}' uses image '{expected_image}'" | |
| return False, f"Expected {expected_image}, found {images}" | |
| def prometheus_blackbox_relabeling_present(): | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| has_blackbox_addr = ("blackbox-exporter:9115" in out or | |
| "blackbox-exporter.observability" in out) | |
| required_snippets = [ | |
| "metrics_path: /probe", | |
| "__param_target", | |
| ] | |
| missing = [s for s in required_snippets if s not in out] | |
| if not has_blackbox_addr: | |
| missing.append("blackbox-exporter address") | |
| if not missing: | |
| return True, "Prometheus blackbox relabeling is correctly configured" | |
| return False, f"Missing blackbox relabeling elements: {missing}" | |
| def prometheus_alert_fires_for_failing_probe(): | |
| for dep in ("prometheus", "blackbox-exporter"): | |
| healthy, msg = check_deployment_health(dep, "observability") | |
| if not healthy: | |
| return False, msg | |
| port_forward("svc", "prometheus", "observability", 9090, 9090) | |
| for _ in range(18): # ~2.5 min with overhead | |
| code, out, _ = sh( | |
| "curl -s http://localhost:9090/api/v1/alerts" | |
| ) | |
| if ( | |
| code == 0 | |
| and "SyntheticProbeFailure" in out | |
| and "does-not-exist.devops.local" in out | |
| ): | |
| return True, "SyntheticProbeFailure alert is firing" | |
| time.sleep(8) | |
| return False, "SyntheticProbeFailure alert did not fire" | |
| def grafana_has_prometheus_datasource(): | |
| healthy, msg = check_deployment_health("grafana", "observability") | |
| if not healthy: | |
| return False, msg | |
| if not wait_for_grafana_api(): | |
| return False, "Grafana API not reachable" | |
| for _ in range(6): | |
| code, out, _ = sh( | |
| "kubectl exec -n observability deploy/grafana -- " | |
| "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' " | |
| "http://localhost:3000/api/datasources" | |
| ) | |
| if code == 0 and "Prometheus" in out: | |
| return True, "Grafana Prometheus datasource configured" | |
| time.sleep(5) | |
| return False, "Grafana Prometheus datasource missing" | |
| def grafana_has_blackbox_dashboard(): | |
| healthy, msg = check_deployment_health("grafana", "observability") | |
| if not healthy: | |
| return False, msg | |
| if not wait_for_grafana_api(): | |
| return False, "Grafana API not reachable" | |
| for _ in range(6): | |
| code, out, _ = sh( | |
| "kubectl exec -n observability deploy/grafana -- " | |
| "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' " | |
| "http://localhost:3000/api/search" | |
| ) | |
| if code == 0 and any(kw in out for kw in [ | |
| "Synthetic", "Blackbox", "Probe", "Endpoint" | |
| ]): | |
| return True, "Grafana dashboard for synthetic probes exists" | |
| time.sleep(5) | |
| return False, "Grafana dashboard missing" | |
| def grafana_dashboard_uses_probe_success(): | |
| ok, _ = configmap_contains( | |
| "grafana-dashboards", | |
| "observability", | |
| ["probe_success"] | |
| ) | |
| if ok: | |
| return True, "Grafana dashboard visualizes probe_success metric" | |
| return False, "Grafana dashboard does not reference probe_success" | |
| def prometheus_uses_pvc(): | |
| code, out, _ = sh( | |
| "kubectl get deployment prometheus " | |
| "-n observability -o json" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus deployment not found" | |
| data = json.loads(out) | |
| volumes = data["spec"]["template"]["spec"].get("volumes", []) | |
| mounts = data["spec"]["template"]["spec"]["containers"][0].get("volumeMounts", []) | |
| pvc_used = any(v.get("persistentVolumeClaim") for v in volumes) | |
| mounted = any(m.get("mountPath") == "/prometheus" for m in mounts) | |
| if pvc_used and mounted: | |
| return True, "Prometheus is using persistent storage" | |
| return False, "Prometheus PVC is not mounted at /prometheus" | |
| def alert_rule_identifies_endpoint(): | |
| """Verify alert annotations reference the failing endpoint.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| # Alerts must reference the endpoint in annotations so operators | |
| # can identify which endpoint failed (not just group in the expr) | |
| has_label_template = re.search( | |
| r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out | |
| ) | |
| if has_label_template: | |
| return True, "Alert annotations identify the failing endpoint" | |
| return False, ( | |
| "Alert annotations must reference the failing endpoint " | |
| "(e.g., {{ $labels.instance }}) for operational use" | |
| ) | |
| def alert_has_minimum_duration(): | |
| """Verify alert rule has for: 2m or greater duration""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| duration_pattern = r'for:\s*([2-9]|[1-9]\d+)m' | |
| if re.search(duration_pattern, out): | |
| return True, "Alert rule has correct minimum duration (โฅ2m)" | |
| return False, "Alert rule must fire 'for: 2m' or longer (not immediate)" | |
| def prometheus_alert_is_per_endpoint(): | |
| for dep in ("prometheus", "blackbox-exporter"): | |
| healthy, msg = check_deployment_health(dep, "observability") | |
| if not healthy: | |
| return False, msg | |
| for _ in range(18): # ~2.5 min with overhead | |
| code, out, _ = sh( | |
| "kubectl exec -n observability deploy/prometheus -- " | |
| "wget -qO- http://localhost:9090/api/v1/alerts" | |
| ) | |
| if code != 0 or "SyntheticProbeFailure" not in out: | |
| time.sleep(8) | |
| continue | |
| try: | |
| data = json.loads(out) | |
| alerts = data.get("data", {}).get("alerts", []) | |
| synthetic = [ | |
| a for a in alerts | |
| if a.get("labels", {}).get("alertname") == "SyntheticProbeFailure" | |
| ] | |
| if len(synthetic) < 1: | |
| time.sleep(8) | |
| continue | |
| instances = { | |
| a.get("labels", {}).get("instance", "") for a in synthetic | |
| } | |
| if not all(instances): | |
| return False, ( | |
| "SyntheticProbeFailure alerts lack instance labels โ " | |
| "alerting is not scoped per endpoint" | |
| ) | |
| firing = { | |
| a.get("labels", {}).get("instance", "") | |
| for a in synthetic if a.get("state") == "firing" | |
| } | |
| has_failing = any("does-not-exist" in i for i in firing) | |
| if has_failing: | |
| return True, ( | |
| "Alerts fire per endpoint (failing endpoint alerts " | |
| "independently with instance labels)" | |
| ) | |
| time.sleep(8) | |
| except (json.JSONDecodeError, KeyError): | |
| time.sleep(10) | |
| continue | |
| return False, "Could not verify per-endpoint alert scoping" | |
| def prometheus_does_not_use_up_metric(): | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| if " up " in out or "up==" in out or "up ==" in out: | |
| return False, "Alerting incorrectly uses exporter 'up' metric" | |
| return True, "Alerting correctly avoids exporter 'up' metric" | |
| def check_endpoint_count(): | |
| """Verify at least 3 probe targets are configured.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return False, "Prometheus config not readable" | |
| out_lower = out.lower() | |
| required_targets = { | |
| 'argocd': 'argocd' in out_lower, | |
| 'kubernetes_api': 'kubernetes.default' in out_lower, | |
| 'test_endpoint': 'does-not-exist' in out_lower, | |
| } | |
| missing = [k for k, found in required_targets.items() if not found] | |
| if missing: | |
| return False, f"Missing probe targets: {missing}" | |
| return True, "All required endpoints configured" | |
| def get_probe_targets(): | |
| """Extract probe targets from Prometheus scrape config.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap prometheus-config " | |
| "-n observability -o jsonpath='{.data.prometheus\\.yml}'" | |
| ) | |
| if code != 0: | |
| return [] | |
| targets = [] | |
| target_matches = re.findall( | |
| r'-\s*(https?://[^\s]+|[\w.-]+:\d+)', | |
| out | |
| ) | |
| targets.extend(target_matches) | |
| return targets | |
| def check_grafana_dashboard_semantics(): | |
| """Verify Grafana dashboard uses correct semantic patterns for synthetic monitoring.""" | |
| code, out, _ = sh( | |
| "kubectl get configmap grafana-dashboards " | |
| "-n observability -o yaml" | |
| ) | |
| if code != 0: | |
| return False, "grafana-dashboards ConfigMap not readable" | |
| issues = [] | |
| # ------------------------------------------------------------------ | |
| # Check 1: dashboard must show availability data (not raw binary) | |
| # ------------------------------------------------------------------ | |
| has_availability_metric = ( | |
| "probe_success" in out | |
| or re.search(r"probe:[a-zA-Z_]*availab", out) | |
| or re.search(r"probe:[a-zA-Z_]*success", out) | |
| ) | |
| if has_availability_metric: | |
| if "probe_success" in out: | |
| has_time_agg = any(fn in out for fn in [ | |
| "avg_over_time", | |
| "min_over_time", | |
| "max_over_time", | |
| "sum_over_time", | |
| ]) | |
| if not has_time_agg: | |
| issues.append( | |
| "Dashboard uses raw probe_success without time " | |
| "aggregation (expected avg_over_time or similar)" | |
| ) | |
| else: | |
| issues.append( | |
| "Dashboard does not reference probe availability metrics " | |
| "(probe_success or a recording rule like probe:availability)" | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Check 2: per-endpoint breakdown (instance / target) | |
| # ------------------------------------------------------------------ | |
| has_grouping = any(x in out for x in [ | |
| "by (instance)", | |
| "by (target)", | |
| "$labels.instance", | |
| "$labels.target", | |
| "{{ instance }}", | |
| "{{instance}}", | |
| "{{ target }}", | |
| "{{target}}", | |
| ]) | |
| if not has_grouping: | |
| issues.append( | |
| "Dashboard does not show per-endpoint breakdown " | |
| "(missing by(instance), legendFormat with {{ instance }}, " | |
| "or target label usage)" | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Check 3: availability not shown as raw binary signal | |
| # ------------------------------------------------------------------ | |
| has_normalized = ( | |
| # Percentage form (e.g., * 100) | |
| any(x in out for x in ["* 100", "*100", "100 *"]) | |
| # Or [0,1] normalized via time aggregation (avg_over_time already | |
| # produces a continuous availability ratio, not binary) | |
| or any(fn in out for fn in [ | |
| "avg_over_time", "min_over_time", "max_over_time", | |
| ]) | |
| # Or uses a recording rule that pre-computes availability | |
| or re.search(r"probe:[a-zA-Z_]+:", out) | |
| ) | |
| if not has_normalized: | |
| issues.append( | |
| "Dashboard should represent availability as a normalized " | |
| "measure (e.g., avg_over_time for ratio or * 100 for percentage)" | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Check 4: response-time / latency metrics present | |
| # ------------------------------------------------------------------ | |
| latency_metrics = [ | |
| "probe_duration_seconds", | |
| "probe_http_duration_seconds", | |
| "probe_tcp_connection_duration_seconds", | |
| "probe_dns_lookup_time_seconds", | |
| ] | |
| has_latency = any(metric in out for metric in latency_metrics) | |
| if not has_latency: | |
| issues.append( | |
| "Dashboard should include response-time metrics " | |
| "(e.g. probe_duration_seconds or protocol-specific durations)" | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Final result | |
| # ------------------------------------------------------------------ | |
| if issues: | |
| return False, "; ".join(issues) | |
| return True, ( | |
| "Dashboard uses time-aggregated probe metrics, per-endpoint breakdown, " | |
| "availability percentage, and latency visualization" | |
| ) | |
| def grade(transcript: str) -> GradingResult: | |
| feedback = [] | |
| # ------------------------ | |
| # Gate checks (non-scored) | |
| # ------------------------ | |
| gate_checks = [ | |
| # Resource existence | |
| lambda: deployment_exists("blackbox-exporter", "observability"), | |
| lambda: service_exists("blackbox-exporter", "observability", 9115), | |
| lambda: configmap_exists("blackbox-config", "observability"), | |
| lambda: deployment_exists("prometheus", "observability"), | |
| lambda: configmap_exists("prometheus-config", "observability"), | |
| lambda: pvc_exists("prometheus-data", "observability"), | |
| blackbox_config_has_required_modules, | |
| # Image correctness | |
| lambda: deployment_uses_image( | |
| "blackbox-exporter", "observability", | |
| "prom/blackbox-exporter:v0.25.0", | |
| ), | |
| lambda: deployment_uses_image( | |
| "prometheus", "observability", | |
| "prom/prometheus:v3.8.1", | |
| ), | |
| lambda: deployment_uses_image( | |
| "grafana", "observability", | |
| "grafana/grafana:12.2", | |
| ), | |
| # Core operational | |
| blackbox_metrics_exposed, | |
| prometheus_has_probe_metrics, | |
| prometheus_blackbox_relabeling_present, | |
| argocd_probe_success, | |
| kubernetes_api_tcp_probe_configured, | |
| prometheus_uses_pvc, | |
| # Basic config quality | |
| grafana_has_prometheus_datasource, | |
| check_alert_severity_labels, | |
| check_scrape_interval, | |
| check_alert_annotations, | |
| alert_has_minimum_duration, | |
| prometheus_does_not_use_up_metric, | |
| check_slo_burn_rate_alerts, | |
| check_blackbox_modules, | |
| ] | |
| for fn in gate_checks: | |
| try: | |
| ok, msg = fn() | |
| except Exception as e: | |
| ok = False | |
| msg = str(e) | |
| feedback.append(("โ " if ok else "โ ") + msg) | |
| # ------------------------ | |
| # Scored checks (partial) | |
| # ------------------------ | |
| scored_checks = { | |
| "grafana_dashboard_present": grafana_has_blackbox_dashboard, | |
| "endpoint_count": check_endpoint_count, | |
| "grafana_dashboard_semantics": check_grafana_dashboard_semantics, | |
| "failing_in_alert": prometheus_alert_fires_for_failing_probe, | |
| "alert_identifies_endpoint": alert_rule_identifies_endpoint, | |
| "per_endpoint": prometheus_alert_is_per_endpoint, | |
| "records": check_recording_rules, | |
| "self_scrape": check_prometheus_self_scrape, | |
| "recording_rules": check_dashboard_uses_recording_rules, | |
| } | |
| subscores = {} | |
| for key, fn in scored_checks.items(): | |
| try: | |
| ok, msg = fn() | |
| except Exception as e: | |
| ok = False | |
| msg = str(e) | |
| subscores[key] = 1.0 if ok else 0.0 | |
| feedback.append(("โ " if ok else "โ ") + msg) | |
| # | |
| total_checks = len(scored_checks) | |
| weight = 1.0 / total_checks | |
| weights = {k: weight for k in scored_checks} | |
| score = sum(subscores[k] * weights[k] for k in subscores) | |
| return GradingResult( | |
| score=round(score, 4), | |
| subscores=subscores, | |
| weights=weights, | |
| feedback=" | ".join(feedback), | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -e | |
| # ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- # | |
| # Start supervisord if not already running (manages k3s, dockerd, dnsmasq) | |
| if ! supervisorctl status &>/dev/null; then | |
| echo "Starting supervisord..." | |
| /usr/bin/supervisord -c /etc/supervisor/supervisord.conf | |
| sleep 5 | |
| fi | |
| # Set kubeconfig for k3s | |
| export KUBECONFIG=/etc/rancher/k3s/k3s.yaml | |
| # Wait for k3s to be ready (k3s can take 30-60 seconds to start) | |
| echo "Waiting for k3s to be ready..." | |
| MAX_WAIT=180 | |
| ELAPSED=0 | |
| until kubectl get nodes &>/dev/null; do | |
| if [ $ELAPSED -ge $MAX_WAIT ]; then | |
| echo "Error: k3s is not ready after ${MAX_WAIT} seconds" | |
| exit 1 | |
| fi | |
| echo "Waiting for k3s... (${ELAPSED}s elapsed)" | |
| sleep 2 | |
| ELAPSED=$((ELAPSED + 2)) | |
| done | |
| echo "k3s is ready!" | |
| # ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- # | |
| echo "Granting Minimal Required Permission" | |
| kubectl apply -f - <<EOF | |
| apiVersion: rbac.authorization.k8s.io/v1 | |
| kind: Role | |
| metadata: | |
| name: ubuntu-user-configmap-editor | |
| namespace: observability | |
| rules: | |
| - apiGroups: [""] | |
| resources: ["configmaps"] | |
| verbs: ["get", "list", "watch", "create", "update", "patch"] | |
| --- | |
| apiVersion: rbac.authorization.k8s.io/v1 | |
| kind: RoleBinding | |
| metadata: | |
| name: ubuntu-user-configmap-editor-binding | |
| namespace: observability | |
| subjects: | |
| - kind: ServiceAccount | |
| name: ubuntu-user | |
| namespace: observability | |
| roleRef: | |
| apiGroup: rbac.authorization.k8s.io | |
| kind: Role | |
| name: ubuntu-user-configmap-editor | |
| EOF | |
| # ------------------------------------------------------------------------------ | |
| # Create Gitea repo (unchanged) | |
| # ------------------------------------------------------------------------------ | |
| kubectl exec -n gitea deploy/gitea -- sh -c ' | |
| curl -s -X POST \ | |
| -u "root:Admin@123456" \ | |
| -H "Content-Type: application/json" \ | |
| -d "{ | |
| \"name\": \"platform-docs\", | |
| \"private\": false, | |
| \"auto_init\": true | |
| }" \ | |
| http://localhost:3000/api/v1/user/repos | |
| ' | |
| # ------------------------------------------------------------------------------ | |
| # Wiki creation โ commit directly to bare git repo (bypasses API timing issues) | |
| # ------------------------------------------------------------------------------ | |
| kubectl exec -n gitea deploy/gitea -- sh -c ' | |
| set -e | |
| WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git | |
| # Ensure bare wiki repo exists (Gitea does NOT auto-create it) | |
| if [ ! -d "$WIKI_REPO" ]; then | |
| git init --bare "$WIKI_REPO" >/dev/null 2>&1 | |
| fi | |
| # Commit wiki content directly into the bare repo via a temp clone | |
| TMPDIR=$(mktemp -d) | |
| cd "$TMPDIR" | |
| git init >/dev/null 2>&1 | |
| git config user.email "root@gitea.local" | |
| git config user.name "root" | |
| cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF | |
| # Platform Services - Monitoring Requirements | |
| ## Critical Platform Services | |
| ### ArgoCD GitOps Controller | |
| | Property | Value | | |
| |----------|-------| | |
| | Ingress | \`argocd.devops.local\` | | |
| | Protocol | HTTP | | |
| | Criticality | High | | |
| ### Kubernetes API Server | |
| | Property | Value | | |
| |----------|-------| | |
| | Access | \`kubernetes.default.svc:443\` | | |
| | Protocol | TCP | | |
| | Criticality | Critical | | |
| ## Alerting Validation | |
| Configure a probe against a known-unreachable endpoint to verify alerting: | |
| http://does-not-exist.devops.local | |
| Alerts should identify the specific failing endpoint (not aggregate into one global alert). | |
| ## Preloaded Container Images | |
| This cluster operates in air-gapped mode. The following images are available: | |
| | Component | Image | | |
| |-----------|-------| | |
| | Prometheus | \`prom/prometheus:v3.8.1\` | | |
| | Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` | | |
| | Grafana | \`grafana/grafana:12.2\` | | |
| Use these exact image references in your deployments. No other versions are available. | |
| WIKIEOF | |
| git add . >/dev/null 2>&1 | |
| git commit -m "Initial wiki content" >/dev/null 2>&1 | |
| git remote add origin "$WIKI_REPO" | |
| git push -f origin master >/dev/null 2>&1 | |
| cd / | |
| rm -rf "$TMPDIR" | |
| chown -R git:git "$WIKI_REPO" | |
| ' && echo "[SETUP] Platform documentation created" \ | |
| || echo "[SETUP] WARN: Wiki creation failed" | |
| # Poke the Gitea wiki API to force indexing of the git-committed content | |
| kubectl exec -n gitea deploy/gitea -- sh -c ' | |
| for i in 1 2 3; do | |
| HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ | |
| -u "root:Admin@123456" \ | |
| "http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages") | |
| if [ "$HTTP_CODE" = "200" ]; then | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| ' 2>/dev/null || true | |
| echo "[SETUP] Importing pre-cached MinIO and MariaDB images..." | |
| PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar" | |
| BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar" | |
| GRAFANA_TAR="/workdir/grafana-12.2.tar" | |
| if [ ! -f "$GRAFANA_TAR" ]; then | |
| echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR" | |
| exit 1 | |
| fi | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images import --no-unpack "$GRAFANA_TAR" | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images list | grep -q "grafana/grafana:12.2" || { | |
| echo "Error: Grafana image was not imported correctly" | |
| exit 1 | |
| } | |
| if [ ! -f "$PROMETHEUS_TAR" ]; then | |
| echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR" | |
| exit 1 | |
| fi | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images import "$PROMETHEUS_TAR" | |
| if [ ! -f "$BLACKBOX_TAR" ]; then | |
| echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR" | |
| exit 1 | |
| fi | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images import --no-unpack "$BLACKBOX_TAR" | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images list | grep -q "prom/prometheus:v3.8.1" || { | |
| echo "Error: Prometheus image was not imported correctly" | |
| exit 1 | |
| } | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images list | grep -q "prom/blackbox-exporter:v0.25.0" || { | |
| echo "Error: Blackbox image was not imported correctly" | |
| exit 1 | |
| } | |
| CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io" | |
| $CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1 | |
| $CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0 | |
| $CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2 | |
| ALLOWED_IMAGES=( | |
| "prom/prometheus:v3.8.1" | |
| "docker.io/prom/prometheus:v3.8.1" | |
| "prom/blackbox-exporter:v0.25.0" | |
| "docker.io/prom/blackbox-exporter:v0.25.0" | |
| "grafana/grafana:12.2" | |
| "docker.io/grafana/grafana:12.2" | |
| ) | |
| is_allowed() { | |
| for allowed in "${ALLOWED_IMAGES[@]}"; do | |
| [[ "$1" == "$allowed" ]] && return 0 | |
| done | |
| return 1 | |
| } | |
| $CTR images list -q | while read -r image; do | |
| case "$image" in | |
| *prometheus*|*blackbox-exporter*|*grafana*) | |
| is_allowed "$image" || $CTR images remove "$image" 2>/dev/null || true | |
| ;; | |
| esac | |
| done | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null || true | |
| ctr --address /run/k3s/containerd/containerd.sock \ | |
| --namespace k8s.io \ | |
| images remove docker.io/grafana/grafana:11.3.0 2>/dev/null || true | |
| rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR" | |
| rm -rf /workdir/*.tar 2>/dev/null || true | |
| echo "[SETUP] Prometheus and Blackbox images imported successfully" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| set -euo pipefail | |
| echo "" | |
| echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)" | |
| echo "" | |
| export KUBECONFIG=/etc/rancher/k3s/k3s.yaml | |
| NAMESPACE="observability" | |
| # Ensure namespace exists | |
| kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 || kubectl create namespace ${NAMESPACE} | |
| echo "" | |
| echo ">>> Applying Blackbox Exporter configuration" | |
| echo "" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: blackbox-config | |
| namespace: ${NAMESPACE} | |
| data: | |
| blackbox.yml: | | |
| modules: | |
| http_2xx: | |
| prober: http | |
| timeout: 5s | |
| http: | |
| valid_http_versions: ["HTTP/1.1", "HTTP/2"] | |
| valid_status_codes: [] | |
| method: GET | |
| tcp_connect: | |
| prober: tcp | |
| timeout: 5s | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: blackbox-exporter | |
| namespace: ${NAMESPACE} | |
| labels: | |
| app: blackbox-exporter | |
| spec: | |
| replicas: 1 | |
| selector: | |
| matchLabels: | |
| app: blackbox-exporter | |
| template: | |
| metadata: | |
| labels: | |
| app: blackbox-exporter | |
| spec: | |
| containers: | |
| - name: blackbox-exporter | |
| image: prom/blackbox-exporter:v0.25.0 | |
| ports: | |
| - containerPort: 9115 | |
| args: | |
| - "--config.file=/etc/blackbox/blackbox.yml" | |
| securityContext: | |
| runAsNonRoot: true | |
| runAsUser: 1000 | |
| volumeMounts: | |
| - name: config-volume | |
| mountPath: /etc/blackbox | |
| volumes: | |
| - name: config-volume | |
| configMap: | |
| name: blackbox-config | |
| items: | |
| - key: blackbox.yml | |
| path: blackbox.yml | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: blackbox-exporter | |
| namespace: ${NAMESPACE} | |
| spec: | |
| selector: | |
| app: blackbox-exporter | |
| ports: | |
| - name: http | |
| port: 9115 | |
| targetPort: 9115 | |
| EOF | |
| echo "" | |
| echo ">>> Applying Prometheus storage" | |
| echo "" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: prometheus-data | |
| namespace: ${NAMESPACE} | |
| spec: | |
| accessModes: | |
| - ReadWriteOnce | |
| resources: | |
| requests: | |
| storage: 2Gi | |
| EOF | |
| echo "" | |
| echo ">>> Applying Prometheus configuration" | |
| echo "" | |
| echo "Applying Prometheus config..." | |
| cat <<'EOF' | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: prometheus-config | |
| namespace: observability | |
| data: | |
| prometheus.yml: | | |
| global: | |
| scrape_interval: 15s | |
| rule_files: | |
| - /etc/prometheus/rules/*.yml | |
| scrape_configs: | |
| - job_name: blackbox | |
| metrics_path: /probe | |
| params: | |
| module: [http_2xx] | |
| static_configs: | |
| - targets: | |
| - http://argocd.devops.local | |
| - http://does-not-exist.devops.local | |
| relabel_configs: | |
| - source_labels: [__address__] | |
| target_label: __param_target | |
| - source_labels: [__param_target] | |
| target_label: instance | |
| - target_label: __address__ | |
| replacement: blackbox-exporter:9115 | |
| - job_name: blackbox-kubernetes-api | |
| metrics_path: /probe | |
| params: | |
| module: [tcp_connect] | |
| static_configs: | |
| - targets: | |
| - kubernetes.default.svc:443 | |
| relabel_configs: | |
| - source_labels: [__address__] | |
| target_label: __param_target | |
| - source_labels: [__param_target] | |
| target_label: instance | |
| - target_label: __address__ | |
| replacement: blackbox-exporter:9115 | |
| - job_name: blackbox-exporter | |
| static_configs: | |
| - targets: | |
| - blackbox-exporter:9115 | |
| - job_name: prometheus | |
| static_configs: | |
| - targets: | |
| - localhost:9090 | |
| rules.yml: | | |
| groups: | |
| # -------------------------------------------------- | |
| # Recording rules | |
| # -------------------------------------------------- | |
| - name: synthetic-recording | |
| rules: | |
| - record: probe:availability:5m | |
| expr: avg_over_time(probe_success[5m]) | |
| - record: probe:availability:1h | |
| expr: avg_over_time(probe_success[1h]) | |
| - record: probe:latency_p99:5m | |
| expr: | | |
| histogram_quantile( | |
| 0.99, | |
| sum(rate(probe_duration_seconds_bucket[5m])) | |
| by (le, instance) | |
| ) | |
| # -------------------------------------------------- | |
| # Compatibility alert (legacy graders depend on this) | |
| # -------------------------------------------------- | |
| - name: synthetic-compat | |
| rules: | |
| - alert: SyntheticProbeFailure | |
| expr: probe_success == 0 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: "Synthetic probe failed" | |
| description: "Endpoint {{ $labels.instance }} is unreachable" | |
| # -------------------------------------------------- | |
| # SLO burn-rate alerts | |
| # -------------------------------------------------- | |
| - name: synthetic-slo | |
| rules: | |
| - alert: SyntheticProbeHighBurnRate | |
| expr: | | |
| (1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: "High synthetic availability burn rate" | |
| description: "High error budget burn rate for {{ $labels.instance }}" | |
| - alert: SyntheticProbeLowBurnRate | |
| expr: | | |
| (1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1 | |
| for: 1h | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: "Sustained synthetic availability degradation" | |
| description: "Sustained error budget burn rate for {{ $labels.instance }}" | |
| EOF | |
| echo "" | |
| echo ">>> Deploying Prometheus" | |
| echo "" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: prometheus | |
| namespace: ${NAMESPACE} | |
| spec: | |
| replicas: 1 | |
| selector: | |
| matchLabels: | |
| app: prometheus | |
| template: | |
| metadata: | |
| labels: | |
| app: prometheus | |
| spec: | |
| securityContext: | |
| fsGroup: 65534 | |
| runAsNonRoot: true | |
| runAsUser: 1000 | |
| containers: | |
| - name: prometheus | |
| image: prom/prometheus:v3.8.1 | |
| args: | |
| - "--config.file=/etc/prometheus/prometheus.yml" | |
| ports: | |
| - containerPort: 9090 | |
| volumeMounts: | |
| - name: config-volume | |
| mountPath: /etc/prometheus/prometheus.yml | |
| subPath: prometheus.yml | |
| - name: data-volume | |
| mountPath: /prometheus | |
| - name: rules-volume | |
| mountPath: /etc/prometheus/rules | |
| volumes: | |
| - name: config-volume | |
| configMap: | |
| name: prometheus-config | |
| - name: data-volume | |
| persistentVolumeClaim: | |
| claimName: prometheus-data | |
| - name: rules-volume | |
| configMap: | |
| name: prometheus-config | |
| items: | |
| - key: rules.yml | |
| path: rules.yml | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: prometheus | |
| namespace: ${NAMESPACE} | |
| spec: | |
| selector: | |
| app: prometheus | |
| ports: | |
| - name: web | |
| port: 9090 | |
| targetPort: 9090 | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: grafana-datasources | |
| namespace: ${NAMESPACE} | |
| data: | |
| datasources.yml: | | |
| apiVersion: 1 | |
| datasources: | |
| - name: Prometheus | |
| type: prometheus | |
| access: proxy | |
| url: http://prometheus:9090 | |
| isDefault: true | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: grafana-dashboard-provider | |
| namespace: ${NAMESPACE} | |
| data: | |
| dashboards.yml: | | |
| apiVersion: 1 | |
| providers: | |
| - name: default | |
| folder: '' | |
| type: file | |
| options: | |
| path: /var/lib/grafana/dashboards | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: ConfigMap | |
| metadata: | |
| name: grafana-dashboards | |
| namespace: observability | |
| data: | |
| blackbox-dashboard.json: | | |
| { | |
| "title": "Synthetic Endpoint Availability", | |
| "schemaVersion": 38, | |
| "panels": [ | |
| { | |
| "type": "timeseries", | |
| "title": "Probe Availability (%)", | |
| "targets": [ | |
| { | |
| "expr": "probe:availability:5m * 100", | |
| "legendFormat": "{{ instance }}", | |
| "refId": "A" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "timeseries", | |
| "title": "Probe Latency (seconds)", | |
| "targets": [ | |
| { | |
| "expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)", | |
| "legendFormat": "{{ instance }}", | |
| "refId": "B" | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| EOF | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: grafana | |
| namespace: ${NAMESPACE} | |
| spec: | |
| replicas: 1 | |
| selector: | |
| matchLabels: | |
| app: grafana | |
| template: | |
| metadata: | |
| labels: | |
| app: grafana | |
| spec: | |
| containers: | |
| - name: grafana | |
| image: grafana/grafana:12.2 | |
| ports: | |
| - containerPort: 3000 | |
| volumeMounts: | |
| - name: datasources | |
| mountPath: /etc/grafana/provisioning/datasources | |
| # ๐ Provider config goes here | |
| - name: dashboard-provider | |
| mountPath: /etc/grafana/provisioning/dashboards | |
| # ๐ Actual JSON dashboards go here | |
| - name: dashboard-json | |
| mountPath: /var/lib/grafana/dashboards | |
| volumes: | |
| - name: datasources | |
| configMap: | |
| name: grafana-datasources | |
| - name: dashboard-provider | |
| configMap: | |
| name: grafana-dashboard-provider | |
| - name: dashboard-json | |
| configMap: | |
| name: grafana-dashboards | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: grafana | |
| namespace: ${NAMESPACE} | |
| spec: | |
| selector: | |
| app: grafana | |
| ports: | |
| - name: web | |
| port: 3000 | |
| targetPort: 3000 | |
| EOF | |
| echo "" | |
| echo ">>> Waiting for deployments to become ready" | |
| echo "" | |
| kubectl rollout status deployment/grafana -n ${NAMESPACE} | |
| kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE} | |
| kubectl rollout status deployment/prometheus -n ${NAMESPACE} | |
| echo "" | |
| echo ">>> Solution deployment completed successfully" | |
| echo "" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| id: synthetic-endpoint-monitoring | |
| metadata: | |
| category: observability | |
| difficulty: advanced | |
| prompt: | | |
| [Context] | |
| The Nebula platform hosts multiple internal services exposed through Kubernetes ingress. | |
| While application-level metrics exist, the platform team lacks visibility into endpoint | |
| availability from a real client perspective. | |
| Synthetic probing is required to validate ingress reachability and detect outages | |
| independently of application instrumentation. | |
| [Environment Notes] | |
| The cluster operates in air-gapped mode. | |
| Required container images are already preloaded into the runtime environment and must | |
| be used as-is without pulling from external registries. Available image versions | |
| are documented in the platform wiki. | |
| [Task] | |
| Implement synthetic endpoint monitoring in the `observability` namespace. | |
| The monitoring stack must include the following Kubernetes resources: | |
| - Deployment `blackbox-exporter` | |
| - Service `blackbox-exporter` | |
| - ConfigMap `blackbox-config` | |
| - Deployment `prometheus` | |
| - Service `prometheus` exposing port 9090 | |
| - ConfigMap `prometheus-config` | |
| - PersistentVolumeClaim `prometheus-data` | |
| Observability components must use the most recent preloaded Prometheus and Grafana container images available in the environment. | |
| The Nebula internal developer wiki documents commonly exposed platform services | |
| and their ingress hostnames: | |
| http://gitea.devops.local/root/platform-docs/wiki/ | |
| Synthetic probes must target internal services representing real client access patterns. | |
| Probe results must be exposed as metrics consumable by the monitoring system. | |
| [Blackbox Exporter Requirements] | |
| The blackbox exporter must support multiple probe types: | |
| - HTTP probes for application-layer availability | |
| - TCP probes for transport-layer connectivity | |
| Probe behavior must follow protocol-appropriate validation: | |
| - Application-layer probes must validate protocol correctness | |
| - Transport-layer probes must validate connectivity only | |
| - HTTP probe modules must preserve default TLS verification behavior | |
| The Kubernetes API server health must be monitored using TCP connectivity probes | |
| against its standard in-cluster service name. This check must validate only | |
| basic transport-layer reachability (no TLS or HTTP validation). | |
| Exporter-level metrics (e.g., `up`) must not be used as a substitute for | |
| synthetic probe result metrics. | |
| [Prometheus Requirements] | |
| The global `scrape_interval` must be set to **15s or 10s**. | |
| Prometheus must be configured to scrape blackbox probe targets using appropriate | |
| relabeling. Configuration should support efficient querying and reuse of commonly | |
| evaluated availability signals. Repeated or computationally expensive expressions | |
| should not be evaluated directly at query time. | |
| [Alerting Requirements] | |
| Prometheus must define alerting rules based on synthetic probe result metrics. | |
| At least one alert must be named **SyntheticProbeFailure** and represent | |
| endpoint-level availability failure detected via synthetic probes. | |
| Alerts detecting sustained availability degradation should be based on | |
| SLO-style burn rate concepts (e.g., evaluating error budget consumption | |
| over time rather than fixed thresholds). | |
| Alerts must: | |
| - Detect rapid availability loss | |
| - Detect sustained availability degradation over longer periods | |
| - Distinguish failures on a per-endpoint basis | |
| - Avoid relying solely on instantaneous probe failures or fixed thresholds | |
| [Visualization Requirements] | |
| A visualization layer must be deployed in the `observability` namespace. | |
| Required resource: | |
| - ConfigMap `grafana-dashboards` for dashboard definitions | |
| The visualization system must: | |
| - Consume metrics directly from Prometheus via declarative configuration | |
| - Include at least one dashboard showing per-endpoint probe availability over time | |
| - Represent availability as a normalized measure over time | |
| - Allow comparison across endpoints | |
| - Include at least one responsiveness-related indicator | |
| - Not rely solely on binary success/failure signals | |
| Dashboard and data source configuration must be fully reproducible and stored | |
| as Kubernetes resources. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment