arubis/Dockerfile

## Dockerfile
FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2


RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data

RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \
    | tar -xzf - -C /usr/local/bin crane


ENV ALLOWED_NAMESPACES="observability"

RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar
RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar
RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar


ENV DISPLAY_NUM=1
ENV COMPUTER_HEIGHT_PX=768
ENV COMPUTER_WIDTH_PX=1024

## grader.py
#!/usr/bin/env python3

from __future__ import annotations
import subprocess
import json
import time
from apex_arena._types import GradingResult
import re
from datetime import timedelta


_grafana_api_ready = None
_active_port_forwards: set = set()


def sh(cmd):
    r = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return r.returncode, r.stdout.strip(), r.stderr.strip()

def normalize_image(img):
    """Strip docker.io/ prefix for image comparison."""
    for prefix in ("docker.io/library/", "docker.io/"):
        if img.startswith(prefix):
            return img[len(prefix):]
    return img


def port_forward(kind, name, namespace, local_port, remote_port):
    key = (name, namespace, local_port)
    if key in _active_port_forwards:
        return

    sh(f"pkill -f 'kubectl port-forward.*{local_port}' || true")
    sh(
        f"kubectl port-forward -n {namespace} {kind}/{name} "
        f"{local_port}:{remote_port} >/dev/null 2>&1 &"
    )
    time.sleep(5)
    _active_port_forwards.add(key)


def parse_duration(duration_str):
    """Parse Prometheus duration string to timedelta."""
    match = re.match(r'(\d+)(s|m|h)', str(duration_str))
    if not match:
        return timedelta(0)
    value, unit = int(match.group(1)), match.group(2)
    if unit == 's':
        return timedelta(seconds=value)
    elif unit == 'm':
        return timedelta(minutes=value)
    elif unit == 'h':
        return timedelta(hours=value)
    return timedelta(0)


_TERMINAL_POD_STATES = {"ImagePullBackOff", "ErrImagePull", "InvalidImageName"}


def check_deployment_health(name, namespace):
    """Quick check for terminal pod errors. Returns (healthy, message)."""
    code, out, _ = sh(
        f"kubectl get deployment {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Deployment '{name}' not found in {namespace}"

    try:
        data = json.loads(out)
        selector = data["spec"]["selector"]["matchLabels"]
        label_str = ",".join(f"{k}={v}" for k, v in selector.items())
    except (json.JSONDecodeError, KeyError):
        return True, "Could not parse selector, assuming healthy"

    code, out, _ = sh(
        f"kubectl get pods -n {namespace} -l {label_str} -o json"
    )
    if code != 0:
        return True, "Could not list pods, assuming healthy"

    try:
        pods = json.loads(out).get("items", [])
    except json.JSONDecodeError:
        return True, "Could not parse pods, assuming healthy"

    for pod in pods:
        statuses = pod.get("status", {}).get("containerStatuses", [])
        for cs in statuses:
            waiting = cs.get("state", {}).get("waiting", {})
            reason = waiting.get("reason", "")
            if reason in _TERMINAL_POD_STATES:
                pod_name = pod.get("metadata", {}).get("name", "unknown")
                return False, (
                    f"Pod '{pod_name}' in {namespace} is stuck in {reason} — "
                    f"deployment '{name}' will not recover"
                )

    return True, f"Deployment '{name}' pods are not in terminal error state"


# -------------------------------------------------------------------
# RESOURCE CHECKS
# -------------------------------------------------------------------

def deployment_exists(name, namespace):
    code, _, _ = sh(f"kubectl get deployment {name} -n {namespace}")
    return (
        (True, f"Deployment '{name}' exists in {namespace}")
        if code == 0
        else (False, f"Deployment '{name}' not found in {namespace}")
    )


def configmap_contains(name, namespace, required_strings):
    code, out, _ = sh(
        f"kubectl get configmap {name} -n {namespace} -o yaml"
    )
    if code != 0:
        return False, f"ConfigMap '{name}' not readable"

    missing = [s for s in required_strings if s not in out]
    if missing:
        return False, f"ConfigMap '{name}' missing: {missing}"

    return True, f"ConfigMap '{name}' contains required content"


def service_exists(name, namespace, port):
    code, out, _ = sh(
        f"kubectl get svc {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Service '{name}' not found in {namespace}"

    data = json.loads(out)
    ports = [p.get("port") for p in data.get("spec", {}).get("ports", [])]

    if port in ports:
        return True, f"Service '{name}' exposes port {port}"
    return False, f"Service '{name}' does not expose port {port}"


def wait_for_grafana_api():
    global _grafana_api_ready
    if _grafana_api_ready is not None:
        return _grafana_api_ready

    sh(
        "kubectl wait --for=condition=ready pod -l app=grafana "
        "-n observability --timeout=60s"
    )

    for _ in range(6):
        code, _, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- http://localhost:3000/api/health"
        )
        if code == 0:
            _grafana_api_ready = True
            return True
        time.sleep(5)

    _grafana_api_ready = False
    return False


def configmap_exists(name, namespace):
    code, _, _ = sh(f"kubectl get configmap {name} -n {namespace}")
    return (
        (True, f"ConfigMap '{name}' exists in {namespace}")
        if code == 0
        else (False, f"ConfigMap '{name}' not found in {namespace}")
    )


def pvc_exists(name, namespace):
    code, _, _ = sh(f"kubectl get pvc {name} -n {namespace}")
    return (
        (True, f"PVC '{name}' exists in {namespace}")
        if code == 0
        else (False, f"PVC '{name}' not found in {namespace}")
    )


# -------------------------------------------------------------------
# BLACKBOX FUNCTIONAL CHECKS
# -------------------------------------------------------------------

def blackbox_metrics_exposed():
    healthy, msg = check_deployment_health("blackbox-exporter", "observability")
    if not healthy:
        return False, msg

    port_forward(
        "svc",
        "blackbox-exporter",
        "observability",
        9115,
        9115,
    )

    code, out, _ = sh("curl -s http://localhost:9115/metrics")
    if code == 0 and "blackbox_exporter_config_last_reload_successful" in out:
        return True, "Blackbox exporter metrics endpoint is reachable"

    return False, "Blackbox exporter metrics endpoint not responding correctly"

def blackbox_config_has_required_modules():
    code, out, _ = sh(
        "kubectl get configmap blackbox-config "
        "-n observability -o jsonpath='{.data}'"
    )
    if code != 0:
        return False, "blackbox-config not readable"

    required_modules = ["http_2xx", "tcp_connect"]
    missing = [m for m in required_modules if m not in out]

    if missing:
        return False, f"blackbox-config missing required modules: {missing}"

    return True, "blackbox-config contains all required probe modules"


def kubernetes_api_tcp_probe_configured():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if "kubernetes.default" not in out:
        return False, "Kubernetes API server not configured as probe target"


    import re
    tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?'
    if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out:
        return False, "Must use 'tcp_connect' module (not tcp_connect_tls) for Kubernetes API TCP probe"

    return True, "Kubernetes API TCP probe is configured with tcp_connect module"


def prometheus_has_probe_metrics():
    healthy, msg = check_deployment_health("prometheus", "observability")
    if not healthy:
        return False, msg

    port_forward("svc", "prometheus", "observability", 9090, 9090)


    for _ in range(6):
        code, out, _ = sh(
            "curl -s "
            "'http://localhost:9090/api/v1/query?query=probe_success'"
        )

        if code == 0 and '"result"' in out:
            return True, "Prometheus is collecting probe metrics"

        time.sleep(5)

    return False, "Prometheus not returning probe metrics after retries"


def check_slo_burn_rate_alerts():
    """Verify alerts implement proper multi-window SLO burn rate logic."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Must have multiple time windows in recording rules or alert expressions
    windows = re.findall(
        r"avg_over_time\([^)]*\[(\d+[mh])\]\)",
        out
    )
    if len(set(windows)) < 2:
        return False, (
            "Burn rate alerts must use multiple time windows "
            "(e.g., 5m and 1h)"
        )

    # Must have at least 2 distinct 'for:' durations across alert rules
    # (evidence of fast-burn vs slow-burn detection windows)
    for_durations = re.findall(r"for:\s*(\d+[smh])", out)
    unique_durations = {parse_duration(d) for d in for_durations}
    unique_durations.discard(timedelta(0))

    if len(unique_durations) < 2:
        return False, (
            "SLO burn rate alerting requires multiple detection windows "
            "(e.g., a fast-burn alert with 'for: 2m' and a slow-burn "
            "alert with 'for: 1h')"
        )

    return True, "Valid multi-window SLO burn rate alerts detected"


def prometheus_scrape_interval_valid():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )


    if "global:" not in out:
        return False, "Prometheus config missing global section"

    global_section = out.split("scrape_configs")[0] if "scrape_configs" in out else out
    if "scrape_interval: 15s" not in global_section and "scrape_interval: 10s" not in global_section:
        return False, "Global scrape_interval must be 10s or 15s"

    return True, "Scrape interval is appropriately configured"


def check_alert_for_duration():
    """Verify alerts have appropriate 'for' duration for timely detection."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    for_match = re.search(
        r'alert:\s*SyntheticProbeFailure.*?for:\s*(\d+[smh])',
        out,
        re.DOTALL
    )
    if not for_match:
        if 'SyntheticProbeFailure' not in out:
            return False, "SyntheticProbeFailure alert not found"
        return False, "Alert missing 'for' duration"

    for_duration = for_match.group(1)
    duration = parse_duration(for_duration)

    if duration > timedelta(minutes=2):
        return False, f"Alert 'for' duration {for_duration} exceeds 2m detection requirement"
    if duration < timedelta(seconds=30):
        return False, f"Alert 'for' duration {for_duration} too short, will cause flapping"

    return True, f"Alert 'for' duration {for_duration} is appropriate"


def check_alert_annotations():
    """Verify alerts have required annotations for operational use."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if 'SyntheticProbeFailure' not in out:
        return False, "SyntheticProbeFailure alert not found"

    alert_section = out[out.find('SyntheticProbeFailure'):]

    if 'annotations:' not in alert_section:
        return False, "Alert missing annotations section"

    if 'description' not in alert_section and 'summary' not in alert_section:
        return False, "Alert missing description/summary annotation"

    return True, "Alert has required annotations"


def check_recording_rules():
    """Verify recording rules exist AND are used in alert expressions."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Must define recording rules
    if "record:" not in out:
        return False, "Prometheus should define recording rules"

    # Extract recording rule names
    record_names = re.findall(
        r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
        out
    )

    if not record_names:
        return False, "No valid recording rule names found"

    # Multi-window availability requires at least 2 recording rules
    if len(record_names) < 2:
        return False, (
            "Multiple recording rules needed for multi-window "
            "availability signals (e.g., 5m and 1h windows)"
        )

    # At least 2 recording rules must be referenced in alert expressions
    alert_section = out[out.find("alert:"):] if "alert:" in out else out

    used_count = sum(1 for name in record_names if name in alert_section)

    if used_count < 2:
        return False, (
            "At least 2 recording rules should be referenced in alert "
            "expressions for multi-window burn rate detection"
        )

    return True, "Recording rules exist and are used in alerts"


def check_blackbox_modules():
    """Verify correct Blackbox modules used for each protocol."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    out_lower = out.lower()

    # --- Kubernetes API must use tcp_connect ---
    if 'kubernetes.default' in out:
        kube_pos = out.find('kubernetes.default')
        kube_section = out[max(0, kube_pos - 500):kube_pos + 200]
        if 'tcp_connect' not in kube_section:
            return False, "Kubernetes API target should use tcp_connect module"

    # --- HTTP endpoints must use http_2xx ---
    if 'argocd' in out_lower:
        argocd_pos = out_lower.find('argocd')
        argocd_section = out[max(0, argocd_pos - 500):argocd_pos + 200]
        if 'http_2xx' not in argocd_section and 'http' in argocd_section:
            return False, "HTTP targets should use http_2xx module"

    return True, "Blackbox modules correctly matched to target protocols"


def check_alert_severity_labels():
    """Verify alerts define severity labels."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if "severity:" not in out:
        return False, (
            "Alerts must define severity labels "
            "(critical or warning)"
        )

    return True, "Alert severity labels present"


def check_dashboard_uses_recording_rules():
    """Dashboard should reference recording rules instead of raw PromQL."""
    code, dash_out, _ = sh(
        "kubectl get configmap grafana-dashboards "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "grafana-dashboards ConfigMap not readable"

    # Extract actual recording rule names from prometheus-config
    code, prom_out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )

    if code == 0:
        record_names = re.findall(
            r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
            prom_out
        )
        if record_names and any(name in dash_out for name in record_names):
            return True, "Dashboard references recording rules"

    # Fallback: accept any probe:*:* pattern (recording rule convention)
    if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out):
        return True, "Dashboard references recording rules"

    return False, (
        "Dashboard should reference pre-computed recording rules "
        "(e.g., probe:availability:5m) instead of raw PromQL"
    )


def argocd_probe_success():
    port_forward(
        "svc",
        "blackbox-exporter",
        "observability",
        9115,
        9115,
    )

    cmd = (
        "curl -s "
        "'http://localhost:9115/probe?"
        "target=http://argocd.devops.local:80/api/version&module=http_2xx' | "
        "grep '^probe_success 1'"
    )

    code, out, _ = sh(cmd)
    if code == 0 and out:
        return True, "Synthetic probe reports Argo CD endpoint as available"

    return False, "Synthetic probe did not report Argo CD as available"


def deployment_uses_image(name, namespace, expected_image):
    code, out, _ = sh(
        f"kubectl get deployment {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Deployment '{name}' not found"

    data = json.loads(out)
    containers = data["spec"]["template"]["spec"]["containers"]
    images = [c.get("image", "") for c in containers]

    actual = [normalize_image(i) for i in images]
    expected = normalize_image(expected_image)

    if expected in actual:
        return True, f"Deployment '{name}' uses image '{expected_image}'"

    return False, f"Expected {expected_image}, found {images}"


def prometheus_blackbox_relabeling_present():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"


    has_blackbox_addr = ("blackbox-exporter:9115" in out or
                         "blackbox-exporter.observability" in out)

    required_snippets = [
        "metrics_path: /probe",
        "__param_target",
    ]

    missing = [s for s in required_snippets if s not in out]

    if not has_blackbox_addr:
        missing.append("blackbox-exporter address")

    if not missing:
        return True, "Prometheus blackbox relabeling is correctly configured"

    return False, f"Missing blackbox relabeling elements: {missing}"

def prometheus_alert_fires_for_failing_probe():
    for dep in ("prometheus", "blackbox-exporter"):
        healthy, msg = check_deployment_health(dep, "observability")
        if not healthy:
            return False, msg

    port_forward("svc", "prometheus", "observability", 9090, 9090)

    for _ in range(18):
        code, out, _ = sh(
            "curl -s http://localhost:9090/api/v1/alerts"
        )

        if (
            code == 0
            and "SyntheticProbeFailure" in out
            and "does-not-exist.devops.local" in out
        ):
            return True, "SyntheticProbeFailure alert is firing"

        time.sleep(8)

    return False, "SyntheticProbeFailure alert did not fire"


def grafana_has_prometheus_datasource():
    healthy, msg = check_deployment_health("grafana", "observability")
    if not healthy:
        return False, msg

    if not wait_for_grafana_api():
        return False, "Grafana API not reachable"

    for _ in range(6):
        code, out, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
            "http://localhost:3000/api/datasources"
        )


        if code == 0 and "Prometheus" in out:
            return True, "Grafana Prometheus datasource configured"
        time.sleep(5)

    return False, "Grafana Prometheus datasource missing"


def grafana_has_blackbox_dashboard():
    healthy, msg = check_deployment_health("grafana", "observability")
    if not healthy:
        return False, msg

    if not wait_for_grafana_api():
        return False, "Grafana API not reachable"

    for _ in range(6):
        code, out, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
            "http://localhost:3000/api/search"
        )


        if code == 0 and any(kw in out for kw in [
            "Synthetic", "Blackbox", "Probe", "Endpoint"
        ]):
            return True, "Grafana dashboard for synthetic probes exists"
        time.sleep(5)

    return False, "Grafana dashboard missing"


def prometheus_uses_pvc():
    code, out, _ = sh(
        "kubectl get deployment prometheus "
        "-n observability -o json"
    )
    if code != 0:
        return False, "Prometheus deployment not found"

    data = json.loads(out)
    volumes = data["spec"]["template"]["spec"].get("volumes", [])
    mounts = data["spec"]["template"]["spec"]["containers"][0].get("volumeMounts", [])

    pvc_used = any(v.get("persistentVolumeClaim") for v in volumes)
    mounted = any(m.get("mountPath") == "/prometheus" for m in mounts)

    if pvc_used and mounted:
        return True, "Prometheus is using persistent storage"

    return False, "Prometheus PVC is not mounted at /prometheus"


def alert_rule_identifies_endpoint():
    """Verify alert annotations reference the failing endpoint."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Alerts must reference the endpoint in annotations so operators
    # can identify which endpoint failed
    has_label_template = re.search(
        r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out
    )

    if has_label_template:
        return True, "Alert annotations identify the failing endpoint"

    return False, (
        "Alert annotations must reference the failing endpoint "
        "(e.g., {{ $labels.instance }}) for operational use"
    )


def alert_has_minimum_duration():
    """Verify alert rule has for: 2m or greater duration"""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    duration_pattern = r'for:\s*([2-9]|[1-9]\d+)m'

    if re.search(duration_pattern, out):
        return True, "Alert rule has correct minimum duration (>=2m)"

    return False, "Alert rule must fire 'for: 2m' or longer (not immediate)"


def prometheus_alert_is_per_endpoint():

    for dep in ("prometheus", "blackbox-exporter"):
        healthy, msg = check_deployment_health(dep, "observability")
        if not healthy:
            return False, msg

    for _ in range(18):
        code, out, _ = sh(
            "kubectl exec -n observability deploy/prometheus -- "
            "wget -qO- http://localhost:9090/api/v1/alerts"
        )
        if code != 0 or "SyntheticProbeFailure" not in out:
            time.sleep(8)
            continue

        try:
            data = json.loads(out)
            alerts = data.get("data", {}).get("alerts", [])
            synthetic = [
                a for a in alerts
                if a.get("labels", {}).get("alertname") == "SyntheticProbeFailure"
            ]

            if len(synthetic) < 1:
                time.sleep(8)
                continue

            instances = {
                a.get("labels", {}).get("instance", "") for a in synthetic
            }
            if not all(instances):
                return False, (
                    "SyntheticProbeFailure alerts lack instance labels — "
                    "alerting is not scoped per endpoint"
                )

            firing = {
                a.get("labels", {}).get("instance", "")
                for a in synthetic if a.get("state") == "firing"
            }

            has_failing = any("does-not-exist" in i for i in firing)

            if has_failing:
                return True, (
                    "Alerts fire per endpoint (failing endpoint alerts "
                    "independently with instance labels)"
                )

            time.sleep(8)

        except (json.JSONDecodeError, KeyError):
            time.sleep(10)
            continue

    return False, "Could not verify per-endpoint alert scoping"


def prometheus_does_not_use_up_metric():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if " up " in out or "up==" in out or "up ==" in out:
        return False, "Alerting incorrectly uses exporter 'up' metric"

    return True, "Alerting correctly avoids exporter 'up' metric"


def check_endpoint_count():
    """Verify at least 3 probe targets are configured."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    out_lower = out.lower()

    required_targets = {
        'argocd': 'argocd' in out_lower,
        'kubernetes_api': 'kubernetes.default' in out_lower,
        'test_endpoint': 'does-not-exist' in out_lower,
    }

    missing = [k for k, found in required_targets.items() if not found]

    if missing:
        return False, f"Missing probe targets: {missing}"

    return True, "All required endpoints configured"


def check_grafana_dashboard_semantics():
    """Verify Grafana dashboard uses correct semantic patterns for synthetic monitoring."""
    code, out, _ = sh(
        "kubectl get configmap grafana-dashboards "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "grafana-dashboards ConfigMap not readable"

    issues = []

    # ------------------------------------------------------------------
    # Check 1: dashboard must show availability data (not raw binary)
    # ------------------------------------------------------------------
    has_availability_metric = (
        "probe_success" in out
        or re.search(r"probe:[a-zA-Z_]*availab", out)
        or re.search(r"probe:[a-zA-Z_]*success", out)
    )

    if has_availability_metric:
        if "probe_success" in out:
            has_time_agg = any(fn in out for fn in [
                "avg_over_time",
                "min_over_time",
                "max_over_time",
                "sum_over_time",
            ])
            if not has_time_agg:
                issues.append(
                    "Dashboard uses raw probe_success without time "
                    "aggregation (expected avg_over_time or similar)"
                )
    else:
        issues.append(
            "Dashboard does not reference probe availability metrics "
            "(probe_success or a recording rule like probe:availability)"
        )

    # ------------------------------------------------------------------
    # Check 2: per-endpoint breakdown (instance / target)
    # ------------------------------------------------------------------
    has_grouping = any(x in out for x in [
        "by (instance)",
        "by (target)",
        "$labels.instance",
        "$labels.target",
        "{{ instance }}",
        "{{instance}}",
        "{{ target }}",
        "{{target}}",
    ])

    if not has_grouping:
        issues.append(
            "Dashboard does not show per-endpoint breakdown "
            "(missing by(instance), legendFormat with {{ instance }}, "
            "or target label usage)"
        )

    # ------------------------------------------------------------------
    # Check 3: availability not shown as raw binary signal
    # ------------------------------------------------------------------
    has_normalized = (
        # Percentage form (e.g., * 100)
        any(x in out for x in ["* 100", "*100", "100 *"])
        # Or [0,1] normalized via time aggregation (avg_over_time already
        # produces a continuous availability ratio, not binary)
        or any(fn in out for fn in [
            "avg_over_time", "min_over_time", "max_over_time",
        ])
        # Or uses a recording rule that pre-computes availability
        or re.search(r"probe:[a-zA-Z_]+:", out)
    )

    if not has_normalized:
        issues.append(
            "Dashboard should represent availability as a normalized "
            "measure (e.g., avg_over_time for ratio or * 100 for percentage)"
        )

    # ------------------------------------------------------------------
    # Check 4: response-time / latency metrics present
    # ------------------------------------------------------------------
    latency_metrics = [
        "probe_duration_seconds",
        "probe_http_duration_seconds",
        "probe_tcp_connection_duration_seconds",
        "probe_dns_lookup_time_seconds",
    ]

    has_latency = any(metric in out for metric in latency_metrics)

    if not has_latency:
        issues.append(
            "Dashboard should include response-time metrics "
            "(e.g. probe_duration_seconds or protocol-specific durations)"
        )

    # ------------------------------------------------------------------
    # Final result
    # ------------------------------------------------------------------
    if issues:
        return False, "; ".join(issues)

    return True, (
        "Dashboard uses time-aggregated probe metrics, per-endpoint breakdown, "
        "availability percentage, and latency visualization"
    )


def grade(transcript: str) -> GradingResult:
    feedback = []

    # ------------------------
    # Gate checks (non-scored)
    # ------------------------
    gate_checks = [
        # Resource existence
        lambda: deployment_exists("blackbox-exporter", "observability"),
        lambda: service_exists("blackbox-exporter", "observability", 9115),
        lambda: configmap_exists("blackbox-config", "observability"),
        lambda: deployment_exists("prometheus", "observability"),
        lambda: configmap_exists("prometheus-config", "observability"),
        lambda: pvc_exists("prometheus-data", "observability"),
        blackbox_config_has_required_modules,
        prometheus_scrape_interval_valid,

        # Image correctness
        lambda: deployment_uses_image(
            "blackbox-exporter", "observability",
            "prom/blackbox-exporter:v0.25.0",
        ),
        lambda: deployment_uses_image(
            "prometheus", "observability",
            "prom/prometheus:v3.8.1",
        ),
        lambda: deployment_uses_image(
            "grafana", "observability",
            "grafana/grafana:12.2",
        ),

        # Core operational
        blackbox_metrics_exposed,
        prometheus_has_probe_metrics,
        prometheus_blackbox_relabeling_present,
        argocd_probe_success,
        kubernetes_api_tcp_probe_configured,
        prometheus_uses_pvc,

        # Basic config quality
        grafana_has_prometheus_datasource,
        check_alert_severity_labels,
        check_alert_annotations,
        check_alert_for_duration,
        alert_has_minimum_duration,
        prometheus_does_not_use_up_metric,
        check_slo_burn_rate_alerts,
        check_blackbox_modules,
    ]

    for fn in gate_checks:
        try:
            ok, msg = fn()
        except Exception as e:
            ok = False
            msg = str(e)

        feedback.append(("✓ " if ok else "✗ ") + msg)

    # ------------------------
    # Scored checks (partial)
    # ------------------------
    scored_checks = {
        "grafana_dashboard_present": grafana_has_blackbox_dashboard,
        "endpoint_count": check_endpoint_count,
        "grafana_dashboard_semantics": check_grafana_dashboard_semantics,
        "failing_in_alert": prometheus_alert_fires_for_failing_probe,
        "alert_identifies_endpoint": alert_rule_identifies_endpoint,
        "per_endpoint": prometheus_alert_is_per_endpoint,
        "records": check_recording_rules,
        "recording_rules": check_dashboard_uses_recording_rules,
    }

    subscores = {}

    for key, fn in scored_checks.items():
        try:
            ok, msg = fn()
        except Exception as e:
            ok = False
            msg = str(e)

        subscores[key] = 1.0 if ok else 0.0
        feedback.append(("✓ " if ok else "✗ ") + msg)

    #
    total_checks = len(scored_checks)
    weight = 1.0 / total_checks
    weights = {k: weight for k in scored_checks}

    score = sum(subscores[k] * weights[k] for k in subscores)

    return GradingResult(
        score=round(score, 4),
        subscores=subscores,
        weights=weights,
        feedback=" | ".join(feedback),
    )

## review-notes.md

      
    Raw
  

              review-notes.md
            
          
    Synthetic Endpoint Monitoring — Reconciled Task Files


Base version: v46 (a6b6b25b-fbdf-4830-bd13-258c6bfd9948)
Status: Review patch applied — ready for solution test and eval
Approach: Minimal changes on v46; author's implementations preserved where both versions address the same issue satisfactorily

These are the complete, ready-to-upload task files with review patches applied on top of the author's v46 submission. A companion gist contains the raw patch and detailed per-change rationale.

What changed from v46

Blocking fixes


Wiki URL corrected — task.yaml pointed to gitea.devops.local:3000/root/bleater-app/wiki but setup.sh creates the wiki at gitea.devops.local/root/platform-docs/wiki/. Agent gets a 404 and can't discover probe targets.


Grader restructured: 25 scored → 25 gates + 8 scored — v46's 25 equal-weight scored checks (0.04 each) let an agent score 0.32 by passing trivial existence checks without deploying anything functional. Now gates provide diagnostic feedback while 8 substantive scored checks (0.125 each) require real working functionality to earn points.


SyntheticProbeFailure alert name added to task.yaml — The grader checks for this exact string in 4+ places but v46's task description never names it. An agent choosing "ProbeFailure" or "EndpointDown" fails through no fault of their own.


Reliability fixes


Wiki creation via git-commit (setup.sh) — Replaces flaky Gitea POST .../wiki/new API with direct git commit to the bare wiki repo. Eliminates timing-dependent 404s.


Image tag aliasing (setup.sh) — ctr images tag docker.io/X X ensures both prefixed and unprefixed image references resolve. Prevents ErrImagePull when agents write image: prom/prometheus:v3.8.1 but k3s only has docker.io/prom/prometheus:v3.8.1.


Distractor image cleanup (setup.sh) — Removes old versions (prometheus v2.54.1, grafana 11.3.0) from the Nebula base image so agents can't discover and use wrong versions.


Image version table in wiki (setup.sh) — v46's wiki told agents to "discover available versions using standard container tooling" but ctr images list output is ambiguous in air-gapped environments. Explicit table is the fair approach.


Grader robustness


normalize_image() — Strips docker.io/ prefix when comparing deployed images against expected versions.


check_deployment_health() — Fast-fails on terminal pod states (ImagePullBackOff, ErrImagePull) instead of retrying for 2+ minutes when a deployment will never recover.


Grafana API caching — wait_for_grafana_api() result is cached to avoid redundant 30s waits across multiple Grafana checks.


Port-forward deduplication — Prevents killing and re-establishing active port-forwards when multiple checks use the same port.


Check quality


check_slo_burn_rate_alerts tightened — v46 used keyword matching ("burn", "14.4", "error budget") which passes if an agent writes # burn rate in a comment. Now requires structural evidence: ≥2 unique avg_over_time time windows AND ≥2 unique for: durations.


check_recording_rules tightened — v46 passed if record: keyword existed + either "probe:" or "availability" appeared anywhere. Now requires ≥2 recording rules defined AND ≥2 referenced in alert expressions.


alert_rule_identifies_endpoint tightened — v46 accepted by (instance) in PromQL expressions as sufficient. Now requires {{ $labels.instance }} in alert annotations — proving the alert message identifies the endpoint for operators, not just that the expression preserves the label.


check_grafana_dashboard_semantics loosened — v46 required literal probe_success in the dashboard. Now also accepts recording rules like probe:availability. Agents using recording rules (better practice) shouldn't be penalized.


Cleanup


Removed duplicate grafana_dashboard_uses_time_aggregation — Defined twice in v46 (lines 574 and 732); second shadows the first. Functionality already covered by check_grafana_dashboard_semantics.


Removed HTTPS TLS check from check_blackbox_modules — v46 required explicit tls_config for HTTPS targets, but the default blackbox exporter behavior already verifies TLS. Task says "preserve default TLS verification behavior."


Removed remote_write from solution.sh — Dummy URL https://url/insert/0/prometheus/api/v1/write causes Prometheus connection timeouts. Agents would copy it.


Burn rate alerts use recording rules in solution.sh — probe:availability:5m instead of raw avg_over_time(probe_success[5m]). Required by scored check_recording_rules and aligns with task requirement about efficient querying.


Removed duplicate paragraphs from task.yaml — scrape_interval stated twice, TCP monitoring mentioned 3x.


Fixed double-slash typo in setup.sh — "//workdir/grafana-12.2.tar" → "/workdir/grafana-12.2.tar".


Grader architecture


      graph TD
    subgraph Gates["Gate Checks (25) — feedback only, no score"]
        G1["6 resource existence"]
        G2["3 image correctness"]
        G3["6 operational checks"]
        G4["10 config quality checks"]
    end

    subgraph Scored["Scored Checks (8 × 0.125)"]
        S1["grafana_dashboard_present<br/><i>Grafana up + dashboard visible</i>"]
        S2["endpoint_count<br/><i>3 targets: argocd, k8s API, test</i>"]
        S3["grafana_dashboard_semantics<br/><i>time agg + per-endpoint + % + latency</i>"]
        S4["failing_in_alert<br/><i>SyntheticProbeFailure fires</i>"]
        S5["alert_identifies_endpoint<br/><i>annotations: {{ $labels.instance }}</i>"]
        S6["per_endpoint<br/><i>alerts fire independently per target</i>"]
        S7["records<br/><i>≥2 recording rules used in alerts</i>"]
        S8["recording_rules<br/><i>dashboard references recording rules</i>"]
    end

    Gates --> Scored

    
      Loading

  
What was kept from v46


v46 addition
Disposition
Notes


prometheus_scrape_interval_valid()
Kept as gate
Validates 10s/15s specifically — stricter and more aligned with task than our generic ≤30s check


check_alert_for_duration()
Kept as gate
Validates for: 30s–2m on SyntheticProbeFailure. Solution uses for: 1m, passes.


deployment_uses_any_image()
Removed
normalize_image() in deployment_uses_image() handles docker.io/ prefix more cleanly


check_grafana_dashboard_semantics()
Kept as scored
Multi-part dashboard quality check — good design


check_endpoint_count()
Kept as scored
Validates all 3 required targets


Files in this gist


File
Lines
Notes


grader.py
1008
25 gates + 8 scored checks


task.yaml
113
Wiki URL fixed, alert name added, deduped


setup.sh
255
Git-commit wiki, image tagging, distractor cleanup


solution.sh
473
Recording rules in alerts/dashboard, no remote_write


Dockerfile
20
Unchanged from v46


## setup.sh
#!/bin/bash
set -e

# ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- #
# Start supervisord if not already running (manages k3s, dockerd, dnsmasq)
if ! supervisorctl status &>/dev/null; then
    echo "Starting supervisord..."
    /usr/bin/supervisord -c /etc/supervisor/supervisord.conf
    sleep 5
fi

# Set kubeconfig for k3s
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

# Wait for k3s to be ready (k3s can take 30-60 seconds to start)
echo "Waiting for k3s to be ready..."
MAX_WAIT=180
ELAPSED=0
until kubectl get nodes &>/dev/null; do
    if [ $ELAPSED -ge $MAX_WAIT ]; then
        echo "Error: k3s is not ready after ${MAX_WAIT} seconds"
        exit 1
    fi
    echo "Waiting for k3s... (${ELAPSED}s elapsed)"
    sleep 2
    ELAPSED=$((ELAPSED + 2))
done

echo "k3s is ready!"
# ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- #

echo "Granting Minimal Required Permission"

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: ubuntu-user-configmap-editor
  namespace: observability
rules:
- apiGroups: [""]
  resources: ["configmaps"]
  verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: ubuntu-user-configmap-editor-binding
  namespace: observability
subjects:
- kind: ServiceAccount
  name: ubuntu-user
  namespace: observability
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: ubuntu-user-configmap-editor
EOF

# ------------------------------------------------------------------------------
# Create Gitea repo
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
curl -s -X POST \
  -u "root:Admin@123456" \
  -H "Content-Type: application/json" \
  -d "{
    \"name\": \"platform-docs\",
    \"private\": false,
    \"auto_init\": true
  }" \
  http://localhost:3000/api/v1/user/repos
'

# ------------------------------------------------------------------------------
# Wiki creation — commit directly to bare git repo (bypasses API timing issues)
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
set -e

WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git

# Ensure bare wiki repo exists (Gitea does NOT auto-create it)
if [ ! -d "$WIKI_REPO" ]; then
  git init --bare "$WIKI_REPO" >/dev/null 2>&1
fi

# Commit wiki content directly into the bare repo via a temp clone
TMPDIR=$(mktemp -d)
cd "$TMPDIR"
git init >/dev/null 2>&1
git config user.email "root@gitea.local"
git config user.name "root"

cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF
# Platform Services - Monitoring Requirements

## Critical Platform Services

### ArgoCD GitOps Controller
| Property | Value |
|----------|-------|
| Ingress | \`argocd.devops.local\` |
| Protocol | HTTP |
| Criticality | High |

### Kubernetes API Server
| Property | Value |
|----------|-------|
| Access | \`kubernetes.default.svc:443\` |
| Protocol | TCP |
| Criticality | Critical |

## Alerting Validation

Configure a probe against a known-unreachable endpoint to verify alerting:

    http://does-not-exist.devops.local

Alerts should identify the specific failing endpoint (not aggregate into one global alert).

## Preloaded Container Images

This cluster operates in air-gapped mode. The following images are available:

| Component | Image |
|-----------|-------|
| Prometheus | \`prom/prometheus:v3.8.1\` |
| Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` |
| Grafana | \`grafana/grafana:12.2\` |

Use these exact image references in your deployments. No other versions are available.
WIKIEOF

git add . >/dev/null 2>&1
git commit -m "Initial wiki content" >/dev/null 2>&1
git remote add origin "$WIKI_REPO"
git push -f origin master >/dev/null 2>&1

cd /
rm -rf "$TMPDIR"
chown -R git:git "$WIKI_REPO"
' && echo "[SETUP] Platform documentation created" \
  || echo "[SETUP] WARN: Wiki creation failed"

# Poke the Gitea wiki API to force indexing of the git-committed content
kubectl exec -n gitea deploy/gitea -- sh -c '
for i in 1 2 3; do
  HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
    -u "root:Admin@123456" \
    "http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages")
  if [ "$HTTP_CODE" = "200" ]; then
    break
  fi
  sleep 2
done
' 2>/dev/null || true


echo "[SETUP] Importing pre-cached container images..."

PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar"
BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar"
GRAFANA_TAR="/workdir/grafana-12.2.tar"

if [ ! -f "$GRAFANA_TAR" ]; then
    echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import --no-unpack "$GRAFANA_TAR"

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "grafana/grafana:12.2" || {
    echo "Error: Grafana image was not imported correctly"
    exit 1
}

if [ ! -f "$PROMETHEUS_TAR" ]; then
    echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import "$PROMETHEUS_TAR"

if [ ! -f "$BLACKBOX_TAR" ]; then
    echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import --no-unpack "$BLACKBOX_TAR"

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "prom/prometheus:v3.8.1" || {
    echo "Error: Prometheus image was not imported correctly"
    exit 1
}

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "prom/blackbox-exporter:v0.25.0" || {
    echo "Error: Blackbox image was not imported correctly"
    exit 1
}

CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io"
$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1
$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0
$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2

ALLOWED_IMAGES=(
  "prom/prometheus:v3.8.1"
  "docker.io/prom/prometheus:v3.8.1"
  "prom/blackbox-exporter:v0.25.0"
  "docker.io/prom/blackbox-exporter:v0.25.0"
  "grafana/grafana:12.2"
  "docker.io/grafana/grafana:12.2"
)

is_allowed() {
  for allowed in "${ALLOWED_IMAGES[@]}"; do
    [[ "$1" == "$allowed" ]] && return 0
  done
  return 1
}

$CTR images list -q | while read -r image; do
  case "$image" in
    *prometheus*|*blackbox-exporter*|*grafana*)
      is_allowed "$image" || $CTR images remove "$image" 2>/dev/null || true
      ;;
  esac
done

ctr --address /run/k3s/containerd/containerd.sock \
  --namespace k8s.io \
  images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null || true

ctr --address /run/k3s/containerd/containerd.sock \
  --namespace k8s.io \
  images remove docker.io/grafana/grafana:11.3.0 2>/dev/null || true

rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR"
rm -rf /workdir/*.tar 2>/dev/null || true

echo "[SETUP] Prometheus and Blackbox images imported successfully"

## solution.sh
#!/bin/bash
set -euo pipefail

echo ""
echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)"
echo ""

export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

NAMESPACE="observability"

# Ensure namespace exists
kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 || kubectl create namespace ${NAMESPACE}

echo ""
echo ">>> Applying Blackbox Exporter configuration"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: blackbox-config
  namespace: ${NAMESPACE}
data:
  blackbox.yml: |
    modules:
      http_2xx:
        prober: http
        timeout: 5s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2"]
          valid_status_codes: []
          method: GET


      tcp_connect:
        prober: tcp
        timeout: 5s
EOF

cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: blackbox-exporter
  namespace: ${NAMESPACE}
  labels:
    app: blackbox-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
      app: blackbox-exporter
  template:
    metadata:
      labels:
        app: blackbox-exporter
    spec:
      containers:
        - name: blackbox-exporter
          image: prom/blackbox-exporter:v0.25.0
          ports:
            - containerPort: 9115
          args:
            - "--config.file=/etc/blackbox/blackbox.yml"
          securityContext:
            runAsNonRoot: true
            runAsUser: 1000
          volumeMounts:
            - name: config-volume
              mountPath: /etc/blackbox
      volumes:
        - name: config-volume
          configMap:
            name: blackbox-config
            items:
              - key: blackbox.yml
                path: blackbox.yml
EOF

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
  name: blackbox-exporter
  namespace: ${NAMESPACE}
spec:
  selector:
    app: blackbox-exporter
  ports:
    - name: http
      port: 9115
      targetPort: 9115
EOF

echo ""
echo ">>> Applying Prometheus storage"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: prometheus-data
  namespace: ${NAMESPACE}
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 2Gi
EOF

echo ""
echo ">>> Applying Prometheus configuration"
echo ""

echo "Applying Prometheus config..."

cat <<'EOF' | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: observability
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s

    rule_files:
      - /etc/prometheus/rules/*.yml

    scrape_configs:
      - job_name: blackbox
        metrics_path: /probe
        params:
          module: [http_2xx]
        static_configs:
          - targets:
              - http://argocd.devops.local
              - http://does-not-exist.devops.local
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter:9115

      - job_name: blackbox-kubernetes-api
        metrics_path: /probe
        params:
          module: [tcp_connect]
        static_configs:
          - targets:
              - kubernetes.default.svc:443
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter:9115

      - job_name: blackbox-exporter
        static_configs:
          - targets:
              - blackbox-exporter:9115

      - job_name: prometheus
        static_configs:
          - targets:
              - localhost:9090


  rules.yml: |
    groups:

      # --------------------------------------------------
      # Recording rules
      # --------------------------------------------------
      - name: synthetic-recording
        rules:
          - record: probe:availability:5m
            expr: avg_over_time(probe_success[5m])

          - record: probe:availability:1h
            expr: avg_over_time(probe_success[1h])

          - record: probe:latency_p99:5m
            expr: |
              histogram_quantile(
                0.99,
                sum(rate(probe_duration_seconds_bucket[5m]))
                by (le, instance)
              )

    # --------------------------------------------------
    # Compatibility alert (legacy graders depend on this)
    # --------------------------------------------------
      - name: synthetic-compat
        rules:
          - alert: SyntheticProbeFailure
            expr: probe_success == 0
            for: 1m
            labels:
              severity: critical
            annotations:
              summary: "Synthetic probe failed"
              description: "Endpoint {{ $labels.instance }} is unreachable"

    # --------------------------------------------------
    # SLO burn-rate alerts
    # --------------------------------------------------
      - name: synthetic-slo
        rules:
          - alert: SyntheticProbeHighBurnRate
            expr: |
              (1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: "High synthetic availability burn rate"
              description: "High error budget burn rate for {{ $labels.instance }}"

          - alert: SyntheticProbeLowBurnRate
            expr: |
              (1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1
            for: 1h
            labels:
              severity: warning
            annotations:
              summary: "Sustained synthetic availability degradation"
              description: "Sustained error budget burn rate for {{ $labels.instance }}"
EOF


echo ""
echo ">>> Deploying Prometheus"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: ${NAMESPACE}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      securityContext:
        fsGroup: 65534
        runAsNonRoot: true
        runAsUser: 1000
      containers:
        - name: prometheus
          image: prom/prometheus:v3.8.1
          args:
            - "--config.file=/etc/prometheus/prometheus.yml"

          ports:
            - containerPort: 9090
          volumeMounts:
            - name: config-volume
              mountPath: /etc/prometheus/prometheus.yml
              subPath: prometheus.yml

            - name: data-volume
              mountPath: /prometheus
            - name: rules-volume
              mountPath: /etc/prometheus/rules

      volumes:
        - name: config-volume
          configMap:
            name: prometheus-config
        - name: data-volume
          persistentVolumeClaim:
            claimName: prometheus-data
        - name: rules-volume
          configMap:
            name: prometheus-config
            items:
              - key: rules.yml
                path: rules.yml

EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: ${NAMESPACE}
spec:
  selector:
    app: prometheus
  ports:
    - name: web
      port: 9090
      targetPort: 9090
EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-datasources
  namespace: ${NAMESPACE}
data:
  datasources.yml: |
    apiVersion: 1
    datasources:
      - name: Prometheus
        type: prometheus
        access: proxy
        url: http://prometheus:9090
        isDefault: true
EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-provider
  namespace: ${NAMESPACE}
data:
  dashboards.yml: |
    apiVersion: 1
    providers:
      - name: default
        folder: ''
        type: file
        options:
          path: /var/lib/grafana/dashboards
EOF


cat <<EOF | kubectl apply -f -

apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboards
  namespace: observability
data:
  blackbox-dashboard.json: |
    {
      "title": "Synthetic Endpoint Availability",
      "schemaVersion": 38,
      "panels": [
        {
          "type": "timeseries",
          "title": "Probe Availability (%)",
          "targets": [
            {
              "expr": "probe:availability:5m * 100",
              "legendFormat": "{{ instance }}",
              "refId": "A"
            }
          ]
        },
        {
          "type": "timeseries",
          "title": "Probe Latency (seconds)",
          "targets": [
            {
              "expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)",
              "legendFormat": "{{ instance }}",
              "refId": "B"
            }
          ]
        }
      ]
    }
EOF


cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: ${NAMESPACE}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
        - name: grafana
          image: grafana/grafana:12.2
          ports:
            - containerPort: 3000
          volumeMounts:
            - name: datasources
              mountPath: /etc/grafana/provisioning/datasources

            # Provider config
            - name: dashboard-provider
              mountPath: /etc/grafana/provisioning/dashboards

            # Actual JSON dashboards
            - name: dashboard-json
              mountPath: /var/lib/grafana/dashboards

      volumes:
        - name: datasources
          configMap:
            name: grafana-datasources

        - name: dashboard-provider
          configMap:
            name: grafana-dashboard-provider

        - name: dashboard-json
          configMap:
            name: grafana-dashboards
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: ${NAMESPACE}
spec:
  selector:
    app: grafana
  ports:
    - name: web
      port: 3000
      targetPort: 3000
EOF


echo ""
echo ">>> Waiting for deployments to become ready"
echo ""

kubectl rollout status deployment/grafana -n ${NAMESPACE}
kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE}
kubectl rollout status deployment/prometheus -n ${NAMESPACE}

echo ""
echo ">>> Solution deployment completed successfully"
echo ""

## task.yaml
id: synthetic-endpoint-monitoring
metadata:
  category: observability
  difficulty: advanced

prompt: |
  [Context]
  The Nebula platform hosts multiple internal services exposed through Kubernetes ingress.
  While application-level metrics exist, the platform team lacks visibility into endpoint
  availability from a real client perspective.

  Synthetic probing is required to validate ingress reachability and detect outages
  independent of application instrumentation.

  [Environment Notes]
  The cluster operates in air-gapped mode.
  Required container images are already preloaded into the runtime environment and must
  be used as-is without pulling from external registries. Available image versions
  are documented in the platform wiki.

  [Task]
  Implement synthetic endpoint monitoring in the `observability` namespace.

  The monitoring stack must include the following Kubernetes resources with the
  specified names:
    - A deployment named `blackbox-exporter`
    - A service named `blackbox-exporter` exposing the probe endpoint
    - A configuration ConfigMap named `blackbox-config`
    - A monitoring deployment named `prometheus`
    - A configuration ConfigMap named `prometheus-config`
    - A persistent volume claim named `prometheus-data`

  Prometheus must be exposed via a Kubernetes Service named `prometheus`
  on port 9090.

  The Nebula internal developer wiki documents commonly exposed platform services
  and their ingress hostnames:
    http://gitea.devops.local/root/platform-docs/wiki/

  The solution must actively probe internal service endpoints that represent
  real client access patterns. Check the Gitea wiki for information about
  critical platform services that require monitoring.

  Probe results must be exposed as metrics and be consumable by the monitoring system.

  [Blackbox Exporter Requirements]
  The blackbox exporter must support multiple probe types:
    - HTTP probes for application-layer availability
    - TCP probes for transport-layer connectivity

  Probe behavior must follow protocol-appropriate validation:
    - Application-layer probes must validate protocol correctness
    - Transport-layer probes must validate connectivity only
    - HTTP probe modules must preserve default TLS verification behavior

  The Kubernetes API server health must be monitored using TCP connectivity probes
  against its standard in-cluster service name. This check must validate only
  basic transport-layer reachability (no TLS or HTTP validation).

  Exporter-level metrics (e.g., `up`) must not be used as a substitute for
  synthetic probe result metrics.

  [Prometheus Requirements]
  The global `scrape_interval` must be set to **15s or 10s**.

  Prometheus must be configured to scrape blackbox probe targets using appropriate
  relabeling. Configuration should support efficient querying and reuse of commonly
  evaluated availability signals. Repeated or computationally expensive expressions
  should not be evaluated directly at query time.

  [Alerting Requirements]
  Prometheus must define alerting rules based on synthetic probe result metrics.

  At least one alert must be named **SyntheticProbeFailure** and represent
  endpoint-level availability failure detected via synthetic probes.

  Alerts detecting sustained availability degradation should be based on
  SLO-style burn rate concepts (e.g., evaluating error budget consumption
  over time rather than fixed thresholds).

  Alerts must:
    - Detect rapid availability loss
    - Detect sustained availability degradation over longer periods
    - Distinguish failures on a per-endpoint basis
    - Avoid relying solely on instantaneous probe failures or fixed thresholds

  Alerting rules must be derived from probe result metrics and must distinguish
  failures on a per-endpoint basis.

  [Visualization Requirements]
  The monitoring stack must include a visualization layer for synthetic probe results.

  Required resource:
    - ConfigMap `grafana-dashboards` for dashboard definitions

  A visualization service must be deployed in the `observability` namespace
  and configured to consume metrics directly from Prometheus without manual
  configuration through a web interface.

  The visualization must include at least one dashboard that presents the
  availability status of synthetic probes on a per-endpoint basis.

  Dashboards must present synthetic probe results in a form suitable for
  service-level assessment rather than raw signal inspection.

  Visualizations must:
    - Represent availability as a normalized measure over time
    - Allow comparison across individual endpoints
    - Include at least one indicator related to request or probe responsiveness
    - Not rely solely on binary success/failure signals

  Dashboard and data source configuration must be reproducible and stored
  declaratively as Kubernetes resources.
	FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2



	RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data

	RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \
	\| tar -xzf - -C /usr/local/bin crane


	ENV ALLOWED_NAMESPACES="observability"

	RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar
	RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar
	RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar


	ENV DISPLAY_NUM=1
	ENV COMPUTER_HEIGHT_PX=768
	ENV COMPUTER_WIDTH_PX=1024
v46 addition	Disposition	Notes
`prometheus_scrape_interval_valid()`	Kept as gate	Validates 10s/15s specifically — stricter and more aligned with task than our generic ≤30s check
`check_alert_for_duration()`	Kept as gate	Validates `for:` 30s–2m on SyntheticProbeFailure. Solution uses `for: 1m`, passes.
`deployment_uses_any_image()`	Removed	`normalize_image()` in `deployment_uses_image()` handles `docker.io/` prefix more cleanly
`check_grafana_dashboard_semantics()`	Kept as scored	Multi-part dashboard quality check — good design
`check_endpoint_count()`	Kept as scored	Validates all 3 required targets
File	Lines	Notes
`grader.py`	1008	25 gates + 8 scored checks
`task.yaml`	113	Wiki URL fixed, alert name added, deduped
`setup.sh`	255	Git-commit wiki, image tagging, distractor cleanup
`solution.sh`	473	Recording rules in alerts/dashboard, no remote_write
`Dockerfile`	20	Unchanged from v46
	#!/bin/bash
	set -e

	# ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- #
	# Start supervisord if not already running (manages k3s, dockerd, dnsmasq)
	if ! supervisorctl status &>/dev/null; then
	echo "Starting supervisord..."
	/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
	sleep 5
	fi

	# Set kubeconfig for k3s
	export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

	# Wait for k3s to be ready (k3s can take 30-60 seconds to start)
	echo "Waiting for k3s to be ready..."
	MAX_WAIT=180
	ELAPSED=0
	until kubectl get nodes &>/dev/null; do
	if [ $ELAPSED -ge $MAX_WAIT ]; then
	echo "Error: k3s is not ready after ${MAX_WAIT} seconds"
	exit 1
	fi
	echo "Waiting for k3s... (${ELAPSED}s elapsed)"
	sleep 2
	ELAPSED=$((ELAPSED + 2))
	done

	echo "k3s is ready!"
	# ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- #

	echo "Granting Minimal Required Permission"

	kubectl apply -f - <<EOF
	apiVersion: rbac.authorization.k8s.io/v1
	kind: Role
	metadata:
	name: ubuntu-user-configmap-editor
	namespace: observability
	rules:
	- apiGroups: [""]
	resources: ["configmaps"]
	verbs: ["get", "list", "watch", "create", "update", "patch"]
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: RoleBinding
	metadata:
	name: ubuntu-user-configmap-editor-binding
	namespace: observability
	subjects:
	- kind: ServiceAccount
	name: ubuntu-user
	namespace: observability
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: Role
	name: ubuntu-user-configmap-editor
	EOF

	# ------------------------------------------------------------------------------
	# Create Gitea repo
	# ------------------------------------------------------------------------------
	kubectl exec -n gitea deploy/gitea -- sh -c '
	curl -s -X POST \
	-u "root:Admin@123456" \
	-H "Content-Type: application/json" \
	-d "{
	\"name\": \"platform-docs\",
	\"private\": false,
	\"auto_init\": true
	}" \
	http://localhost:3000/api/v1/user/repos
	'

	# ------------------------------------------------------------------------------
	# Wiki creation — commit directly to bare git repo (bypasses API timing issues)
	# ------------------------------------------------------------------------------
	kubectl exec -n gitea deploy/gitea -- sh -c '
	set -e

	WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git

	# Ensure bare wiki repo exists (Gitea does NOT auto-create it)
	if [ ! -d "$WIKI_REPO" ]; then
	git init --bare "$WIKI_REPO" >/dev/null 2>&1
	fi

	# Commit wiki content directly into the bare repo via a temp clone
	TMPDIR=$(mktemp -d)
	cd "$TMPDIR"
	git init >/dev/null 2>&1
	git config user.email "root@gitea.local"
	git config user.name "root"

	cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF
	# Platform Services - Monitoring Requirements

	## Critical Platform Services

	### ArgoCD GitOps Controller
	\| Property \| Value \|
	\|----------\|-------\|
	\| Ingress \| \`argocd.devops.local\` \|
	\| Protocol \| HTTP \|
	\| Criticality \| High \|

	### Kubernetes API Server
	\| Property \| Value \|
	\|----------\|-------\|
	\| Access \| \`kubernetes.default.svc:443\` \|
	\| Protocol \| TCP \|
	\| Criticality \| Critical \|

	## Alerting Validation

	Configure a probe against a known-unreachable endpoint to verify alerting:

	http://does-not-exist.devops.local

	Alerts should identify the specific failing endpoint (not aggregate into one global alert).

	## Preloaded Container Images

	This cluster operates in air-gapped mode. The following images are available:

	\| Component \| Image \|
	\|-----------\|-------\|
	\| Prometheus \| \`prom/prometheus:v3.8.1\` \|
	\| Blackbox Exporter \| \`prom/blackbox-exporter:v0.25.0\` \|
	\| Grafana \| \`grafana/grafana:12.2\` \|

	Use these exact image references in your deployments. No other versions are available.
	WIKIEOF

	git add . >/dev/null 2>&1
	git commit -m "Initial wiki content" >/dev/null 2>&1
	git remote add origin "$WIKI_REPO"
	git push -f origin master >/dev/null 2>&1

	cd /
	rm -rf "$TMPDIR"
	chown -R git:git "$WIKI_REPO"
	' && echo "[SETUP] Platform documentation created" \
	\|\| echo "[SETUP] WARN: Wiki creation failed"

	# Poke the Gitea wiki API to force indexing of the git-committed content
	kubectl exec -n gitea deploy/gitea -- sh -c '
	for i in 1 2 3; do
	HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
	-u "root:Admin@123456" \
	"http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages")
	if [ "$HTTP_CODE" = "200" ]; then
	break
	fi
	sleep 2
	done
	' 2>/dev/null \|\| true



	echo "[SETUP] Importing pre-cached container images..."

	PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar"
	BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar"
	GRAFANA_TAR="/workdir/grafana-12.2.tar"

	if [ ! -f "$GRAFANA_TAR" ]; then
	echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import --no-unpack "$GRAFANA_TAR"

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "grafana/grafana:12.2" \|\| {
	echo "Error: Grafana image was not imported correctly"
	exit 1
	}

	if [ ! -f "$PROMETHEUS_TAR" ]; then
	echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import "$PROMETHEUS_TAR"

	if [ ! -f "$BLACKBOX_TAR" ]; then
	echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import --no-unpack "$BLACKBOX_TAR"

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "prom/prometheus:v3.8.1" \|\| {
	echo "Error: Prometheus image was not imported correctly"
	exit 1
	}

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "prom/blackbox-exporter:v0.25.0" \|\| {
	echo "Error: Blackbox image was not imported correctly"
	exit 1
	}

	CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io"
	$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1
	$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0
	$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2

	ALLOWED_IMAGES=(
	"prom/prometheus:v3.8.1"
	"docker.io/prom/prometheus:v3.8.1"
	"prom/blackbox-exporter:v0.25.0"
	"docker.io/prom/blackbox-exporter:v0.25.0"
	"grafana/grafana:12.2"
	"docker.io/grafana/grafana:12.2"
	)

	is_allowed() {
	for allowed in "${ALLOWED_IMAGES[@]}"; do
	[[ "$1" == "$allowed" ]] && return 0
	done
	return 1
	}

	$CTR images list -q \| while read -r image; do
	case "$image" in
	prometheus\|blackbox-exporter\|grafana)
	is_allowed "$image" \|\| $CTR images remove "$image" 2>/dev/null \|\| true
	;;
	esac
	done

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null \|\| true

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images remove docker.io/grafana/grafana:11.3.0 2>/dev/null \|\| true

	rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR"
	rm -rf /workdir/*.tar 2>/dev/null \|\| true

	echo "[SETUP] Prometheus and Blackbox images imported successfully"
	#!/bin/bash
	set -euo pipefail

	echo ""
	echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)"
	echo ""

	export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

	NAMESPACE="observability"

	# Ensure namespace exists
	kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 \|\| kubectl create namespace ${NAMESPACE}

	echo ""
	echo ">>> Applying Blackbox Exporter configuration"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: blackbox-config
	namespace: ${NAMESPACE}
	data:
	blackbox.yml: \|
	modules:
	http_2xx:
	prober: http
	timeout: 5s
	http:
	valid_http_versions: ["HTTP/1.1", "HTTP/2"]
	valid_status_codes: []
	method: GET


	tcp_connect:
	prober: tcp
	timeout: 5s
	EOF

	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: blackbox-exporter
	namespace: ${NAMESPACE}
	labels:
	app: blackbox-exporter
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: blackbox-exporter
	template:
	metadata:
	labels:
	app: blackbox-exporter
	spec:
	containers:
	- name: blackbox-exporter
	image: prom/blackbox-exporter:v0.25.0
	ports:
	- containerPort: 9115
	args:
	- "--config.file=/etc/blackbox/blackbox.yml"
	securityContext:
	runAsNonRoot: true
	runAsUser: 1000
	volumeMounts:
	- name: config-volume
	mountPath: /etc/blackbox
	volumes:
	- name: config-volume
	configMap:
	name: blackbox-config
	items:
	- key: blackbox.yml
	path: blackbox.yml
	EOF

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Service
	metadata:
	name: blackbox-exporter
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: blackbox-exporter
	ports:
	- name: http
	port: 9115
	targetPort: 9115
	EOF

	echo ""
	echo ">>> Applying Prometheus storage"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: prometheus-data
	namespace: ${NAMESPACE}
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: 2Gi
	EOF

	echo ""
	echo ">>> Applying Prometheus configuration"
	echo ""

	echo "Applying Prometheus config..."

	cat <<'EOF' \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: prometheus-config
	namespace: observability
	data:
	prometheus.yml: \|
	global:
	scrape_interval: 15s

	rule_files:
	- /etc/prometheus/rules/*.yml

	scrape_configs:
	- job_name: blackbox
	metrics_path: /probe
	params:
	module: [http_2xx]
	static_configs:
	- targets:
	- http://argocd.devops.local
	- http://does-not-exist.devops.local
	relabel_configs:
	- source_labels: [__address__]
	target_label: __param_target
	- source_labels: [__param_target]
	target_label: instance
	- target_label: __address__
	replacement: blackbox-exporter:9115

	- job_name: blackbox-kubernetes-api
	metrics_path: /probe
	params:
	module: [tcp_connect]
	static_configs:
	- targets:
	- kubernetes.default.svc:443
	relabel_configs:
	- source_labels: [__address__]
	target_label: __param_target
	- source_labels: [__param_target]
	target_label: instance
	- target_label: __address__
	replacement: blackbox-exporter:9115

	- job_name: blackbox-exporter
	static_configs:
	- targets:
	- blackbox-exporter:9115

	- job_name: prometheus
	static_configs:
	- targets:
	- localhost:9090


	rules.yml: \|
	groups:

	# --------------------------------------------------
	# Recording rules
	# --------------------------------------------------
	- name: synthetic-recording
	rules:
	- record: probe:availability:5m
	expr: avg_over_time(probe_success[5m])

	- record: probe:availability:1h
	expr: avg_over_time(probe_success[1h])

	- record: probe:latency_p99:5m
	expr: \|
	histogram_quantile(
	0.99,
	sum(rate(probe_duration_seconds_bucket[5m]))
	by (le, instance)
	)

	# --------------------------------------------------
	# Compatibility alert (legacy graders depend on this)
	# --------------------------------------------------
	- name: synthetic-compat
	rules:
	- alert: SyntheticProbeFailure
	expr: probe_success == 0
	for: 1m
	labels:
	severity: critical
	annotations:
	summary: "Synthetic probe failed"
	description: "Endpoint {{ $labels.instance }} is unreachable"

	# --------------------------------------------------
	# SLO burn-rate alerts
	# --------------------------------------------------
	- name: synthetic-slo
	rules:
	- alert: SyntheticProbeHighBurnRate
	expr: \|
	(1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4
	for: 2m
	labels:
	severity: critical
	annotations:
	summary: "High synthetic availability burn rate"
	description: "High error budget burn rate for {{ $labels.instance }}"

	- alert: SyntheticProbeLowBurnRate
	expr: \|
	(1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1
	for: 1h
	labels:
	severity: warning
	annotations:
	summary: "Sustained synthetic availability degradation"
	description: "Sustained error budget burn rate for {{ $labels.instance }}"
	EOF




	echo ""
	echo ">>> Deploying Prometheus"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: prometheus
	namespace: ${NAMESPACE}
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: prometheus
	template:
	metadata:
	labels:
	app: prometheus
	spec:
	securityContext:
	fsGroup: 65534
	runAsNonRoot: true
	runAsUser: 1000
	containers:
	- name: prometheus
	image: prom/prometheus:v3.8.1
	args:
	- "--config.file=/etc/prometheus/prometheus.yml"

	ports:
	- containerPort: 9090
	volumeMounts:
	- name: config-volume
	mountPath: /etc/prometheus/prometheus.yml
	subPath: prometheus.yml

	- name: data-volume
	mountPath: /prometheus
	- name: rules-volume
	mountPath: /etc/prometheus/rules

	volumes:
	- name: config-volume
	configMap:
	name: prometheus-config
	- name: data-volume
	persistentVolumeClaim:
	claimName: prometheus-data
	- name: rules-volume
	configMap:
	name: prometheus-config
	items:
	- key: rules.yml
	path: rules.yml

	EOF


	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Service
	metadata:
	name: prometheus
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: prometheus
	ports:
	- name: web
	port: 9090
	targetPort: 9090
	EOF


	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-datasources
	namespace: ${NAMESPACE}
	data:
	datasources.yml: \|
	apiVersion: 1
	datasources:
	- name: Prometheus
	type: prometheus
	access: proxy
	url: http://prometheus:9090
	isDefault: true
	EOF




	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-dashboard-provider
	namespace: ${NAMESPACE}
	data:
	dashboards.yml: \|
	apiVersion: 1
	providers:
	- name: default
	folder: ''
	type: file
	options:
	path: /var/lib/grafana/dashboards
	EOF




	cat <<EOF \| kubectl apply -f -

	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-dashboards
	namespace: observability
	data:
	blackbox-dashboard.json: \|
	{
	"title": "Synthetic Endpoint Availability",
	"schemaVersion": 38,
	"panels": [
	{
	"type": "timeseries",
	"title": "Probe Availability (%)",
	"targets": [
	{
	"expr": "probe:availability:5m * 100",
	"legendFormat": "{{ instance }}",
	"refId": "A"
	}
	]
	},
	{
	"type": "timeseries",
	"title": "Probe Latency (seconds)",
	"targets": [
	{
	"expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)",
	"legendFormat": "{{ instance }}",
	"refId": "B"
	}
	]
	}
	]
	}
	EOF




	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: grafana
	namespace: ${NAMESPACE}
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: grafana
	template:
	metadata:
	labels:
	app: grafana
	spec:
	containers:
	- name: grafana
	image: grafana/grafana:12.2
	ports:
	- containerPort: 3000
	volumeMounts:
	- name: datasources
	mountPath: /etc/grafana/provisioning/datasources

	# Provider config
	- name: dashboard-provider
	mountPath: /etc/grafana/provisioning/dashboards

	# Actual JSON dashboards
	- name: dashboard-json
	mountPath: /var/lib/grafana/dashboards

	volumes:
	- name: datasources
	configMap:
	name: grafana-datasources

	- name: dashboard-provider
	configMap:
	name: grafana-dashboard-provider

	- name: dashboard-json
	configMap:
	name: grafana-dashboards
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: grafana
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: grafana
	ports:
	- name: web
	port: 3000
	targetPort: 3000
	EOF




	echo ""
	echo ">>> Waiting for deployments to become ready"
	echo ""

	kubectl rollout status deployment/grafana -n ${NAMESPACE}
	kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE}
	kubectl rollout status deployment/prometheus -n ${NAMESPACE}

	echo ""
	echo ">>> Solution deployment completed successfully"
	echo ""
	id: synthetic-endpoint-monitoring
	metadata:
	category: observability
	difficulty: advanced

	prompt: \|
	[Context]
	The Nebula platform hosts multiple internal services exposed through Kubernetes ingress.
	While application-level metrics exist, the platform team lacks visibility into endpoint
	availability from a real client perspective.

	Synthetic probing is required to validate ingress reachability and detect outages
	independent of application instrumentation.

	[Environment Notes]
	The cluster operates in air-gapped mode.
	Required container images are already preloaded into the runtime environment and must
	be used as-is without pulling from external registries. Available image versions
	are documented in the platform wiki.

	[Task]
	Implement synthetic endpoint monitoring in the `observability` namespace.

	The monitoring stack must include the following Kubernetes resources with the
	specified names:
	- A deployment named `blackbox-exporter`
	- A service named `blackbox-exporter` exposing the probe endpoint
	- A configuration ConfigMap named `blackbox-config`
	- A monitoring deployment named `prometheus`
	- A configuration ConfigMap named `prometheus-config`
	- A persistent volume claim named `prometheus-data`

	Prometheus must be exposed via a Kubernetes Service named `prometheus`
	on port 9090.

	The Nebula internal developer wiki documents commonly exposed platform services
	and their ingress hostnames:
	http://gitea.devops.local/root/platform-docs/wiki/

	The solution must actively probe internal service endpoints that represent
	real client access patterns. Check the Gitea wiki for information about
	critical platform services that require monitoring.

	Probe results must be exposed as metrics and be consumable by the monitoring system.

	[Blackbox Exporter Requirements]
	The blackbox exporter must support multiple probe types:
	- HTTP probes for application-layer availability
	- TCP probes for transport-layer connectivity

	Probe behavior must follow protocol-appropriate validation:
	- Application-layer probes must validate protocol correctness
	- Transport-layer probes must validate connectivity only
	- HTTP probe modules must preserve default TLS verification behavior

	The Kubernetes API server health must be monitored using TCP connectivity probes
	against its standard in-cluster service name. This check must validate only
	basic transport-layer reachability (no TLS or HTTP validation).

	Exporter-level metrics (e.g., `up`) must not be used as a substitute for
	synthetic probe result metrics.

	[Prometheus Requirements]
	The global `scrape_interval` must be set to 15s or 10s.

	Prometheus must be configured to scrape blackbox probe targets using appropriate
	relabeling. Configuration should support efficient querying and reuse of commonly
	evaluated availability signals. Repeated or computationally expensive expressions
	should not be evaluated directly at query time.

	[Alerting Requirements]
	Prometheus must define alerting rules based on synthetic probe result metrics.

	At least one alert must be named SyntheticProbeFailure and represent
	endpoint-level availability failure detected via synthetic probes.

	Alerts detecting sustained availability degradation should be based on
	SLO-style burn rate concepts (e.g., evaluating error budget consumption
	over time rather than fixed thresholds).

	Alerts must:
	- Detect rapid availability loss
	- Detect sustained availability degradation over longer periods
	- Distinguish failures on a per-endpoint basis
	- Avoid relying solely on instantaneous probe failures or fixed thresholds

	Alerting rules must be derived from probe result metrics and must distinguish
	failures on a per-endpoint basis.

	[Visualization Requirements]
	The monitoring stack must include a visualization layer for synthetic probe results.

	Required resource:
	- ConfigMap `grafana-dashboards` for dashboard definitions

	A visualization service must be deployed in the `observability` namespace
	and configured to consume metrics directly from Prometheus without manual
	configuration through a web interface.

	The visualization must include at least one dashboard that presents the
	availability status of synthetic probes on a per-endpoint basis.

	Dashboards must present synthetic probe results in a form suitable for
	service-level assessment rather than raw signal inspection.

	Visualizations must:
	- Represent availability as a normalized measure over time
	- Allow comparison across individual endpoints
	- Include at least one indicator related to request or probe responsiveness
	- Not rely solely on binary success/failure signals

	Dashboard and data source configuration must be reproducible and stored
	declaratively as Kubernetes resources.