arubis/Dockerfile

## Dockerfile
FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2


RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data

RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \
    | tar -xzf - -C /usr/local/bin crane


ENV ALLOWED_NAMESPACES="observability"

RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar
RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar
RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar


ENV DISPLAY_NUM=1
ENV COMPUTER_HEIGHT_PX=768
ENV COMPUTER_WIDTH_PX=1024

## grader.py
#!/usr/bin/env python3

from __future__ import annotations
import subprocess
import json
import time
from apex_arena._types import GradingResult
import re
from datetime import timedelta


_grafana_api_ready = None
_active_port_forwards: set = set()


def sh(cmd):
    r = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return r.returncode, r.stdout.strip(), r.stderr.strip()

def normalize_image(img):
    """Strip docker.io/ prefix for image comparison."""
    for prefix in ("docker.io/library/", "docker.io/"):
        if img.startswith(prefix):
            return img[len(prefix):]
    return img


def port_forward(kind, name, namespace, local_port, remote_port):
    key = (name, namespace, local_port)
    if key in _active_port_forwards:
        return

    sh(f"pkill -f 'kubectl port-forward.*{local_port}' || true")
    sh(
        f"kubectl port-forward -n {namespace} {kind}/{name} "
        f"{local_port}:{remote_port} >/dev/null 2>&1 &"
    )
    time.sleep(5)
    _active_port_forwards.add(key)


def parse_duration(duration_str):
    """Parse Prometheus duration string to timedelta."""
    match = re.match(r'(\d+)(s|m|h)', str(duration_str))
    if not match:
        return timedelta(0)
    value, unit = int(match.group(1)), match.group(2)
    if unit == 's':
        return timedelta(seconds=value)
    elif unit == 'm':
        return timedelta(minutes=value)
    elif unit == 'h':
        return timedelta(hours=value)
    return timedelta(0)


_TERMINAL_POD_STATES = {"ImagePullBackOff", "ErrImagePull", "InvalidImageName"}


def check_deployment_health(name, namespace):
    """Quick check for terminal pod errors. Returns (healthy, message)."""
    code, out, _ = sh(
        f"kubectl get deployment {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Deployment '{name}' not found in {namespace}"

    try:
        data = json.loads(out)
        selector = data["spec"]["selector"]["matchLabels"]
        label_str = ",".join(f"{k}={v}" for k, v in selector.items())
    except (json.JSONDecodeError, KeyError):
        return True, "Could not parse selector, assuming healthy"

    code, out, _ = sh(
        f"kubectl get pods -n {namespace} -l {label_str} -o json"
    )
    if code != 0:
        return True, "Could not list pods, assuming healthy"

    try:
        pods = json.loads(out).get("items", [])
    except json.JSONDecodeError:
        return True, "Could not parse pods, assuming healthy"

    for pod in pods:
        statuses = pod.get("status", {}).get("containerStatuses", [])
        for cs in statuses:
            waiting = cs.get("state", {}).get("waiting", {})
            reason = waiting.get("reason", "")
            if reason in _TERMINAL_POD_STATES:
                pod_name = pod.get("metadata", {}).get("name", "unknown")
                return False, (
                    f"Pod '{pod_name}' in {namespace} is stuck in {reason} — "
                    f"deployment '{name}' will not recover"
                )

    return True, f"Deployment '{name}' pods are not in terminal error state"


# -------------------------------------------------------------------
# RESOURCE CHECKS
# -------------------------------------------------------------------

def deployment_exists(name, namespace):
    code, _, _ = sh(f"kubectl get deployment {name} -n {namespace}")
    return (
        (True, f"Deployment '{name}' exists in {namespace}")
        if code == 0
        else (False, f"Deployment '{name}' not found in {namespace}")
    )


def configmap_contains(name, namespace, required_strings):
    code, out, _ = sh(
        f"kubectl get configmap {name} -n {namespace} -o yaml"
    )
    if code != 0:
        return False, f"ConfigMap '{name}' not readable"

    missing = [s for s in required_strings if s not in out]
    if missing:
        return False, f"ConfigMap '{name}' missing: {missing}"

    return True, f"ConfigMap '{name}' contains required content"


def service_exists(name, namespace, port):
    code, out, _ = sh(
        f"kubectl get svc {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Service '{name}' not found in {namespace}"

    data = json.loads(out)
    ports = [p.get("port") for p in data.get("spec", {}).get("ports", [])]

    if port in ports:
        return True, f"Service '{name}' exposes port {port}"
    return False, f"Service '{name}' does not expose port {port}"


def wait_for_grafana_api():
    global _grafana_api_ready
    if _grafana_api_ready is not None:
        return _grafana_api_ready

    sh(
        "kubectl wait --for=condition=ready pod -l app=grafana "
        "-n observability --timeout=60s"
    )

    for _ in range(6):
        code, _, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- http://localhost:3000/api/health"
        )
        if code == 0:
            _grafana_api_ready = True
            return True
        time.sleep(5)

    _grafana_api_ready = False
    return False


def configmap_exists(name, namespace):
    code, _, _ = sh(f"kubectl get configmap {name} -n {namespace}")
    return (
        (True, f"ConfigMap '{name}' exists in {namespace}")
        if code == 0
        else (False, f"ConfigMap '{name}' not found in {namespace}")
    )


def pvc_exists(name, namespace):
    code, _, _ = sh(f"kubectl get pvc {name} -n {namespace}")
    return (
        (True, f"PVC '{name}' exists in {namespace}")
        if code == 0
        else (False, f"PVC '{name}' not found in {namespace}")
    )


# -------------------------------------------------------------------
# BLACKBOX FUNCTIONAL CHECKS
# -------------------------------------------------------------------

def blackbox_metrics_exposed():
    healthy, msg = check_deployment_health("blackbox-exporter", "observability")
    if not healthy:
        return False, msg


    port_forward(
        "svc",
        "blackbox-exporter",
        "observability",
        9115,
        9115,
    )

    code, out, _ = sh("curl -s http://localhost:9115/metrics")
    if code == 0 and "blackbox_exporter_config_last_reload_successful" in out:
        return True, "Blackbox exporter metrics endpoint is reachable"

    return False, "Blackbox exporter metrics endpoint not responding correctly"

def blackbox_config_has_required_modules():
    code, out, _ = sh(
        "kubectl get configmap blackbox-config "
        "-n observability -o jsonpath='{.data}'"
    )
    if code != 0:
        return False, "blackbox-config not readable"

    required_modules = ["http_2xx", "tcp_connect"]
    missing = [m for m in required_modules if m not in out]

    if missing:
        return False, f"blackbox-config missing required modules: {missing}"

    return True, "blackbox-config contains all required probe modules"


def kubernetes_api_tcp_probe_configured():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if "kubernetes.default" not in out:
        return False, "Kubernetes API server not configured as probe target"

    tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?'
    if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out:
        return False, "Must use 'tcp_connect' module (not tcp_connect_tls) for Kubernetes API TCP probe"

    return True, "Kubernetes API TCP probe is configured with tcp_connect module"


def prometheus_has_probe_metrics():
    healthy, msg = check_deployment_health("prometheus", "observability")
    if not healthy:
        return False, msg


    port_forward("svc", "prometheus", "observability", 9090, 9090)


    for _ in range(6):
        code, out, _ = sh(
            "curl -s "
            "'http://localhost:9090/api/v1/query?query=probe_success'"
        )

        if code == 0 and '"result"' in out:
            return True, "Prometheus is collecting probe metrics"

        time.sleep(5)

    return False, "Prometheus not returning probe metrics after retries"


def check_slo_burn_rate_alerts():
    """Verify alerts implement proper multi-window SLO burn rate logic."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Must have multiple time windows in recording rules or alert expressions
    windows = re.findall(
        r"avg_over_time\([^)]*\[(\d+[mh])\]\)",
        out
    )
    if len(set(windows)) < 2:
        return False, (
            "Burn rate alerts must use multiple time windows "
            "(e.g., 5m and 1h)"
        )

    # Must have at least 2 distinct 'for:' durations across alert rules
    # (evidence of fast-burn vs slow-burn detection windows)
    for_durations = re.findall(r"for:\s*(\d+[smh])", out)
    unique_durations = {parse_duration(d) for d in for_durations}
    unique_durations.discard(timedelta(0))

    if len(unique_durations) < 2:
        return False, (
            "SLO burn rate alerting requires multiple detection windows "
            "(e.g., a fast-burn alert with 'for: 2m' and a slow-burn "
            "alert with 'for: 1h')"
        )

    return True, "Valid multi-window SLO burn rate alerts detected"


def check_alert_annotations():
    """Verify alerts have required annotations for operational use."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if 'SyntheticProbeFailure' not in out:
        return False, "SyntheticProbeFailure alert not found"

    alert_section = out[out.find('SyntheticProbeFailure'):]

    if 'annotations:' not in alert_section:
        return False, "Alert missing annotations section"

    if 'description' not in alert_section and 'summary' not in alert_section:
        return False, "Alert missing description/summary annotation"

    return True, "Alert has required annotations"


def check_scrape_interval():
    """Verify scrape interval supports ≤2 min detection time."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    global_match = re.search(r'scrape_interval:\s*(\d+[smh])', out)
    if not global_match:
        return False, "Global scrape_interval not found"

    global_interval = global_match.group(1)
    duration = parse_duration(global_interval)

    if duration > timedelta(seconds=30):
        return False, f"Global scrape_interval {global_interval} too long for 2m detection"

    return True, f"Scrape interval {global_interval} supports timely detection"


def check_recording_rules():
    """Verify recording rules exist AND are used in alert expressions."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Must define recording rules
    if "record:" not in out:
        return False, "Prometheus should define recording rules"

    # Extract recording rule names
    record_names = re.findall(
        r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
        out
    )

    if not record_names:
        return False, "No valid recording rule names found"

    # Multi-window availability requires at least 2 recording rules
    if len(record_names) < 2:
        return False, (
            "Multiple recording rules needed for multi-window "
            "availability signals (e.g., 5m and 1h windows)"
        )

    # At least 2 recording rules must be referenced in alert expressions
    alert_section = out[out.find("alert:"):] if "alert:" in out else out

    used_count = sum(1 for name in record_names if name in alert_section)

    if used_count < 2:
        return False, (
            "At least 2 recording rules should be referenced in alert "
            "expressions for multi-window burn rate detection"
        )

    return True, "Recording rules exist and are used in alerts"


def check_blackbox_modules():
    """Verify correct Blackbox modules used for each protocol."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    out_lower = out.lower()

    # --- Kubernetes API must use tcp_connect ---
    if 'kubernetes.default' in out:
        kube_pos = out.find('kubernetes.default')
        kube_section = out[max(0, kube_pos - 500):kube_pos + 200]
        if 'tcp_connect' not in kube_section:
            return False, "Kubernetes API target should use tcp_connect module"

    # --- HTTP endpoints must use http_2xx ---
    if 'argocd' in out_lower:
        argocd_pos = out_lower.find('argocd')
        argocd_section = out[max(0, argocd_pos - 500):argocd_pos + 200]
        if 'http_2xx' not in argocd_section and 'http' in argocd_section:
            return False, "HTTP targets should use http_2xx module"

    return True, "Blackbox modules correctly matched to target protocols"


def check_alert_severity_labels():
    """Verify alerts define severity labels."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if "severity:" not in out:
        return False, (
            "Alerts must define severity labels "
            "(critical or warning)"
        )

    return True, "Alert severity labels present"


def check_dashboard_uses_recording_rules():
    """Dashboard should reference recording rules instead of raw PromQL."""
    code, dash_out, _ = sh(
        "kubectl get configmap grafana-dashboards "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "grafana-dashboards ConfigMap not readable"

    # Extract actual recording rule names from prometheus-config
    code, prom_out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )

    if code == 0:
        record_names = re.findall(
            r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)",
            prom_out
        )
        if record_names and any(name in dash_out for name in record_names):
            return True, "Dashboard references recording rules"

    # Fallback: accept any probe:*:* pattern (recording rule convention)
    if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out):
        return True, "Dashboard references recording rules"

    return False, (
        "Dashboard should reference pre-computed recording rules "
        "(e.g., probe:availability:5m) instead of raw PromQL"
    )


def check_prometheus_self_scrape():
    """Verify Prometheus scrapes its own metrics."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if "job_name: prometheus" not in out and "job_name: 'prometheus'" not in out:
        return False, "Prometheus should scrape its own metrics (job_name: prometheus)"
    return True, "Prometheus self-monitoring configured"


def argocd_probe_success():
    port_forward(
        "svc",
        "blackbox-exporter",
        "observability",
        9115,
        9115,
    )

    cmd = (
        "curl -s "
        "'http://localhost:9115/probe?"
        "target=http://argocd.devops.local:80/api/version&module=http_2xx' | "
        "grep '^probe_success 1'"
    )

    code, out, _ = sh(cmd)
    if code == 0 and out:
        return True, "Synthetic probe reports Argo CD endpoint as available"

    return False, "Synthetic probe did not report Argo CD as available"


def deployment_uses_image(name, namespace, expected_image):
    code, out, _ = sh(
        f"kubectl get deployment {name} -n {namespace} -o json"
    )
    if code != 0:
        return False, f"Deployment '{name}' not found"

    data = json.loads(out)
    containers = data["spec"]["template"]["spec"]["containers"]
    images = [c.get("image", "") for c in containers]

    actual = [normalize_image(i) for i in images]
    expected = normalize_image(expected_image)

    if expected in actual:
        return True, f"Deployment '{name}' uses image '{expected_image}'"

    return False, f"Expected {expected_image}, found {images}"


def prometheus_blackbox_relabeling_present():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"


    has_blackbox_addr = ("blackbox-exporter:9115" in out or
                         "blackbox-exporter.observability" in out)

    required_snippets = [
        "metrics_path: /probe",
        "__param_target",
    ]

    missing = [s for s in required_snippets if s not in out]

    if not has_blackbox_addr:
        missing.append("blackbox-exporter address")

    if not missing:
        return True, "Prometheus blackbox relabeling is correctly configured"

    return False, f"Missing blackbox relabeling elements: {missing}"

def prometheus_alert_fires_for_failing_probe():
    for dep in ("prometheus", "blackbox-exporter"):
        healthy, msg = check_deployment_health(dep, "observability")
        if not healthy:
            return False, msg

    port_forward("svc", "prometheus", "observability", 9090, 9090)

    for _ in range(18):  # ~2.5 min with overhead
        code, out, _ = sh(
            "curl -s http://localhost:9090/api/v1/alerts"
        )

        if (
            code == 0
            and "SyntheticProbeFailure" in out
            and "does-not-exist.devops.local" in out
        ):
            return True, "SyntheticProbeFailure alert is firing"

        time.sleep(8)

    return False, "SyntheticProbeFailure alert did not fire"


def grafana_has_prometheus_datasource():
    healthy, msg = check_deployment_health("grafana", "observability")
    if not healthy:
        return False, msg

    if not wait_for_grafana_api():
        return False, "Grafana API not reachable"

    for _ in range(6):
        code, out, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
            "http://localhost:3000/api/datasources"
        )


        if code == 0 and "Prometheus" in out:
            return True, "Grafana Prometheus datasource configured"
        time.sleep(5)

    return False, "Grafana Prometheus datasource missing"


def grafana_has_blackbox_dashboard():
    healthy, msg = check_deployment_health("grafana", "observability")
    if not healthy:
        return False, msg

    if not wait_for_grafana_api():
        return False, "Grafana API not reachable"

    for _ in range(6):
        code, out, _ = sh(
            "kubectl exec -n observability deploy/grafana -- "
            "wget -qO- --header='Authorization: Basic YWRtaW46YWRtaW4=' "
            "http://localhost:3000/api/search"
        )


        if code == 0 and any(kw in out for kw in [
            "Synthetic", "Blackbox", "Probe", "Endpoint"
        ]):
            return True, "Grafana dashboard for synthetic probes exists"
        time.sleep(5)

    return False, "Grafana dashboard missing"

def grafana_dashboard_uses_probe_success():
    ok, _ = configmap_contains(
        "grafana-dashboards",
        "observability",
        ["probe_success"]
    )

    if ok:
        return True, "Grafana dashboard visualizes probe_success metric"

    return False, "Grafana dashboard does not reference probe_success"


def prometheus_uses_pvc():
    code, out, _ = sh(
        "kubectl get deployment prometheus "
        "-n observability -o json"
    )
    if code != 0:
        return False, "Prometheus deployment not found"

    data = json.loads(out)
    volumes = data["spec"]["template"]["spec"].get("volumes", [])
    mounts = data["spec"]["template"]["spec"]["containers"][0].get("volumeMounts", [])

    pvc_used = any(v.get("persistentVolumeClaim") for v in volumes)
    mounted = any(m.get("mountPath") == "/prometheus" for m in mounts)

    if pvc_used and mounted:
        return True, "Prometheus is using persistent storage"

    return False, "Prometheus PVC is not mounted at /prometheus"


def alert_rule_identifies_endpoint():
    """Verify alert annotations reference the failing endpoint."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    # Alerts must reference the endpoint in annotations so operators
    # can identify which endpoint failed (not just group in the expr)
    has_label_template = re.search(
        r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out
    )

    if has_label_template:
        return True, "Alert annotations identify the failing endpoint"

    return False, (
        "Alert annotations must reference the failing endpoint "
        "(e.g., {{ $labels.instance }}) for operational use"
    )


def alert_has_minimum_duration():
    """Verify alert rule has for: 2m or greater duration"""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    duration_pattern = r'for:\s*([2-9]|[1-9]\d+)m'

    if re.search(duration_pattern, out):
        return True, "Alert rule has correct minimum duration (≥2m)"

    return False, "Alert rule must fire 'for: 2m' or longer (not immediate)"


def prometheus_alert_is_per_endpoint():

    for dep in ("prometheus", "blackbox-exporter"):
        healthy, msg = check_deployment_health(dep, "observability")
        if not healthy:
            return False, msg

    for _ in range(18):  # ~2.5 min with overhead
        code, out, _ = sh(
            "kubectl exec -n observability deploy/prometheus -- "
            "wget -qO- http://localhost:9090/api/v1/alerts"
        )
        if code != 0 or "SyntheticProbeFailure" not in out:
            time.sleep(8)
            continue

        try:
            data = json.loads(out)
            alerts = data.get("data", {}).get("alerts", [])
            synthetic = [
                a for a in alerts
                if a.get("labels", {}).get("alertname") == "SyntheticProbeFailure"
            ]

            if len(synthetic) < 1:
                time.sleep(8)
                continue

            instances = {
                a.get("labels", {}).get("instance", "") for a in synthetic
            }
            if not all(instances):
                return False, (
                    "SyntheticProbeFailure alerts lack instance labels — "
                    "alerting is not scoped per endpoint"
                )

            firing = {
                a.get("labels", {}).get("instance", "")
                for a in synthetic if a.get("state") == "firing"
            }

            has_failing = any("does-not-exist" in i for i in firing)

            if has_failing:
                return True, (
                    "Alerts fire per endpoint (failing endpoint alerts "
                    "independently with instance labels)"
                )

            time.sleep(8)

        except (json.JSONDecodeError, KeyError):
            time.sleep(10)
            continue

    return False, "Could not verify per-endpoint alert scoping"


def prometheus_does_not_use_up_metric():
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    if " up " in out or "up==" in out or "up ==" in out:
        return False, "Alerting incorrectly uses exporter 'up' metric"

    return True, "Alerting correctly avoids exporter 'up' metric"


def check_endpoint_count():
    """Verify at least 3 probe targets are configured."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return False, "Prometheus config not readable"

    out_lower = out.lower()

    required_targets = {
        'argocd': 'argocd' in out_lower,
        'kubernetes_api': 'kubernetes.default' in out_lower,
        'test_endpoint': 'does-not-exist' in out_lower,
    }

    missing = [k for k, found in required_targets.items() if not found]

    if missing:
        return False, f"Missing probe targets: {missing}"

    return True, "All required endpoints configured"


def get_probe_targets():
    """Extract probe targets from Prometheus scrape config."""
    code, out, _ = sh(
        "kubectl get configmap prometheus-config "
        "-n observability -o jsonpath='{.data.prometheus\\.yml}'"
    )
    if code != 0:
        return []

    targets = []
    target_matches = re.findall(
        r'-\s*(https?://[^\s]+|[\w.-]+:\d+)',
        out
    )
    targets.extend(target_matches)

    return targets


def check_grafana_dashboard_semantics():
    """Verify Grafana dashboard uses correct semantic patterns for synthetic monitoring."""
    code, out, _ = sh(
        "kubectl get configmap grafana-dashboards "
        "-n observability -o yaml"
    )
    if code != 0:
        return False, "grafana-dashboards ConfigMap not readable"

    issues = []

    # ------------------------------------------------------------------
    # Check 1: dashboard must show availability data (not raw binary)
    # ------------------------------------------------------------------
    has_availability_metric = (
        "probe_success" in out
        or re.search(r"probe:[a-zA-Z_]*availab", out)
        or re.search(r"probe:[a-zA-Z_]*success", out)
    )

    if has_availability_metric:
        if "probe_success" in out:
            has_time_agg = any(fn in out for fn in [
                "avg_over_time",
                "min_over_time",
                "max_over_time",
                "sum_over_time",
            ])
            if not has_time_agg:
                issues.append(
                    "Dashboard uses raw probe_success without time "
                    "aggregation (expected avg_over_time or similar)"
                )
    else:
        issues.append(
            "Dashboard does not reference probe availability metrics "
            "(probe_success or a recording rule like probe:availability)"
        )

    # ------------------------------------------------------------------
    # Check 2: per-endpoint breakdown (instance / target)
    # ------------------------------------------------------------------
    has_grouping = any(x in out for x in [
        "by (instance)",
        "by (target)",
        "$labels.instance",
        "$labels.target",
        "{{ instance }}",
        "{{instance}}",
        "{{ target }}",
        "{{target}}",
    ])

    if not has_grouping:
        issues.append(
            "Dashboard does not show per-endpoint breakdown "
            "(missing by(instance), legendFormat with {{ instance }}, "
            "or target label usage)"
        )

    # ------------------------------------------------------------------
    # Check 3: availability not shown as raw binary signal
    # ------------------------------------------------------------------
    has_normalized = (
        # Percentage form (e.g., * 100)
        any(x in out for x in ["* 100", "*100", "100 *"])
        # Or [0,1] normalized via time aggregation (avg_over_time already
        # produces a continuous availability ratio, not binary)
        or any(fn in out for fn in [
            "avg_over_time", "min_over_time", "max_over_time",
        ])
        # Or uses a recording rule that pre-computes availability
        or re.search(r"probe:[a-zA-Z_]+:", out)
    )

    if not has_normalized:
        issues.append(
            "Dashboard should represent availability as a normalized "
            "measure (e.g., avg_over_time for ratio or * 100 for percentage)"
        )

    # ------------------------------------------------------------------
    # Check 4: response-time / latency metrics present
    # ------------------------------------------------------------------
    latency_metrics = [
        "probe_duration_seconds",
        "probe_http_duration_seconds",
        "probe_tcp_connection_duration_seconds",
        "probe_dns_lookup_time_seconds",
    ]

    has_latency = any(metric in out for metric in latency_metrics)

    if not has_latency:
        issues.append(
            "Dashboard should include response-time metrics "
            "(e.g. probe_duration_seconds or protocol-specific durations)"
        )

    # ------------------------------------------------------------------
    # Final result
    # ------------------------------------------------------------------
    if issues:
        return False, "; ".join(issues)

    return True, (
        "Dashboard uses time-aggregated probe metrics, per-endpoint breakdown, "
        "availability percentage, and latency visualization"
    )


def grade(transcript: str) -> GradingResult:
    feedback = []

    # ------------------------
    # Gate checks (non-scored)
    # ------------------------
    gate_checks = [
        # Resource existence
        lambda: deployment_exists("blackbox-exporter", "observability"),
        lambda: service_exists("blackbox-exporter", "observability", 9115),
        lambda: configmap_exists("blackbox-config", "observability"),
        lambda: deployment_exists("prometheus", "observability"),
        lambda: configmap_exists("prometheus-config", "observability"),
        lambda: pvc_exists("prometheus-data", "observability"),
        blackbox_config_has_required_modules,

        # Image correctness
        lambda: deployment_uses_image(
            "blackbox-exporter", "observability",
            "prom/blackbox-exporter:v0.25.0",
        ),
        lambda: deployment_uses_image(
            "prometheus", "observability",
            "prom/prometheus:v3.8.1",
        ),
        lambda: deployment_uses_image(
            "grafana", "observability",
            "grafana/grafana:12.2",
        ),

        # Core operational
        blackbox_metrics_exposed,
        prometheus_has_probe_metrics,
        prometheus_blackbox_relabeling_present,
        argocd_probe_success,
        kubernetes_api_tcp_probe_configured,
        prometheus_uses_pvc,

        # Basic config quality
        grafana_has_prometheus_datasource,
        check_alert_severity_labels,
        check_scrape_interval,
        check_alert_annotations,
        alert_has_minimum_duration,
        prometheus_does_not_use_up_metric,
        check_slo_burn_rate_alerts,
        check_blackbox_modules,
    ]

    for fn in gate_checks:
        try:
            ok, msg = fn()
        except Exception as e:
            ok = False
            msg = str(e)

        feedback.append(("✓ " if ok else "✗ ") + msg)

    # ------------------------
    # Scored checks (partial)
    # ------------------------
    scored_checks = {
        "grafana_dashboard_present": grafana_has_blackbox_dashboard,
        "endpoint_count": check_endpoint_count,
        "grafana_dashboard_semantics": check_grafana_dashboard_semantics,
        "failing_in_alert": prometheus_alert_fires_for_failing_probe,
        "alert_identifies_endpoint": alert_rule_identifies_endpoint,
        "per_endpoint": prometheus_alert_is_per_endpoint,
        "records": check_recording_rules,
        "self_scrape": check_prometheus_self_scrape,
        "recording_rules": check_dashboard_uses_recording_rules,
    }

    subscores = {}

    for key, fn in scored_checks.items():
        try:
            ok, msg = fn()
        except Exception as e:
            ok = False
            msg = str(e)

        subscores[key] = 1.0 if ok else 0.0
        feedback.append(("✓ " if ok else "✗ ") + msg)

    #
    total_checks = len(scored_checks)
    weight = 1.0 / total_checks
    weights = {k: weight for k in scored_checks}

    score = sum(subscores[k] * weights[k] for k in subscores)

    return GradingResult(
        score=round(score, 4),
        subscores=subscores,
        weights=weights,
        feedback=" | ".join(feedback),
    )

## setup.sh
#!/bin/bash
set -e

# ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- #
# Start supervisord if not already running (manages k3s, dockerd, dnsmasq)
if ! supervisorctl status &>/dev/null; then
    echo "Starting supervisord..."
    /usr/bin/supervisord -c /etc/supervisor/supervisord.conf
    sleep 5
fi

# Set kubeconfig for k3s
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

# Wait for k3s to be ready (k3s can take 30-60 seconds to start)
echo "Waiting for k3s to be ready..."
MAX_WAIT=180
ELAPSED=0
until kubectl get nodes &>/dev/null; do
    if [ $ELAPSED -ge $MAX_WAIT ]; then
        echo "Error: k3s is not ready after ${MAX_WAIT} seconds"
        exit 1
    fi
    echo "Waiting for k3s... (${ELAPSED}s elapsed)"
    sleep 2
    ELAPSED=$((ELAPSED + 2))
done

echo "k3s is ready!"
# ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- #

echo "Granting Minimal Required Permission"

kubectl apply -f - <<EOF
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: ubuntu-user-configmap-editor
  namespace: observability
rules:
- apiGroups: [""]
  resources: ["configmaps"]
  verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: ubuntu-user-configmap-editor-binding
  namespace: observability
subjects:
- kind: ServiceAccount
  name: ubuntu-user
  namespace: observability
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: ubuntu-user-configmap-editor
EOF

# ------------------------------------------------------------------------------
# Create Gitea repo (unchanged)
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
curl -s -X POST \
  -u "root:Admin@123456" \
  -H "Content-Type: application/json" \
  -d "{
    \"name\": \"platform-docs\",
    \"private\": false,
    \"auto_init\": true
  }" \
  http://localhost:3000/api/v1/user/repos
'

# ------------------------------------------------------------------------------
# Wiki creation — commit directly to bare git repo (bypasses API timing issues)
# ------------------------------------------------------------------------------
kubectl exec -n gitea deploy/gitea -- sh -c '
set -e

WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git

# Ensure bare wiki repo exists (Gitea does NOT auto-create it)
if [ ! -d "$WIKI_REPO" ]; then
  git init --bare "$WIKI_REPO" >/dev/null 2>&1
fi

# Commit wiki content directly into the bare repo via a temp clone
TMPDIR=$(mktemp -d)
cd "$TMPDIR"
git init >/dev/null 2>&1
git config user.email "root@gitea.local"
git config user.name "root"

cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF
# Platform Services - Monitoring Requirements

## Critical Platform Services

### ArgoCD GitOps Controller
| Property | Value |
|----------|-------|
| Ingress | \`argocd.devops.local\` |
| Protocol | HTTP |
| Criticality | High |

### Kubernetes API Server
| Property | Value |
|----------|-------|
| Access | \`kubernetes.default.svc:443\` |
| Protocol | TCP |
| Criticality | Critical |

## Alerting Validation

Configure a probe against a known-unreachable endpoint to verify alerting:

    http://does-not-exist.devops.local

Alerts should identify the specific failing endpoint (not aggregate into one global alert).

## Preloaded Container Images

This cluster operates in air-gapped mode. The following images are available:

| Component | Image |
|-----------|-------|
| Prometheus | \`prom/prometheus:v3.8.1\` |
| Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` |
| Grafana | \`grafana/grafana:12.2\` |

Use these exact image references in your deployments. No other versions are available.
WIKIEOF

git add . >/dev/null 2>&1
git commit -m "Initial wiki content" >/dev/null 2>&1
git remote add origin "$WIKI_REPO"
git push -f origin master >/dev/null 2>&1

cd /
rm -rf "$TMPDIR"
chown -R git:git "$WIKI_REPO"
' && echo "[SETUP] Platform documentation created" \
  || echo "[SETUP] WARN: Wiki creation failed"

# Poke the Gitea wiki API to force indexing of the git-committed content
kubectl exec -n gitea deploy/gitea -- sh -c '
for i in 1 2 3; do
  HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
    -u "root:Admin@123456" \
    "http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages")
  if [ "$HTTP_CODE" = "200" ]; then
    break
  fi
  sleep 2
done
' 2>/dev/null || true


echo "[SETUP] Importing pre-cached MinIO and MariaDB images..."

PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar"
BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar"
GRAFANA_TAR="/workdir/grafana-12.2.tar"

if [ ! -f "$GRAFANA_TAR" ]; then
    echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import --no-unpack "$GRAFANA_TAR"

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "grafana/grafana:12.2" || {
    echo "Error: Grafana image was not imported correctly"
    exit 1
}

if [ ! -f "$PROMETHEUS_TAR" ]; then
    echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import "$PROMETHEUS_TAR"

if [ ! -f "$BLACKBOX_TAR" ]; then
    echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR"
    exit 1
fi

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images import --no-unpack "$BLACKBOX_TAR"

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "prom/prometheus:v3.8.1" || {
    echo "Error: Prometheus image was not imported correctly"
    exit 1
}

ctr --address /run/k3s/containerd/containerd.sock \
    --namespace k8s.io \
    images list | grep -q "prom/blackbox-exporter:v0.25.0" || {
    echo "Error: Blackbox image was not imported correctly"
    exit 1
}

CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io"
$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1
$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0
$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2

ALLOWED_IMAGES=(
  "prom/prometheus:v3.8.1"
  "docker.io/prom/prometheus:v3.8.1"
  "prom/blackbox-exporter:v0.25.0"
  "docker.io/prom/blackbox-exporter:v0.25.0"
  "grafana/grafana:12.2"
  "docker.io/grafana/grafana:12.2"
)

is_allowed() {
  for allowed in "${ALLOWED_IMAGES[@]}"; do
    [[ "$1" == "$allowed" ]] && return 0
  done
  return 1
}

$CTR images list -q | while read -r image; do
  case "$image" in
    *prometheus*|*blackbox-exporter*|*grafana*)
      is_allowed "$image" || $CTR images remove "$image" 2>/dev/null || true
      ;;
  esac
done

ctr --address /run/k3s/containerd/containerd.sock \
  --namespace k8s.io \
  images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null || true

ctr --address /run/k3s/containerd/containerd.sock \
  --namespace k8s.io \
  images remove docker.io/grafana/grafana:11.3.0 2>/dev/null || true

rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR"
rm -rf /workdir/*.tar 2>/dev/null || true

echo "[SETUP] Prometheus and Blackbox images imported successfully"

## solution.sh
#!/bin/bash
set -euo pipefail

echo ""
echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)"
echo ""

export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

NAMESPACE="observability"

# Ensure namespace exists
kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 || kubectl create namespace ${NAMESPACE}

echo ""
echo ">>> Applying Blackbox Exporter configuration"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: blackbox-config
  namespace: ${NAMESPACE}
data:
  blackbox.yml: |
    modules:
      http_2xx:
        prober: http
        timeout: 5s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2"]
          valid_status_codes: []
          method: GET


      tcp_connect:
        prober: tcp
        timeout: 5s
EOF

cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: blackbox-exporter
  namespace: ${NAMESPACE}
  labels:
    app: blackbox-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
      app: blackbox-exporter
  template:
    metadata:
      labels:
        app: blackbox-exporter
    spec:
      containers:
        - name: blackbox-exporter
          image: prom/blackbox-exporter:v0.25.0
          ports:
            - containerPort: 9115
          args:
            - "--config.file=/etc/blackbox/blackbox.yml"
          securityContext:
            runAsNonRoot: true
            runAsUser: 1000
          volumeMounts:
            - name: config-volume
              mountPath: /etc/blackbox
      volumes:
        - name: config-volume
          configMap:
            name: blackbox-config
            items:
              - key: blackbox.yml
                path: blackbox.yml
EOF

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
  name: blackbox-exporter
  namespace: ${NAMESPACE}
spec:
  selector:
    app: blackbox-exporter
  ports:
    - name: http
      port: 9115
      targetPort: 9115
EOF

echo ""
echo ">>> Applying Prometheus storage"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: prometheus-data
  namespace: ${NAMESPACE}
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 2Gi
EOF

echo ""
echo ">>> Applying Prometheus configuration"
echo ""

echo "Applying Prometheus config..."

cat <<'EOF' | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: observability
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s

    rule_files:
      - /etc/prometheus/rules/*.yml

    scrape_configs:
      - job_name: blackbox
        metrics_path: /probe
        params:
          module: [http_2xx]
        static_configs:
          - targets:
              - http://argocd.devops.local
              - http://does-not-exist.devops.local
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter:9115

      - job_name: blackbox-kubernetes-api
        metrics_path: /probe
        params:
          module: [tcp_connect]
        static_configs:
          - targets:
              - kubernetes.default.svc:443
        relabel_configs:
          - source_labels: [__address__]
            target_label: __param_target
          - source_labels: [__param_target]
            target_label: instance
          - target_label: __address__
            replacement: blackbox-exporter:9115

      - job_name: blackbox-exporter
        static_configs:
          - targets:
              - blackbox-exporter:9115

      - job_name: prometheus
        static_configs:
          - targets:
              - localhost:9090


  rules.yml: |
    groups:

      # --------------------------------------------------
      # Recording rules
      # --------------------------------------------------
      - name: synthetic-recording
        rules:
          - record: probe:availability:5m
            expr: avg_over_time(probe_success[5m])

          - record: probe:availability:1h
            expr: avg_over_time(probe_success[1h])

          - record: probe:latency_p99:5m
            expr: |
              histogram_quantile(
                0.99,
                sum(rate(probe_duration_seconds_bucket[5m]))
                by (le, instance)
              )

    # --------------------------------------------------
    # Compatibility alert (legacy graders depend on this)
    # --------------------------------------------------
      - name: synthetic-compat
        rules:
          - alert: SyntheticProbeFailure
            expr: probe_success == 0
            for: 1m
            labels:
              severity: critical
            annotations:
              summary: "Synthetic probe failed"
              description: "Endpoint {{ $labels.instance }} is unreachable"

    # --------------------------------------------------
    # SLO burn-rate alerts
    # --------------------------------------------------
      - name: synthetic-slo
        rules:
          - alert: SyntheticProbeHighBurnRate
            expr: |
              (1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4
            for: 2m
            labels:
              severity: critical
            annotations:
              summary: "High synthetic availability burn rate"
              description: "High error budget burn rate for {{ $labels.instance }}"

          - alert: SyntheticProbeLowBurnRate
            expr: |
              (1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1
            for: 1h
            labels:
              severity: warning
            annotations:
              summary: "Sustained synthetic availability degradation"
              description: "Sustained error budget burn rate for {{ $labels.instance }}"
EOF


echo ""
echo ">>> Deploying Prometheus"
echo ""

cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: ${NAMESPACE}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      securityContext:
        fsGroup: 65534
        runAsNonRoot: true
        runAsUser: 1000
      containers:
        - name: prometheus
          image: prom/prometheus:v3.8.1
          args:
            - "--config.file=/etc/prometheus/prometheus.yml"

          ports:
            - containerPort: 9090
          volumeMounts:
            - name: config-volume
              mountPath: /etc/prometheus/prometheus.yml
              subPath: prometheus.yml

            - name: data-volume
              mountPath: /prometheus
            - name: rules-volume
              mountPath: /etc/prometheus/rules

      volumes:
        - name: config-volume
          configMap:
            name: prometheus-config
        - name: data-volume
          persistentVolumeClaim:
            claimName: prometheus-data
        - name: rules-volume
          configMap:
            name: prometheus-config
            items:
              - key: rules.yml
                path: rules.yml

EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: ${NAMESPACE}
spec:
  selector:
    app: prometheus
  ports:
    - name: web
      port: 9090
      targetPort: 9090
EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-datasources
  namespace: ${NAMESPACE}
data:
  datasources.yml: |
    apiVersion: 1
    datasources:
      - name: Prometheus
        type: prometheus
        access: proxy
        url: http://prometheus:9090
        isDefault: true
EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboard-provider
  namespace: ${NAMESPACE}
data:
  dashboards.yml: |
    apiVersion: 1
    providers:
      - name: default
        folder: ''
        type: file
        options:
          path: /var/lib/grafana/dashboards
EOF


cat <<EOF | kubectl apply -f -

apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-dashboards
  namespace: observability
data:
  blackbox-dashboard.json: |
    {
      "title": "Synthetic Endpoint Availability",
      "schemaVersion": 38,
      "panels": [
        {
          "type": "timeseries",
          "title": "Probe Availability (%)",
          "targets": [
            {
              "expr": "probe:availability:5m * 100",
              "legendFormat": "{{ instance }}",
              "refId": "A"
            }
          ]
        },
        {
          "type": "timeseries",
          "title": "Probe Latency (seconds)",
          "targets": [
            {
              "expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)",
              "legendFormat": "{{ instance }}",
              "refId": "B"
            }
          ]
        }
      ]
    }
EOF


cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: ${NAMESPACE}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
        - name: grafana
          image: grafana/grafana:12.2
          ports:
            - containerPort: 3000
          volumeMounts:
            - name: datasources
              mountPath: /etc/grafana/provisioning/datasources

            # 👇 Provider config goes here
            - name: dashboard-provider
              mountPath: /etc/grafana/provisioning/dashboards

            # 👇 Actual JSON dashboards go here
            - name: dashboard-json
              mountPath: /var/lib/grafana/dashboards

      volumes:
        - name: datasources
          configMap:
            name: grafana-datasources

        - name: dashboard-provider
          configMap:
            name: grafana-dashboard-provider

        - name: dashboard-json
          configMap:
            name: grafana-dashboards
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: ${NAMESPACE}
spec:
  selector:
    app: grafana
  ports:
    - name: web
      port: 3000
      targetPort: 3000
EOF


echo ""
echo ">>> Waiting for deployments to become ready"
echo ""

kubectl rollout status deployment/grafana -n ${NAMESPACE}
kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE}
kubectl rollout status deployment/prometheus -n ${NAMESPACE}

echo ""
echo ">>> Solution deployment completed successfully"
echo ""

## task.yaml
id: synthetic-endpoint-monitoring
metadata:
  category: observability
  difficulty: advanced

prompt: |
  [Context]
  The Nebula platform hosts multiple internal services exposed through Kubernetes ingress.
  While application-level metrics exist, the platform team lacks visibility into endpoint
  availability from a real client perspective.

  Synthetic probing is required to validate ingress reachability and detect outages
  independently of application instrumentation.

  [Environment Notes]
  The cluster operates in air-gapped mode.
  Required container images are already preloaded into the runtime environment and must
  be used as-is without pulling from external registries. Available image versions
  are documented in the platform wiki.


  [Task]
  Implement synthetic endpoint monitoring in the `observability` namespace.

  The monitoring stack must include the following Kubernetes resources:
    - Deployment `blackbox-exporter`
    - Service `blackbox-exporter`
    - ConfigMap `blackbox-config`
    - Deployment `prometheus`
    - Service `prometheus` exposing port 9090
    - ConfigMap `prometheus-config`
    - PersistentVolumeClaim `prometheus-data`

  Observability components must use the most recent preloaded Prometheus and Grafana container images available in the environment.


  The Nebula internal developer wiki documents commonly exposed platform services
  and their ingress hostnames:
    http://gitea.devops.local/root/platform-docs/wiki/

  Synthetic probes must target internal services representing real client access patterns.
  Probe results must be exposed as metrics consumable by the monitoring system.

  [Blackbox Exporter Requirements]
  The blackbox exporter must support multiple probe types:
    - HTTP probes for application-layer availability
    - TCP probes for transport-layer connectivity

  Probe behavior must follow protocol-appropriate validation:
    - Application-layer probes must validate protocol correctness
    - Transport-layer probes must validate connectivity only
    - HTTP probe modules must preserve default TLS verification behavior

  The Kubernetes API server health must be monitored using TCP connectivity probes
  against its standard in-cluster service name. This check must validate only
  basic transport-layer reachability (no TLS or HTTP validation).

  Exporter-level metrics (e.g., `up`) must not be used as a substitute for
  synthetic probe result metrics.

  [Prometheus Requirements]
  The global `scrape_interval` must be set to **15s or 10s**.

  Prometheus must be configured to scrape blackbox probe targets using appropriate
  relabeling. Configuration should support efficient querying and reuse of commonly
  evaluated availability signals. Repeated or computationally expensive expressions
  should not be evaluated directly at query time.

  [Alerting Requirements]
  Prometheus must define alerting rules based on synthetic probe result metrics.

  At least one alert must be named **SyntheticProbeFailure** and represent
  endpoint-level availability failure detected via synthetic probes.

  Alerts detecting sustained availability degradation should be based on
  SLO-style burn rate concepts (e.g., evaluating error budget consumption
  over time rather than fixed thresholds).


  Alerts must:
    - Detect rapid availability loss
    - Detect sustained availability degradation over longer periods
    - Distinguish failures on a per-endpoint basis
    - Avoid relying solely on instantaneous probe failures or fixed thresholds

  [Visualization Requirements]
  A visualization layer must be deployed in the `observability` namespace.

  Required resource:
    - ConfigMap `grafana-dashboards` for dashboard definitions

  The visualization system must:
    - Consume metrics directly from Prometheus via declarative configuration
    - Include at least one dashboard showing per-endpoint probe availability over time
    - Represent availability as a normalized measure over time
    - Allow comparison across endpoints
    - Include at least one responsiveness-related indicator
    - Not rely solely on binary success/failure signals

  Dashboard and data source configuration must be fully reproducible and stored
  as Kubernetes resources.
	FROM us-central1-docker.pkg.dev/bespokelabs/nebula-devops-registry/nebula-devops:1.0.2



	RUN mkdir -p /workdir /data && chmod -R 777 /workdir /data

	RUN curl -sL https://github.com/google/go-containerregistry/releases/download/v0.19.0/go-containerregistry_Linux_x86_64.tar.gz \
	\| tar -xzf - -C /usr/local/bin crane


	ENV ALLOWED_NAMESPACES="observability"

	RUN crane pull prom/prometheus:v3.8.1 /workdir/prometheus-v3.8.1.tar
	RUN crane pull prom/blackbox-exporter:v0.25.0 /workdir/blackbox-exporter-v0.25.0.tar
	RUN crane pull grafana/grafana:12.2 /workdir/grafana-12.2.tar


	ENV DISPLAY_NUM=1
	ENV COMPUTER_HEIGHT_PX=768
	ENV COMPUTER_WIDTH_PX=1024
	#!/bin/bash
	set -e

	# ---------------------- [DONOT CHANGE ANYTHING BELOW] ---------------------------------- #
	# Start supervisord if not already running (manages k3s, dockerd, dnsmasq)
	if ! supervisorctl status &>/dev/null; then
	echo "Starting supervisord..."
	/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
	sleep 5
	fi

	# Set kubeconfig for k3s
	export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

	# Wait for k3s to be ready (k3s can take 30-60 seconds to start)
	echo "Waiting for k3s to be ready..."
	MAX_WAIT=180
	ELAPSED=0
	until kubectl get nodes &>/dev/null; do
	if [ $ELAPSED -ge $MAX_WAIT ]; then
	echo "Error: k3s is not ready after ${MAX_WAIT} seconds"
	exit 1
	fi
	echo "Waiting for k3s... (${ELAPSED}s elapsed)"
	sleep 2
	ELAPSED=$((ELAPSED + 2))
	done

	echo "k3s is ready!"
	# ---------------------- [DONOT CHANGE ANYTHING ABOVE] ---------------------------------- #

	echo "Granting Minimal Required Permission"

	kubectl apply -f - <<EOF
	apiVersion: rbac.authorization.k8s.io/v1
	kind: Role
	metadata:
	name: ubuntu-user-configmap-editor
	namespace: observability
	rules:
	- apiGroups: [""]
	resources: ["configmaps"]
	verbs: ["get", "list", "watch", "create", "update", "patch"]
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: RoleBinding
	metadata:
	name: ubuntu-user-configmap-editor-binding
	namespace: observability
	subjects:
	- kind: ServiceAccount
	name: ubuntu-user
	namespace: observability
	roleRef:
	apiGroup: rbac.authorization.k8s.io
	kind: Role
	name: ubuntu-user-configmap-editor
	EOF

	# ------------------------------------------------------------------------------
	# Create Gitea repo (unchanged)
	# ------------------------------------------------------------------------------
	kubectl exec -n gitea deploy/gitea -- sh -c '
	curl -s -X POST \
	-u "root:Admin@123456" \
	-H "Content-Type: application/json" \
	-d "{
	\"name\": \"platform-docs\",
	\"private\": false,
	\"auto_init\": true
	}" \
	http://localhost:3000/api/v1/user/repos
	'

	# ------------------------------------------------------------------------------
	# Wiki creation — commit directly to bare git repo (bypasses API timing issues)
	# ------------------------------------------------------------------------------
	kubectl exec -n gitea deploy/gitea -- sh -c '
	set -e

	WIKI_REPO=/data/git/repositories/root/platform-docs.wiki.git

	# Ensure bare wiki repo exists (Gitea does NOT auto-create it)
	if [ ! -d "$WIKI_REPO" ]; then
	git init --bare "$WIKI_REPO" >/dev/null 2>&1
	fi

	# Commit wiki content directly into the bare repo via a temp clone
	TMPDIR=$(mktemp -d)
	cd "$TMPDIR"
	git init >/dev/null 2>&1
	git config user.email "root@gitea.local"
	git config user.name "root"

	cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF
	# Platform Services - Monitoring Requirements

	## Critical Platform Services

	### ArgoCD GitOps Controller
	\| Property \| Value \|
	\|----------\|-------\|
	\| Ingress \| \`argocd.devops.local\` \|
	\| Protocol \| HTTP \|
	\| Criticality \| High \|

	### Kubernetes API Server
	\| Property \| Value \|
	\|----------\|-------\|
	\| Access \| \`kubernetes.default.svc:443\` \|
	\| Protocol \| TCP \|
	\| Criticality \| Critical \|

	## Alerting Validation

	Configure a probe against a known-unreachable endpoint to verify alerting:

	http://does-not-exist.devops.local

	Alerts should identify the specific failing endpoint (not aggregate into one global alert).

	## Preloaded Container Images

	This cluster operates in air-gapped mode. The following images are available:

	\| Component \| Image \|
	\|-----------\|-------\|
	\| Prometheus \| \`prom/prometheus:v3.8.1\` \|
	\| Blackbox Exporter \| \`prom/blackbox-exporter:v0.25.0\` \|
	\| Grafana \| \`grafana/grafana:12.2\` \|

	Use these exact image references in your deployments. No other versions are available.
	WIKIEOF

	git add . >/dev/null 2>&1
	git commit -m "Initial wiki content" >/dev/null 2>&1
	git remote add origin "$WIKI_REPO"
	git push -f origin master >/dev/null 2>&1

	cd /
	rm -rf "$TMPDIR"
	chown -R git:git "$WIKI_REPO"
	' && echo "[SETUP] Platform documentation created" \
	\|\| echo "[SETUP] WARN: Wiki creation failed"

	# Poke the Gitea wiki API to force indexing of the git-committed content
	kubectl exec -n gitea deploy/gitea -- sh -c '
	for i in 1 2 3; do
	HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \
	-u "root:Admin@123456" \
	"http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages")
	if [ "$HTTP_CODE" = "200" ]; then
	break
	fi
	sleep 2
	done
	' 2>/dev/null \|\| true





	echo "[SETUP] Importing pre-cached MinIO and MariaDB images..."

	PROMETHEUS_TAR="/workdir/prometheus-v3.8.1.tar"
	BLACKBOX_TAR="/workdir/blackbox-exporter-v0.25.0.tar"
	GRAFANA_TAR="/workdir/grafana-12.2.tar"

	if [ ! -f "$GRAFANA_TAR" ]; then
	echo "Error: Pre-cached Grafana image not found at $GRAFANA_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import --no-unpack "$GRAFANA_TAR"

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "grafana/grafana:12.2" \|\| {
	echo "Error: Grafana image was not imported correctly"
	exit 1
	}

	if [ ! -f "$PROMETHEUS_TAR" ]; then
	echo "Error: Pre-cached Prometheus image not found at $PROMETHEUS_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import "$PROMETHEUS_TAR"

	if [ ! -f "$BLACKBOX_TAR" ]; then
	echo "Error: Pre-cached Blackbox image not found at $BLACKBOX_TAR"
	exit 1
	fi

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images import --no-unpack "$BLACKBOX_TAR"

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "prom/prometheus:v3.8.1" \|\| {
	echo "Error: Prometheus image was not imported correctly"
	exit 1
	}

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images list \| grep -q "prom/blackbox-exporter:v0.25.0" \|\| {
	echo "Error: Blackbox image was not imported correctly"
	exit 1
	}

	CTR="ctr --address /run/k3s/containerd/containerd.sock --namespace k8s.io"
	$CTR images tag docker.io/prom/prometheus:v3.8.1 prom/prometheus:v3.8.1
	$CTR images tag docker.io/prom/blackbox-exporter:v0.25.0 prom/blackbox-exporter:v0.25.0
	$CTR images tag docker.io/grafana/grafana:12.2 grafana/grafana:12.2

	ALLOWED_IMAGES=(
	"prom/prometheus:v3.8.1"
	"docker.io/prom/prometheus:v3.8.1"
	"prom/blackbox-exporter:v0.25.0"
	"docker.io/prom/blackbox-exporter:v0.25.0"
	"grafana/grafana:12.2"
	"docker.io/grafana/grafana:12.2"
	)

	is_allowed() {
	for allowed in "${ALLOWED_IMAGES[@]}"; do
	[[ "$1" == "$allowed" ]] && return 0
	done
	return 1
	}

	$CTR images list -q \| while read -r image; do
	case "$image" in
	prometheus\|blackbox-exporter\|grafana)
	is_allowed "$image" \|\| $CTR images remove "$image" 2>/dev/null \|\| true
	;;
	esac
	done

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images remove docker.io/prom/prometheus:v2.54.1 2>/dev/null \|\| true

	ctr --address /run/k3s/containerd/containerd.sock \
	--namespace k8s.io \
	images remove docker.io/grafana/grafana:11.3.0 2>/dev/null \|\| true

	rm -f "$PROMETHEUS_TAR" "$BLACKBOX_TAR"
	rm -rf /workdir/*.tar 2>/dev/null \|\| true

	echo "[SETUP] Prometheus and Blackbox images imported successfully"
	#!/bin/bash
	set -euo pipefail

	echo ""
	echo ">>> Deploying Blackbox Exporter and Prometheus (Observability Stack)"
	echo ""

	export KUBECONFIG=/etc/rancher/k3s/k3s.yaml

	NAMESPACE="observability"

	# Ensure namespace exists
	kubectl get namespace ${NAMESPACE} >/dev/null 2>&1 \|\| kubectl create namespace ${NAMESPACE}

	echo ""
	echo ">>> Applying Blackbox Exporter configuration"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: blackbox-config
	namespace: ${NAMESPACE}
	data:
	blackbox.yml: \|
	modules:
	http_2xx:
	prober: http
	timeout: 5s
	http:
	valid_http_versions: ["HTTP/1.1", "HTTP/2"]
	valid_status_codes: []
	method: GET


	tcp_connect:
	prober: tcp
	timeout: 5s
	EOF

	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: blackbox-exporter
	namespace: ${NAMESPACE}
	labels:
	app: blackbox-exporter
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: blackbox-exporter
	template:
	metadata:
	labels:
	app: blackbox-exporter
	spec:
	containers:
	- name: blackbox-exporter
	image: prom/blackbox-exporter:v0.25.0
	ports:
	- containerPort: 9115
	args:
	- "--config.file=/etc/blackbox/blackbox.yml"
	securityContext:
	runAsNonRoot: true
	runAsUser: 1000
	volumeMounts:
	- name: config-volume
	mountPath: /etc/blackbox
	volumes:
	- name: config-volume
	configMap:
	name: blackbox-config
	items:
	- key: blackbox.yml
	path: blackbox.yml
	EOF

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Service
	metadata:
	name: blackbox-exporter
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: blackbox-exporter
	ports:
	- name: http
	port: 9115
	targetPort: 9115
	EOF

	echo ""
	echo ">>> Applying Prometheus storage"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: prometheus-data
	namespace: ${NAMESPACE}
	spec:
	accessModes:
	- ReadWriteOnce
	resources:
	requests:
	storage: 2Gi
	EOF

	echo ""
	echo ">>> Applying Prometheus configuration"
	echo ""

	echo "Applying Prometheus config..."

	cat <<'EOF' \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: prometheus-config
	namespace: observability
	data:
	prometheus.yml: \|
	global:
	scrape_interval: 15s

	rule_files:
	- /etc/prometheus/rules/*.yml

	scrape_configs:
	- job_name: blackbox
	metrics_path: /probe
	params:
	module: [http_2xx]
	static_configs:
	- targets:
	- http://argocd.devops.local
	- http://does-not-exist.devops.local
	relabel_configs:
	- source_labels: [__address__]
	target_label: __param_target
	- source_labels: [__param_target]
	target_label: instance
	- target_label: __address__
	replacement: blackbox-exporter:9115

	- job_name: blackbox-kubernetes-api
	metrics_path: /probe
	params:
	module: [tcp_connect]
	static_configs:
	- targets:
	- kubernetes.default.svc:443
	relabel_configs:
	- source_labels: [__address__]
	target_label: __param_target
	- source_labels: [__param_target]
	target_label: instance
	- target_label: __address__
	replacement: blackbox-exporter:9115

	- job_name: blackbox-exporter
	static_configs:
	- targets:
	- blackbox-exporter:9115

	- job_name: prometheus
	static_configs:
	- targets:
	- localhost:9090


	rules.yml: \|
	groups:

	# --------------------------------------------------
	# Recording rules
	# --------------------------------------------------
	- name: synthetic-recording
	rules:
	- record: probe:availability:5m
	expr: avg_over_time(probe_success[5m])

	- record: probe:availability:1h
	expr: avg_over_time(probe_success[1h])

	- record: probe:latency_p99:5m
	expr: \|
	histogram_quantile(
	0.99,
	sum(rate(probe_duration_seconds_bucket[5m]))
	by (le, instance)
	)

	# --------------------------------------------------
	# Compatibility alert (legacy graders depend on this)
	# --------------------------------------------------
	- name: synthetic-compat
	rules:
	- alert: SyntheticProbeFailure
	expr: probe_success == 0
	for: 1m
	labels:
	severity: critical
	annotations:
	summary: "Synthetic probe failed"
	description: "Endpoint {{ $labels.instance }} is unreachable"

	# --------------------------------------------------
	# SLO burn-rate alerts
	# --------------------------------------------------
	- name: synthetic-slo
	rules:
	- alert: SyntheticProbeHighBurnRate
	expr: \|
	(1 - probe:availability:5m{job="blackbox"}) / (1 - 0.99) > 14.4
	for: 2m
	labels:
	severity: critical
	annotations:
	summary: "High synthetic availability burn rate"
	description: "High error budget burn rate for {{ $labels.instance }}"

	- alert: SyntheticProbeLowBurnRate
	expr: \|
	(1 - probe:availability:1h{job="blackbox"}) / (1 - 0.99) > 1
	for: 1h
	labels:
	severity: warning
	annotations:
	summary: "Sustained synthetic availability degradation"
	description: "Sustained error budget burn rate for {{ $labels.instance }}"
	EOF





	echo ""
	echo ">>> Deploying Prometheus"
	echo ""

	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: prometheus
	namespace: ${NAMESPACE}
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: prometheus
	template:
	metadata:
	labels:
	app: prometheus
	spec:
	securityContext:
	fsGroup: 65534
	runAsNonRoot: true
	runAsUser: 1000
	containers:
	- name: prometheus
	image: prom/prometheus:v3.8.1
	args:
	- "--config.file=/etc/prometheus/prometheus.yml"

	ports:
	- containerPort: 9090
	volumeMounts:
	- name: config-volume
	mountPath: /etc/prometheus/prometheus.yml
	subPath: prometheus.yml

	- name: data-volume
	mountPath: /prometheus
	- name: rules-volume
	mountPath: /etc/prometheus/rules

	volumes:
	- name: config-volume
	configMap:
	name: prometheus-config
	- name: data-volume
	persistentVolumeClaim:
	claimName: prometheus-data
	- name: rules-volume
	configMap:
	name: prometheus-config
	items:
	- key: rules.yml
	path: rules.yml

	EOF


	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Service
	metadata:
	name: prometheus
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: prometheus
	ports:
	- name: web
	port: 9090
	targetPort: 9090
	EOF


	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-datasources
	namespace: ${NAMESPACE}
	data:
	datasources.yml: \|
	apiVersion: 1
	datasources:
	- name: Prometheus
	type: prometheus
	access: proxy
	url: http://prometheus:9090
	isDefault: true
	EOF




	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-dashboard-provider
	namespace: ${NAMESPACE}
	data:
	dashboards.yml: \|
	apiVersion: 1
	providers:
	- name: default
	folder: ''
	type: file
	options:
	path: /var/lib/grafana/dashboards
	EOF








	cat <<EOF \| kubectl apply -f -

	apiVersion: v1
	kind: ConfigMap
	metadata:
	name: grafana-dashboards
	namespace: observability
	data:
	blackbox-dashboard.json: \|
	{
	"title": "Synthetic Endpoint Availability",
	"schemaVersion": 38,
	"panels": [
	{
	"type": "timeseries",
	"title": "Probe Availability (%)",
	"targets": [
	{
	"expr": "probe:availability:5m * 100",
	"legendFormat": "{{ instance }}",
	"refId": "A"
	}
	]
	},
	{
	"type": "timeseries",
	"title": "Probe Latency (seconds)",
	"targets": [
	{
	"expr": "avg_over_time(probe_duration_seconds[5m]) by (instance)",
	"legendFormat": "{{ instance }}",
	"refId": "B"
	}
	]
	}
	]
	}
	EOF





	cat <<EOF \| kubectl apply -f -
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: grafana
	namespace: ${NAMESPACE}
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: grafana
	template:
	metadata:
	labels:
	app: grafana
	spec:
	containers:
	- name: grafana
	image: grafana/grafana:12.2
	ports:
	- containerPort: 3000
	volumeMounts:
	- name: datasources
	mountPath: /etc/grafana/provisioning/datasources

	# 👇 Provider config goes here
	- name: dashboard-provider
	mountPath: /etc/grafana/provisioning/dashboards

	# 👇 Actual JSON dashboards go here
	- name: dashboard-json
	mountPath: /var/lib/grafana/dashboards

	volumes:
	- name: datasources
	configMap:
	name: grafana-datasources

	- name: dashboard-provider
	configMap:
	name: grafana-dashboard-provider

	- name: dashboard-json
	configMap:
	name: grafana-dashboards
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: grafana
	namespace: ${NAMESPACE}
	spec:
	selector:
	app: grafana
	ports:
	- name: web
	port: 3000
	targetPort: 3000
	EOF













	echo ""
	echo ">>> Waiting for deployments to become ready"
	echo ""

	kubectl rollout status deployment/grafana -n ${NAMESPACE}
	kubectl rollout status deployment/blackbox-exporter -n ${NAMESPACE}
	kubectl rollout status deployment/prometheus -n ${NAMESPACE}

	echo ""
	echo ">>> Solution deployment completed successfully"
	echo ""
	id: synthetic-endpoint-monitoring
	metadata:
	category: observability
	difficulty: advanced

	prompt: \|
	[Context]
	The Nebula platform hosts multiple internal services exposed through Kubernetes ingress.
	While application-level metrics exist, the platform team lacks visibility into endpoint
	availability from a real client perspective.

	Synthetic probing is required to validate ingress reachability and detect outages
	independently of application instrumentation.

	[Environment Notes]
	The cluster operates in air-gapped mode.
	Required container images are already preloaded into the runtime environment and must
	be used as-is without pulling from external registries. Available image versions
	are documented in the platform wiki.


	[Task]
	Implement synthetic endpoint monitoring in the `observability` namespace.

	The monitoring stack must include the following Kubernetes resources:
	- Deployment `blackbox-exporter`
	- Service `blackbox-exporter`
	- ConfigMap `blackbox-config`
	- Deployment `prometheus`
	- Service `prometheus` exposing port 9090
	- ConfigMap `prometheus-config`
	- PersistentVolumeClaim `prometheus-data`

	Observability components must use the most recent preloaded Prometheus and Grafana container images available in the environment.


	The Nebula internal developer wiki documents commonly exposed platform services
	and their ingress hostnames:
	http://gitea.devops.local/root/platform-docs/wiki/

	Synthetic probes must target internal services representing real client access patterns.
	Probe results must be exposed as metrics consumable by the monitoring system.

	[Blackbox Exporter Requirements]
	The blackbox exporter must support multiple probe types:
	- HTTP probes for application-layer availability
	- TCP probes for transport-layer connectivity

	Probe behavior must follow protocol-appropriate validation:
	- Application-layer probes must validate protocol correctness
	- Transport-layer probes must validate connectivity only
	- HTTP probe modules must preserve default TLS verification behavior

	The Kubernetes API server health must be monitored using TCP connectivity probes
	against its standard in-cluster service name. This check must validate only
	basic transport-layer reachability (no TLS or HTTP validation).

	Exporter-level metrics (e.g., `up`) must not be used as a substitute for
	synthetic probe result metrics.

	[Prometheus Requirements]
	The global `scrape_interval` must be set to 15s or 10s.

	Prometheus must be configured to scrape blackbox probe targets using appropriate
	relabeling. Configuration should support efficient querying and reuse of commonly
	evaluated availability signals. Repeated or computationally expensive expressions
	should not be evaluated directly at query time.

	[Alerting Requirements]
	Prometheus must define alerting rules based on synthetic probe result metrics.

	At least one alert must be named SyntheticProbeFailure and represent
	endpoint-level availability failure detected via synthetic probes.

	Alerts detecting sustained availability degradation should be based on
	SLO-style burn rate concepts (e.g., evaluating error budget consumption
	over time rather than fixed thresholds).


	Alerts must:
	- Detect rapid availability loss
	- Detect sustained availability degradation over longer periods
	- Distinguish failures on a per-endpoint basis
	- Avoid relying solely on instantaneous probe failures or fixed thresholds

	[Visualization Requirements]
	A visualization layer must be deployed in the `observability` namespace.

	Required resource:
	- ConfigMap `grafana-dashboards` for dashboard definitions

	The visualization system must:
	- Consume metrics directly from Prometheus via declarative configuration
	- Include at least one dashboard showing per-endpoint probe availability over time
	- Represent availability as a normalized measure over time
	- Allow comparison across endpoints
	- Include at least one responsiveness-related indicator
	- Not rely solely on binary success/failure signals

	Dashboard and data source configuration must be fully reproducible and stored
	as Kubernetes resources.