|
--- a/task.yaml |
|
+++ b/task.yaml |
|
@@ -15,10 +15,8 @@ |
|
[Environment Notes] |
|
The cluster operates in air-gapped mode. |
|
Required container images are already preloaded into the runtime environment and must |
|
- be used as-is without pulling from external registries. |
|
- |
|
- The list of available images can be inspected from the node using the |
|
- container runtime tooling (e.g., `ctr images list`). |
|
+ be used as-is without pulling from external registries. Available image versions |
|
+ are documented in the platform wiki. |
|
|
|
|
|
[Task] |
|
@@ -51,7 +49,7 @@ |
|
Probe behavior must follow protocol-appropriate validation: |
|
- Application-layer probes must validate protocol correctness |
|
- Transport-layer probes must validate connectivity only |
|
- - Security-related protocol behavior must not be implicitly disabled |
|
+ - HTTP probe modules must preserve default TLS verification behavior |
|
|
|
The Kubernetes API server health must be monitored using TCP connectivity probes |
|
against its standard in-cluster service name. This check must validate only |
|
--- a/setup.sh |
|
+++ b/setup.sh |
|
@@ -73,7 +73,7 @@ |
|
' |
|
|
|
# ------------------------------------------------------------------------------ |
|
-# Wiki creation (REVIEWER FIX – surgically integrated) |
|
+# Wiki creation — commit directly to bare git repo (bypasses API timing issues) |
|
# ------------------------------------------------------------------------------ |
|
kubectl exec -n gitea deploy/gitea -- sh -c ' |
|
set -e |
|
@@ -83,10 +83,17 @@ |
|
# Ensure bare wiki repo exists (Gitea does NOT auto-create it) |
|
if [ ! -d "$WIKI_REPO" ]; then |
|
git init --bare "$WIKI_REPO" >/dev/null 2>&1 |
|
- chown -R git:git "$WIKI_REPO" |
|
fi |
|
|
|
-WIKI_CONTENT="# Platform Services - Monitoring Requirements |
|
+# Commit wiki content directly into the bare repo via a temp clone |
|
+TMPDIR=$(mktemp -d) |
|
+cd "$TMPDIR" |
|
+git init >/dev/null 2>&1 |
|
+git config user.email "root@gitea.local" |
|
+git config user.name "root" |
|
+ |
|
+cat > "Platform-Services---Monitoring-Requirements.md" <<WIKIEOF |
|
+# Platform Services - Monitoring Requirements |
|
|
|
## Critical Platform Services |
|
|
|
@@ -112,28 +119,42 @@ |
|
|
|
Alerts should identify the specific failing endpoint (not aggregate into one global alert). |
|
|
|
-## Air-Gapped Environment |
|
+## Preloaded Container Images |
|
|
|
-Container images are preloaded in the air-gapped environment. |
|
-Agents must discover available versions using standard container tooling." |
|
+This cluster operates in air-gapped mode. The following images are available: |
|
|
|
-# Create wiki page via API (retry to allow Gitea to index repo) |
|
+| Component | Image | |
|
+|-----------|-------| |
|
+| Prometheus | \`prom/prometheus:v3.8.1\` | |
|
+| Blackbox Exporter | \`prom/blackbox-exporter:v0.25.0\` | |
|
+| Grafana | \`grafana/grafana:12.2\` | |
|
+ |
|
+Use these exact image references in your deployments. No other versions are available. |
|
+WIKIEOF |
|
+ |
|
+git add . >/dev/null 2>&1 |
|
+git commit -m "Initial wiki content" >/dev/null 2>&1 |
|
+git remote add origin "$WIKI_REPO" |
|
+git push -f origin master >/dev/null 2>&1 |
|
+ |
|
+cd / |
|
+rm -rf "$TMPDIR" |
|
+chown -R git:git "$WIKI_REPO" |
|
+' && echo "[SETUP] Platform documentation created" \ |
|
+ || echo "[SETUP] WARN: Wiki creation failed" |
|
+ |
|
+# Poke the Gitea wiki API to force indexing of the git-committed content |
|
+kubectl exec -n gitea deploy/gitea -- sh -c ' |
|
for i in 1 2 3; do |
|
- HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \ |
|
+ HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" \ |
|
-u "root:Admin@123456" \ |
|
- -H "Content-Type: application/json" \ |
|
- -d "{\"title\":\"Platform Services - Monitoring Requirements\",\"content_base64\":\"$(printf "%s" "$WIKI_CONTENT" | base64 -w0)\"}" \ |
|
- http://localhost:3000/api/v1/repos/root/platform-docs/wiki/new) |
|
- |
|
- if [ "$HTTP_CODE" = "201" ] || [ "$HTTP_CODE" = "200" ]; then |
|
- exit 0 |
|
+ "http://localhost:3000/api/v1/repos/root/platform-docs/wiki/pages") |
|
+ if [ "$HTTP_CODE" = "200" ]; then |
|
+ break |
|
fi |
|
sleep 2 |
|
done |
|
- |
|
-exit 1 |
|
-' && echo "[SETUP] Platform documentation created" \ |
|
- || echo "[SETUP] WARN: Wiki creation failed" |
|
+' 2>/dev/null || true |
|
|
|
|
|
|
|
--- a/solution.sh |
|
+++ b/solution.sh |
|
@@ -132,11 +132,6 @@ |
|
rule_files: |
|
- /etc/prometheus/rules/*.yml |
|
|
|
- remote_write: |
|
- - url: "https://url/insert/0/prometheus/api/v1/write" |
|
- tls_config: |
|
- insecure_skip_verify: true |
|
- |
|
scrape_configs: |
|
- job_name: blackbox |
|
metrics_path: /probe |
|
@@ -383,7 +378,7 @@ |
|
"title": "Probe Availability (%)", |
|
"targets": [ |
|
{ |
|
- "expr": "probe:availability:5m by (instance) * 100 or avg_over_time(probe_success[5m]) by (instance) * 100", |
|
+ "expr": "probe:availability:5m * 100", |
|
"legendFormat": "{{ instance }}", |
|
"refId": "A" |
|
} |
|
--- a/grader.py |
|
+++ b/grader.py |
|
@@ -241,8 +241,6 @@ |
|
if "kubernetes.default" not in out: |
|
return False, "Kubernetes API server not configured as probe target" |
|
|
|
- |
|
- import re |
|
tcp_module_pattern = r'module:\s*\[?\s*["\']?tcp_connect["\']?\s*\]?' |
|
if not re.search(tcp_module_pattern, out) or "tcp_connect_tls" in out: |
|
return False, "Must use 'tcp_connect' module (not tcp_connect_tls) for Kubernetes API TCP probe" |
|
@@ -283,9 +281,7 @@ |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- import re |
|
- |
|
- # Must have multiple time windows |
|
+ # Must have multiple time windows in recording rules or alert expressions |
|
windows = re.findall( |
|
r"avg_over_time\([^)]*\[(\d+[mh])\]\)", |
|
out |
|
@@ -296,16 +292,17 @@ |
|
"(e.g., 5m and 1h)" |
|
) |
|
|
|
- # Must compare against known burn rate multipliers |
|
- has_multiplier = bool( |
|
- re.search(r">\s*(14\.4|6|3|1)\b", out) or |
|
- re.search(r"\b(14\.4|6|3|1)\s*<", out) |
|
- ) |
|
+ # Must have at least 2 distinct 'for:' durations across alert rules |
|
+ # (evidence of fast-burn vs slow-burn detection windows) |
|
+ for_durations = re.findall(r"for:\s*(\d+[smh])", out) |
|
+ unique_durations = {parse_duration(d) for d in for_durations} |
|
+ unique_durations.discard(timedelta(0)) |
|
|
|
- if not has_multiplier: |
|
+ if len(unique_durations) < 2: |
|
return False, ( |
|
- "Burn rate alerts must compare against error budget " |
|
- "multipliers (e.g., > 14.4)" |
|
+ "SLO burn rate alerting requires multiple detection windows " |
|
+ "(e.g., a fast-burn alert with 'for: 2m' and a slow-burn " |
|
+ "alert with 'for: 1h')" |
|
) |
|
|
|
return True, "Valid multi-window SLO burn rate alerts detected" |
|
@@ -373,8 +370,6 @@ |
|
if "record:" not in out: |
|
return False, "Prometheus should define recording rules" |
|
|
|
- import re |
|
- |
|
# Extract recording rule names |
|
record_names = re.findall( |
|
r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", |
|
@@ -384,15 +379,22 @@ |
|
if not record_names: |
|
return False, "No valid recording rule names found" |
|
|
|
- # Alerts must reference recording rules |
|
+ # Multi-window availability requires at least 2 recording rules |
|
+ if len(record_names) < 2: |
|
+ return False, ( |
|
+ "Multiple recording rules needed for multi-window " |
|
+ "availability signals (e.g., 5m and 1h windows)" |
|
+ ) |
|
+ |
|
+ # At least 2 recording rules must be referenced in alert expressions |
|
alert_section = out[out.find("alert:"):] if "alert:" in out else out |
|
|
|
- used = any(name in alert_section for name in record_names) |
|
+ used_count = sum(1 for name in record_names if name in alert_section) |
|
|
|
- if not used: |
|
+ if used_count < 2: |
|
return False, ( |
|
- "Recording rules must be referenced in alert expressions " |
|
- "(not raw probe_success)" |
|
+ "At least 2 recording rules should be referenced in alert " |
|
+ "expressions for multi-window burn rate detection" |
|
) |
|
|
|
return True, "Recording rules exist and are used in alerts" |
|
@@ -423,20 +425,6 @@ |
|
if 'http_2xx' not in argocd_section and 'http' in argocd_section: |
|
return False, "HTTP targets should use http_2xx module" |
|
|
|
- # --- NEW: HTTPS targets must explicitly configure TLS verification --- |
|
- if 'https://' in out_lower: |
|
- # Only enforce if HTTPS is actually being probed |
|
- has_tls_config = ( |
|
- 'tls_config' in out_lower or |
|
- 'insecure_skip_verify: false' in out_lower |
|
- ) |
|
- |
|
- if not has_tls_config: |
|
- return False, ( |
|
- "HTTPS targets should have explicit TLS verification " |
|
- "configuration (tls_config or insecure_skip_verify: false)" |
|
- ) |
|
- |
|
return True, "Blackbox modules correctly matched to target protocols" |
|
|
|
|
|
@@ -463,24 +451,35 @@ |
|
|
|
def check_dashboard_uses_recording_rules(): |
|
"""Dashboard should reference recording rules instead of raw PromQL.""" |
|
- code, out, _ = sh( |
|
+ code, dash_out, _ = sh( |
|
"kubectl get configmap grafana-dashboards " |
|
"-n observability -o yaml" |
|
) |
|
if code != 0: |
|
return False, "grafana-dashboards ConfigMap not readable" |
|
|
|
- has_recording_rule_ref = any(x in out for x in [ |
|
- "probe:availability", |
|
- "probe:latency", |
|
- ]) |
|
+ # Extract actual recording rule names from prometheus-config |
|
+ code, prom_out, _ = sh( |
|
+ "kubectl get configmap prometheus-config " |
|
+ "-n observability -o yaml" |
|
+ ) |
|
|
|
- if not has_recording_rule_ref: |
|
- return False, ( |
|
- "Dashboard should reference pre-computed recording rules " |
|
- "(e.g., probe:availability:5m) instead of raw PromQL" |
|
- ) |
|
- return True, "Dashboard references recording rules" |
|
+ if code == 0: |
|
+ record_names = re.findall( |
|
+ r"record:\s*([a-zA-Z_:][a-zA-Z0-9_:]*)", |
|
+ prom_out |
|
+ ) |
|
+ if record_names and any(name in dash_out for name in record_names): |
|
+ return True, "Dashboard references recording rules" |
|
+ |
|
+ # Fallback: accept any probe:*:* pattern (recording rule convention) |
|
+ if re.search(r"probe:[a-zA-Z_]+:[a-zA-Z0-9_]+", dash_out): |
|
+ return True, "Dashboard references recording rules" |
|
+ |
|
+ return False, ( |
|
+ "Dashboard should reference pre-computed recording rules " |
|
+ "(e.g., probe:availability:5m) instead of raw PromQL" |
|
+ ) |
|
|
|
|
|
|
|
@@ -594,7 +593,7 @@ |
|
|
|
port_forward("svc", "prometheus", "observability", 9090, 9090) |
|
|
|
- for _ in range(9): # allow 2 minutes |
|
+ for _ in range(18): # ~2.5 min with overhead |
|
code, out, _ = sh( |
|
"curl -s http://localhost:9090/api/v1/alerts" |
|
) |
|
@@ -606,7 +605,7 @@ |
|
): |
|
return True, "SyntheticProbeFailure alert is firing" |
|
|
|
- time.sleep(7) |
|
+ time.sleep(8) |
|
|
|
return False, "SyntheticProbeFailure alert did not fire" |
|
|
|
@@ -650,7 +649,9 @@ |
|
) |
|
|
|
|
|
- if code == 0 and "Synthetic" in out: |
|
+ if code == 0 and any(kw in out for kw in [ |
|
+ "Synthetic", "Blackbox", "Probe", "Endpoint" |
|
+ ]): |
|
return True, "Grafana dashboard for synthetic probes exists" |
|
time.sleep(5) |
|
|
|
@@ -691,7 +692,7 @@ |
|
|
|
|
|
def alert_rule_identifies_endpoint(): |
|
- """Verify alert rule groups by instance/endpoint""" |
|
+ """Verify alert annotations reference the failing endpoint.""" |
|
code, out, _ = sh( |
|
"kubectl get configmap prometheus-config " |
|
"-n observability -o yaml" |
|
@@ -699,21 +700,19 @@ |
|
if code != 0: |
|
return False, "Prometheus config not readable" |
|
|
|
- grouping_patterns = [ |
|
- r'by\s*\(\s*instance\s*\)', |
|
- r'by\s*\(\s*target\s*\)', |
|
- r'by\s*\(\s*[^)]*instance[^)]*\)', |
|
- ] |
|
- |
|
- has_grouping = any(re.search(pattern, out) for pattern in grouping_patterns) |
|
- |
|
- |
|
- has_label_template = re.search(r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out) |
|
- |
|
- if has_grouping or has_label_template: |
|
- return True, "Alert rule properly identifies per-endpoint failures" |
|
- |
|
- return False, "Alert must group by instance (use 'by (instance)' or reference {{ $labels.instance }})" |
|
+ # Alerts must reference the endpoint in annotations so operators |
|
+ # can identify which endpoint failed (not just group in the expr) |
|
+ has_label_template = re.search( |
|
+ r'\{\{\s*\$labels\.(instance|target)\s*\}\}', out |
|
+ ) |
|
+ |
|
+ if has_label_template: |
|
+ return True, "Alert annotations identify the failing endpoint" |
|
+ |
|
+ return False, ( |
|
+ "Alert annotations must reference the failing endpoint " |
|
+ "(e.g., {{ $labels.instance }}) for operational use" |
|
+ ) |
|
|
|
|
|
def alert_has_minimum_duration(): |
|
@@ -743,7 +742,7 @@ |
|
if not healthy: |
|
return False, msg |
|
|
|
- for _ in range(9): |
|
+ for _ in range(18): # ~2.5 min with overhead |
|
code, out, _ = sh( |
|
"kubectl exec -n observability deploy/prometheus -- " |
|
"wget -qO- http://localhost:9090/api/v1/alerts" |
|
@@ -779,17 +778,11 @@ |
|
} |
|
|
|
has_failing = any("does-not-exist" in i for i in firing) |
|
- argocd_quiet = not any("argocd" in i for i in firing) |
|
|
|
- if has_failing and argocd_quiet: |
|
+ if has_failing: |
|
return True, ( |
|
"Alerts fire per endpoint (failing endpoint alerts " |
|
- "independently of healthy endpoints)" |
|
- ) |
|
- elif has_failing: |
|
- return False, ( |
|
- "ArgoCD endpoint is also firing — alerts may not " |
|
- "distinguish healthy from unhealthy endpoints" |
|
+ "independently with instance labels)" |
|
) |
|
|
|
time.sleep(8) |
|
@@ -878,22 +871,32 @@ |
|
issues = [] |
|
|
|
# ------------------------------------------------------------------ |
|
- # Check 1: probe_success must be time-aggregated (not raw) |
|
+ # Check 1: dashboard must show availability data (not raw binary) |
|
# ------------------------------------------------------------------ |
|
- if "probe_success" in out: |
|
- has_time_agg = any(fn in out for fn in [ |
|
- "avg_over_time", |
|
- "min_over_time", |
|
- "max_over_time", |
|
- "sum_over_time", |
|
- ]) |
|
- if not has_time_agg: |
|
- issues.append( |
|
- "Dashboard uses raw probe_success without time aggregation " |
|
- "(expected avg_over_time / min_over_time / max_over_time)" |
|
- ) |
|
+ has_availability_metric = ( |
|
+ "probe_success" in out |
|
+ or re.search(r"probe:[a-zA-Z_]*availab", out) |
|
+ or re.search(r"probe:[a-zA-Z_]*success", out) |
|
+ ) |
|
+ |
|
+ if has_availability_metric: |
|
+ if "probe_success" in out: |
|
+ has_time_agg = any(fn in out for fn in [ |
|
+ "avg_over_time", |
|
+ "min_over_time", |
|
+ "max_over_time", |
|
+ "sum_over_time", |
|
+ ]) |
|
+ if not has_time_agg: |
|
+ issues.append( |
|
+ "Dashboard uses raw probe_success without time " |
|
+ "aggregation (expected avg_over_time or similar)" |
|
+ ) |
|
else: |
|
- issues.append("Dashboard does not query probe_success metric") |
|
+ issues.append( |
|
+ "Dashboard does not reference probe availability metrics " |
|
+ "(probe_success or a recording rule like probe:availability)" |
|
+ ) |
|
|
|
# ------------------------------------------------------------------ |
|
# Check 2: per-endpoint breakdown (instance / target) |
|
@@ -903,30 +906,38 @@ |
|
"by (target)", |
|
"$labels.instance", |
|
"$labels.target", |
|
+ "{{ instance }}", |
|
+ "{{instance}}", |
|
+ "{{ target }}", |
|
+ "{{target}}", |
|
]) |
|
|
|
if not has_grouping: |
|
issues.append( |
|
"Dashboard does not show per-endpoint breakdown " |
|
- "(missing by(instance) or target label usage)" |
|
+ "(missing by(instance), legendFormat with {{ instance }}, " |
|
+ "or target label usage)" |
|
) |
|
|
|
# ------------------------------------------------------------------ |
|
- # Check 3: availability expressed as percentage |
|
+ # Check 3: availability not shown as raw binary signal |
|
# ------------------------------------------------------------------ |
|
- has_percentage = ( |
|
- "probe_success" in out and |
|
- any(x in out for x in [ |
|
- "* 100", |
|
- "*100", |
|
- "100 *", |
|
+ has_normalized = ( |
|
+ # Percentage form (e.g., * 100) |
|
+ any(x in out for x in ["* 100", "*100", "100 *"]) |
|
+ # Or [0,1] normalized via time aggregation (avg_over_time already |
|
+ # produces a continuous availability ratio, not binary) |
|
+ or any(fn in out for fn in [ |
|
+ "avg_over_time", "min_over_time", "max_over_time", |
|
]) |
|
+ # Or uses a recording rule that pre-computes availability |
|
+ or re.search(r"probe:[a-zA-Z_]+:", out) |
|
) |
|
|
|
- if not has_percentage: |
|
+ if not has_normalized: |
|
issues.append( |
|
- "Dashboard should express availability as a percentage " |
|
- "(e.g. avg_over_time(probe_success[5m]) * 100)" |
|
+ "Dashboard should represent availability as a normalized " |
|
+ "measure (e.g., avg_over_time for ratio or * 100 for percentage)" |
|
) |
|
|
|
# ------------------------------------------------------------------ |
|
@@ -989,16 +1000,46 @@ |
|
# Gate checks (non-scored) |
|
# ------------------------ |
|
gate_checks = [ |
|
+ # Resource existence |
|
lambda: deployment_exists("blackbox-exporter", "observability"), |
|
lambda: service_exists("blackbox-exporter", "observability", 9115), |
|
lambda: configmap_exists("blackbox-config", "observability"), |
|
- |
|
lambda: deployment_exists("prometheus", "observability"), |
|
lambda: configmap_exists("prometheus-config", "observability"), |
|
lambda: pvc_exists("prometheus-data", "observability"), |
|
- |
|
blackbox_config_has_required_modules, |
|
- |
|
+ |
|
+ # Image correctness |
|
+ lambda: deployment_uses_image( |
|
+ "blackbox-exporter", "observability", |
|
+ "prom/blackbox-exporter:v0.25.0", |
|
+ ), |
|
+ lambda: deployment_uses_image( |
|
+ "prometheus", "observability", |
|
+ "prom/prometheus:v3.8.1", |
|
+ ), |
|
+ lambda: deployment_uses_image( |
|
+ "grafana", "observability", |
|
+ "grafana/grafana:12.2", |
|
+ ), |
|
+ |
|
+ # Core operational |
|
+ blackbox_metrics_exposed, |
|
+ prometheus_has_probe_metrics, |
|
+ prometheus_blackbox_relabeling_present, |
|
+ argocd_probe_success, |
|
+ kubernetes_api_tcp_probe_configured, |
|
+ prometheus_uses_pvc, |
|
+ |
|
+ # Basic config quality |
|
+ grafana_has_prometheus_datasource, |
|
+ check_alert_severity_labels, |
|
+ check_scrape_interval, |
|
+ check_alert_annotations, |
|
+ alert_has_minimum_duration, |
|
+ prometheus_does_not_use_up_metric, |
|
+ check_slo_burn_rate_alerts, |
|
+ check_blackbox_modules, |
|
] |
|
|
|
for fn in gate_checks: |
|
@@ -1014,61 +1055,15 @@ |
|
# Scored checks (partial) |
|
# ------------------------ |
|
scored_checks = { |
|
- # Image correct |
|
- "blackbox_image_correct": lambda: deployment_uses_image( |
|
- "blackbox-exporter", |
|
- "observability", |
|
- "prom/blackbox-exporter:v0.25.0", |
|
- ), |
|
- "prometheus_image_correct": lambda: deployment_uses_image( |
|
- "prometheus", |
|
- "observability", |
|
- "prom/prometheus:v3.8.1" |
|
- ), |
|
- "grafana_image_correct": lambda: deployment_uses_image( |
|
- "grafana", |
|
- "observability", |
|
- "grafana/grafana:12.2" |
|
- ), |
|
- |
|
- # Core functionality |
|
- "blackbox_operational": blackbox_metrics_exposed, |
|
- "prometheus_operational": prometheus_has_probe_metrics, |
|
- "blackbox_scraped_by_prometheus": prometheus_blackbox_relabeling_present, |
|
- "argocd_probe_working": argocd_probe_success, |
|
- "kubernetes_api_probed": kubernetes_api_tcp_probe_configured, |
|
- "storage_persistent": prometheus_uses_pvc, |
|
- |
|
- # Visualization |
|
- "grafana_datasource": grafana_has_prometheus_datasource, |
|
"grafana_dashboard_present": grafana_has_blackbox_dashboard, |
|
- |
|
- |
|
- "alert_severity_labels": check_alert_severity_labels, |
|
- |
|
"endpoint_count": check_endpoint_count, |
|
"grafana_dashboard_semantics": check_grafana_dashboard_semantics, |
|
- |
|
- # Alerting |
|
"failing_in_alert": prometheus_alert_fires_for_failing_probe, |
|
"alert_identifies_endpoint": alert_rule_identifies_endpoint, |
|
"per_endpoint": prometheus_alert_is_per_endpoint, |
|
- |
|
- "blackbox_modules_correct": check_blackbox_modules, |
|
- "scrape_interval_valid": check_scrape_interval, |
|
- "alert_has_annotations": check_alert_annotations, |
|
- |
|
- |
|
- "slo_rate": check_slo_burn_rate_alerts, |
|
"records": check_recording_rules, |
|
- |
|
"self_scrape": check_prometheus_self_scrape, |
|
"recording_rules": check_dashboard_uses_recording_rules, |
|
- |
|
- |
|
- "alert_duration_correct": alert_has_minimum_duration, |
|
- |
|
- "up_metric": prometheus_does_not_use_up_metric, |
|
} |
|
|
|
subscores = {} |