|
<?php |
|
/** |
|
* Elasticsearch Cluster Health Check (Nagios-friendly) |
|
* ---------------------------------------------------- |
|
* Drop this single file on any PHP-enabled web server. |
|
* It returns: |
|
* - 200 OK when everything looks healthy |
|
* - 429 Too Many Requests for warnings (degraded but serving) |
|
* - 503 Service Unavailable for critical issues |
|
* - 500 Internal Server Error if the checker itself fails |
|
* |
|
* First line of the body is a concise, Nagios-style summary. |
|
* The rest of the body is JSON with detailed findings. |
|
* |
|
* Configuration (env vars OR query params override): |
|
* ES_URL default: http://127.0.0.1:9200 |
|
* ES_USER / ES_PASS optional basic auth |
|
* ES_API_KEY optional: sends as Authorization: ApiKey <key> |
|
* TIMEOUT_SEC default: 3 |
|
* VERIFY_TLS default: 1 (set 0 to skip verify) |
|
* CACHE_BACKEND file|redis|memcached|none (default: file) |
|
* CACHE_TTL_SEC default: 30 |
|
* CACHE_FILE default: /tmp/es_health_cache.json |
|
* REDIS_URL e.g. redis://127.0.0.1:6379/0 |
|
* MEMCACHED_HOST e.g. 127.0.0.1:11211 |
|
* DISK_WARN default: 0.80 (80%) |
|
* DISK_CRIT default: 0.95 (95%) |
|
* LOAD_FACTOR_WARN default: 1.5 (load1 > 1.5 * cpu cores) |
|
* LOAD_FACTOR_CRIT default: 3.0 (load1 > 3.0 * cpu cores) |
|
* MASTER_CHANGE_WINDOW_SEC default: 3600 (1h) |
|
* |
|
* Optional checks you may toggle (1/0): |
|
* CHECK_PENDING_TASKS default: 1 |
|
* CHECK_UNASSIGNED_SHARDS default: 1 |
|
* |
|
* Tested with PHP 7.2+ and ES 7.x/8.x. |
|
*/ |
|
|
|
// --------------------------- Config helpers --------------------------- |
|
function cfg($key, $default) { |
|
$val = null; |
|
if (isset($_GET[$key])) $val = $_GET[$key]; |
|
elseif (($env = getenv($key)) !== false) $val = $env; |
|
return ($val === null || $val === '') ? $default : $val; |
|
} |
|
|
|
function cfgf($key, $default) { return floatval(cfg($key, $default)); } |
|
function cfgi($key, $default) { return intval(cfg($key, $default)); } |
|
|
|
$ES_URL = rtrim(cfg('ES_URL', 'http://127.0.0.1:9200'), '/'); |
|
$ES_USER = cfg('ES_USER', ''); |
|
$ES_PASS = cfg('ES_PASS', ''); |
|
$ES_API_KEY = cfg('ES_API_KEY', ''); |
|
$TIMEOUT = cfgi('TIMEOUT_SEC', 3); |
|
$VERIFY_TLS = cfgi('VERIFY_TLS', 1) ? true : false; |
|
|
|
$CACHE_BACKEND = strtolower(cfg('CACHE_BACKEND', 'file')); |
|
$CACHE_TTL = cfgi('CACHE_TTL_SEC', 30); |
|
$CACHE_FILE = cfg('CACHE_FILE', '/tmp/es_health_cache.json'); |
|
$REDIS_URL = cfg('REDIS_URL', ''); |
|
$MEMCACHED_HOST= cfg('MEMCACHED_HOST', '127.0.0.1:11211'); |
|
|
|
$DISK_WARN = cfgf('DISK_WARN', 0.80); |
|
$DISK_CRIT = cfgf('DISK_CRIT', 0.95); |
|
$LOAD_WARN = cfgf('LOAD_FACTOR_WARN', 1.5); |
|
$LOAD_CRIT = cfgf('LOAD_FACTOR_CRIT', 3.0); |
|
$MASTER_WIN = cfgi('MASTER_CHANGE_WINDOW_SEC', 3600); |
|
|
|
$CHECK_PENDING_TASKS = cfgi('CHECK_PENDING_TASKS', 1) === 1; |
|
$CHECK_UNASSIGNED_SHARDS = cfgi('CHECK_UNASSIGNED_SHARDS', 1) === 1; |
|
|
|
// --------------------------- Cache layer --------------------------- |
|
interface CacheLike { public function get($k); public function set($k, $v, $ttl); } |
|
|
|
class FileCache implements CacheLike { |
|
private $file; |
|
public function __construct($file) { |
|
$this->file = $file; |
|
$dir = dirname($file); |
|
if (!is_dir($dir)) { |
|
// Try to create the directory if it doesn't exist |
|
if (!@mkdir($dir, 0775, true) && !is_dir($dir)) { |
|
throw new Exception("Cache directory not writable/creatable: $dir"); |
|
} |
|
} |
|
if (!is_writable($dir)) { |
|
throw new Exception("Cache directory not writable: $dir"); |
|
} |
|
} |
|
public function get($k) { |
|
if (!file_exists($this->file)) return null; |
|
if (!is_readable($this->file)) return null; |
|
$raw = file_get_contents($this->file); |
|
if ($raw === false || $raw === '') return null; |
|
$data = json_decode($raw, true); |
|
if (!is_array($data) || !isset($data[$k])) return null; |
|
$entry = $data[$k]; |
|
if (!isset($entry['exp']) || !isset($entry['val'])) return null; |
|
if (time() > intval($entry['exp'])) return null; |
|
return $entry['val']; |
|
} |
|
public function set($k, $v, $ttl) { |
|
$bucket = []; |
|
if (file_exists($this->file) && is_readable($this->file)) { |
|
$raw = file_get_contents($this->file); |
|
$decoded = json_decode($raw, true); |
|
if (is_array($decoded)) $bucket = $decoded; |
|
} |
|
$bucket[$k] = ['val' => $v, 'exp' => time() + max(1, intval($ttl))]; |
|
$json = json_encode($bucket, JSON_PRETTY_PRINT); |
|
if ($json === false) { |
|
throw new Exception('Failed to encode cache JSON'); |
|
} |
|
$tmp = $this->file . '.tmp'; |
|
$bytes = file_put_contents($tmp, $json, LOCK_EX); |
|
if ($bytes === false) { |
|
throw new Exception("Failed writing cache tmp file: $tmp"); |
|
} |
|
if (!@rename($tmp, $this->file)) { |
|
// Fallback: unlink target then write directly |
|
@unlink($tmp); |
|
$bytes2 = file_put_contents($this->file, $json, LOCK_EX); |
|
if ($bytes2 === false) { |
|
throw new Exception("Failed writing cache file: {$this->file}"); |
|
} |
|
} |
|
} |
|
} |
|
|
|
class RedisCache implements CacheLike { |
|
private $r; public function __construct($url) { |
|
if (!class_exists('Redis')) throw new Exception('Redis ext not installed'); |
|
$parts = parse_url($url); |
|
$host = $parts['host'] ?? '127.0.0.1'; |
|
$port = intval($parts['port'] ?? 6379); |
|
$db = intval(($parts['path'] ?? '/0') === '' ? 0 : trim($parts['path'],'/')); |
|
$pass = $parts['pass'] ?? null; |
|
$this->r = new Redis(); |
|
$this->r->connect($host, $port, 1.5); |
|
if ($pass) $this->r->auth($pass); |
|
$this->r->select($db); |
|
} |
|
public function get($k) { $v = $this->r->get($k); return $v ? json_decode($v, true) : null; } |
|
public function set($k, $v, $ttl) { $this->r->setex($k, $ttl, json_encode($v)); } |
|
} |
|
|
|
class MemdCache implements CacheLike { |
|
private $m; public function __construct($hostport) { |
|
if (!class_exists('Memcached')) throw new Exception('Memcached ext not installed'); |
|
$this->m = new Memcached(); |
|
[$h, $p] = array_pad(explode(':', $hostport, 2), 2, 11211); |
|
$this->m->addServer($h, intval($p)); |
|
} |
|
public function get($k) { $v = $this->m->get($k); return ($v !== false) ? $v : null; } |
|
public function set($k, $v, $ttl) { $this->m->set($k, $v, $ttl); } |
|
} |
|
|
|
function make_cache($backend, $file, $redisUrl, $memdHost) { |
|
try { |
|
switch ($backend) { |
|
case 'redis': return new RedisCache($redisUrl); |
|
case 'memcached': return new MemdCache($memdHost); |
|
case 'none': return null; |
|
default: return new FileCache($file); |
|
} |
|
} catch (Throwable $e) { return new FileCache($file); } |
|
} |
|
|
|
$CACHE = make_cache($CACHE_BACKEND, $CACHE_FILE, $REDIS_URL, $MEMCACHED_HOST); |
|
|
|
// --------------------------- HTTP helper --------------------------- |
|
function es_get($base, $path, $timeout, $verifyTls, $user, $pass, $apiKey) { |
|
$url = $base . $path; |
|
$ch = curl_init($url); |
|
$headers = [ 'Accept: application/json' ]; |
|
if ($apiKey) $headers[] = 'Authorization: ApiKey ' . $apiKey; |
|
curl_setopt_array($ch, [ |
|
CURLOPT_RETURNTRANSFER => true, |
|
CURLOPT_CONNECTTIMEOUT => $timeout, |
|
CURLOPT_TIMEOUT => $timeout, |
|
CURLOPT_HTTPHEADER => $headers, |
|
CURLOPT_SSL_VERIFYPEER => $verifyTls, |
|
CURLOPT_SSL_VERIFYHOST => $verifyTls ? 2 : 0, |
|
]); |
|
if ($user !== '' || $pass !== '') curl_setopt($ch, CURLOPT_USERPWD, $user . ':' . $pass); |
|
$body = curl_exec($ch); |
|
$status = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
|
$err = curl_error($ch); |
|
curl_close($ch); |
|
if ($body === false) throw new Exception('curl error: ' . $err); |
|
if ($status >= 400) throw new Exception('HTTP ' . $status . ' for ' . $url . ' body=' . substr($body,0,500)); |
|
$json = json_decode($body, true); |
|
if (!is_array($json)) throw new Exception('Invalid JSON from ' . $url); |
|
return $json; |
|
} |
|
|
|
function stable_hash($x) { return substr(sha1(json_encode($x)), 0, 12); } |
|
|
|
// --------------------------- Run checks --------------------------- |
|
$issues = []; |
|
$warnings = []; |
|
$criticals = []; |
|
$details = [ 'nodes' => [], 'versions' => [], 'jvm_versions' => [], 'disk' => [], 'loads' => [], 'cluster' => [], 'master' => [] ]; |
|
|
|
try { |
|
// minimal cache: node list + master id can be cached briefly to reduce ES load |
|
$cacheKey = 'es_health_core'; |
|
$core = null; |
|
if ($CACHE) $core = $CACHE->get($cacheKey); |
|
|
|
if (!$core) { |
|
$health = es_get($ES_URL, '/_cluster/health', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); |
|
$nodes = es_get($ES_URL, '/_nodes', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); |
|
$state = es_get($ES_URL, '/_cluster/state/master_node', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); |
|
$core = compact('health','nodes','state'); |
|
if ($CACHE) $CACHE->set($cacheKey, $core, max(5, $CACHE_TTL)); |
|
} |
|
|
|
$health = $core['health']; |
|
$nodes = $core['nodes']; |
|
$state = $core['state']; |
|
|
|
// Build node -> core count map from _nodes (NOT stats) |
|
$nodeCores = []; |
|
|
|
foreach ($nodes['nodes'] as $nodeId => $n) { |
|
$cores = |
|
$n['os']['allocated_processors'] |
|
?? $n['os']['available_processors'] |
|
?? 1; |
|
|
|
$nodeCores[$nodeId] = max(1, intval($cores)); |
|
} |
|
|
|
|
|
// Pending tasks (optional) |
|
$pending = null; |
|
if ($CHECK_PENDING_TASKS) { |
|
try { $pending = es_get($ES_URL, '/_cluster/pending_tasks', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); } |
|
catch (Throwable $e) { $pending = ['error' => $e->getMessage()]; } |
|
} |
|
|
|
// Node stats for disk + load |
|
$stats = es_get($ES_URL, '/_nodes/stats/os,fs', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); |
|
|
|
// Unassigned shards detail (optional) |
|
$unassigned = null; |
|
if ($CHECK_UNASSIGNED_SHARDS && isset($health['unassigned_shards']) && $health['unassigned_shards'] > 0) { |
|
try { $unassigned = es_get($ES_URL, '/_cat/shards?state=UNASSIGNED&format=json', $TIMEOUT, $VERIFY_TLS, $ES_USER, $ES_PASS, $ES_API_KEY); } |
|
catch (Throwable $e) { $unassigned = ['error' => $e->getMessage()]; } |
|
} |
|
|
|
// ---- Cluster status |
|
$details['cluster'] = $health; |
|
$clusterStatus = $health['status'] ?? 'unknown'; |
|
if ($clusterStatus === 'red') $criticals[] = 'cluster status is RED'; |
|
elseif ($clusterStatus === 'yellow') $warnings[] = 'cluster status is YELLOW'; |
|
|
|
if (!empty($health['unassigned_shards'])) { |
|
$count = intval($health['unassigned_shards']); |
|
if ($count > 0) $warnings[] = $count . ' unassigned shard(s)'; |
|
} |
|
|
|
// ---- Version skew (ES + JVM) |
|
$nodeInfos = $nodes['nodes'] ?? []; |
|
$esVersions = []; |
|
$jvmVersions = []; |
|
foreach ($nodeInfos as $id => $n) { |
|
$ver = $n['version'] ?? 'unknown'; |
|
$esVersions[$ver] = true; |
|
$jv = $n['jvm']['version'] ?? ($n['jvm']['vm_version'] ?? 'unknown'); |
|
$jvmVersions[$jv] = true; |
|
} |
|
$details['versions'] = array_keys($esVersions); |
|
$details['jvm_versions'] = array_keys($jvmVersions); |
|
if (count($esVersions) > 1) $warnings[] = 'mismatched Elasticsearch versions across nodes'; |
|
if (count($jvmVersions) > 1) $warnings[] = 'mismatched JVM versions across nodes'; |
|
|
|
// ---- Disk & Load per node |
|
$statsNodes = $stats['nodes'] ?? []; |
|
foreach ($statsNodes as $id => $s) { |
|
$name = $s['name'] ?? $id; |
|
// Disk |
|
$fs = $s['fs']['total'] ?? null; |
|
if ($fs && isset($fs['total_in_bytes']) && isset($fs['available_in_bytes'])) { |
|
$total = max(1, $fs['total_in_bytes']); |
|
$avail = $fs['available_in_bytes']; |
|
$usedP = ($total - $avail) / $total; |
|
$details['disk'][$name] = $usedP; |
|
if ($usedP >= $GLOBALS['DISK_CRIT']) $criticals[] = sprintf('disk %.1f%% used on %s', $usedP*100, $name); |
|
elseif ($usedP >= $GLOBALS['DISK_WARN']) $warnings[] = sprintf('disk %.1f%% used on %s', $usedP*100, $name); |
|
} |
|
// Load |
|
$cores = $nodeCores[$id] ?? 1; |
|
$load5 = floatval($s['os']['cpu']['load_average']['5m'] ?? 0); |
|
$details['loads'][$name] = ['load5' => $load5, 'cores' => $cores]; |
|
if ($load5 >= $cores * $GLOBALS['LOAD_CRIT']) $criticals[] = sprintf('high load %.2f on %s (cores=%d)', $load5, $name, $cores); |
|
elseif ($load5 >= $cores * $GLOBALS['LOAD_WARN']) $warnings[] = sprintf('elevated load %.2f on %s (cores=%d)', $load5, $name, $cores); |
|
} |
|
|
|
// ---- Master changes (requires tiny cache of last master) |
|
$masterNode = $state['master_node'] ?? null; |
|
$details['master'] = [ 'current_master' => $masterNode ]; |
|
|
|
$mcacheKey = 'es_master_state'; |
|
$prev = $CACHE ? $CACHE->get($mcacheKey) : null; |
|
$now = time(); |
|
if (!$prev || ($prev['master'] ?? null) !== $masterNode) { |
|
// Master changed now |
|
if ($prev && isset($prev['ts']) && ($now - $prev['ts']) <= $MASTER_WIN) { |
|
$warnings[] = 'master changed recently'; |
|
} |
|
if ($CACHE) $CACHE->set($mcacheKey, ['master' => $masterNode, 'ts' => $now], 86400); |
|
} |
|
|
|
// ---- Pending tasks (can indicate master pressure) |
|
if ($pending && isset($pending['tasks']) && is_array($pending['tasks'])) { |
|
$pt = count($pending['tasks']); |
|
if ($pt > 0) $warnings[] = $pt . ' pending cluster task(s)'; |
|
$details['cluster']['pending_tasks'] = $pt; |
|
} |
|
|
|
// ---- Unassigned shards detail |
|
if ($unassigned) $details['cluster']['unassigned_detail'] = $unassigned; |
|
|
|
// ---- Build output |
|
$statusCode = 200; $overall = 'OK'; |
|
if (!empty($criticals)) { $statusCode = 503; $overall = 'CRIT'; } |
|
elseif (!empty($warnings) || $clusterStatus === 'yellow') { $statusCode = 429; $overall = 'WARN'; } |
|
|
|
$summaryParts = []; |
|
$summaryParts[] = 'cluster=' . strtoupper($clusterStatus); |
|
if (!empty($warnings)) $summaryParts[] = 'warn=' . count($warnings); |
|
if (!empty($criticals)) $summaryParts[] = 'crit=' . count($criticals); |
|
$summary = 'ES ' . $overall . ' - ' . implode(' ', $summaryParts); |
|
|
|
$body = [ |
|
'overall' => $overall, |
|
'http_status' => $statusCode, |
|
'summary' => $summary, |
|
'warnings' => $warnings, |
|
'criticals' => $criticals, |
|
'details' => $details, |
|
'es_url' => $ES_URL, |
|
]; |
|
|
|
http_response_code($statusCode); |
|
header('Content-Type: application/json'); |
|
// Friendly first line for Nagios; then JSON for humans/tools |
|
echo $summary . "\n"; |
|
echo json_encode($body, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"; |
|
exit; |
|
} |
|
catch (Throwable $e) { |
|
http_response_code(500); |
|
header('Content-Type: application/json'); |
|
$msg = 'ES CHECK ERROR - ' . $e->getMessage(); |
|
$body = [ 'overall' => 'ERROR', 'http_status' => 500, 'summary' => $msg ]; |
|
echo $msg . "\n"; |
|
echo json_encode($body, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES) . "\n"; |
|
exit; |
|
} |