Skip to content

Instantly share code, notes, and snippets.

@luishfonseca
Last active February 25, 2026 16:01
Show Gist options
  • Select an option

  • Save luishfonseca/6d63db9c0663ce5c52a842050777853b to your computer and use it in GitHub Desktop.

Select an option

Save luishfonseca/6d63db9c0663ce5c52a842050777853b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import asyncio
import argparse
import json
import urllib.request
def gtt_pressure(card: str) -> float:
base = f"/sys/class/drm/{card}/device"
with open(f"{base}/mem_info_gtt_used") as f: used = int(f.read())
with open(f"{base}/mem_info_gtt_total") as f: total = int(f.read())
return used / total
def get_models(base: str) -> list:
with urllib.request.urlopen(f"{base}/models") as r:
return json.loads(r.read())["data"]
def unload_model(base: str, model_id: str) -> None:
body = json.dumps({"model": model_id}).encode()
req = urllib.request.Request(
f"{base}/models/unload",
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req):
pass
def evict_lru(base: str) -> None:
loaded = [m for m in get_models(base) if m["status"]["value"] == "loaded"]
if not loaded:
print("pressure high but no loaded models to evict")
return
lru = min(loaded, key=lambda m: m["status"]["last_used"])
print(f"evicting {lru['id']} (last_used={lru['status']['last_used']})")
unload_model(base, lru["id"])
async def watchdog(base: str, card: str, threshold: float, interval: float) -> None:
last_pressure = -1;
while True:
pressure = gtt_pressure(card)
if round(pressure,1) != last_pressure:
last_pressure = round(pressure,1)
print(f"gtt: {pressure:.1%}")
if pressure > threshold:
evict_lru(base)
await asyncio.sleep(10)
await asyncio.sleep(interval)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="llama.cpp GTT watchdog")
parser.add_argument("--url", default="http://localhost:8080", metavar="URL")
parser.add_argument("--card", default="card1", metavar="CARD")
parser.add_argument("--threshold", default=0.80, type=float, metavar="0-1")
parser.add_argument("--interval", default=2.0, type=float, metavar="SEC")
args = parser.parse_args()
asyncio.run(watchdog(args.url, args.card, args.threshold, args.interval))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment