Skip to content

Instantly share code, notes, and snippets.

@rndmcnlly
Created March 11, 2026 02:42
Show Gist options
  • Select an option

  • Save rndmcnlly/6680b63aeb26b5ea9ffbb56dab3030dc to your computer and use it in GitHub Desktop.

Select an option

Save rndmcnlly/6680b63aeb26b5ea9ffbb56dab3030dc to your computer and use it in GitHub Desktop.
Empirical test: OpenRouter 1h cache TTL -- prompt_cache_ttl vs cache_control.ttl
# /// script
# requires-python = ">=3.11"
# dependencies = ["httpx", "python-dotenv"]
# ///
"""
Empirical test: does OpenRouter's 1h cache TTL survive a 6-minute gap?
Tests three approaches against anthropic/claude-sonnet-4 via OpenRouter:
A) prompt_cache_ttl: 3600 (undocumented, used in PR #16850)
B) cache_control.ttl: "1h" (documented per-message approach)
C) baseline (no TTL extension, default 5-min expiry)
Protocol for each approach:
1. Send request with ~5k tokens of system context -> expect cache WRITE
2. Wait 6 minutes (past 5-min default TTL)
3. Send same request again -> expect cache HIT only if TTL > 5 min
Pass criteria:
- A and/or B show cached_tokens > 0 on the second request
- C shows cached_tokens == 0 on the second request (control)
Usage:
echo 'OPENROUTER_API_KEY=sk-or-...' > .env
uv run --script test_cache_ttl.py
Or:
export OPENROUTER_API_KEY=sk-or-...
uv run --script test_cache_ttl.py
Docs: https://openrouter.ai/docs/guides/best-practices/prompt-caching
"""
import hashlib
import os
import sys
import time
from dotenv import load_dotenv
import httpx
load_dotenv()
API = "https://openrouter.ai/api/v1/chat/completions"
MODEL = "anthropic/claude-sonnet-4"
DELAY = 6 * 60 # 6 minutes in seconds
RETRIES = 3
TIMEOUT = 120
# ~5k tokens of deterministic filler (well above 2048 minimum for sonnet)
FILLER = ("The quick brown fox jumps over the lazy dog. " * 120 + "\n") * 10
key = os.environ.get("OPENROUTER_API_KEY")
if not key:
print("error: set OPENROUTER_API_KEY in env or .env file", file=sys.stderr)
sys.exit(1)
headers = {
"Authorization": f"Bearer {key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://github.com/anomalyco/opencode",
"X-Title": "opencode-cache-ttl-test",
}
def tag(label: str) -> str:
return hashlib.sha256(f"cache-ttl-test-{label}-{os.getpid()}".encode()).hexdigest()[:16]
def messages_plain():
return [
{
"role": "system",
"content": f"You are a test assistant. Reference material:\n{FILLER}",
},
{"role": "user", "content": "Say 'ok' and nothing else."},
]
def messages_with_cc(ttl: str | None = None):
cc: dict = {"type": "ephemeral"}
if ttl:
cc["ttl"] = ttl
return [
{
"role": "system",
"content": [
{
"type": "text",
"text": f"You are a test assistant. Reference material:\n{FILLER}",
"cache_control": cc,
},
],
},
{"role": "user", "content": "Say 'ok' and nothing else."},
]
def send(label: str, body: dict) -> dict:
for attempt in range(1, RETRIES + 1):
try:
print(f" [{label}] request (attempt {attempt}/{RETRIES})...", flush=True)
r = httpx.post(API, headers=headers, json=body, timeout=TIMEOUT)
r.raise_for_status()
data = r.json()
usage = data.get("usage", {})
details = usage.get("prompt_tokens_details", {})
cached = details.get("cached_tokens", 0)
written = details.get("cache_write_tokens", 0)
prompt = usage.get("prompt_tokens", 0)
print(f" [{label}] prompt={prompt} cached={cached} written={written}")
return {"prompt": prompt, "cached": cached, "written": written}
except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
print(f" [{label}] attempt {attempt} failed: {e}", flush=True)
if attempt == RETRIES:
print(f" [{label}] all {RETRIES} attempts failed, aborting", file=sys.stderr)
sys.exit(1)
wait = 2 ** attempt
print(f" [{label}] retrying in {wait}s...", flush=True)
time.sleep(wait)
assert False, "unreachable"
def approach_a(session: str) -> dict:
"""prompt_cache_ttl: 3600 (undocumented)"""
return {
"model": MODEL,
"messages": messages_plain(),
"max_tokens": 8,
"prompt_cache_key": session,
"prompt_cache_ttl": 3600,
}
def approach_b(session: str) -> dict:
"""cache_control with ttl: '1h' (documented)"""
return {
"model": MODEL,
"messages": messages_with_cc(ttl="1h"),
"max_tokens": 8,
"prompt_cache_key": session,
}
def approach_c(session: str) -> dict:
"""baseline: no TTL extension"""
return {
"model": MODEL,
"messages": messages_plain(),
"max_tokens": 8,
"prompt_cache_key": session,
}
def run():
approaches = {
"A (prompt_cache_ttl=3600)": approach_a,
"B (cache_control ttl=1h)": approach_b,
"C (baseline, no TTL ext)": approach_c,
}
sessions = {name: tag(name) for name in approaches}
results: dict[str, dict] = {}
# --- Round 1: prime the caches ---
print("=== Round 1: priming caches ===", flush=True)
for name, build in approaches.items():
r1 = send(f"{name} R1", build(sessions[name]))
results[f"{name}_r1"] = r1
# --- Wait ---
print(f"\n=== Waiting {DELAY}s ({DELAY // 60}m) to exceed 5-min default TTL ===", flush=True)
for elapsed in range(DELAY):
remaining = DELAY - elapsed
if remaining % 60 == 0:
print(f" {remaining // 60}m remaining...", flush=True)
time.sleep(1)
# --- Round 2: test cache survival ---
print("\n=== Round 2: testing cache survival after 6-min gap ===", flush=True)
for name, build in approaches.items():
r2 = send(f"{name} R2", build(sessions[name]))
results[f"{name}_r2"] = r2
# --- Verdict ---
print("\n=== Results ===")
print(f"{'Approach':<35} {'R1 cached':>10} {'R1 written':>11} {'R2 cached':>10} {'R2 written':>11} Verdict")
print("-" * 100)
for name in approaches:
r1 = results[f"{name}_r1"]
r2 = results[f"{name}_r2"]
hit = r2["cached"] > 0
verdict = "CACHE HIT (TTL > 5m)" if hit else "CACHE MISS (TTL <= 5m)"
print(f"{name:<35} {r1['cached']:>10} {r1['written']:>11} {r2['cached']:>10} {r2['written']:>11} {verdict}")
print()
c_hit = results["C (baseline, no TTL ext)_r2"]["cached"] > 0
a_hit = results["A (prompt_cache_ttl=3600)_r2"]["cached"] > 0
b_hit = results["B (cache_control ttl=1h)_r2"]["cached"] > 0
if c_hit:
print("WARNING: baseline also got a cache hit -- 6 min may not have been enough,")
print(" or OpenRouter changed default TTL. Results inconclusive.")
else:
if a_hit and b_hit:
print("Both A (undocumented) and B (documented) survived. Either approach works.")
elif b_hit and not a_hit:
print("Only B (documented cache_control ttl) survived. PR should use that approach.")
elif a_hit and not b_hit:
print("Only A (undocumented prompt_cache_ttl) survived. Surprising -- needs investigation.")
else:
print("Neither A nor B survived. 1h TTL may not be working as expected.")
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment