rndmcnlly/test_cache_ttl.py

## test_cache_ttl.py
# /// script
# requires-python = ">=3.11"
# dependencies = ["httpx", "python-dotenv"]
# ///
"""
Empirical test: does OpenRouter's 1h cache TTL survive a 6-minute gap?

Tests three approaches against anthropic/claude-sonnet-4 via OpenRouter:
  A) prompt_cache_ttl: 3600        (undocumented, used in PR #16850)
  B) cache_control.ttl: "1h"       (documented per-message approach)
  C) baseline                      (no TTL extension, default 5-min expiry)

Protocol for each approach:
  1. Send request with ~5k tokens of system context  -> expect cache WRITE
  2. Wait 6 minutes (past 5-min default TTL)
  3. Send same request again                         -> expect cache HIT only if TTL > 5 min

Pass criteria:
  - A and/or B show cached_tokens > 0 on the second request
  - C shows cached_tokens == 0 on the second request (control)

Usage:
  echo 'OPENROUTER_API_KEY=sk-or-...' > .env
  uv run --script test_cache_ttl.py

Or:
  export OPENROUTER_API_KEY=sk-or-...
  uv run --script test_cache_ttl.py

Docs: https://openrouter.ai/docs/guides/best-practices/prompt-caching
"""

import hashlib
import os
import sys
import time

from dotenv import load_dotenv
import httpx

load_dotenv()

API = "https://openrouter.ai/api/v1/chat/completions"
MODEL = "anthropic/claude-sonnet-4"
DELAY = 6 * 60  # 6 minutes in seconds
RETRIES = 3
TIMEOUT = 120

# ~5k tokens of deterministic filler (well above 2048 minimum for sonnet)
FILLER = ("The quick brown fox jumps over the lazy dog. " * 120 + "\n") * 10

key = os.environ.get("OPENROUTER_API_KEY")
if not key:
    print("error: set OPENROUTER_API_KEY in env or .env file", file=sys.stderr)
    sys.exit(1)

headers = {
    "Authorization": f"Bearer {key}",
    "Content-Type": "application/json",
    "HTTP-Referer": "https://github.com/anomalyco/opencode",
    "X-Title": "opencode-cache-ttl-test",
}


def tag(label: str) -> str:
    return hashlib.sha256(f"cache-ttl-test-{label}-{os.getpid()}".encode()).hexdigest()[:16]


def messages_plain():
    return [
        {
            "role": "system",
            "content": f"You are a test assistant. Reference material:\n{FILLER}",
        },
        {"role": "user", "content": "Say 'ok' and nothing else."},
    ]


def messages_with_cc(ttl: str | None = None):
    cc: dict = {"type": "ephemeral"}
    if ttl:
        cc["ttl"] = ttl
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": f"You are a test assistant. Reference material:\n{FILLER}",
                    "cache_control": cc,
                },
            ],
        },
        {"role": "user", "content": "Say 'ok' and nothing else."},
    ]


def send(label: str, body: dict) -> dict:
    for attempt in range(1, RETRIES + 1):
        try:
            print(f"  [{label}] request (attempt {attempt}/{RETRIES})...", flush=True)
            r = httpx.post(API, headers=headers, json=body, timeout=TIMEOUT)
            r.raise_for_status()
            data = r.json()
            usage = data.get("usage", {})
            details = usage.get("prompt_tokens_details", {})
            cached = details.get("cached_tokens", 0)
            written = details.get("cache_write_tokens", 0)
            prompt = usage.get("prompt_tokens", 0)
            print(f"  [{label}] prompt={prompt}  cached={cached}  written={written}")
            return {"prompt": prompt, "cached": cached, "written": written}
        except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
            print(f"  [{label}] attempt {attempt} failed: {e}", flush=True)
            if attempt == RETRIES:
                print(f"  [{label}] all {RETRIES} attempts failed, aborting", file=sys.stderr)
                sys.exit(1)
            wait = 2 ** attempt
            print(f"  [{label}] retrying in {wait}s...", flush=True)
            time.sleep(wait)
    assert False, "unreachable"


def approach_a(session: str) -> dict:
    """prompt_cache_ttl: 3600 (undocumented)"""
    return {
        "model": MODEL,
        "messages": messages_plain(),
        "max_tokens": 8,
        "prompt_cache_key": session,
        "prompt_cache_ttl": 3600,
    }


def approach_b(session: str) -> dict:
    """cache_control with ttl: '1h' (documented)"""
    return {
        "model": MODEL,
        "messages": messages_with_cc(ttl="1h"),
        "max_tokens": 8,
        "prompt_cache_key": session,
    }


def approach_c(session: str) -> dict:
    """baseline: no TTL extension"""
    return {
        "model": MODEL,
        "messages": messages_plain(),
        "max_tokens": 8,
        "prompt_cache_key": session,
    }


def run():
    approaches = {
        "A (prompt_cache_ttl=3600)": approach_a,
        "B (cache_control ttl=1h)": approach_b,
        "C (baseline, no TTL ext)": approach_c,
    }

    sessions = {name: tag(name) for name in approaches}
    results: dict[str, dict] = {}

    # --- Round 1: prime the caches ---
    print("=== Round 1: priming caches ===", flush=True)
    for name, build in approaches.items():
        r1 = send(f"{name} R1", build(sessions[name]))
        results[f"{name}_r1"] = r1

    # --- Wait ---
    print(f"\n=== Waiting {DELAY}s ({DELAY // 60}m) to exceed 5-min default TTL ===", flush=True)
    for elapsed in range(DELAY):
        remaining = DELAY - elapsed
        if remaining % 60 == 0:
            print(f"  {remaining // 60}m remaining...", flush=True)
        time.sleep(1)

    # --- Round 2: test cache survival ---
    print("\n=== Round 2: testing cache survival after 6-min gap ===", flush=True)
    for name, build in approaches.items():
        r2 = send(f"{name} R2", build(sessions[name]))
        results[f"{name}_r2"] = r2

    # --- Verdict ---
    print("\n=== Results ===")
    print(f"{'Approach':<35} {'R1 cached':>10} {'R1 written':>11} {'R2 cached':>10} {'R2 written':>11}  Verdict")
    print("-" * 100)
    for name in approaches:
        r1 = results[f"{name}_r1"]
        r2 = results[f"{name}_r2"]
        hit = r2["cached"] > 0
        verdict = "CACHE HIT (TTL > 5m)" if hit else "CACHE MISS (TTL <= 5m)"
        print(f"{name:<35} {r1['cached']:>10} {r1['written']:>11} {r2['cached']:>10} {r2['written']:>11}  {verdict}")

    print()
    c_hit = results["C (baseline, no TTL ext)_r2"]["cached"] > 0
    a_hit = results["A (prompt_cache_ttl=3600)_r2"]["cached"] > 0
    b_hit = results["B (cache_control ttl=1h)_r2"]["cached"] > 0

    if c_hit:
        print("WARNING: baseline also got a cache hit -- 6 min may not have been enough,")
        print("         or OpenRouter changed default TTL. Results inconclusive.")
    else:
        if a_hit and b_hit:
            print("Both A (undocumented) and B (documented) survived. Either approach works.")
        elif b_hit and not a_hit:
            print("Only B (documented cache_control ttl) survived. PR should use that approach.")
        elif a_hit and not b_hit:
            print("Only A (undocumented prompt_cache_ttl) survived. Surprising -- needs investigation.")
        else:
            print("Neither A nor B survived. 1h TTL may not be working as expected.")


if __name__ == "__main__":
    run()
	# /// script
	# requires-python = ">=3.11"
	# dependencies = ["httpx", "python-dotenv"]
	# ///
	"""
	Empirical test: does OpenRouter's 1h cache TTL survive a 6-minute gap?

	Tests three approaches against anthropic/claude-sonnet-4 via OpenRouter:
	A) prompt_cache_ttl: 3600 (undocumented, used in PR #16850)
	B) cache_control.ttl: "1h" (documented per-message approach)
	C) baseline (no TTL extension, default 5-min expiry)

	Protocol for each approach:
	1. Send request with ~5k tokens of system context -> expect cache WRITE
	2. Wait 6 minutes (past 5-min default TTL)
	3. Send same request again -> expect cache HIT only if TTL > 5 min

	Pass criteria:
	- A and/or B show cached_tokens > 0 on the second request
	- C shows cached_tokens == 0 on the second request (control)

	Usage:
	echo 'OPENROUTER_API_KEY=sk-or-...' > .env
	uv run --script test_cache_ttl.py

	Or:
	export OPENROUTER_API_KEY=sk-or-...
	uv run --script test_cache_ttl.py

	Docs: https://openrouter.ai/docs/guides/best-practices/prompt-caching
	"""

	import hashlib
	import os
	import sys
	import time

	from dotenv import load_dotenv
	import httpx

	load_dotenv()

	API = "https://openrouter.ai/api/v1/chat/completions"
	MODEL = "anthropic/claude-sonnet-4"
	DELAY = 6 * 60 # 6 minutes in seconds
	RETRIES = 3
	TIMEOUT = 120

	# ~5k tokens of deterministic filler (well above 2048 minimum for sonnet)
	FILLER = ("The quick brown fox jumps over the lazy dog. " * 120 + "\n") * 10

	key = os.environ.get("OPENROUTER_API_KEY")
	if not key:
	print("error: set OPENROUTER_API_KEY in env or .env file", file=sys.stderr)
	sys.exit(1)

	headers = {
	"Authorization": f"Bearer {key}",
	"Content-Type": "application/json",
	"HTTP-Referer": "https://github.com/anomalyco/opencode",
	"X-Title": "opencode-cache-ttl-test",
	}


	def tag(label: str) -> str:
	return hashlib.sha256(f"cache-ttl-test-{label}-{os.getpid()}".encode()).hexdigest()[:16]


	def messages_plain():
	return [
	{
	"role": "system",
	"content": f"You are a test assistant. Reference material:\n{FILLER}",
	},
	{"role": "user", "content": "Say 'ok' and nothing else."},
	]


	def messages_with_cc(ttl: str \| None = None):
	cc: dict = {"type": "ephemeral"}
	if ttl:
	cc["ttl"] = ttl
	return [
	{
	"role": "system",
	"content": [
	{
	"type": "text",
	"text": f"You are a test assistant. Reference material:\n{FILLER}",
	"cache_control": cc,
	},
	],
	},
	{"role": "user", "content": "Say 'ok' and nothing else."},
	]


	def send(label: str, body: dict) -> dict:
	for attempt in range(1, RETRIES + 1):
	try:
	print(f" [{label}] request (attempt {attempt}/{RETRIES})...", flush=True)
	r = httpx.post(API, headers=headers, json=body, timeout=TIMEOUT)
	r.raise_for_status()
	data = r.json()
	usage = data.get("usage", {})
	details = usage.get("prompt_tokens_details", {})
	cached = details.get("cached_tokens", 0)
	written = details.get("cache_write_tokens", 0)
	prompt = usage.get("prompt_tokens", 0)
	print(f" [{label}] prompt={prompt} cached={cached} written={written}")
	return {"prompt": prompt, "cached": cached, "written": written}
	except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
	print(f" [{label}] attempt {attempt} failed: {e}", flush=True)
	if attempt == RETRIES:
	print(f" [{label}] all {RETRIES} attempts failed, aborting", file=sys.stderr)
	sys.exit(1)
	wait = 2 ** attempt
	print(f" [{label}] retrying in {wait}s...", flush=True)
	time.sleep(wait)
	assert False, "unreachable"


	def approach_a(session: str) -> dict:
	"""prompt_cache_ttl: 3600 (undocumented)"""
	return {
	"model": MODEL,
	"messages": messages_plain(),
	"max_tokens": 8,
	"prompt_cache_key": session,
	"prompt_cache_ttl": 3600,
	}


	def approach_b(session: str) -> dict:
	"""cache_control with ttl: '1h' (documented)"""
	return {
	"model": MODEL,
	"messages": messages_with_cc(ttl="1h"),
	"max_tokens": 8,
	"prompt_cache_key": session,
	}


	def approach_c(session: str) -> dict:
	"""baseline: no TTL extension"""
	return {
	"model": MODEL,
	"messages": messages_plain(),
	"max_tokens": 8,
	"prompt_cache_key": session,
	}


	def run():
	approaches = {
	"A (prompt_cache_ttl=3600)": approach_a,
	"B (cache_control ttl=1h)": approach_b,
	"C (baseline, no TTL ext)": approach_c,
	}

	sessions = {name: tag(name) for name in approaches}
	results: dict[str, dict] = {}

	# --- Round 1: prime the caches ---
	print("=== Round 1: priming caches ===", flush=True)
	for name, build in approaches.items():
	r1 = send(f"{name} R1", build(sessions[name]))
	results[f"{name}_r1"] = r1

	# --- Wait ---
	print(f"\n=== Waiting {DELAY}s ({DELAY // 60}m) to exceed 5-min default TTL ===", flush=True)
	for elapsed in range(DELAY):
	remaining = DELAY - elapsed
	if remaining % 60 == 0:
	print(f" {remaining // 60}m remaining...", flush=True)
	time.sleep(1)

	# --- Round 2: test cache survival ---
	print("\n=== Round 2: testing cache survival after 6-min gap ===", flush=True)
	for name, build in approaches.items():
	r2 = send(f"{name} R2", build(sessions[name]))
	results[f"{name}_r2"] = r2

	# --- Verdict ---
	print("\n=== Results ===")
	print(f"{'Approach':<35} {'R1 cached':>10} {'R1 written':>11} {'R2 cached':>10} {'R2 written':>11} Verdict")
	print("-" * 100)
	for name in approaches:
	r1 = results[f"{name}_r1"]
	r2 = results[f"{name}_r2"]
	hit = r2["cached"] > 0
	verdict = "CACHE HIT (TTL > 5m)" if hit else "CACHE MISS (TTL <= 5m)"
	print(f"{name:<35} {r1['cached']:>10} {r1['written']:>11} {r2['cached']:>10} {r2['written']:>11} {verdict}")

	print()
	c_hit = results["C (baseline, no TTL ext)_r2"]["cached"] > 0
	a_hit = results["A (prompt_cache_ttl=3600)_r2"]["cached"] > 0
	b_hit = results["B (cache_control ttl=1h)_r2"]["cached"] > 0

	if c_hit:
	print("WARNING: baseline also got a cache hit -- 6 min may not have been enough,")
	print(" or OpenRouter changed default TTL. Results inconclusive.")
	else:
	if a_hit and b_hit:
	print("Both A (undocumented) and B (documented) survived. Either approach works.")
	elif b_hit and not a_hit:
	print("Only B (documented cache_control ttl) survived. PR should use that approach.")
	elif a_hit and not b_hit:
	print("Only A (undocumented prompt_cache_ttl) survived. Surprising -- needs investigation.")
	else:
	print("Neither A nor B survived. 1h TTL may not be working as expected.")


	if __name__ == "__main__":
	run()
No results found