Skip to content

Instantly share code, notes, and snippets.

@twitu
Created October 29, 2025 00:10
Show Gist options
  • Select an option

  • Save twitu/3906f716a6d30ad892e06ae7472ecca4 to your computer and use it in GitHub Desktop.

Select an option

Save twitu/3906f716a6d30ad892e06ae7472ecca4 to your computer and use it in GitHub Desktop.
Test anthropic prompt caching
#!/usr/bin/env python3
"""
Anthropic Prompt Caching Test - Direct API
Tests caching behavior with detailed metrics using Anthropic's direct API.
Run: uv run python scripts/test_anthropic_caching.py
"""
import os
import time
import anthropic
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
# ============================================================================
# MODEL CONFIGURATION - Uncomment the model you want to test
# ============================================================================
# MODEL = "claude-sonnet-4-5"
# MODEL = "claude-3-5-sonnet-20241022"
# MODEL = "claude-3-7-sonnet"
MODEL = "claude-haiku-4-5"
# ============================================================================
# TEST CONFIGURATION
# ============================================================================
def create_large_context():
"""Create a cacheable context (>1024 tokens)."""
import random
import string
# Generate random padding (varied text so tokenizer doesn't compress)
padding = " ".join("".join(random.choices(string.ascii_letters + string.digits, k=10)) for _ in range(1200))
return (
"""
You are a financial analysis assistant with expertise in investment strategies.
# Investment Analysis Framework
## Risk Assessment Criteria
1. Volatility measures (standard deviation, beta)
2. Drawdown analysis (maximum, average)
3. Correlation with market indices
4. Liquidity risk factors
5. Credit risk evaluation
## Return Metrics
1. Absolute returns (CAGR, total return)
2. Risk-adjusted returns (Sharpe ratio, Sortino ratio)
3. Benchmark comparison (alpha, tracking error)
4. Rolling period analysis
## Portfolio Construction
1. Asset allocation strategies
2. Diversification principles
3. Rebalancing methodologies
4. Tax optimization considerations
## Mutual Fund Evaluation
When analyzing mutual funds, consider:
- Expense ratio and its impact on returns
- Fund manager tenure and track record
- AUM size and liquidity implications
- Investment style consistency
- Tax efficiency
## Market Analysis
Evaluate market conditions:
- Economic indicators (GDP, inflation, employment)
- Interest rate environment
- Sector rotation patterns
- Geopolitical factors
- Market sentiment indicators
"""
+ f"\n\nContext padding (random data to prevent tokenizer compression):\n{padding}"
)
QUESTIONS = ["What is the Sharpe ratio?", "Explain diversification benefits"]
# ============================================================================
# TEST WITH CACHING
# ============================================================================
def test_with_caching():
"""Test with prompt caching enabled."""
print("\n" + "=" * 80)
print(f"ANTHROPIC PROMPT CACHING TEST - {MODEL}")
print("=" * 80)
# Prompt caching is now GA - no beta header needed
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
context = create_large_context()
results = []
for i, question in enumerate(QUESTIONS, 1):
print(f"\n{'─' * 80}")
print(f"Call {i}: {question}")
print(f"{'─' * 80}")
start = time.time()
response = client.messages.create(
model=MODEL,
max_tokens=200,
system=[{"type": "text", "text": context, "cache_control": {"type": "ephemeral"}}],
messages=[{"role": "user", "content": question}],
)
elapsed = time.time() - start
usage = response.usage
print(f"DEBUG - Raw response object: {response}")
# print(f"DEBUG - Raw usage object: {usage}")
# Extract metrics (handle both new nested and old flat format)
# New format has cache_creation object with TTL-specific fields
# Old format has flat cache_creation_input_tokens
cache_creation = 0
cache_read = 0
# Try new nested format first
if hasattr(usage, "cache_creation") and usage.cache_creation:
cache_obj = usage.cache_creation
# Sum all cache creation tokens (5m + 1h variants)
cache_creation = getattr(cache_obj, "ephemeral_5m_input_tokens", 0) + getattr(
cache_obj, "ephemeral_1h_input_tokens", 0
)
# Try old flat format (backward compatibility)
if cache_creation == 0:
cache_creation = getattr(usage, "cache_creation_input_tokens", 0)
# Cache reads (still flat field)
cache_read = getattr(usage, "cache_read_input_tokens", 0)
input_tokens = usage.input_tokens
output_tokens = usage.output_tokens
# Calculate costs (Claude Sonnet 4.5 pricing)
cost_cache_write = cache_creation * 0.00000375 # $3.75/MTok
cost_cache_read = cache_read * 0.0000003 # $0.30/MTok
cost_input = input_tokens * 0.000003 # $3.00/MTok
cost_output = output_tokens * 0.000015 # $15.00/MTok
total_cost = cost_cache_write + cost_cache_read + cost_input + cost_output
results.append(
{
"call": i,
"latency": elapsed,
"cache_creation": cache_creation,
"cache_read": cache_read,
"input": input_tokens,
"output": output_tokens,
"cost": total_cost,
}
)
# Print metrics
cache_status = ""
if cache_creation > 0:
cache_status = " [CACHE WRITE]"
elif cache_read > 0:
cache_status = " [CACHE READ]"
print(f"Latency: {elapsed:.2f}s{cache_status}")
print(f"Input tokens: {input_tokens:,}")
if cache_creation > 0:
print(f"Cache write: {cache_creation:,}")
if cache_read > 0:
print(f"Cache read: {cache_read:,}")
print(f"Output tokens: {output_tokens:,}")
print(f"Cost: ${total_cost:.6f}")
print(f"Answer preview: {response.content[0].text[:80]}...")
# Summary
print(f"\n{'=' * 80}")
print("SUMMARY")
print(f"{'=' * 80}")
total_time = sum(r["latency"] for r in results)
total_cost = sum(r["cost"] for r in results)
total_cache_creation = sum(r["cache_creation"] for r in results)
total_cache_read = sum(r["cache_read"] for r in results)
total_input = sum(r["input"] for r in results)
print(f"Total calls: {len(results)}")
print(f"Total time: {total_time:.2f}s")
print(f"Total cost: ${total_cost:.6f}")
print("\nToken breakdown:")
print(f" Cache writes: {total_cache_creation:,}")
print(f" Cache reads: {total_cache_read:,}")
print(f" Regular input: {total_input:,}")
if len(results) > 1:
first_latency = results[0]["latency"]
avg_subsequent = sum(r["latency"] for r in results[1:]) / len(results[1:])
improvement = ((first_latency - avg_subsequent) / first_latency) * 100
print("\nLatency:")
print(f" First call: {first_latency:.2f}s")
print(f" Avg subsequent: {avg_subsequent:.2f}s")
print(f" Improvement: {improvement:.1f}%")
# Cost comparison
first_cost = results[0]["cost"]
avg_subsequent_cost = sum(r["cost"] for r in results[1:]) / len(results[1:])
cost_savings = ((first_cost - avg_subsequent_cost) / first_cost) * 100
print("\nCost:")
print(f" First call: ${first_cost:.6f}")
print(f" Avg subsequent: ${avg_subsequent_cost:.6f}")
print(f" Savings: {cost_savings:.1f}%")
return results
# ============================================================================
# MAIN
# ============================================================================
def main():
print("\n" + "=" * 80)
print("ANTHROPIC PROMPT CACHING TEST")
print("=" * 80)
print(f"Model: {MODEL}\n")
results = test_with_caching()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment