Created
October 29, 2025 00:10
-
-
Save twitu/3906f716a6d30ad892e06ae7472ecca4 to your computer and use it in GitHub Desktop.
Test anthropic prompt caching
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Anthropic Prompt Caching Test - Direct API | |
| Tests caching behavior with detailed metrics using Anthropic's direct API. | |
| Run: uv run python scripts/test_anthropic_caching.py | |
| """ | |
| import os | |
| import time | |
| import anthropic | |
| ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") | |
| # ============================================================================ | |
| # MODEL CONFIGURATION - Uncomment the model you want to test | |
| # ============================================================================ | |
| # MODEL = "claude-sonnet-4-5" | |
| # MODEL = "claude-3-5-sonnet-20241022" | |
| # MODEL = "claude-3-7-sonnet" | |
| MODEL = "claude-haiku-4-5" | |
| # ============================================================================ | |
| # TEST CONFIGURATION | |
| # ============================================================================ | |
| def create_large_context(): | |
| """Create a cacheable context (>1024 tokens).""" | |
| import random | |
| import string | |
| # Generate random padding (varied text so tokenizer doesn't compress) | |
| padding = " ".join("".join(random.choices(string.ascii_letters + string.digits, k=10)) for _ in range(1200)) | |
| return ( | |
| """ | |
| You are a financial analysis assistant with expertise in investment strategies. | |
| # Investment Analysis Framework | |
| ## Risk Assessment Criteria | |
| 1. Volatility measures (standard deviation, beta) | |
| 2. Drawdown analysis (maximum, average) | |
| 3. Correlation with market indices | |
| 4. Liquidity risk factors | |
| 5. Credit risk evaluation | |
| ## Return Metrics | |
| 1. Absolute returns (CAGR, total return) | |
| 2. Risk-adjusted returns (Sharpe ratio, Sortino ratio) | |
| 3. Benchmark comparison (alpha, tracking error) | |
| 4. Rolling period analysis | |
| ## Portfolio Construction | |
| 1. Asset allocation strategies | |
| 2. Diversification principles | |
| 3. Rebalancing methodologies | |
| 4. Tax optimization considerations | |
| ## Mutual Fund Evaluation | |
| When analyzing mutual funds, consider: | |
| - Expense ratio and its impact on returns | |
| - Fund manager tenure and track record | |
| - AUM size and liquidity implications | |
| - Investment style consistency | |
| - Tax efficiency | |
| ## Market Analysis | |
| Evaluate market conditions: | |
| - Economic indicators (GDP, inflation, employment) | |
| - Interest rate environment | |
| - Sector rotation patterns | |
| - Geopolitical factors | |
| - Market sentiment indicators | |
| """ | |
| + f"\n\nContext padding (random data to prevent tokenizer compression):\n{padding}" | |
| ) | |
| QUESTIONS = ["What is the Sharpe ratio?", "Explain diversification benefits"] | |
| # ============================================================================ | |
| # TEST WITH CACHING | |
| # ============================================================================ | |
| def test_with_caching(): | |
| """Test with prompt caching enabled.""" | |
| print("\n" + "=" * 80) | |
| print(f"ANTHROPIC PROMPT CACHING TEST - {MODEL}") | |
| print("=" * 80) | |
| # Prompt caching is now GA - no beta header needed | |
| client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) | |
| context = create_large_context() | |
| results = [] | |
| for i, question in enumerate(QUESTIONS, 1): | |
| print(f"\n{'─' * 80}") | |
| print(f"Call {i}: {question}") | |
| print(f"{'─' * 80}") | |
| start = time.time() | |
| response = client.messages.create( | |
| model=MODEL, | |
| max_tokens=200, | |
| system=[{"type": "text", "text": context, "cache_control": {"type": "ephemeral"}}], | |
| messages=[{"role": "user", "content": question}], | |
| ) | |
| elapsed = time.time() - start | |
| usage = response.usage | |
| print(f"DEBUG - Raw response object: {response}") | |
| # print(f"DEBUG - Raw usage object: {usage}") | |
| # Extract metrics (handle both new nested and old flat format) | |
| # New format has cache_creation object with TTL-specific fields | |
| # Old format has flat cache_creation_input_tokens | |
| cache_creation = 0 | |
| cache_read = 0 | |
| # Try new nested format first | |
| if hasattr(usage, "cache_creation") and usage.cache_creation: | |
| cache_obj = usage.cache_creation | |
| # Sum all cache creation tokens (5m + 1h variants) | |
| cache_creation = getattr(cache_obj, "ephemeral_5m_input_tokens", 0) + getattr( | |
| cache_obj, "ephemeral_1h_input_tokens", 0 | |
| ) | |
| # Try old flat format (backward compatibility) | |
| if cache_creation == 0: | |
| cache_creation = getattr(usage, "cache_creation_input_tokens", 0) | |
| # Cache reads (still flat field) | |
| cache_read = getattr(usage, "cache_read_input_tokens", 0) | |
| input_tokens = usage.input_tokens | |
| output_tokens = usage.output_tokens | |
| # Calculate costs (Claude Sonnet 4.5 pricing) | |
| cost_cache_write = cache_creation * 0.00000375 # $3.75/MTok | |
| cost_cache_read = cache_read * 0.0000003 # $0.30/MTok | |
| cost_input = input_tokens * 0.000003 # $3.00/MTok | |
| cost_output = output_tokens * 0.000015 # $15.00/MTok | |
| total_cost = cost_cache_write + cost_cache_read + cost_input + cost_output | |
| results.append( | |
| { | |
| "call": i, | |
| "latency": elapsed, | |
| "cache_creation": cache_creation, | |
| "cache_read": cache_read, | |
| "input": input_tokens, | |
| "output": output_tokens, | |
| "cost": total_cost, | |
| } | |
| ) | |
| # Print metrics | |
| cache_status = "" | |
| if cache_creation > 0: | |
| cache_status = " [CACHE WRITE]" | |
| elif cache_read > 0: | |
| cache_status = " [CACHE READ]" | |
| print(f"Latency: {elapsed:.2f}s{cache_status}") | |
| print(f"Input tokens: {input_tokens:,}") | |
| if cache_creation > 0: | |
| print(f"Cache write: {cache_creation:,}") | |
| if cache_read > 0: | |
| print(f"Cache read: {cache_read:,}") | |
| print(f"Output tokens: {output_tokens:,}") | |
| print(f"Cost: ${total_cost:.6f}") | |
| print(f"Answer preview: {response.content[0].text[:80]}...") | |
| # Summary | |
| print(f"\n{'=' * 80}") | |
| print("SUMMARY") | |
| print(f"{'=' * 80}") | |
| total_time = sum(r["latency"] for r in results) | |
| total_cost = sum(r["cost"] for r in results) | |
| total_cache_creation = sum(r["cache_creation"] for r in results) | |
| total_cache_read = sum(r["cache_read"] for r in results) | |
| total_input = sum(r["input"] for r in results) | |
| print(f"Total calls: {len(results)}") | |
| print(f"Total time: {total_time:.2f}s") | |
| print(f"Total cost: ${total_cost:.6f}") | |
| print("\nToken breakdown:") | |
| print(f" Cache writes: {total_cache_creation:,}") | |
| print(f" Cache reads: {total_cache_read:,}") | |
| print(f" Regular input: {total_input:,}") | |
| if len(results) > 1: | |
| first_latency = results[0]["latency"] | |
| avg_subsequent = sum(r["latency"] for r in results[1:]) / len(results[1:]) | |
| improvement = ((first_latency - avg_subsequent) / first_latency) * 100 | |
| print("\nLatency:") | |
| print(f" First call: {first_latency:.2f}s") | |
| print(f" Avg subsequent: {avg_subsequent:.2f}s") | |
| print(f" Improvement: {improvement:.1f}%") | |
| # Cost comparison | |
| first_cost = results[0]["cost"] | |
| avg_subsequent_cost = sum(r["cost"] for r in results[1:]) / len(results[1:]) | |
| cost_savings = ((first_cost - avg_subsequent_cost) / first_cost) * 100 | |
| print("\nCost:") | |
| print(f" First call: ${first_cost:.6f}") | |
| print(f" Avg subsequent: ${avg_subsequent_cost:.6f}") | |
| print(f" Savings: {cost_savings:.1f}%") | |
| return results | |
| # ============================================================================ | |
| # MAIN | |
| # ============================================================================ | |
| def main(): | |
| print("\n" + "=" * 80) | |
| print("ANTHROPIC PROMPT CACHING TEST") | |
| print("=" * 80) | |
| print(f"Model: {MODEL}\n") | |
| results = test_with_caching() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment