twitu/test_anthropic_caching.py

## test_anthropic_caching.py
#!/usr/bin/env python3
"""
Anthropic Prompt Caching Test - Direct API

Tests caching behavior with detailed metrics using Anthropic's direct API.
Run: uv run python scripts/test_anthropic_caching.py
"""

import os
import time

import anthropic

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")


# ============================================================================
# MODEL CONFIGURATION - Uncomment the model you want to test
# ============================================================================

# MODEL = "claude-sonnet-4-5"
# MODEL = "claude-3-5-sonnet-20241022"
# MODEL = "claude-3-7-sonnet"
MODEL = "claude-haiku-4-5"


# ============================================================================
# TEST CONFIGURATION
# ============================================================================


def create_large_context():
    """Create a cacheable context (>1024 tokens)."""
    import random
    import string

    # Generate random padding (varied text so tokenizer doesn't compress)
    padding = " ".join("".join(random.choices(string.ascii_letters + string.digits, k=10)) for _ in range(1200))

    return (
        """
You are a financial analysis assistant with expertise in investment strategies.

# Investment Analysis Framework

## Risk Assessment Criteria
1. Volatility measures (standard deviation, beta)
2. Drawdown analysis (maximum, average)
3. Correlation with market indices
4. Liquidity risk factors
5. Credit risk evaluation

## Return Metrics
1. Absolute returns (CAGR, total return)
2. Risk-adjusted returns (Sharpe ratio, Sortino ratio)
3. Benchmark comparison (alpha, tracking error)
4. Rolling period analysis

## Portfolio Construction
1. Asset allocation strategies
2. Diversification principles
3. Rebalancing methodologies
4. Tax optimization considerations

## Mutual Fund Evaluation
When analyzing mutual funds, consider:
- Expense ratio and its impact on returns
- Fund manager tenure and track record
- AUM size and liquidity implications
- Investment style consistency
- Tax efficiency

## Market Analysis
Evaluate market conditions:
- Economic indicators (GDP, inflation, employment)
- Interest rate environment
- Sector rotation patterns
- Geopolitical factors
- Market sentiment indicators

"""
        + f"\n\nContext padding (random data to prevent tokenizer compression):\n{padding}"
    )


QUESTIONS = ["What is the Sharpe ratio?", "Explain diversification benefits"]


# ============================================================================
# TEST WITH CACHING
# ============================================================================


def test_with_caching():
    """Test with prompt caching enabled."""
    print("\n" + "=" * 80)
    print(f"ANTHROPIC PROMPT CACHING TEST - {MODEL}")
    print("=" * 80)

    # Prompt caching is now GA - no beta header needed
    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    context = create_large_context()

    results = []

    for i, question in enumerate(QUESTIONS, 1):
        print(f"\n{'─' * 80}")
        print(f"Call {i}: {question}")
        print(f"{'─' * 80}")

        start = time.time()

        response = client.messages.create(
            model=MODEL,
            max_tokens=200,
            system=[{"type": "text", "text": context, "cache_control": {"type": "ephemeral"}}],
            messages=[{"role": "user", "content": question}],
        )

        elapsed = time.time() - start
        usage = response.usage
        print(f"DEBUG - Raw response object: {response}")
        # print(f"DEBUG - Raw usage object: {usage}")

        # Extract metrics (handle both new nested and old flat format)
        # New format has cache_creation object with TTL-specific fields
        # Old format has flat cache_creation_input_tokens
        cache_creation = 0
        cache_read = 0

        # Try new nested format first
        if hasattr(usage, "cache_creation") and usage.cache_creation:
            cache_obj = usage.cache_creation
            # Sum all cache creation tokens (5m + 1h variants)
            cache_creation = getattr(cache_obj, "ephemeral_5m_input_tokens", 0) + getattr(
                cache_obj, "ephemeral_1h_input_tokens", 0
            )

        # Try old flat format (backward compatibility)
        if cache_creation == 0:
            cache_creation = getattr(usage, "cache_creation_input_tokens", 0)

        # Cache reads (still flat field)
        cache_read = getattr(usage, "cache_read_input_tokens", 0)

        input_tokens = usage.input_tokens
        output_tokens = usage.output_tokens

        # Calculate costs (Claude Sonnet 4.5 pricing)
        cost_cache_write = cache_creation * 0.00000375  # $3.75/MTok
        cost_cache_read = cache_read * 0.0000003  # $0.30/MTok
        cost_input = input_tokens * 0.000003  # $3.00/MTok
        cost_output = output_tokens * 0.000015  # $15.00/MTok
        total_cost = cost_cache_write + cost_cache_read + cost_input + cost_output

        results.append(
            {
                "call": i,
                "latency": elapsed,
                "cache_creation": cache_creation,
                "cache_read": cache_read,
                "input": input_tokens,
                "output": output_tokens,
                "cost": total_cost,
            }
        )

        # Print metrics
        cache_status = ""
        if cache_creation > 0:
            cache_status = " [CACHE WRITE]"
        elif cache_read > 0:
            cache_status = " [CACHE READ]"

        print(f"Latency:          {elapsed:.2f}s{cache_status}")
        print(f"Input tokens:     {input_tokens:,}")
        if cache_creation > 0:
            print(f"Cache write:      {cache_creation:,}")
        if cache_read > 0:
            print(f"Cache read:       {cache_read:,}")
        print(f"Output tokens:    {output_tokens:,}")
        print(f"Cost:             ${total_cost:.6f}")
        print(f"Answer preview:   {response.content[0].text[:80]}...")

    # Summary
    print(f"\n{'=' * 80}")
    print("SUMMARY")
    print(f"{'=' * 80}")

    total_time = sum(r["latency"] for r in results)
    total_cost = sum(r["cost"] for r in results)
    total_cache_creation = sum(r["cache_creation"] for r in results)
    total_cache_read = sum(r["cache_read"] for r in results)
    total_input = sum(r["input"] for r in results)

    print(f"Total calls:      {len(results)}")
    print(f"Total time:       {total_time:.2f}s")
    print(f"Total cost:       ${total_cost:.6f}")
    print("\nToken breakdown:")
    print(f"  Cache writes:   {total_cache_creation:,}")
    print(f"  Cache reads:    {total_cache_read:,}")
    print(f"  Regular input:  {total_input:,}")

    if len(results) > 1:
        first_latency = results[0]["latency"]
        avg_subsequent = sum(r["latency"] for r in results[1:]) / len(results[1:])
        improvement = ((first_latency - avg_subsequent) / first_latency) * 100

        print("\nLatency:")
        print(f"  First call:     {first_latency:.2f}s")
        print(f"  Avg subsequent: {avg_subsequent:.2f}s")
        print(f"  Improvement:    {improvement:.1f}%")

        # Cost comparison
        first_cost = results[0]["cost"]
        avg_subsequent_cost = sum(r["cost"] for r in results[1:]) / len(results[1:])
        cost_savings = ((first_cost - avg_subsequent_cost) / first_cost) * 100

        print("\nCost:")
        print(f"  First call:     ${first_cost:.6f}")
        print(f"  Avg subsequent: ${avg_subsequent_cost:.6f}")
        print(f"  Savings:        {cost_savings:.1f}%")

    return results


# ============================================================================
# MAIN
# ============================================================================


def main():
    print("\n" + "=" * 80)
    print("ANTHROPIC PROMPT CACHING TEST")
    print("=" * 80)
    print(f"Model: {MODEL}\n")

    results = test_with_caching()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Anthropic Prompt Caching Test - Direct API

	Tests caching behavior with detailed metrics using Anthropic's direct API.
	Run: uv run python scripts/test_anthropic_caching.py
	"""

	import os
	import time

	import anthropic

	ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")


	# ============================================================================
	# MODEL CONFIGURATION - Uncomment the model you want to test
	# ============================================================================

	# MODEL = "claude-sonnet-4-5"
	# MODEL = "claude-3-5-sonnet-20241022"
	# MODEL = "claude-3-7-sonnet"
	MODEL = "claude-haiku-4-5"


	# ============================================================================
	# TEST CONFIGURATION
	# ============================================================================


	def create_large_context():
	"""Create a cacheable context (>1024 tokens)."""
	import random
	import string

	# Generate random padding (varied text so tokenizer doesn't compress)
	padding = " ".join("".join(random.choices(string.ascii_letters + string.digits, k=10)) for _ in range(1200))

	return (
	"""
	You are a financial analysis assistant with expertise in investment strategies.

	# Investment Analysis Framework

	## Risk Assessment Criteria
	1. Volatility measures (standard deviation, beta)
	2. Drawdown analysis (maximum, average)
	3. Correlation with market indices
	4. Liquidity risk factors
	5. Credit risk evaluation

	## Return Metrics
	1. Absolute returns (CAGR, total return)
	2. Risk-adjusted returns (Sharpe ratio, Sortino ratio)
	3. Benchmark comparison (alpha, tracking error)
	4. Rolling period analysis

	## Portfolio Construction
	1. Asset allocation strategies
	2. Diversification principles
	3. Rebalancing methodologies
	4. Tax optimization considerations

	## Mutual Fund Evaluation
	When analyzing mutual funds, consider:
	- Expense ratio and its impact on returns
	- Fund manager tenure and track record
	- AUM size and liquidity implications
	- Investment style consistency
	- Tax efficiency

	## Market Analysis
	Evaluate market conditions:
	- Economic indicators (GDP, inflation, employment)
	- Interest rate environment
	- Sector rotation patterns
	- Geopolitical factors
	- Market sentiment indicators

	"""
	+ f"\n\nContext padding (random data to prevent tokenizer compression):\n{padding}"
	)


	QUESTIONS = ["What is the Sharpe ratio?", "Explain diversification benefits"]


	# ============================================================================
	# TEST WITH CACHING
	# ============================================================================


	def test_with_caching():
	"""Test with prompt caching enabled."""
	print("\n" + "=" * 80)
	print(f"ANTHROPIC PROMPT CACHING TEST - {MODEL}")
	print("=" * 80)

	# Prompt caching is now GA - no beta header needed
	client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
	context = create_large_context()

	results = []

	for i, question in enumerate(QUESTIONS, 1):
	print(f"\n{'─' * 80}")
	print(f"Call {i}: {question}")
	print(f"{'─' * 80}")

	start = time.time()

	response = client.messages.create(
	model=MODEL,
	max_tokens=200,
	system=[{"type": "text", "text": context, "cache_control": {"type": "ephemeral"}}],
	messages=[{"role": "user", "content": question}],
	)

	elapsed = time.time() - start
	usage = response.usage
	print(f"DEBUG - Raw response object: {response}")
	# print(f"DEBUG - Raw usage object: {usage}")

	# Extract metrics (handle both new nested and old flat format)
	# New format has cache_creation object with TTL-specific fields
	# Old format has flat cache_creation_input_tokens
	cache_creation = 0
	cache_read = 0

	# Try new nested format first
	if hasattr(usage, "cache_creation") and usage.cache_creation:
	cache_obj = usage.cache_creation
	# Sum all cache creation tokens (5m + 1h variants)
	cache_creation = getattr(cache_obj, "ephemeral_5m_input_tokens", 0) + getattr(
	cache_obj, "ephemeral_1h_input_tokens", 0
	)

	# Try old flat format (backward compatibility)
	if cache_creation == 0:
	cache_creation = getattr(usage, "cache_creation_input_tokens", 0)

	# Cache reads (still flat field)
	cache_read = getattr(usage, "cache_read_input_tokens", 0)

	input_tokens = usage.input_tokens
	output_tokens = usage.output_tokens

	# Calculate costs (Claude Sonnet 4.5 pricing)
	cost_cache_write = cache_creation * 0.00000375 # $3.75/MTok
	cost_cache_read = cache_read * 0.0000003 # $0.30/MTok
	cost_input = input_tokens * 0.000003 # $3.00/MTok
	cost_output = output_tokens * 0.000015 # $15.00/MTok
	total_cost = cost_cache_write + cost_cache_read + cost_input + cost_output

	results.append(
	{
	"call": i,
	"latency": elapsed,
	"cache_creation": cache_creation,
	"cache_read": cache_read,
	"input": input_tokens,
	"output": output_tokens,
	"cost": total_cost,
	}
	)

	# Print metrics
	cache_status = ""
	if cache_creation > 0:
	cache_status = " [CACHE WRITE]"
	elif cache_read > 0:
	cache_status = " [CACHE READ]"

	print(f"Latency: {elapsed:.2f}s{cache_status}")
	print(f"Input tokens: {input_tokens:,}")
	if cache_creation > 0:
	print(f"Cache write: {cache_creation:,}")
	if cache_read > 0:
	print(f"Cache read: {cache_read:,}")
	print(f"Output tokens: {output_tokens:,}")
	print(f"Cost: ${total_cost:.6f}")
	print(f"Answer preview: {response.content[0].text[:80]}...")

	# Summary
	print(f"\n{'=' * 80}")
	print("SUMMARY")
	print(f"{'=' * 80}")

	total_time = sum(r["latency"] for r in results)
	total_cost = sum(r["cost"] for r in results)
	total_cache_creation = sum(r["cache_creation"] for r in results)
	total_cache_read = sum(r["cache_read"] for r in results)
	total_input = sum(r["input"] for r in results)

	print(f"Total calls: {len(results)}")
	print(f"Total time: {total_time:.2f}s")
	print(f"Total cost: ${total_cost:.6f}")
	print("\nToken breakdown:")
	print(f" Cache writes: {total_cache_creation:,}")
	print(f" Cache reads: {total_cache_read:,}")
	print(f" Regular input: {total_input:,}")

	if len(results) > 1:
	first_latency = results[0]["latency"]
	avg_subsequent = sum(r["latency"] for r in results[1:]) / len(results[1:])
	improvement = ((first_latency - avg_subsequent) / first_latency) * 100

	print("\nLatency:")
	print(f" First call: {first_latency:.2f}s")
	print(f" Avg subsequent: {avg_subsequent:.2f}s")
	print(f" Improvement: {improvement:.1f}%")

	# Cost comparison
	first_cost = results[0]["cost"]
	avg_subsequent_cost = sum(r["cost"] for r in results[1:]) / len(results[1:])
	cost_savings = ((first_cost - avg_subsequent_cost) / first_cost) * 100

	print("\nCost:")
	print(f" First call: ${first_cost:.6f}")
	print(f" Avg subsequent: ${avg_subsequent_cost:.6f}")
	print(f" Savings: {cost_savings:.1f}%")

	return results


	# ============================================================================
	# MAIN
	# ============================================================================


	def main():
	print("\n" + "=" * 80)
	print("ANTHROPIC PROMPT CACHING TEST")
	print("=" * 80)
	print(f"Model: {MODEL}\n")

	results = test_with_caching()


	if __name__ == "__main__":
	main()
No results found