Skip to content

Instantly share code, notes, and snippets.

@lizthegrey
Last active January 1, 2026 18:28
Show Gist options
  • Select an option

  • Save lizthegrey/eef57b3d40430ecea339604275b4f1d6 to your computer and use it in GitHub Desktop.

Select an option

Save lizthegrey/eef57b3d40430ecea339604275b4f1d6 to your computer and use it in GitHub Desktop.
Interactive LLM Performance vs Cost Pareto Frontier - Compare models across different usage patterns (input:output ratios, cache hit rates, thinking overhead)
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LLM Performance vs Cost - Interactive Pareto Frontier</title>
<script src="https://d3js.org/d3.v7.min.js"></script>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
margin: 0;
padding: 20px;
background: #1a1a1a;
}
.container {
max-width: 1400px;
margin: 0 auto;
background: #2a2a2a;
padding: 30px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
}
h1 {
margin-top: 0;
color: #e0e0e0;
}
.controls {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px;
margin: 20px 0;
padding: 20px;
background: #333;
border-radius: 4px;
}
.control-group {
display: flex;
flex-direction: column;
}
label {
font-weight: 600;
margin-bottom: 8px;
color: #ccc;
font-size: 14px;
}
input[type="range"] {
width: 100%;
}
.value-display {
color: #0066cc;
font-weight: 600;
margin-top: 4px;
font-size: 13px;
}
.presets {
display: flex;
gap: 10px;
flex-wrap: wrap;
}
button {
padding: 8px 16px;
border: 1px solid #555;
background: #3a3a3a;
color: #e0e0e0;
border-radius: 4px;
cursor: pointer;
font-size: 13px;
transition: all 0.2s;
}
button:hover {
background: #4a4a4a;
}
button.active {
background: #0066cc;
color: white;
border-color: #0066cc;
}
.chart-container {
margin-top: 30px;
}
.tooltip {
position: absolute;
padding: 10px;
background: rgba(0, 0, 0, 0.8);
color: white;
border-radius: 4px;
pointer-events: none;
font-size: 12px;
opacity: 0;
transition: opacity 0.2s;
}
.legend {
display: flex;
gap: 20px;
margin-top: 20px;
flex-wrap: wrap;
}
.legend-item {
display: flex;
align-items: center;
gap: 8px;
font-size: 13px;
color: #ccc;
}
.legend-circle {
width: 12px;
height: 12px;
border-radius: 50%;
}
.info {
margin-top: 20px;
padding: 15px;
background: #333;
border-left: 4px solid #0066cc;
border-radius: 4px;
font-size: 14px;
color: #ccc;
}
</style>
</head>
<body>
<div class="container">
<h1>LLM Performance vs Cost: Interactive Pareto Frontier</h1>
<div class="info">
<strong>Why this matters:</strong> Google's Gemini 3 Flash chart assumes a fixed 5:1 output:input ratio with no caching.
Real-world usage varies dramatically—coding agents often run 50:1+ ratios with 80-90% cache hits,
completely changing the cost picture and Pareto frontier.<br><br>
<strong>Note:</strong> Prices shown are for interactive/real-time usage. Batch processing (50% discount) is available
from some providers but adds significant latency, making it unsuitable for the interactive use cases these Elo scores measure.
</div>
<div class="controls">
<div class="control-group">
<label>Benchmark</label>
<div style="display: flex; gap: 10px;">
<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
<input type="radio" name="benchmark" value="lmarena" checked style="width: auto; margin: 0;">
General Chat (LMArena)
</label>
<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
<input type="radio" name="benchmark" value="coding" style="width: auto; margin: 0;">
Coding (SWE-Bench)
</label>
</div>
</div>
<div class="control-group">
<label for="inputOutputRatio">Input:Output Token Ratio</label>
<input type="range" id="inputOutputRatio" min="0" max="100" value="5" step="1">
<div class="value-display" id="ratioDisplay">1:5</div>
</div>
<div class="control-group">
<label for="cacheHitRate">Cache Hit Rate (%)</label>
<input type="range" id="cacheHitRate" min="0" max="100" value="0" step="5">
<div class="value-display" id="cacheDisplay">0%</div>
</div>
<div class="control-group">
<label for="thinkingOverhead">Thinking Token Overhead (%)</label>
<input type="range" id="thinkingOverhead" min="0" max="300" value="100" step="25">
<div class="value-display" id="thinkingDisplay">100%</div>
</div>
</div>
<div class="presets">
<button onclick="applyPreset('casual')">Casual Chat (1:1, no cache)</button>
<button onclick="applyPreset('coding')">Coding Agent (1:50, 85% cache)</button>
<button onclick="applyPreset('rag')">RAG System (1:10, 70% cache)</button>
<button onclick="applyPreset('docs')">Document Analysis (100:1, 95% cache)</button>
<button onclick="applyPreset('google')">Google's Scenario (1:5, no cache)</button>
</div>
<div class="chart-container">
<svg id="chart"></svg>
</div>
<div class="legend" id="legend"></div>
</div>
<div class="tooltip" id="tooltip"></div>
<script>
// Model data with Elo scores and pricing (per million tokens)
// Pricing from https://ai.google.dev/gemini-api/docs/pricing (text only)
// thinkingMode: 'full' = full thinking overhead, 'minimal' = reduced overhead, null = no thinking
// lmarenaElo: LMArena Elo score (general chat), codingScore: SWE-Bench Verified % or CodeClash Elo
const models = [
{ name: 'Gemini 3 Pro', provider: 'Google', lmarenaElo: 1490, codingScore: 76.2, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Gemini 3 Flash', provider: 'Google', lmarenaElo: 1480, codingScore: 76.2, inputPrice: 0.5, outputPrice: 3.0, cachePrice: 0.05, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Grok 4.1 (thinking)', provider: 'xAI', lmarenaElo: 1477, codingScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Opus 4.5 (thinking)', provider: 'Anthropic', lmarenaElo: 1470, codingScore: 80.9, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, batchDiscount: true, thinkingMode: 'full' },
{ name: 'Opus 4.5', provider: 'Anthropic', lmarenaElo: 1467, codingScore: 74.6, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, batchDiscount: true, thinkingMode: null },
{ name: 'Grok 4.1', provider: 'xAI', lmarenaElo: 1466, codingScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, batchDiscount: false, thinkingMode: null },
{ name: 'Gemini 3 Flash (thinking-min)', provider: 'Google', lmarenaElo: 1464, codingScore: null, inputPrice: 0.5, outputPrice: 3.0, cachePrice: 0.05, batchDiscount: false, thinkingMode: 'minimal' },
{ name: 'GPT-5.1 High', provider: 'OpenAI', lmarenaElo: 1458, codingScore: 76.3, inputPrice: 30.0, outputPrice: 60.0, cachePrice: null, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Gemini 2.5 Pro', provider: 'Google', lmarenaElo: 1451, codingScore: 53.6, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Sonnet 4.5 (thinking)', provider: 'Anthropic', lmarenaElo: 1450, codingScore: 77.2, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, batchDiscount: true, thinkingMode: 'full' },
{ name: 'Sonnet 4.5', provider: 'Anthropic', lmarenaElo: 1450, codingScore: 77.2, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, batchDiscount: true, thinkingMode: null },
{ name: 'GPT-4o', provider: 'OpenAI', lmarenaElo: 1440, codingScore: null, inputPrice: 2.5, outputPrice: 10.0, cachePrice: null, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Haiku 4.5', provider: 'Anthropic', lmarenaElo: 1420, codingScore: 73.3, inputPrice: 1.0, outputPrice: 5.0, cachePrice: 0.10, batchDiscount: true, thinkingMode: null },
{ name: 'Gemini 2.5 Flash', provider: 'Google', lmarenaElo: 1408, codingScore: null, inputPrice: 0.30, outputPrice: 2.50, cachePrice: 0.03, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Gemini 2.5 Flash-Lite', provider: 'Google', lmarenaElo: 1378, codingScore: null, inputPrice: 0.10, outputPrice: 0.40, cachePrice: 0.01, batchDiscount: false, thinkingMode: 'minimal' },
{ name: 'Sonnet 3.5', provider: 'Anthropic', lmarenaElo: 1372, codingScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, batchDiscount: true, thinkingMode: null },
{ name: 'Gemini 2.0 Flash', provider: 'Google', lmarenaElo: 1360, codingScore: null, inputPrice: 0.10, outputPrice: 0.40, cachePrice: 0.025, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Haiku 3.5', provider: 'Anthropic', lmarenaElo: 1323, codingScore: null, inputPrice: 0.80, outputPrice: 4.0, cachePrice: 0.08, batchDiscount: true, thinkingMode: null },
{ name: 'Opus 3', provider: 'Anthropic', lmarenaElo: 1322, codingScore: null, inputPrice: 15.0, outputPrice: 75.0, cachePrice: 1.50, batchDiscount: true, thinkingMode: null },
{ name: 'Gemini 1.5 Flash', provider: 'Google', lmarenaElo: 1310, codingScore: null, inputPrice: 0.13, outputPrice: 0.38, cachePrice: null, batchDiscount: false, thinkingMode: 'full' },
{ name: 'Sonnet 3', provider: 'Anthropic', lmarenaElo: 1281, codingScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, batchDiscount: true, thinkingMode: null },
{ name: 'Haiku 3', provider: 'Anthropic', lmarenaElo: 1261, codingScore: null, inputPrice: 0.25, outputPrice: 1.25, cachePrice: 0.03, batchDiscount: true, thinkingMode: null },
];
// Provider colors
const providerColors = {
'Google': '#4285f4',
'Anthropic': '#d4a373',
'xAI': '#1da1f2',
'OpenAI': '#10a37f'
};
// Chart dimensions
const margin = { top: 20, right: 120, bottom: 60, left: 60 };
const width = 1200 - margin.left - margin.right;
const height = 600 - margin.top - margin.bottom;
// Create SVG
const svg = d3.select('#chart')
.attr('width', width + margin.left + margin.right)
.attr('height', height + margin.top + margin.bottom)
.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`);
// Scales - log scale INVERTED (expensive on left, cheap on right like Google's chart)
const xScale = d3.scaleLog().range([width, 0]);
const yScale = d3.scaleLinear().range([height, 0]);
// Axes
const xAxis = svg.append('g')
.attr('transform', `translate(0,${height})`);
const yAxis = svg.append('g');
// Axis labels
svg.append('text')
.attr('x', width / 2)
.attr('y', height + 50)
.attr('text-anchor', 'middle')
.style('font-size', '14px')
.style('font-weight', '600')
.style('fill', '#ccc')
.text('$ Price per million tokens (log scale)');
const yAxisLabel = svg.append('text')
.attr('transform', 'rotate(-90)')
.attr('x', -height / 2)
.attr('y', -45)
.attr('text-anchor', 'middle')
.style('font-size', '14px')
.style('font-weight', '600')
.style('fill', '#ccc')
.text('LMArena Elo Score');
// Pareto frontier line
const paretoLine = svg.append('path')
.attr('fill', 'none')
.attr('stroke', '#ff6b6b')
.attr('stroke-width', 2)
.attr('stroke-dasharray', '5,5');
// Tooltip
const tooltip = d3.select('#tooltip');
function calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead) {
const totalTokens = inputTokens + outputTokens;
const cachedTokens = model.cachePrice !== null ? inputTokens * (cacheHitRate / 100) : 0;
const regularInputTokens = inputTokens - cachedTokens;
// Apply thinking token overhead based on model's thinkingMode property
let effectiveOutputTokens = outputTokens;
if (model.thinkingMode === 'full') {
// Full thinking models: full slider overhead
effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 100);
} else if (model.thinkingMode === 'minimal') {
// Minimal thinking models: 25% of slider overhead
effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 400);
}
// Non-thinking models (thinkingMode: null): no overhead applied
// Prices are per million tokens, so divide token counts by 1M
let cost = (regularInputTokens * model.inputPrice + effectiveOutputTokens * model.outputPrice) / 1000000;
if (cachedTokens > 0) {
cost += (cachedTokens * model.cachePrice) / 1000000;
}
// Note: Batch processing (50% discount) not included as it adds significant latency
// and is unsuitable for interactive use cases
return cost;
}
function calculateParetoFrontier(points) {
// Sort by cost ASCENDING (cheap to expensive)
// On inverted axis, this goes right to left (cheap on right, expensive on left)
// Keep points with increasing score as we go from cheap to expensive
// This creates frontier from lower-right to upper-left
const sorted = [...points].sort((a, b) => a.cost - b.cost);
const frontier = [];
let maxScore = -Infinity;
for (const point of sorted) {
if (point.score > maxScore) {
frontier.push(point);
maxScore = point.score;
}
}
return frontier;
}
function updateChart() {
// Get current settings
const inputOutputRatio = parseInt(document.getElementById('inputOutputRatio').value);
const cacheHitRate = parseInt(document.getElementById('cacheHitRate').value);
const thinkingOverhead = parseInt(document.getElementById('thinkingOverhead').value);
const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
// Calculate costs for each model (per 1M total tokens)
const inputRatio = 1 / (1 + inputOutputRatio);
const outputRatio = inputOutputRatio / (1 + inputOutputRatio);
const inputTokens = 1000000 * inputRatio;
const outputTokens = 1000000 * outputRatio;
// Filter models based on benchmark and add cost
const allDataPoints = models.map(model => ({
...model,
cost: calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead),
score: benchmark === 'lmarena' ? model.lmarenaElo : model.codingScore
}));
// Filter out models without scores for selected benchmark
const dataPoints = allDataPoints.filter(d => d.score !== null);
// Update Y-axis label based on benchmark
yAxisLabel.text(benchmark === 'lmarena' ? 'LMArena Elo Score (General Chat)' : 'SWE-Bench Verified Score (%)');
// Update scales
const minCost = d3.min(dataPoints, d => d.cost);
const maxCost = d3.max(dataPoints, d => d.cost);
const minScore = d3.min(dataPoints, d => d.score);
const maxScore = d3.max(dataPoints, d => d.score);
// Log scale needs positive values, add padding
xScale.domain([Math.max(minCost * 0.8, 0.01), maxCost * 1.2]);
// Set Y-axis domain based on benchmark type
if (benchmark === 'lmarena') {
yScale.domain([Math.floor(minScore / 50) * 50, Math.ceil(maxScore / 50) * 50]);
} else {
yScale.domain([Math.floor(minScore / 10) * 10, Math.ceil(maxScore / 10) * 10]);
}
// Update axes - let D3 handle log scale ticks properly
xAxis.transition().duration(500)
.call(d3.axisBottom(xScale)
.ticks(10)
.tickFormat(d => {
// Format nicely for different scales
if (d >= 1) return `$${d.toFixed(0)}`;
if (d >= 0.1) return `$${d.toFixed(1)}`;
return `$${d.toFixed(2)}`;
}))
.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
.call(g => g.selectAll('.tick line').style('stroke', '#555'))
.call(g => g.select('.domain').style('stroke', '#555'));
yAxis.transition().duration(500)
.call(d3.axisLeft(yScale).ticks(10))
.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
.call(g => g.selectAll('.tick line').style('stroke', '#555'))
.call(g => g.select('.domain').style('stroke', '#555'));
// Calculate Pareto frontier
const frontierPoints = calculateParetoFrontier(dataPoints);
// Update Pareto line
const lineGenerator = d3.line()
.x(d => xScale(d.cost))
.y(d => yScale(d.score));
paretoLine.transition().duration(500)
.attr('d', lineGenerator(frontierPoints));
// Update circles
const circles = svg.selectAll('.model-circle')
.data(dataPoints, d => d.name);
circles.exit().remove();
const circlesEnter = circles.enter()
.append('circle')
.attr('class', 'model-circle')
.attr('r', 6)
.attr('opacity', 0.8)
.attr('stroke', '#fff')
.attr('stroke-width', 2)
.on('mouseover', function(event, d) {
d3.select(this)
.attr('r', 8)
.attr('opacity', 1);
const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
const scoreLabel = benchmark === 'lmarena' ? 'LMArena Elo' : 'SWE-Bench';
tooltip
.style('opacity', 1)
.html(`
<strong>${d.name}</strong><br>
Provider: ${d.provider}<br>
${scoreLabel}: ${d.score}<br>
Cost: $${d.cost.toFixed(2)}/M tokens<br>
${d.cachePrice !== null ? 'Cache: ✓' : 'Cache: ✗'}
`)
.style('left', (event.pageX + 10) + 'px')
.style('top', (event.pageY - 10) + 'px');
})
.on('mouseout', function() {
d3.select(this)
.attr('r', 6)
.attr('opacity', 0.8);
tooltip.style('opacity', 0);
});
circles.merge(circlesEnter)
.transition()
.duration(500)
.attr('cx', d => xScale(d.cost))
.attr('cy', d => yScale(d.score))
.attr('fill', d => providerColors[d.provider]);
// Update labels for all models (make frontier models more prominent)
const frontierSet = new Set(frontierPoints.map(d => d.name));
const labels = svg.selectAll('.model-label')
.data(dataPoints, d => d.name);
labels.exit().remove();
const labelsEnter = labels.enter()
.append('text')
.attr('class', 'model-label')
.attr('font-size', '11px')
.style('pointer-events', 'none');
labels.merge(labelsEnter)
.transition()
.duration(500)
.attr('x', d => xScale(d.cost) + 10)
.attr('y', d => yScale(d.score) + 4)
.attr('font-weight', d => frontierSet.has(d.name) ? '600' : '400')
.attr('fill', d => frontierSet.has(d.name) ? '#e0e0e0' : '#888')
.attr('opacity', d => frontierSet.has(d.name) ? 1 : 0.6)
.text(d => d.name);
}
function applyPreset(preset) {
const presets = {
casual: { ratio: 1, cache: 0, batch: false },
coding: { ratio: 50, cache: 85, batch: false },
rag: { ratio: 10, cache: 70, batch: false },
docs: { ratio: 1, cache: 95, batch: false },
google: { ratio: 5, cache: 0, batch: false }
};
const config = presets[preset];
document.getElementById('inputOutputRatio').value = config.ratio;
document.getElementById('cacheHitRate').value = config.cache;
document.getElementById('batchMode').checked = config.batch;
updateDisplays();
updateChart();
}
function updateDisplays() {
const ratio = parseInt(document.getElementById('inputOutputRatio').value);
const cache = parseInt(document.getElementById('cacheHitRate').value);
const thinking = parseInt(document.getElementById('thinkingOverhead').value);
document.getElementById('ratioDisplay').textContent = `1:${ratio}`;
document.getElementById('cacheDisplay').textContent = `${cache}%`;
document.getElementById('thinkingDisplay').textContent = `${thinking}%`;
}
// Create legend
function createLegend() {
const legend = document.getElementById('legend');
const providers = [...new Set(models.map(m => m.provider))];
providers.forEach(provider => {
const item = document.createElement('div');
item.className = 'legend-item';
const circle = document.createElement('div');
circle.className = 'legend-circle';
circle.style.backgroundColor = providerColors[provider];
const label = document.createElement('span');
label.textContent = provider;
item.appendChild(circle);
item.appendChild(label);
legend.appendChild(item);
});
}
// Event listeners
document.getElementById('inputOutputRatio').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.getElementById('cacheHitRate').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.getElementById('thinkingOverhead').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.querySelectorAll('input[name="benchmark"]').forEach(radio => {
radio.addEventListener('change', updateChart);
});
// Initialize
createLegend();
updateDisplays();
updateChart();
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment