Skip to content

Instantly share code, notes, and snippets.

@dburkhardt
Last active February 16, 2026 15:22
Show Gist options
  • Select an option

  • Save dburkhardt/33fdb764f5089ce6a8ee73879127f98d to your computer and use it in GitHub Desktop.

Select an option

Save dburkhardt/33fdb764f5089ce6a8ee73879127f98d to your computer and use it in GitHub Desktop.
NVIDIA inference API advertises 1M context for Opus 4.6 but backend rejects >200K tokens (both aws/ and us/aws/ model IDs)
#!/usr/bin/env python3
"""Opus 4.6 advertises 1M context on inference.nvidia.com but rejects >200K."""
import json, os, sys, urllib.request
API_KEY = os.environ.get("NVIDIA_API_KEY") or os.environ.get("NVIDIA_INFERENCE_KEY")
if not API_KEY:
sys.exit("Set NVIDIA_API_KEY env var (from https://inference.nvidia.com)")
# ~950K tokens of filler (well under advertised 1M limit)
# 3.8M chars / ~4 chars per token ≈ 950K tokens (API reported 928,958 actual)
filler = ("The quick brown fox jumps over the lazy dog. " * 85_000)[:3_800_000]
payload = json.dumps({
"model": "aws/anthropic/bedrock-claude-opus-4-6",
"messages": [
{"role": "user", "content": filler + "\n\nReply with 'OK'"},
],
"max_tokens": 10,
}).encode()
print(f"Sending ~950K tokens to Opus 4.6 ({len(payload)/1e6:.1f} MB)...")
req = urllib.request.Request(
"https://inference-api.nvidia.com/v1/chat/completions",
data=payload,
headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=300) as r:
print("SUCCESS:", json.loads(r.read())["choices"][0]["message"]["content"])
except urllib.error.HTTPError as e:
print("FAILED:", json.loads(e.read())["error"]["message"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment