Last active
February 16, 2026 15:22
-
-
Save dburkhardt/33fdb764f5089ce6a8ee73879127f98d to your computer and use it in GitHub Desktop.
NVIDIA inference API advertises 1M context for Opus 4.6 but backend rejects >200K tokens (both aws/ and us/aws/ model IDs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Opus 4.6 advertises 1M context on inference.nvidia.com but rejects >200K.""" | |
| import json, os, sys, urllib.request | |
| API_KEY = os.environ.get("NVIDIA_API_KEY") or os.environ.get("NVIDIA_INFERENCE_KEY") | |
| if not API_KEY: | |
| sys.exit("Set NVIDIA_API_KEY env var (from https://inference.nvidia.com)") | |
| # ~950K tokens of filler (well under advertised 1M limit) | |
| # 3.8M chars / ~4 chars per token ≈ 950K tokens (API reported 928,958 actual) | |
| filler = ("The quick brown fox jumps over the lazy dog. " * 85_000)[:3_800_000] | |
| payload = json.dumps({ | |
| "model": "aws/anthropic/bedrock-claude-opus-4-6", | |
| "messages": [ | |
| {"role": "user", "content": filler + "\n\nReply with 'OK'"}, | |
| ], | |
| "max_tokens": 10, | |
| }).encode() | |
| print(f"Sending ~950K tokens to Opus 4.6 ({len(payload)/1e6:.1f} MB)...") | |
| req = urllib.request.Request( | |
| "https://inference-api.nvidia.com/v1/chat/completions", | |
| data=payload, | |
| headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}, | |
| ) | |
| try: | |
| with urllib.request.urlopen(req, timeout=300) as r: | |
| print("SUCCESS:", json.loads(r.read())["choices"][0]["message"]["content"]) | |
| except urllib.error.HTTPError as e: | |
| print("FAILED:", json.loads(e.read())["error"]["message"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment