When calling nvidia/nemotron-parse with tool_choice: detection_only, the API returns bounding boxes but no text content. This causes two problems:
- Bounding box geometry differs from what the NVIDIA Build demo produces on the same image.
- Page text is always empty, breaking any downstream text extraction.
The correct tool is markdown_bbox, which returns both bounding boxes and inline text for every detected element.
Set your NVIDIA Build API key in your environment before running:
export NVIDIA_BUILD_API_KEY="nvapi-..."Then run:
import base64
import json
import os
import requests
from openai import OpenAI
from PIL import Image, ImageDraw
client = OpenAI(
base_url="https://integrate.api.nvidia.com/v1",
api_key=os.environ["NVIDIA_BUILD_API_KEY"],
)
# Download the test PDF and render page 1 to PNG (requires pymupdf: pip install pymupdf requests)
import requests
import fitz
pdf_bytes = requests.get(
"https://github.com/dburkhardt/nemotron-parse-repro/releases/download/v1.0/cai2024surfaceproteinprofiling.pdf"
).content
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
pix = doc[0].get_pixmap(matrix=fitz.Matrix(150/72, 150/72))
pix.save("page.png")
with open("page.png", "rb") as f:
data_uri = "data:image/png;base64," + base64.b64encode(f.read()).decode()
message = {"role": "user", "content": [{"type": "image_url", "image_url": {"url": data_uri}}]}
def draw_boxes(png_path, items, output_path):
img = Image.open(png_path).convert("RGB")
draw = ImageDraw.Draw(img)
w, h = img.size
for item in items:
x0, y0, x1, y1 = item["bbox"]
draw.rectangle([x0 * w, y0 * h, x1 * w, y1 * h], outline="red", width=2)
label = item.get("type", "")
draw.text((x0 * w + 2, y0 * h + 2), label, fill="red")
img.save(output_path)
print(f"Saved {output_path}")
def parse_response(resp):
args = json.loads(resp.choices[0].message.tool_calls[0].function.arguments)
# API returns either a flat list or a dict with an elements/items key
items = args if isinstance(args, list) else args.get("elements", args.get("items", []))
# Normalize bbox: API may return {xmin, ymin, xmax, ymax} or [x0, y0, x1, y1]
for item in items:
b = item["bbox"]
if isinstance(b, dict):
item["bbox"] = [b["xmin"], b["ymin"], b["xmax"], b["ymax"]]
return items
# ❌ BROKEN: returns bboxes only, no text, different geometry than Build
broken_resp = client.chat.completions.create(
model="nvidia/nemotron-parse",
messages=[message],
tools=[{"type": "function", "function": {"name": "detection_only"}}],
tool_choice={"type": "function", "function": {"name": "detection_only"}},
temperature=0,
)
broken_items = parse_response(broken_resp)
draw_boxes("page.png", broken_items, "out_detection_only.png")
# ✅ FIXED: returns bboxes + inline text, matches Build geometry
fixed_resp = client.chat.completions.create(
model="nvidia/nemotron-parse",
messages=[message],
tools=[{"type": "function", "function": {"name": "markdown_bbox"}}],
tool_choice={"type": "function", "function": {"name": "markdown_bbox"}},
temperature=0,
)
fixed_items = parse_response(fixed_resp)
draw_boxes("page.png", fixed_items, "out_markdown_bbox.png")
print(f"detection_only element 0: {broken_items[0]}")
print(f"markdown_bbox element 0: {fixed_items[0]}")Compare out_detection_only.png and out_markdown_bbox.png to see the difference visually.
| Tool | bbox ymax | text |
|---|---|---|
detection_only |
0.0543 |
(empty) |
markdown_bbox |
0.1054 |
"Multiomic single-cell..." |
detection_only — each element in tool_calls[0].function.arguments:
{"bbox": [0.075, 0.032, 0.926, 0.054], "type": "Page-header"}markdown_bbox — each element includes text:
{"bbox": [0.075, 0.032, 0.926, 0.105], "type": "Page-header", "text": "Multiomic single-cell..."}Use markdown_bbox instead of detection_only. The NVIDIA API documentation uses markdown_bbox in all primary examples. The third available mode, markdown_no_bbox, returns text without coordinates and is not useful for layout analysis.
The PDF used to produce the observations above (cai2024surfaceproteinprofiling.pdf) is available for download: