Skip to content

Instantly share code, notes, and snippets.

@vkryukov
Created January 8, 2026 09:14
Show Gist options
  • Select an option

  • Save vkryukov/d1b9b854abd778c4b6462d0d322f6b6b to your computer and use it in GitHub Desktop.

Select an option

Save vkryukov/d1b9b854abd778c4b6462d0d322f6b6b to your computer and use it in GitHub Desktop.
from __future__ import annotations
import base64
from openai import OpenAI
PDF_CONTENT = b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >> endobj
4 0 obj << /Length 44 >> stream
BT /F1 24 Tf 100 700 Td (Hello World) Tj ET
endstream endobj
5 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000359 00000 n
trailer << /Size 6 /Root 1 0 R >>
startxref
434
%%EOF
"""
def extract_output_text(response) -> str:
text = getattr(response, "output_text", None)
if text:
return text
text_parts = []
for item in response.output or []:
if item.type == "message":
for part in item.content or []:
if part.type == "output_text":
text_parts.append(part.text)
return "".join(text_parts)
def main() -> None:
file_data = "data:application/pdf;base64," + base64.b64encode(PDF_CONTENT).decode("ascii")
client = OpenAI()
response = client.responses.create(
model="gpt-5.2",
input=[
{
"role": "user",
"content": [
{"type": "input_file", "file_data": file_data, "filename": "test.pdf"},
{"type": "input_text", "text": "Reply with the text from the attached pdf file"}
],
}
],
)
print("status:", response.status)
print("model:", response.model)
print("output:", extract_output_text(response).strip())
if __name__ == "__main__":
main()
❯ .venv/bin/python openai_pdf_repro.py
status: completed
model: gpt-5.2-2025-12-11
output: I can do that, but I don’t yet have the PDF attached in this chat.
Please upload the PDF (or paste a link to it), and tell me whether you want:
1) **All text** exactly as-is, or
2) A **cleaned** extraction (no headers/footers/page numbers), or
3) A **summary** instead.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment