vkryukov/openai_pdf_repro.py

## openai_pdf_repro.py
from __future__ import annotations

import base64

from openai import OpenAI

PDF_CONTENT = b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >> endobj
4 0 obj << /Length 44 >> stream
BT /F1 24 Tf 100 700 Td (Hello World) Tj ET
endstream endobj
5 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000359 00000 n
trailer << /Size 6 /Root 1 0 R >>
startxref
434
%%EOF
"""


def extract_output_text(response) -> str:
    text = getattr(response, "output_text", None)
    if text:
        return text

    text_parts = []
    for item in response.output or []:
        if item.type == "message":
            for part in item.content or []:
                if part.type == "output_text":
                    text_parts.append(part.text)

    return "".join(text_parts)


def main() -> None:
    file_data = "data:application/pdf;base64," + base64.b64encode(PDF_CONTENT).decode("ascii")

    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.2",
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_file", "file_data": file_data, "filename": "test.pdf"},
                    {"type": "input_text", "text": "Reply with the text from the attached pdf file"}
                ],
            }
        ],
    )

    print("status:", response.status)
    print("model:", response.model)
    print("output:", extract_output_text(response).strip())


if __name__ == "__main__":
    main()

## response
❯ .venv/bin/python openai_pdf_repro.py
status: completed
model: gpt-5.2-2025-12-11
output: I can do that, but I don’t yet have the PDF attached in this chat.

Please upload the PDF (or paste a link to it), and tell me whether you want:
1) **All text** exactly as-is, or
2) A **cleaned** extraction (no headers/footers/page numbers), or
3) A **summary** instead.
	from __future__ import annotations

	import base64

	from openai import OpenAI

	PDF_CONTENT = b"""%PDF-1.4
	1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
	2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
	3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >> endobj
	4 0 obj << /Length 44 >> stream
	BT /F1 24 Tf 100 700 Td (Hello World) Tj ET
	endstream endobj
	5 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj
	xref
	0 6
	0000000000 65535 f
	0000000009 00000 n
	0000000058 00000 n
	0000000115 00000 n
	0000000266 00000 n
	0000000359 00000 n
	trailer << /Size 6 /Root 1 0 R >>
	startxref
	434
	%%EOF
	"""


	def extract_output_text(response) -> str:
	text = getattr(response, "output_text", None)
	if text:
	return text

	text_parts = []
	for item in response.output or []:
	if item.type == "message":
	for part in item.content or []:
	if part.type == "output_text":
	text_parts.append(part.text)

	return "".join(text_parts)


	def main() -> None:
	file_data = "data:application/pdf;base64," + base64.b64encode(PDF_CONTENT).decode("ascii")

	client = OpenAI()
	response = client.responses.create(
	model="gpt-5.2",
	input=[
	{
	"role": "user",
	"content": [
	{"type": "input_file", "file_data": file_data, "filename": "test.pdf"},
	{"type": "input_text", "text": "Reply with the text from the attached pdf file"}
	],
	}
	],
	)

	print("status:", response.status)
	print("model:", response.model)
	print("output:", extract_output_text(response).strip())


	if __name__ == "__main__":
	main()
	❯ .venv/bin/python openai_pdf_repro.py
	status: completed
	model: gpt-5.2-2025-12-11
	output: I can do that, but I don’t yet have the PDF attached in this chat.

	Please upload the PDF (or paste a link to it), and tell me whether you want:
	1) All text exactly as-is, or
	2) A cleaned extraction (no headers/footers/page numbers), or
	3) A summary instead.