Created
January 8, 2026 09:14
-
-
Save vkryukov/d1b9b854abd778c4b6462d0d322f6b6b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import annotations | |
| import base64 | |
| from openai import OpenAI | |
| PDF_CONTENT = b"""%PDF-1.4 | |
| 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj | |
| 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj | |
| 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >> endobj | |
| 4 0 obj << /Length 44 >> stream | |
| BT /F1 24 Tf 100 700 Td (Hello World) Tj ET | |
| endstream endobj | |
| 5 0 obj << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> endobj | |
| xref | |
| 0 6 | |
| 0000000000 65535 f | |
| 0000000009 00000 n | |
| 0000000058 00000 n | |
| 0000000115 00000 n | |
| 0000000266 00000 n | |
| 0000000359 00000 n | |
| trailer << /Size 6 /Root 1 0 R >> | |
| startxref | |
| 434 | |
| %%EOF | |
| """ | |
| def extract_output_text(response) -> str: | |
| text = getattr(response, "output_text", None) | |
| if text: | |
| return text | |
| text_parts = [] | |
| for item in response.output or []: | |
| if item.type == "message": | |
| for part in item.content or []: | |
| if part.type == "output_text": | |
| text_parts.append(part.text) | |
| return "".join(text_parts) | |
| def main() -> None: | |
| file_data = "data:application/pdf;base64," + base64.b64encode(PDF_CONTENT).decode("ascii") | |
| client = OpenAI() | |
| response = client.responses.create( | |
| model="gpt-5.2", | |
| input=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "input_file", "file_data": file_data, "filename": "test.pdf"}, | |
| {"type": "input_text", "text": "Reply with the text from the attached pdf file"} | |
| ], | |
| } | |
| ], | |
| ) | |
| print("status:", response.status) | |
| print("model:", response.model) | |
| print("output:", extract_output_text(response).strip()) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ❯ .venv/bin/python openai_pdf_repro.py | |
| status: completed | |
| model: gpt-5.2-2025-12-11 | |
| output: I can do that, but I don’t yet have the PDF attached in this chat. | |
| Please upload the PDF (or paste a link to it), and tell me whether you want: | |
| 1) **All text** exactly as-is, or | |
| 2) A **cleaned** extraction (no headers/footers/page numbers), or | |
| 3) A **summary** instead. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment