Last active
March 8, 2026 03:16
-
-
Save snowfluke/6faa92771ceaf41945c1821829b0a301 to your computer and use it in GitHub Desktop.
Convert flatten/scanned pdf into searchable pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // curl -fsSL https://bun.sh/install | bash | |
| // bun add ppu-pdf ppu-paddle-ocr onnxruntime-node | |
| // Run it: bun run index.ts | |
| import { PaddleOcrService } from "ppu-paddle-ocr"; | |
| import { PdfReader } from "ppu-pdf"; | |
| export const MODEL_BASE_URL = | |
| "https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main"; | |
| export const DICT_BASE_URL = | |
| "https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main"; | |
| const pdfReader = new PdfReader({ verbose: false }); | |
| // Tweak the model variant and dictionary to balance the accuracy and performance. | |
| // Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate. | |
| const ocr = new PaddleOcrService({ | |
| model: { | |
| detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`, | |
| recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`, | |
| charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`, | |
| }, | |
| }); | |
| await ocr.initialize(); | |
| // Download OCR model and warm up cache | |
| console.log("Warming up OCR model..."); | |
| { | |
| const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer(); | |
| const testDoc = pdfReader.open(testBuffer); | |
| const testCanvas = await pdfReader.renderAll(testDoc); | |
| await pdfReader.getTextsScanned(ocr, testCanvas); | |
| pdfReader.destroy(testDoc); | |
| } | |
| console.log("Warmup complete.\n"); | |
| console.time("Normal inference") | |
| { | |
| // 1. Reading the file from disk | |
| const fileScan = Bun.file("./assets/test_japanese.pdf"); | |
| const bufferScan = await fileScan.arrayBuffer(); | |
| // 2. Open and Render | |
| const pdfScan = pdfReader.open(bufferScan); | |
| const canvasMap = await pdfReader.renderAll(pdfScan); | |
| pdfReader.destroy(pdfScan); | |
| // 3. Extract OCR Texts | |
| const texts = await pdfReader.getTextsScanned(ocr, canvasMap); | |
| // 4. Rebuild Searchable PDF | |
| const pdfForRebuild = pdfReader.open(bufferScan); | |
| const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts); | |
| pdfReader.destroy(pdfForRebuild); | |
| // 5. Save onto disk | |
| await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer); | |
| } | |
| console.timeEnd("Normal inference") | |
| // import { bench, group, run } from "mitata"; | |
| // console.log("\nStarting benchmarking") | |
| // group("ppu-pdf e2e processing", () => { | |
| // bench("Extract Texts and Rebuild PDF", async () => { | |
| // const fileScan = Bun.file("./assets/test_japanese.pdf"); | |
| // const bufferScan = await fileScan.arrayBuffer(); | |
| // const pdfScan = pdfReader.open(bufferScan); | |
| // const canvasMap = await pdfReader.renderAll(pdfScan); | |
| // pdfReader.destroy(pdfScan); | |
| // const texts = await pdfReader.getTextsScanned(ocr, canvasMap); | |
| // const pdfForRebuild = pdfReader.open(bufferScan); | |
| // const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts); | |
| // pdfReader.destroy(pdfForRebuild); | |
| // await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer); | |
| // }); | |
| // }); | |
| // await run({ | |
| // colors: true, | |
| // }); | |
| await ocr.destroy(); | |
| // BENCHMARK RESULT | |
| // benchmark avg (min … max) p75 / p99 (min … top 1%) | |
| // -------------------------------------------- ------------------------------- | |
| // • ppu-pdf e2e processing | |
| // -------------------------------------------- ------------------------------- | |
| // japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt | |
| // Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █ | |
| // (783.87 ms … 850.33 ms) 817.52 ms █ █ | |
| // (224.00 kb … 18.47 mb) 9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█ | |
| // PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt | |
| // Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █ █ | |
| // (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅ ▅ ▅ ▅ | |
| // ( 16.00 kb … 15.58 mb) 7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█ | |
| // PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt | |
| // Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms █ | |
| // (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅ ▅ ▅ ▅ | |
| // (384.00 kb … 33.72 mb) 11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█ |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hypotetichal 180-page in Bun.js run time