snowfluke/index.ts

## index.ts
// curl -fsSL https://bun.sh/install | bash
// bun add ppu-pdf ppu-paddle-ocr onnxruntime-node
// Run it: bun run index.ts

import { PaddleOcrService } from "ppu-paddle-ocr";
import { PdfReader } from "ppu-pdf";

export const MODEL_BASE_URL =
  "https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";
export const DICT_BASE_URL =
  "https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";

const pdfReader = new PdfReader({ verbose: false });

// Tweak the model variant and dictionary to balance the accuracy and performance.
// Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate.
const ocr = new PaddleOcrService({
  model: {
    detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`,
    recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`,
    charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`,
  },
});

await ocr.initialize();

// Download OCR model and warm up cache

console.log("Warming up OCR model...");
{
  const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer();
  const testDoc = pdfReader.open(testBuffer);
  const testCanvas = await pdfReader.renderAll(testDoc);

  await pdfReader.getTextsScanned(ocr, testCanvas);
  pdfReader.destroy(testDoc);
}
console.log("Warmup complete.\n");

console.time("Normal inference")
{
    // 1. Reading the file from disk
    const fileScan = Bun.file("./assets/test_japanese.pdf");
    const bufferScan = await fileScan.arrayBuffer();

    // 2. Open and Render
    const pdfScan = pdfReader.open(bufferScan);
    const canvasMap = await pdfReader.renderAll(pdfScan);
    pdfReader.destroy(pdfScan);

    // 3. Extract OCR Texts
    const texts = await pdfReader.getTextsScanned(ocr, canvasMap);

    // 4. Rebuild Searchable PDF
    const pdfForRebuild = pdfReader.open(bufferScan);
    const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
    pdfReader.destroy(pdfForRebuild);

    // 5. Save onto disk
    await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
}
console.timeEnd("Normal inference")

// import { bench, group, run } from "mitata";
// console.log("\nStarting benchmarking")
// group("ppu-pdf e2e processing", () => {
//   bench("Extract Texts and Rebuild PDF", async () => {
//     const fileScan = Bun.file("./assets/test_japanese.pdf");
//     const bufferScan = await fileScan.arrayBuffer();

//     const pdfScan = pdfReader.open(bufferScan);
//     const canvasMap = await pdfReader.renderAll(pdfScan);
//     pdfReader.destroy(pdfScan);

//     const texts = await pdfReader.getTextsScanned(ocr, canvasMap);
//     const pdfForRebuild = pdfReader.open(bufferScan);
//     const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
//     pdfReader.destroy(pdfForRebuild);

//     await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
//   });
// });

// await run({
//   colors: true,
// });

await ocr.destroy();

// BENCHMARK RESULT

// benchmark                    avg (min … max) p75 / p99    (min … top 1%)
// -------------------------------------------- -------------------------------
// • ppu-pdf e2e processing
// -------------------------------------------- -------------------------------

// japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt
// Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █
//                      (783.87 ms … 850.33 ms) 817.52 ms █ █
//                      (224.00 kb …  18.47 mb)   9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█


// PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt
// Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █  █
//                      (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅  ▅   ▅      ▅
//                      ( 16.00 kb …  15.58 mb)   7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█


// PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt
// Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms  █
//                      (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅     ▅   ▅   ▅
//                      (384.00 kb …  33.72 mb)  11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█
	// curl -fsSL https://bun.sh/install \| bash
	// bun add ppu-pdf ppu-paddle-ocr onnxruntime-node
	// Run it: bun run index.ts

	import { PaddleOcrService } from "ppu-paddle-ocr";
	import { PdfReader } from "ppu-pdf";

	export const MODEL_BASE_URL =
	"https://media.githubusercontent.com/media/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";
	export const DICT_BASE_URL =
	"https://raw.githubusercontent.com/PT-Perkasa-Pilar-Utama/ppu-paddle-ocr-models/main";

	const pdfReader = new PdfReader({ verbose: false });

	// Tweak the model variant and dictionary to balance the accuracy and performance.
	// Note that the dictionary should match the recognition model, otherwise the OCR results will be inaccurate.
	const ocr = new PaddleOcrService({
	model: {
	detection: `${MODEL_BASE_URL}/detection/PP-OCRv5_mobile_det_infer.onnx`,
	recognition: `${MODEL_BASE_URL}/recognition/PP-OCRv5_mobile_rec_infer.onnx`,
	charactersDictionary: `${DICT_BASE_URL}/recognition/ppocrv5_dict.txt`,
	},
	});

	await ocr.initialize();

	// Download OCR model and warm up cache

	console.log("Warming up OCR model...");
	{
	const testBuffer = await Bun.file("./assets/opposite-expectation-scan.pdf").arrayBuffer();
	const testDoc = pdfReader.open(testBuffer);
	const testCanvas = await pdfReader.renderAll(testDoc);

	await pdfReader.getTextsScanned(ocr, testCanvas);
	pdfReader.destroy(testDoc);
	}
	console.log("Warmup complete.\n");

	console.time("Normal inference")
	{
	// 1. Reading the file from disk
	const fileScan = Bun.file("./assets/test_japanese.pdf");
	const bufferScan = await fileScan.arrayBuffer();

	// 2. Open and Render
	const pdfScan = pdfReader.open(bufferScan);
	const canvasMap = await pdfReader.renderAll(pdfScan);
	pdfReader.destroy(pdfScan);

	// 3. Extract OCR Texts
	const texts = await pdfReader.getTextsScanned(ocr, canvasMap);

	// 4. Rebuild Searchable PDF
	const pdfForRebuild = pdfReader.open(bufferScan);
	const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
	pdfReader.destroy(pdfForRebuild);

	// 5. Save onto disk
	await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
	}
	console.timeEnd("Normal inference")

	// import { bench, group, run } from "mitata";
	// console.log("\nStarting benchmarking")
	// group("ppu-pdf e2e processing", () => {
	// bench("Extract Texts and Rebuild PDF", async () => {
	// const fileScan = Bun.file("./assets/test_japanese.pdf");
	// const bufferScan = await fileScan.arrayBuffer();

	// const pdfScan = pdfReader.open(bufferScan);
	// const canvasMap = await pdfReader.renderAll(pdfScan);
	// pdfReader.destroy(pdfScan);

	// const texts = await pdfReader.getTextsScanned(ocr, canvasMap);
	// const pdfForRebuild = pdfReader.open(bufferScan);
	// const rebuiltPdfBuffer = await pdfReader.rebuild(pdfForRebuild, texts);
	// pdfReader.destroy(pdfForRebuild);

	// await Bun.write("./test_japanese_searchable.pdf", rebuiltPdfBuffer);
	// });
	// });

	// await run({
	// colors: true,
	// });

	await ocr.destroy();

	// BENCHMARK RESULT

	// benchmark avg (min … max) p75 / p99 (min … top 1%)
	// -------------------------------------------- -------------------------------
	// • ppu-pdf e2e processing
	// -------------------------------------------- -------------------------------

	// japan_PP-OCRv3_mobile_rec_infer.onnx + japan_dict.txt
	// Extract Texts and Rebuild PDF 798.30 ms/iter 799.05 ms █ █
	// (783.87 ms … 850.33 ms) 817.52 ms █ █
	// (224.00 kb … 18.47 mb) 9.74 mb █▁█▁▁▁█▁██▁▁▁▁▁█▁▁▁▁█


	// PP-OCRv5_mobile_rec_infer.onnx + ppocrv5_dict.txt
	// Extract Texts and Rebuild PDF 802.18 ms/iter 803.59 ms █ █ █
	// (792.74 ms … 825.94 ms) 817.62 ms █ █▅ █▅ ▅ ▅ ▅
	// ( 16.00 kb … 15.58 mb) 7.87 mb █▁██▁██▁▁█▁▁▁█▁▁▁▁▁▁█


	// PP-OCRv5_server_rec_infer.onnx + ppocrv5_dict.txt
	// Extract Texts and Rebuild PDF 802.84 ms/iter 804.37 ms █
	// (797.71 ms … 819.77 ms) 808.87 ms ▅█▅▅▅▅▅ ▅ ▅ ▅
	// (384.00 kb … 33.72 mb) 11.90 mb ███████▁▁▁▁▁█▁▁▁█▁▁▁█
Model	Avg	p75	p99	Avg Mem	Max Mem
japan_PP-OCRv3_mobile	798.30 ms	799.05 ms	817.52 ms	9.74 mb	18.47 mb
PP-OCRv5_mobile	802.18 ms	803.59 ms	817.62 ms	7.87 mb	15.58 mb
PP-OCRv5_server	802.84 ms	804.37 ms	808.87 ms	11.90 mb	33.72 mb
Model	Est. Duration	Est. Avg Mem	Est. Max Mem
japan_PP-OCRv3_mobile	~14.4 s	~174 mb	~332 mb
PP-OCRv5_mobile	~14.4 s	~142 mb	~280 mb
PP-OCRv5_server	~14.5 s	~214 mb	~607 mb