Skip to content

Instantly share code, notes, and snippets.

@mshivam019
Created February 16, 2026 14:50
Show Gist options
  • Select an option

  • Save mshivam019/4260f6fcfbe860ace50bd8527751db94 to your computer and use it in GitHub Desktop.

Select an option

Save mshivam019/4260f6fcfbe860ace50bd8527751db94 to your computer and use it in GitHub Desktop.
This script to perform OCR on a nodejs server
import { promises as fs } from "node:fs";
import { pdf } from "pdf-to-img";
import path from "path";
import { fileURLToPath } from "url";
import sharp from "sharp";
import Ocr from "@gutenye/ocr-node";
// Handle ES module paths
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Directories
const inputDir = path.join(__dirname, "input");
const outputBaseDir = path.join(__dirname, "output");
function insertSpaceBetweenWords(text) {
return text.replace(/([a-z])([A-Z])/g, (match, lower, upper) => {
// Don't split if part of acronym (e.g., XLRI, NASA)
const isAcronym = /^[A-Z]{2,}$/.test(match.slice(1)); // e.g., LRI from XLRI
return isAcronym ? match : `${lower} ${upper}`;
});
}
async function convertPdfToImages(pdfPath, outputDir, scale = 3) {
const images = [];
const document = await pdf(pdfPath, { scale });
let page = 1;
for await (const imageBuffer of document) {
const imagePath = path.join(outputDir, `page${page}.png`);
await fs.writeFile(imagePath, imageBuffer);
images.push(imagePath);
page++;
}
return images;
}
async function preprocessImages(imagePaths, preprocessedDir) {
await fs.mkdir(preprocessedDir, { recursive: true });
const processedPaths = [];
for (const imagePath of imagePaths) {
const filename = path.basename(imagePath);
const outPath = path.join(preprocessedDir, filename);
await sharp(imagePath)
.grayscale()
.normalize()
.resize({ width: 2000 })
.toFile(outPath);
processedPaths.push(outPath);
}
return processedPaths;
}
async function runOcrOnImages(imagePaths) {
const ocr = await Ocr.create();
const results = [];
for (const imagePath of imagePaths) {
console.log(`Processing ${path.basename(imagePath)}...`);
const raw = await ocr.detect(imagePath);
const characters = raw.filter(e => typeof e.text === "string" && e.text.trim().length);
const avgHeight = characters.reduce((sum, c) => sum + (c.height || 10), 0) / characters.length;
const lineThreshold = avgHeight * 0.6;
characters.sort((a, b) => a.y - b.y);
const lines = [];
for (const char of characters) {
let line = lines.find(line => Math.abs(line.y - char.y) < lineThreshold);
if (!line) {
line = { y: char.y, chars: [] };
lines.push(line);
}
line.chars.push(char);
}
lines.forEach(line => line.chars.sort((a, b) => a.x - b.x));
const processedLines = [];
let lastWasHeading = false;
for (const line of lines) {
const lineText = insertSpaceBetweenWords(line.chars.map(c => c.text).join("")).trim();
// Define what qualifies as a "heading"
const isHeading =
/^[A-Z0-9\s:&\-]{4,}$/.test(lineText) || // All caps or uppercase-heavy
lineText.endsWith(":") || // Ends in colon (e.g., Project Title:)
/^[A-Z][a-z]+(\s[A-Z][a-z]+)*$/.test(lineText); // Capitalized like "Education Details"
// Add an extra newline if this line is a heading or follows one
if (isHeading || lastWasHeading) {
processedLines.push(""); // this becomes a `\n\n` when joined
}
processedLines.push(lineText);
lastWasHeading = isHeading;
}
const text = processedLines.join("\n");
results.push(text);
}
return results.join("\n\n");
}
async function processPdf(pdfFilePath) {
const pdfName = path.basename(pdfFilePath, path.extname(pdfFilePath));
const outputDir = path.join(outputBaseDir, pdfName);
const preprocessedDir = path.join(outputDir, "preprocessed");
await fs.mkdir(outputDir, { recursive: true });
console.log(`\n📄 Processing PDF: ${pdfName}`);
const rawImages = await convertPdfToImages(pdfFilePath, outputDir);
const processedImages = await preprocessImages(rawImages, preprocessedDir);
const fullText = await runOcrOnImages(processedImages);
const outputTxtPath = path.join(outputBaseDir, `${pdfName}.txt`);
await fs.writeFile(outputTxtPath, fullText, "utf8");
console.log(`✅ Saved OCR result to ${outputTxtPath}`);
}
async function main() {
const files = await fs.readdir(inputDir);
const pdfFiles = files.filter(f => f.toLowerCase().endsWith(".pdf"));
if (pdfFiles.length === 0) {
console.log("❌ No PDF files found in the input folder.");
return;
}
for (const file of pdfFiles) {
const fullPath = path.join(inputDir, file);
await processPdf(fullPath);
}
}
main().catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment