Created
February 16, 2026 14:50
-
-
Save mshivam019/4260f6fcfbe860ace50bd8527751db94 to your computer and use it in GitHub Desktop.
This script to perform OCR on a nodejs server
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { promises as fs } from "node:fs"; | |
| import { pdf } from "pdf-to-img"; | |
| import path from "path"; | |
| import { fileURLToPath } from "url"; | |
| import sharp from "sharp"; | |
| import Ocr from "@gutenye/ocr-node"; | |
| // Handle ES module paths | |
| const __filename = fileURLToPath(import.meta.url); | |
| const __dirname = path.dirname(__filename); | |
| // Directories | |
| const inputDir = path.join(__dirname, "input"); | |
| const outputBaseDir = path.join(__dirname, "output"); | |
| function insertSpaceBetweenWords(text) { | |
| return text.replace(/([a-z])([A-Z])/g, (match, lower, upper) => { | |
| // Don't split if part of acronym (e.g., XLRI, NASA) | |
| const isAcronym = /^[A-Z]{2,}$/.test(match.slice(1)); // e.g., LRI from XLRI | |
| return isAcronym ? match : `${lower} ${upper}`; | |
| }); | |
| } | |
| async function convertPdfToImages(pdfPath, outputDir, scale = 3) { | |
| const images = []; | |
| const document = await pdf(pdfPath, { scale }); | |
| let page = 1; | |
| for await (const imageBuffer of document) { | |
| const imagePath = path.join(outputDir, `page${page}.png`); | |
| await fs.writeFile(imagePath, imageBuffer); | |
| images.push(imagePath); | |
| page++; | |
| } | |
| return images; | |
| } | |
| async function preprocessImages(imagePaths, preprocessedDir) { | |
| await fs.mkdir(preprocessedDir, { recursive: true }); | |
| const processedPaths = []; | |
| for (const imagePath of imagePaths) { | |
| const filename = path.basename(imagePath); | |
| const outPath = path.join(preprocessedDir, filename); | |
| await sharp(imagePath) | |
| .grayscale() | |
| .normalize() | |
| .resize({ width: 2000 }) | |
| .toFile(outPath); | |
| processedPaths.push(outPath); | |
| } | |
| return processedPaths; | |
| } | |
| async function runOcrOnImages(imagePaths) { | |
| const ocr = await Ocr.create(); | |
| const results = []; | |
| for (const imagePath of imagePaths) { | |
| console.log(`Processing ${path.basename(imagePath)}...`); | |
| const raw = await ocr.detect(imagePath); | |
| const characters = raw.filter(e => typeof e.text === "string" && e.text.trim().length); | |
| const avgHeight = characters.reduce((sum, c) => sum + (c.height || 10), 0) / characters.length; | |
| const lineThreshold = avgHeight * 0.6; | |
| characters.sort((a, b) => a.y - b.y); | |
| const lines = []; | |
| for (const char of characters) { | |
| let line = lines.find(line => Math.abs(line.y - char.y) < lineThreshold); | |
| if (!line) { | |
| line = { y: char.y, chars: [] }; | |
| lines.push(line); | |
| } | |
| line.chars.push(char); | |
| } | |
| lines.forEach(line => line.chars.sort((a, b) => a.x - b.x)); | |
| const processedLines = []; | |
| let lastWasHeading = false; | |
| for (const line of lines) { | |
| const lineText = insertSpaceBetweenWords(line.chars.map(c => c.text).join("")).trim(); | |
| // Define what qualifies as a "heading" | |
| const isHeading = | |
| /^[A-Z0-9\s:&\-]{4,}$/.test(lineText) || // All caps or uppercase-heavy | |
| lineText.endsWith(":") || // Ends in colon (e.g., Project Title:) | |
| /^[A-Z][a-z]+(\s[A-Z][a-z]+)*$/.test(lineText); // Capitalized like "Education Details" | |
| // Add an extra newline if this line is a heading or follows one | |
| if (isHeading || lastWasHeading) { | |
| processedLines.push(""); // this becomes a `\n\n` when joined | |
| } | |
| processedLines.push(lineText); | |
| lastWasHeading = isHeading; | |
| } | |
| const text = processedLines.join("\n"); | |
| results.push(text); | |
| } | |
| return results.join("\n\n"); | |
| } | |
| async function processPdf(pdfFilePath) { | |
| const pdfName = path.basename(pdfFilePath, path.extname(pdfFilePath)); | |
| const outputDir = path.join(outputBaseDir, pdfName); | |
| const preprocessedDir = path.join(outputDir, "preprocessed"); | |
| await fs.mkdir(outputDir, { recursive: true }); | |
| console.log(`\n📄 Processing PDF: ${pdfName}`); | |
| const rawImages = await convertPdfToImages(pdfFilePath, outputDir); | |
| const processedImages = await preprocessImages(rawImages, preprocessedDir); | |
| const fullText = await runOcrOnImages(processedImages); | |
| const outputTxtPath = path.join(outputBaseDir, `${pdfName}.txt`); | |
| await fs.writeFile(outputTxtPath, fullText, "utf8"); | |
| console.log(`✅ Saved OCR result to ${outputTxtPath}`); | |
| } | |
| async function main() { | |
| const files = await fs.readdir(inputDir); | |
| const pdfFiles = files.filter(f => f.toLowerCase().endsWith(".pdf")); | |
| if (pdfFiles.length === 0) { | |
| console.log("❌ No PDF files found in the input folder."); | |
| return; | |
| } | |
| for (const file of pdfFiles) { | |
| const fullPath = path.join(inputDir, file); | |
| await processPdf(fullPath); | |
| } | |
| } | |
| main().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment