mshivam019/ocr.js

## ocr.js
import { promises as fs } from "node:fs";
import { pdf } from "pdf-to-img";
import path from "path";
import { fileURLToPath } from "url";
import sharp from "sharp";
import Ocr from "@gutenye/ocr-node";

// Handle ES module paths
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// Directories
const inputDir = path.join(__dirname, "input");
const outputBaseDir = path.join(__dirname, "output");

function insertSpaceBetweenWords(text) {
  return text.replace(/([a-z])([A-Z])/g, (match, lower, upper) => {
    // Don't split if part of acronym (e.g., XLRI, NASA)
    const isAcronym = /^[A-Z]{2,}$/.test(match.slice(1)); // e.g., LRI from XLRI
    return isAcronym ? match : `${lower} ${upper}`;
  });
}

async function convertPdfToImages(pdfPath, outputDir, scale = 3) {
  const images = [];
  const document = await pdf(pdfPath, { scale });

  let page = 1;
  for await (const imageBuffer of document) {
    const imagePath = path.join(outputDir, `page${page}.png`);
    await fs.writeFile(imagePath, imageBuffer);
    images.push(imagePath);
    page++;
  }

  return images;
}

async function preprocessImages(imagePaths, preprocessedDir) {
  await fs.mkdir(preprocessedDir, { recursive: true });

  const processedPaths = [];

  for (const imagePath of imagePaths) {
    const filename = path.basename(imagePath);
    const outPath = path.join(preprocessedDir, filename);

    await sharp(imagePath)
      .grayscale()
      .normalize()
      .resize({ width: 2000 })
      .toFile(outPath);

    processedPaths.push(outPath);
  }

  return processedPaths;
}

async function runOcrOnImages(imagePaths) {
  const ocr = await Ocr.create();
  const results = [];

  for (const imagePath of imagePaths) {
    console.log(`Processing ${path.basename(imagePath)}...`);

    const raw = await ocr.detect(imagePath);

    const characters = raw.filter(e => typeof e.text === "string" && e.text.trim().length);
    const avgHeight = characters.reduce((sum, c) => sum + (c.height || 10), 0) / characters.length;
    const lineThreshold = avgHeight * 0.6;

    characters.sort((a, b) => a.y - b.y);

    const lines = [];
    for (const char of characters) {
      let line = lines.find(line => Math.abs(line.y - char.y) < lineThreshold);
      if (!line) {
        line = { y: char.y, chars: [] };
        lines.push(line);
      }
      line.chars.push(char);
    }

    lines.forEach(line => line.chars.sort((a, b) => a.x - b.x));
    const processedLines = [];
let lastWasHeading = false;

for (const line of lines) {
  const lineText = insertSpaceBetweenWords(line.chars.map(c => c.text).join("")).trim();

  // Define what qualifies as a "heading"
  const isHeading =
  /^[A-Z0-9\s:&\-]{4,}$/.test(lineText) ||         // All caps or uppercase-heavy
    lineText.endsWith(":") ||                        // Ends in colon (e.g., Project Title:)
    /^[A-Z][a-z]+(\s[A-Z][a-z]+)*$/.test(lineText);   // Capitalized like "Education Details"

  // Add an extra newline if this line is a heading or follows one
  if (isHeading || lastWasHeading) {
    processedLines.push(""); // this becomes a `\n\n` when joined
  }

  processedLines.push(lineText);
  lastWasHeading = isHeading;
}

const text = processedLines.join("\n");


    results.push(text);
  }

  return results.join("\n\n");
}

async function processPdf(pdfFilePath) {
  const pdfName = path.basename(pdfFilePath, path.extname(pdfFilePath));
  const outputDir = path.join(outputBaseDir, pdfName);
  const preprocessedDir = path.join(outputDir, "preprocessed");

  await fs.mkdir(outputDir, { recursive: true });

  console.log(`\n📄 Processing PDF: ${pdfName}`);

  const rawImages = await convertPdfToImages(pdfFilePath, outputDir);
  const processedImages = await preprocessImages(rawImages, preprocessedDir);
  const fullText = await runOcrOnImages(processedImages);

  const outputTxtPath = path.join(outputBaseDir, `${pdfName}.txt`);
  await fs.writeFile(outputTxtPath, fullText, "utf8");

  console.log(`✅ Saved OCR result to ${outputTxtPath}`);
}

async function main() {
  const files = await fs.readdir(inputDir);
  const pdfFiles = files.filter(f => f.toLowerCase().endsWith(".pdf"));

  if (pdfFiles.length === 0) {
    console.log("❌ No PDF files found in the input folder.");
    return;
  }

  for (const file of pdfFiles) {
    const fullPath = path.join(inputDir, file);
    await processPdf(fullPath);
  }
}

main().catch(console.error);
	import { promises as fs } from "node:fs";
	import { pdf } from "pdf-to-img";
	import path from "path";
	import { fileURLToPath } from "url";
	import sharp from "sharp";
	import Ocr from "@gutenye/ocr-node";

	// Handle ES module paths
	const __filename = fileURLToPath(import.meta.url);
	const __dirname = path.dirname(__filename);

	// Directories
	const inputDir = path.join(__dirname, "input");
	const outputBaseDir = path.join(__dirname, "output");

	function insertSpaceBetweenWords(text) {
	return text.replace(/([a-z])([A-Z])/g, (match, lower, upper) => {
	// Don't split if part of acronym (e.g., XLRI, NASA)
	const isAcronym = /^[A-Z]{2,}$/.test(match.slice(1)); // e.g., LRI from XLRI
	return isAcronym ? match : `${lower} ${upper}`;
	});
	}

	async function convertPdfToImages(pdfPath, outputDir, scale = 3) {
	const images = [];
	const document = await pdf(pdfPath, { scale });

	let page = 1;
	for await (const imageBuffer of document) {
	const imagePath = path.join(outputDir, `page${page}.png`);
	await fs.writeFile(imagePath, imageBuffer);
	images.push(imagePath);
	page++;
	}

	return images;
	}

	async function preprocessImages(imagePaths, preprocessedDir) {
	await fs.mkdir(preprocessedDir, { recursive: true });

	const processedPaths = [];

	for (const imagePath of imagePaths) {
	const filename = path.basename(imagePath);
	const outPath = path.join(preprocessedDir, filename);

	await sharp(imagePath)
	.grayscale()
	.normalize()
	.resize({ width: 2000 })
	.toFile(outPath);

	processedPaths.push(outPath);
	}

	return processedPaths;
	}

	async function runOcrOnImages(imagePaths) {
	const ocr = await Ocr.create();
	const results = [];

	for (const imagePath of imagePaths) {
	console.log(`Processing ${path.basename(imagePath)}...`);

	const raw = await ocr.detect(imagePath);

	const characters = raw.filter(e => typeof e.text === "string" && e.text.trim().length);
	const avgHeight = characters.reduce((sum, c) => sum + (c.height \|\| 10), 0) / characters.length;
	const lineThreshold = avgHeight * 0.6;

	characters.sort((a, b) => a.y - b.y);

	const lines = [];
	for (const char of characters) {
	let line = lines.find(line => Math.abs(line.y - char.y) < lineThreshold);
	if (!line) {
	line = { y: char.y, chars: [] };
	lines.push(line);
	}
	line.chars.push(char);
	}

	lines.forEach(line => line.chars.sort((a, b) => a.x - b.x));
	const processedLines = [];
	let lastWasHeading = false;

	for (const line of lines) {
	const lineText = insertSpaceBetweenWords(line.chars.map(c => c.text).join("")).trim();

	// Define what qualifies as a "heading"
	const isHeading =
	/^[A-Z0-9\s:&\-]{4,}$/.test(lineText) \|\| // All caps or uppercase-heavy
	lineText.endsWith(":") \|\| // Ends in colon (e.g., Project Title:)
	/^[A-Z][a-z]+(\s[A-Z][a-z]+)*$/.test(lineText); // Capitalized like "Education Details"

	// Add an extra newline if this line is a heading or follows one
	if (isHeading \|\| lastWasHeading) {
	processedLines.push(""); // this becomes a `\n\n` when joined
	}

	processedLines.push(lineText);
	lastWasHeading = isHeading;
	}

	const text = processedLines.join("\n");


	results.push(text);
	}

	return results.join("\n\n");
	}

	async function processPdf(pdfFilePath) {
	const pdfName = path.basename(pdfFilePath, path.extname(pdfFilePath));
	const outputDir = path.join(outputBaseDir, pdfName);
	const preprocessedDir = path.join(outputDir, "preprocessed");

	await fs.mkdir(outputDir, { recursive: true });

	console.log(`\n📄 Processing PDF: ${pdfName}`);

	const rawImages = await convertPdfToImages(pdfFilePath, outputDir);
	const processedImages = await preprocessImages(rawImages, preprocessedDir);
	const fullText = await runOcrOnImages(processedImages);

	const outputTxtPath = path.join(outputBaseDir, `${pdfName}.txt`);
	await fs.writeFile(outputTxtPath, fullText, "utf8");

	console.log(`✅ Saved OCR result to ${outputTxtPath}`);
	}

	async function main() {
	const files = await fs.readdir(inputDir);
	const pdfFiles = files.filter(f => f.toLowerCase().endsWith(".pdf"));

	if (pdfFiles.length === 0) {
	console.log("❌ No PDF files found in the input folder.");
	return;
	}

	for (const file of pdfFiles) {
	const fullPath = path.join(inputDir, file);
	await processPdf(fullPath);
	}
	}

	main().catch(console.error);
No results found