Skip to content

Instantly share code, notes, and snippets.

@telugu-boy
Created January 14, 2026 01:05
Show Gist options
  • Select an option

  • Save telugu-boy/ed3e47e061fce41746bfd89d3e2e07ff to your computer and use it in GitHub Desktop.

Select an option

Save telugu-boy/ed3e47e061fce41746bfd89d3e2e07ff to your computer and use it in GitHub Desktop.
Quick OCR pipeline for noisy images
# April 12, 2024
from typing import Dict, Optional, Tuple
from pytesseract import Output, pytesseract
import io
import cv2
import numpy as np
import imutils
from imutils.perspective import four_point_transform
import urllib.request
def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
# initialize the dimensions of the image to be resized and
# grab the image size
dim = None
(h, w) = image.shape[:2]
# if both the width and height are None, then return the
# original image
if width is None and height is None:
return image
# check to see if the width is None
if width is None:
# calculate the ratio of the height and construct the
# dimensions
r = height / float(h)
dim = (int(w * r), height)
# otherwise, the height is None
else:
# calculate the ratio of the width and construct the
# dimensions
r = width / float(w)
dim = (width, int(h * r))
# resize the image
resized = cv2.resize(image, dim, interpolation = inter)
# return the resized image
return resized
def preprocess_image(image: cv2.typing.MatLike) -> cv2.typing.MatLike:
"""Preprocess the image for OCR."""
rgb_planes = cv2.split(image)
result_planes = []
result_norm_planes = []
for plane in rgb_planes:
dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8))
bg_img = cv2.medianBlur(dilated_img, 21)
diff_img = 255 - cv2.absdiff(plane, bg_img)
norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
result_planes.append(diff_img)
result_norm_planes.append(norm_img)
result = cv2.merge(result_planes)
result_norm = cv2.merge(result_norm_planes)
image = result_norm
# Resize the image
rescaled_image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
# Convert the image to grayscale
gray_image = cv2.cvtColor(rescaled_image, cv2.COLOR_BGR2GRAY)
# Apply thresholding
_, binary_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY)
#binary_image = gray_image
# Denoise the image using morphological operations (opening)
#kernel = np.ones((2,2),np.uint8)
#denoised_image = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel)
denoised_image = binary_image
# Adjust contrast
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
contrast_adjusted_image = clahe.apply(denoised_image)
#contrast_adjusted_image = denoised_image
inverted_image = cv2.bitwise_not(contrast_adjusted_image)
processed_img = inverted_image
return processed_img
def ocr_image(image_stream: io.BytesIO, return_img = None) -> Tuple[Dict[str, str], Optional[cv2.typing.MatLike]]:
"""OCR the image data and return the result as JSON."""
image_stream.seek(0)
img_np = np.frombuffer(image_stream.read(), dtype=np.uint8)
img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
processed_img = preprocess_image(img)
data = pytesseract.image_to_data(processed_img, output_type=Output.DICT, config='--psm 4')
return data, image_resize(processed_img, height=500) if return_img else None
if __name__ == "__main__":
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
assim_url = "https://cdn.discordapp.com/attachments/1005603724066029610/1228232883349749800/image.png?ex=662b4c02&is=6618d702&hm=19529ffce15930f392286f13a1f2921d253904af4de019c6bbf0c2c9d938de3d&"
request = urllib.request.Request(assim_url, headers=headers)
with urllib.request.urlopen(request) as f:
# Read the response data
image_data = f.read()
data, img = ocr_image(io.BytesIO(image_data), True)
print(' '.join(data['text']))
cv2.imshow("img", img)
cv2.waitKey(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment