Created
January 14, 2026 01:05
-
-
Save telugu-boy/ed3e47e061fce41746bfd89d3e2e07ff to your computer and use it in GitHub Desktop.
Quick OCR pipeline for noisy images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # April 12, 2024 | |
| from typing import Dict, Optional, Tuple | |
| from pytesseract import Output, pytesseract | |
| import io | |
| import cv2 | |
| import numpy as np | |
| import imutils | |
| from imutils.perspective import four_point_transform | |
| import urllib.request | |
| def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA): | |
| # initialize the dimensions of the image to be resized and | |
| # grab the image size | |
| dim = None | |
| (h, w) = image.shape[:2] | |
| # if both the width and height are None, then return the | |
| # original image | |
| if width is None and height is None: | |
| return image | |
| # check to see if the width is None | |
| if width is None: | |
| # calculate the ratio of the height and construct the | |
| # dimensions | |
| r = height / float(h) | |
| dim = (int(w * r), height) | |
| # otherwise, the height is None | |
| else: | |
| # calculate the ratio of the width and construct the | |
| # dimensions | |
| r = width / float(w) | |
| dim = (width, int(h * r)) | |
| # resize the image | |
| resized = cv2.resize(image, dim, interpolation = inter) | |
| # return the resized image | |
| return resized | |
| def preprocess_image(image: cv2.typing.MatLike) -> cv2.typing.MatLike: | |
| """Preprocess the image for OCR.""" | |
| rgb_planes = cv2.split(image) | |
| result_planes = [] | |
| result_norm_planes = [] | |
| for plane in rgb_planes: | |
| dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8)) | |
| bg_img = cv2.medianBlur(dilated_img, 21) | |
| diff_img = 255 - cv2.absdiff(plane, bg_img) | |
| norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1) | |
| result_planes.append(diff_img) | |
| result_norm_planes.append(norm_img) | |
| result = cv2.merge(result_planes) | |
| result_norm = cv2.merge(result_norm_planes) | |
| image = result_norm | |
| # Resize the image | |
| rescaled_image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC) | |
| # Convert the image to grayscale | |
| gray_image = cv2.cvtColor(rescaled_image, cv2.COLOR_BGR2GRAY) | |
| # Apply thresholding | |
| _, binary_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY) | |
| #binary_image = gray_image | |
| # Denoise the image using morphological operations (opening) | |
| #kernel = np.ones((2,2),np.uint8) | |
| #denoised_image = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel) | |
| denoised_image = binary_image | |
| # Adjust contrast | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) | |
| contrast_adjusted_image = clahe.apply(denoised_image) | |
| #contrast_adjusted_image = denoised_image | |
| inverted_image = cv2.bitwise_not(contrast_adjusted_image) | |
| processed_img = inverted_image | |
| return processed_img | |
| def ocr_image(image_stream: io.BytesIO, return_img = None) -> Tuple[Dict[str, str], Optional[cv2.typing.MatLike]]: | |
| """OCR the image data and return the result as JSON.""" | |
| image_stream.seek(0) | |
| img_np = np.frombuffer(image_stream.read(), dtype=np.uint8) | |
| img = cv2.imdecode(img_np, cv2.IMREAD_COLOR) | |
| processed_img = preprocess_image(img) | |
| data = pytesseract.image_to_data(processed_img, output_type=Output.DICT, config='--psm 4') | |
| return data, image_resize(processed_img, height=500) if return_img else None | |
| if __name__ == "__main__": | |
| headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'} | |
| assim_url = "https://cdn.discordapp.com/attachments/1005603724066029610/1228232883349749800/image.png?ex=662b4c02&is=6618d702&hm=19529ffce15930f392286f13a1f2921d253904af4de019c6bbf0c2c9d938de3d&" | |
| request = urllib.request.Request(assim_url, headers=headers) | |
| with urllib.request.urlopen(request) as f: | |
| # Read the response data | |
| image_data = f.read() | |
| data, img = ocr_image(io.BytesIO(image_data), True) | |
| print(' '.join(data['text'])) | |
| cv2.imshow("img", img) | |
| cv2.waitKey(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment