import cv2 | |
import os | |
import pytesseract | |
import gradio as gr | |
from gradio import Interface, Image, Text | |
import numpy as np | |
from PIL import Image | |
from PIL import UnidentifiedImageError | |
def process_image(input_image): | |
try: | |
# Convert the input image to a NumPy array if it's a PIL Image | |
if isinstance(input_image, Image.Image): | |
img = np.array(input_image) | |
else: | |
# If it's a file path or file-like object, read it directly with OpenCV | |
img = cv2.imread(input_image) | |
# Check that the image is in the expected format | |
if img is None or img.dtype != np.uint8: | |
raise Exception("Could not read the image. Please check the image format.") | |
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) | |
# img = cv2.imdecode(np.fromstring(, np.uint8), cv2.IMREAD_COLOR) | |
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) | |
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18)) | |
dilation = cv2.dilate(thresh1, rect_kernel, iterations=1) | |
# Find text lines using connected component analysis | |
text_lines = [] | |
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
for cnt in contours: | |
x, y, w, h = cv2.boundingRect(cnt) | |
text_lines.append((y, y + h, x, x + w)) | |
# Sort text lines by their y-coordinates | |
text_lines.sort(key=lambda line: line[0]) | |
# Extract text from each line using Tesseract | |
recognized_text = [] | |
for y_min, y_max, x_min, x_max in text_lines: | |
cropped_img = img[y_min:y_max, x_min:x_max] | |
custom_config = r'-l eng+khm --oem 3 --psm 6' | |
extracted_text = pytesseract.image_to_string(cropped_img, config=custom_config) | |
recognized_text.append(extracted_text.strip()) | |
# Combine recognized text into a single string | |
full_text = "\n".join(recognized_text) | |
# Draw bounding boxes on the image | |
result_rgb = img.copy() | |
for y_min, y_max, x_min, x_max in text_lines: | |
cv2.rectangle(result_rgb, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) | |
return full_text, result_rgb | |
except Exception as e: | |
return "Could not process the image. Error: " + str(e), None | |
iface = gr.Interface( | |
process_image, | |
inputs=[gr.Image(type="pil", label="Processed Image")], | |
outputs=[ | |
gr.Text(label="Detected Labels"), | |
gr.Image(type="pil", label="Processed Image") | |
], | |
title="Bank Statement OCR", | |
# description="Upload an image containing text to perform OCR and see the detected text and image." | |
flagging_options=["blurry", "incorrect", "other"],) | |
iface.launch(debug=True , share=True) |