import cv2 import os import pytesseract import gradio as gr from gradio import Interface, Image, Text import numpy as np from PIL import Image from PIL import UnidentifiedImageError def process_image(input_image): try: # Convert the input image to a NumPy array if it's a PIL Image if isinstance(input_image, Image.Image): img = np.array(input_image) else: # If it's a file path or file-like object, read it directly with OpenCV img = cv2.imread(input_image) # Check that the image is in the expected format if img is None or img.dtype != np.uint8: raise Exception("Could not read the image. Please check the image format.") img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # img = cv2.imdecode(np.fromstring(input_image.read(), np.uint8), cv2.IMREAD_COLOR) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18)) dilation = cv2.dilate(thresh1, rect_kernel, iterations=1) # Find text lines using connected component analysis text_lines = [] contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) text_lines.append((y, y + h, x, x + w)) # Sort text lines by their y-coordinates text_lines.sort(key=lambda line: line[0]) # Extract text from each line using Tesseract recognized_text = [] for y_min, y_max, x_min, x_max in text_lines: cropped_img = img[y_min:y_max, x_min:x_max] custom_config = r'-l eng+khm --oem 3 --psm 6' extracted_text = pytesseract.image_to_string(cropped_img, config=custom_config) recognized_text.append(extracted_text.strip()) # Combine recognized text into a single string full_text = "\n".join(recognized_text) # Draw bounding boxes on the image result_rgb = img.copy() for y_min, y_max, x_min, x_max in text_lines: cv2.rectangle(result_rgb, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) return full_text, result_rgb except Exception as e: return "Could not process the image. Error: " + str(e), None iface = gr.Interface( process_image, inputs=[gr.Image(type="pil", label="Processed Image")], outputs=[ gr.Text(label="Detected Labels"), gr.Image(type="pil", label="Processed Image") ], title="Bank Statement OCR", # description="Upload an image containing text to perform OCR and see the detected text and image." flagging_options=["blurry", "incorrect", "other"],) iface.launch(debug=True , share=True)