manuth's picture
sharing public
3685a25
raw
history blame
2.87 kB
import cv2
import os
import pytesseract
import gradio as gr
from gradio import Interface, Image, Text
import numpy as np
from PIL import Image
from PIL import UnidentifiedImageError
def process_image(input_image):
try:
# Convert the input image to a NumPy array if it's a PIL Image
if isinstance(input_image, Image.Image):
img = np.array(input_image)
else:
# If it's a file path or file-like object, read it directly with OpenCV
img = cv2.imread(input_image)
# Check that the image is in the expected format
if img is None or img.dtype != np.uint8:
raise Exception("Could not read the image. Please check the image format.")
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
# img = cv2.imdecode(np.fromstring(input_image.read(), np.uint8), cv2.IMREAD_COLOR)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))
dilation = cv2.dilate(thresh1, rect_kernel, iterations=1)
# Find text lines using connected component analysis
text_lines = []
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
text_lines.append((y, y + h, x, x + w))
# Sort text lines by their y-coordinates
text_lines.sort(key=lambda line: line[0])
# Extract text from each line using Tesseract
recognized_text = []
for y_min, y_max, x_min, x_max in text_lines:
cropped_img = img[y_min:y_max, x_min:x_max]
custom_config = r'-l eng+khm --oem 3 --psm 6'
extracted_text = pytesseract.image_to_string(cropped_img, config=custom_config)
recognized_text.append(extracted_text.strip())
# Combine recognized text into a single string
full_text = "\n".join(recognized_text)
# Draw bounding boxes on the image
result_rgb = img.copy()
for y_min, y_max, x_min, x_max in text_lines:
cv2.rectangle(result_rgb, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
return full_text, result_rgb
except Exception as e:
return "Could not process the image. Error: " + str(e), None
iface = gr.Interface(
process_image,
inputs=[gr.Image(type="pil", label="Processed Image")],
outputs=[
gr.Text(label="Detected Labels"),
gr.Image(type="pil", label="Processed Image")
],
title="Bank Statement OCR",
# description="Upload an image containing text to perform OCR and see the detected text and image."
flagging_options=["blurry", "incorrect", "other"],)
iface.launch(debug=True , share=True)