import re import cv2 import numpy as np from paddleocr import PaddleOCR from PIL import Image import matplotlib.pyplot as plt import pandas as pd import matplotlib.pyplot as plt import onnxruntime import gradio as gr # initialize the OCR ocr = PaddleOCR(lang='sl', enable_mkldnn=True, cls=False, show_log= False) # initialize the models model_deskew = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") model_denoise = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx") ##### All Functions ##### def preprocess_image(image): ''' Function: preprocess image to make it lighter to work on Input: resized image Output: image ''' image = np.array(image) scale = 1.494 width = int(image.shape[1] / scale) height = int(image.shape[0] / scale) dim = (width, height) image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA) return image def deskew(image, model): ''' Function: deskew an image Input: takes an image as an array Output: deskewed image ''' # map the model classes to the actual degree of skew map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13', 5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4', 10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9', 15: '0', 16: '1', 17: '10', 18: '11', 19: '12', 20: '13',21: '14',22: '15', 23: '180',24: '2', 25: '270',26: '3',27: '4', 28: '5', 29: '6', 30: '7', 31: '8',32: '9', 33: '90'} image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) width = int(image_d.shape[1] * 0.2) height = int(image_d.shape[0] * 0.2) dim = (width, height) # resize image res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA) resized = cv2.resize(res, (200, 200)) # add two dimensions to feed to the model resized = resized.astype('float32').reshape(1, 200, 200 ,1) # normalize resized = resized/255 # predictions predictions = model.run(None, {'conv2d_input': resized}) # best prediction pred = predictions[0].argmax() # angle of skew angle = int(map[pred]) skew_confidence = predictions[0][0][pred] * 100 # deskew original image if angle == 90: deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) return deskewed_image, angle, skew_confidence if angle == 270: deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) return deskewed_image, angle, skew_confidence (h, w) = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, -angle, 1.0) deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) return deskewed_image, angle, skew_confidence def prepare_image_to_autoencoder(image): ''' Function: prepare the image to be passed to the autoencoder. Input: image (_type_): deskewed image Output: resized image to be passed to the autoencoder ''' height, width = image.shape[:2] target_height = 600 target_width = 600 image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)] # reshape image to fixed size image = cv2.resize(image, (target_width, target_height)) image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # normalize images image = image / 255.0 # reshape to pass image to autoencoder image = image.reshape(target_height, target_width, 1) return image def autoencode_ONNX(image, model): ''' Function: remove noise from image Input: image and autoencoder model Output: image ''' image = image.astype(np.float32).reshape(1, 600, 600, 1) image = model.run(None, {'input_2': image}) image = image[0] image = image.squeeze() image = image * 255 image = image.astype('uint8') return image def extract_detected_entries_pdl(image): """ Extracts text, scores, and boundary boxes from an image using OCR and returns a DataFrame. This function takes an input image, applies OCR to detect text in the image, and then extracts the detected text, confidence scores, and boundary boxes for each text entry. The extracted information is returned in a DataFrame with columns "Text", "Score", and "Boundary Box". Parameters ---------- image : numpy.ndarray The input image to be processed. Returns ------- pandas.DataFrame A DataFrame containing the extracted text, confidence scores, and boundary boxes for each detected text entry. The DataFrame has the following columns: - "Text": The detected text. - "Score": The confidence score for the detected text. - "Boundary Box": The coordinates of the boundary box for the detected text. """ # run the OCR result = ocr.ocr(image) # creates the Pandas dataframe txt = [] scores = [] boxes = [] for r in result[0]: txt.append(cleanString_basic(r[-1][0])) scores.append(r[-1][1]) boxes.append(r[0]) # Debugging: Print shapes and types of txt, scores, and boxes print("txt shape:", np.shape(txt), "type:", type(txt[0])) print("scores shape:", np.shape(scores), "type:", type(scores[0])) print("boxes shape:", np.shape(boxes), "type:", type(boxes[0])) return pd.DataFrame(np.transpose([txt, scores, boxes]),columns = ["Text","Score", "Boundary Box"]) def cleanString_basic(word): word = word.replace("$", "s") return word def clean_string_start(string: 'str'): names_flags = "√" chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] if string.startswith(tuple(chars_to_remove)): names_flags = string[0] string = string[1:] return string, names_flags def clean_string_end(string: 'str'): names_flags = "√" chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] if string.endswith(tuple(chars_to_remove)): names_flags = string[-1] string = string[:-1] return string, names_flags def clean_dates(date: 'str'): ''' Function: cleans the fields "datum smrti" and returns the char removed. Input: date (string format) Output: cleaned frame ''' date_flags = "Y" # finds special characters in the string special_char = re.findall(r'[a-zA-Z!\[\|]', date) if len(special_char) > 0: date_flags = special_char # remove special characters in the string string = re.sub(r'[a-zA-Z!\[\|]', '', date) return string, date_flags ##### Main Function ##### def pdf_extract_gr(image): extractimg = preprocess_image(image) #extractimg = np.array(image) # deskew the image deskewed_image, angle, skew_confidence = deskew(extractimg, model_deskew) # prepare the image for the autoencoder cleanimg = prepare_image_to_autoencoder(deskewed_image) # clean the image img = autoencode_ONNX(cleanimg, model_denoise) # extract the entries from the image df = extract_detected_entries_pdl(img) # first name firstnamerow = df.iloc[0] firstname = firstnamerow[0] firstnameconfidence = round(float(firstnamerow[1]) * 100,3) firstnameconfidence = f"{firstnameconfidence}%" # surname surnamerow = df.iloc[1] surname = surnamerow[0] surnameconfidence = round(float(surnamerow[1]) * 100,3) surnameconfidence = f"{surnameconfidence}%" # death date condifence dodrow = df.iloc[2] dodname = dodrow[0] dodconfidence = round(float(dodrow[1]) * 100,3) dodconfidence = f"{dodconfidence}%" # return all the results return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence ##### Gradio Style ##### css = """ .run_container { display: flex; flex-direction: column; align-items: center; gap: 10px; } .run_btn { margin: auto; width: 50%; display: flex; } .upload_cell { margin: auto; display: flex; } .results_container { display: flex; justify-content: space-evenly; } .results_cell { } """ ##### Gradio Blocks ##### with gr.Blocks(css = css) as demo: gr.Markdown(""" # Death Certificate Extraction """, elem_classes = "h1") gr.Markdown("Upload a PDF, extract data") with gr.Box(elem_classes = "run_container"): # ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell") ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn") with gr.Row(elem_id = "hide"): with gr.Column(): ExtractInput = gr.Image() with gr.Column(): # ExtractResult = gr.Image(label = "result") with gr.Row(elem_classes = "results_container"): FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell") FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell") with gr.Row(elem_classes = "results_container"): SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell") SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell") with gr.Row(elem_classes = "results_container"): DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell") DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell") with gr.Accordion("Full Results", open = False): ExtractDF = gr.Dataframe(label = "Results") with gr.Accordion("Clean Image", open = False): CleanOutput = gr.Image() with gr.Accordion("Deskew", open = False): DeskewOutput = gr.Image() with gr.Column(): DeskewAngle = gr.Number(label = "Angle") with gr.Column(): DeskewConfidence = gr.Number(label = "Confidence") ExtractButton.click(fn=pdf_extract_gr, inputs = ExtractInput, outputs = [ExtractDF, DeskewOutput, DeskewAngle, DeskewConfidence, CleanOutput, FirstNameBox, FirstNameConfidenceBox, SurnameNameBox, SurnameNameConfidenceBox, DODBox, DODConfidenceBox]) demo.launch(show_api=True, share=False, debug=True)