import spacy from spacy.language import Language from spacy.lang.it import Italian import re from transformers import pipeline from gradio.inputs import File import gradio as gr from pdf2image import convert_from_path import pytesseract import tempfile import os from gradio.inputs import Dropdown import gradio as gr import tempfile import os from pdf2image import convert_from_path import pytesseract import fitz from pdf2image import convert_from_bytes def preprocess_punctuation(text): pattern = r'(?= 5.0) and (float(match) in numeric_list)] ###### remove duplicates if unique_values: numbers = list(set(numbers)) ###### total = 0 sum = 0 total_list = [] # Define a regular expression pattern that will match a number pattern = r'\d+' # Loop through the keywords and search for them in the text found = False for keyword in keywords: # Build a regular expression pattern that looks for the keyword # followed by up to three words, then a number keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})' match = re.search(keyword_pattern, text, re.IGNORECASE) if match: # If we find a match, print the number and set found to True number = match.group(2) if (number in numbers) and (number in numeric_list): total_list.append(int(number)) print(f"Found a value ({number}) for keyword '{keyword}'.") found = True # If we didn't find a match if not found: for value in numbers: if value in numeric_list: total += value total_list.append(total) #If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq for value in total_list: sum += value return numbers, sum def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False): tex = '' dictionary = get_text_and_values(text, k_words) raw = get_values(dictionary) qa = initialize_qa_transformer(transformer) val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa) keywords = ['totale', 'complessivo', 'complessiva'] values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True) if return_text: tex = get_useful_text(dictionary) return values, return_text, tex elif return_text == False: return values, return_text def format_output(extracted_values): output = {} values_output = "\n".join([f"mq. {value}" for value in extracted_values[0]]) output["Mq. Values"] = values_output output["Total"] = extracted_values[1] if extracted_values[2]: output["Ref. Text"] = extracted_values[2] return output def pdf_ocr(file, model_t, question): # Convert PDF to image with tempfile.TemporaryDirectory() as path: with open(file, "rb") as f: content = f.read() with fitz.open(stream=content, filetype="pdf") as doc: num_pages = len(doc) # Extract text from the PDF text = "" for page in doc: text += page.get_text() # Perform OCR on the PDF if the extracted text is empty if not text: # Convert PDF pages to images images = convert_from_path(content) for i, img in enumerate(images): text += pytesseract.image_to_string(img, lang='ita') # Clear the image list to free up memory del images ks = ('mq', 'metri quadri', 'm2') quest = "Quanti metri quadri misura la superficie?" totalK = ['totale', 'complessivo', 'complessiva'] extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True) values_output = extracted_values[0][0] total_output = f'{extracted_values[0][1]} Mq' text_output = extracted_values[2] immobile_values = [f'{i + 1}. Immobile : {value} Mq\n' for i, value in enumerate(values_output)] immobile_values = '\n'.join(immobile_values) return immobile_values, total_output, text_output def ocr_interface(pdf_file, model_t, question): # Call the pdf_ocr function values, total, text = pdf_ocr(pdf_file.name, model_t, question) return values, total, text with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( ''' # PDF Mq Extractor ''') with gr.Tab("Extractor"): with gr.Row(): pdf_input = gr.components.File(label="PDF File") with gr.Row(): model_input = gr.components.Dropdown(['it5/it5-small-question-answering', 'it5/it5-base-question-answering'], value= 'it5/it5-small-question-answering', label = 'Select model') question_input = gr.components.Dropdown(["Quanti metri quadri misura l'immobile?"], value = "Quanti metri quadri misura l'immobile?", label = 'Question') with gr.Column(): gr.Markdown( ''' # Output values Values extracted from the pdf document ''') with gr.Row(): values_output = gr.components.Textbox(label="Area Values") total_output = gr.components.Textbox(label="Total") text_output = gr.components.Textbox(label="Ref. Text") with gr.Row(): extract_button = gr.Button("Extract") extract_button.click(fn = ocr_interface, inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output]) gr.Examples(['Example1.pdf', 'Example2.pdf'], inputs = pdf_input) demo.launch()