import spacy from spacy.language import Language from spacy.lang.it import Italian import re from transformers import pipeline from gradio.inputs import File import gradio as gr from pdf2image import convert_from_path import pytesseract import tempfile import os from gradio.inputs import Dropdown import gradio as gr import tempfile import os from pdf2image import convert_from_path import pytesseract import fitz from pdf2image import convert_from_bytes def preprocess_punctuation(text): pattern = r'(?= 5.0) and (float(match) in numeric_list)] ###### remove duplicates if unique_values: numbers = list(set(numbers)) ###### total = 0 sum = 0 total_list = [] # Define a regular expression pattern that will match a number pattern = r'\d+' # Loop through the keywords and search for them in the text found = False for keyword in keywords: # Build a regular expression pattern that looks for the keyword # followed by up to three words, then a number keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})' match = re.search(keyword_pattern, text, re.IGNORECASE) if match: # If we find a match, print the number and set found to True number = match.group(2) if (number in numbers) and (number in numeric_list): total_list.append(int(number)) print(f"Found a value ({number}) for keyword '{keyword}'.") found = True # If we didn't find a match if not found: for value in numbers: if value in numeric_list: total += value total_list.append(total) #If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq for value in total_list: sum += value return numbers, sum def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False): tex = '' dictionary = get_text_and_values(text, k_words) raw = get_values(dictionary) qa = initialize_qa_transformer(transformer) val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa) keywords = ['totale', 'complessivo', 'complessiva'] values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True) if return_text: tex = get_useful_text(dictionary) return values, return_text, tex elif return_text == False: return values, return_text def format_output(extracted_values): output = {} values_output = "\n".join([f"mq. {value}" for value in extracted_values[0]]) output["Mq. Values"] = values_output output["Total"] = extracted_values[1] if extracted_values[2]: output["Ref. Text"] = extracted_values[2] return output def pdf_ocr(file): # Convert PDF to image with tempfile.TemporaryDirectory() as path: with open(file, "rb") as f: content = f.read() with fitz.open(stream=content, filetype="pdf") as doc: num_pages = len(doc) # Extract text from the PDF text = "" for page in doc: text += page.get_text() # Perform OCR on the PDF if the extracted text is empty if not text: # Convert PDF pages to images images = convert_from_path(content) for i, img in enumerate(images): text += pytesseract.image_to_string(img, lang='ita') # Clear the image list to free up memory del images # Call extractor_clean and format_output functions ks = ('mq', 'metri quadri', 'm2') tra = 'it5/it5-base-question-answering' quest = "Quanti metri quadri misura la superficie?" totalK = ['totale', 'complessivo', 'complessiva'] extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True) values_output = extracted_values[0][0] # Join values with '\n' total_output = extracted_values[0][1] text_output = extracted_values[2] return values_output, total_output, text_output def ocr_interface(pdf_file): # Call the pdf_ocr function values, total, text = pdf_ocr(pdf_file.name) return values, total, text pdf_input = gr.inputs.File(label="PDF File") values_output = gr.outputs.Textbox(label="Mq. Values") total_output = gr.outputs.Textbox(label="Total") text_output = gr.outputs.Textbox(label="Ref. Text") iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, title="PDF MQ EXTRACTOR", outputs=[values_output, total_output, text_output], preprocess=format_output) iface.launch()