Spaces:

digitiamosrl
/

document_info_extractor

Build error

File size: 10,217 Bytes

import spacy
from spacy.language import Language
from spacy.lang.it import Italian
import re
from transformers import pipeline
from gradio.inputs import File
import gradio as gr
from pdf2image import convert_from_path
import pytesseract
import tempfile
import os
from gradio.inputs import Dropdown
import gradio as gr
import tempfile
import os
from pdf2image import convert_from_path
import pytesseract
import fitz
from pdf2image import convert_from_bytes


def preprocess_punctuation(text):
  pattern = r'(?<![a-z])[a-zA-Z\.]{1,4}(?:\.[a-zA-Z\.]{1,4})*\.(?!\s*[A-Z])'
  matches = re.findall(pattern, text)
  res = [*set(matches)]
  #res = [r for r in res if not nlp(r).ents or 
       #not any(ent.label_ in nlp.get_pipe('ner').labels for ent in nlp(r).ents)] #optimized
  return res


def preprocess_text(text):
  prep_text = re.sub(r'\n\s*\n', '\n', text)
  prep_text = re.sub(r'\n{2,}', '\n', prep_text)
#string_with_single_newlines_and_no_blank_lines = re.sub(r' {2,}', ' ', string_with_single_newlines_and_no_blank_lines)
#print(string_with_single_newlines_and_no_blank_lines)
  return prep_text



@Language.component('custom_tokenizer')
def custom_tokenizer(doc):
    # Define a custom rule to ignore colons as a sentence boundary
    for token in doc[:-1]:
        if (token.text == ":"):
            doc[token.i+1].is_sent_start = False
    return doc



def get_sentences(text, dictionary = None):
  cl_sentences = []
  chars_to_strip = [' ', '\n']
  chars_to_strip_str = ''.join(set(chars_to_strip))
  nlp = spacy.load("it_core_news_lg")  #load ita moodel
  nlp.add_pipe("custom_tokenizer", before="parser")

  for punct in preprocess_punctuation(text):
    nlp.tokenizer.add_special_case(punct, [{spacy.symbols.ORTH: punct, spacy.symbols.NORM: punct}])

  doc = nlp(text)  # Process the text with spaCy
  sentences = list(doc.sents)  # Split the text into sentences
  for sentence in sentences:
    sent = sentence.text
    cl_sentence = ' '.join(filter(None, sent.lstrip(chars_to_strip_str).rstrip(chars_to_strip_str).split(' ')))
    if cl_sentence!= '':
      cl_sentences.append(cl_sentence)
  return cl_sentences




def extract_numbers(text, given_strings):
    # Split text into a list of words
    words = text.split()
    # Find the indices of the given strings in the list of words
    indices = [i for i, word in enumerate(words) if any(s in word for s in given_strings)]
    # Initialize an empty list to store the numbers
    numbers = []
    # Loop through each index
    for index in indices:
        # Define the range of words to search for numbers
        start = max(index - 1, 0)
        end = min(index + 2, len(words))
        # Extract the words within the range
        context = words[start:end]
        # Check if the context contains mathematical operators
        if any(re.match(r'[+\*/]', word) for word in context):
            continue
        # Find all numbers in the context
        context_numbers = [
            float(re.sub('[^0-9\.,]+', '', word).replace(',', '.'))
            if re.sub('[^0-9\.,]+', '', word).replace(',', '.').replace('.', '', 1).isdigit()
            else int(re.sub('[^0-9]+', '', word))
            if re.sub('[^0-9]+', '', word).isdigit()
            else None
            for word in context
        ]
        # Add the numbers to the list
        numbers.extend(context_numbers)
    return numbers



def get_text_and_values(text, key_list):
  sentences = get_sentences(text)
  total_numbers= []
  infoDict = {}
  for sentence in sentences:
    numbers = extract_numbers(text = sentence, given_strings = key_list)
    total_numbers.append(numbers)
    if not numbers:
      continue
    else: infoDict[sentence] = numbers
  return infoDict


def get_useful_text(dictionary):
  keysList = list(dictionary.keys())
  tx = ('\n------------------------\n'.join(keysList))
  return tx

def get_values(dictionary):
  pr = list(dictionary.values())
  return pr


def initialize_qa_transformer(model):
  qa = pipeline("text2text-generation", model=model)
  return qa


def get_answers_unfiltered(dictionary, question, qa_pipeline):
  keysList = list(dictionary.keys())
  answers = []
  for kl in keysList:
    answer = qa_pipeline(f'{kl} Domanda: {question}')
    answers.append(answer)
  return answers


def get_total(answered_values, text, keywords, raw_values, unique_values = False):
  numeric_list = [num for sublist in raw_values for num in sublist if isinstance(num, (int, float))]
  #numbers = [float(x[0]['generated_text']) for x in answered_values if x[0]['generated_text'].isdigit()]
  pattern = r'\d+(?:[.,]\d+)?'
  numbers = []
  for sub_lst in answered_values:
      for d in sub_lst:
          for k, v in d.items():
            # Replace commas with dots
              v = v.replace(',', '.')
            # Extract numbers and convert to float
              numbers += [float(match) for match in re.findall(pattern, v) if (float(match) >= 5.0) and (float(match) in numeric_list)]
  ###### remove duplicates
  if unique_values:
    numbers = list(set(numbers))
  ######
  total = 0
  sum = 0
  total_list = []
# Define a regular expression pattern that will match a number
  pattern = r'\d+'
# Loop through the keywords and search for them in the text
  found = False
  for keyword in keywords:
    # Build a regular expression pattern that looks for the keyword
    # followed by up to three words, then a number
      keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})'
      match = re.search(keyword_pattern, text, re.IGNORECASE)
      if match:
        # If we find a match, print the number and set found to True
          number = match.group(2)
          if (number in numbers) and (number in numeric_list):
            total_list.append(int(number))
            print(f"Found a value ({number}) for keyword '{keyword}'.")
            found = True  

# If we didn't find a match
  if not found:
    for value in numbers:
      if value in numeric_list:
        total += value
    total_list.append(total)
 #If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq  
  for value in total_list:
    sum += value
  return numbers, sum



def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False):

  tex = ''
  dictionary = get_text_and_values(text, k_words)
  raw = get_values(dictionary)
  qa = initialize_qa_transformer(transformer)
  val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa)
  keywords = ['totale', 'complessivo', 'complessiva']
  values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True)
  if return_text:
    tex = get_useful_text(dictionary)
    return values, return_text, tex
  elif return_text == False:
    return values, return_text



def pdf_ocr(file, model_t, question):
    # Convert PDF to image
    with tempfile.TemporaryDirectory() as path:
        with open(file, "rb") as f:
            content = f.read()

        with fitz.open(stream=content, filetype="pdf") as doc:
            num_pages = len(doc)

            # Extract text from the PDF
            text = ""
            for page in doc:
                text += page.get_text()

            # Perform OCR on the PDF if the extracted text is empty
            if not text:
                # Convert PDF pages to images
                images = convert_from_bytes(content)
                for i, img in enumerate(images):
                    text += pytesseract.image_to_string(img, lang='ita')

                # Clear the image list to free up memory
                del images

    ks = ('mq', 'MQ', 'Mq', 'metri quadri', 'm2')
    quest = "Quanti metri quadri misura la superficie?"
    totalK = ['totale', 'complessivo', 'complessiva']

    extracted_values = extractor_clean(text=text, k_words=ks, transformer=model_t, question=question, total_kwords=totalK, return_text=True)
    values_output = extracted_values[0][0]
    sor_values = sorted(values_output)
    total_output = f'{extracted_values[0][1]}  Mq'
    text_output = extracted_values[2]

    immobile_values = [f'{i + 1}. Immobile :  {value}  Mq\n' for i, value in enumerate(sor_values)]
    immobile_values = '\n'.join(immobile_values)

    return immobile_values, total_output, text_output


def ocr_interface(pdf_file, model_t='it5/it5-base-question-answering', question="Quanti metri quadri misura l'immobile?"):
    # Call the pdf_ocr function
    values, total, text = pdf_ocr(pdf_file.name, model_t, question)
    return values, total, text


# Start the UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:

    gr.Markdown(
    '''
    # PDF Mq Extractor
    Demo for ITAL-IA
    ''')
    with gr.Tab("Extractor"):
      with gr.Row():
        pdf_input = gr.components.File(label="PDF File")
     
      with gr.Row():
          model_input = gr.components.Dropdown(['it5/it5-base-question-answering', 'it5/it5-small-question-answering'],
                                               value='it5/it5-base-question-answering', label = 'Select model')
          question_input = gr.components.Dropdown(["Quanti metri quadri misura l'immobile?"],
                                                  value = "Quanti metri quadri misura l'immobile?", label = 'Question')
      
      with gr.Column():
          gr.Markdown(
          '''
          # Output values
          Values extracted from the pdf document
          ''')
      
      with gr.Row():

          text_output = gr.components.Textbox(label="Ref. Text")
          values_output = gr.components.Textbox(label="Area Values - sorted by value")
          total_output = gr.components.Textbox(label="Total")
          
      with gr.Row():
          extract_button = gr.Button("Extract")


    extract_button.click(fn = ocr_interface,
                         inputs=[pdf_input, model_input, question_input], outputs=[values_output, total_output, text_output])

    gr.Examples(['Example1(scannedDoc).pdf', 'Example2.pdf', 'Example3Large.pdf'], inputs = pdf_input, 
                cache_examples = True, fn = ocr_interface, outputs = [values_output, total_output, text_output])


demo.launch()