Build error
Build error
Browse filesAdded source code for the app
@@ -1,7 +1,265 @@
1 |
import gradio as gr
2 |
3 |
def greet(name):
4 |
return "Hello " + name + "!!"
5 |
6 |
7 |
1 |
import spacy
2 |
from spacy.language import Language
3 |
from import Italian
4 |
import re
5 |
from transformers import pipeline
6 |
from gradio.inputs import File
7 |
import gradio as gr
8 |
from pdf2image import convert_from_path
9 |
import pytesseract
10 |
import tempfile
11 |
import os
12 |
from gradio.inputs import Dropdown
13 |
import gradio as gr
14 |
import tempfile
15 |
import os
16 |
from pdf2image import convert_from_path
17 |
import pytesseract
18 |
from IPython.display import Markdown
19 |
import fitz
20 |
from pdf2image import convert_from_bytes
21 |
22 |
23 |
def preprocess_punctuation(text):
24 |
pattern = r'(?<![a-z])[a-zA-Z\.]{1,4}(?:\.[a-zA-Z\.]{1,4})*\.(?!\s*[A-Z])'
25 |
matches = re.findall(pattern, text)
26 |
res = [*set(matches)]
27 |
#res = [r for r in res if not nlp(r).ents or
28 |
#not any(ent.label_ in nlp.get_pipe('ner').labels for ent in nlp(r).ents)] #optimized
29 |
return res
30 |
31 |
32 |
def preprocess_text(text):
33 |
prep_text = re.sub(r'\n\s*\n', '\n', text)
34 |
prep_text = re.sub(r'\n{2,}', '\n', prep_text)
35 |
#string_with_single_newlines_and_no_blank_lines = re.sub(r' {2,}', ' ', string_with_single_newlines_and_no_blank_lines)
36 |
37 |
return prep_text
38 |
39 |
40 |
41 |
42 |
def custom_tokenizer(doc):
43 |
# Define a custom rule to ignore colons as a sentence boundary
44 |
for token in doc[:-1]:
45 |
if (token.text == ":"):
46 |
doc[token.i+1].is_sent_start = False
47 |
return doc
48 |
49 |
50 |
51 |
def get_sentences(text, dictionary = None):
52 |
cl_sentences = []
53 |
chars_to_strip = [' ', '\n']
54 |
chars_to_strip_str = ''.join(set(chars_to_strip))
55 |
nlp = spacy.load("it_core_news_lg") #load ita moodel
56 |
nlp.add_pipe("custom_tokenizer", before="parser")
57 |
58 |
for punct in preprocess_punctuation(text):
59 |
nlp.tokenizer.add_special_case(punct, [{spacy.symbols.ORTH: punct, spacy.symbols.NORM: punct}])
60 |
61 |
doc = nlp(text) # Process the text with spaCy
62 |
sentences = list(doc.sents) # Split the text into sentences
63 |
for sentence in sentences:
64 |
sent = sentence.text
65 |
cl_sentence = ' '.join(filter(None, sent.lstrip(chars_to_strip_str).rstrip(chars_to_strip_str).split(' ')))
66 |
if cl_sentence!= '':
67 |
68 |
return cl_sentences
69 |
70 |
71 |
72 |
73 |
def extract_numbers(text, given_strings):
74 |
# Split text into a list of words
75 |
words = text.split()
76 |
# Find the indices of the given strings in the list of words
77 |
indices = [i for i, word in enumerate(words) if any(s in word for s in given_strings)]
78 |
# Initialize an empty list to store the numbers
79 |
numbers = []
80 |
# Loop through each index
81 |
for index in indices:
82 |
# Define the range of words to search for numbers
83 |
start = max(index - 1, 0)
84 |
end = min(index + 2, len(words))
85 |
# Extract the words within the range
86 |
context = words[start:end]
87 |
# Check if the context contains mathematical operators
88 |
if any(re.match(r'[+\*/]', word) for word in context):
89 |
90 |
# Find all numbers in the context
91 |
context_numbers = [
92 |
float(re.sub('[^0-9\.,]+', '', word).replace(',', '.'))
93 |
if re.sub('[^0-9\.,]+', '', word).replace(',', '.').replace('.', '', 1).isdigit()
94 |
else int(re.sub('[^0-9]+', '', word))
95 |
if re.sub('[^0-9]+', '', word).isdigit()
96 |
else None
97 |
for word in context
98 |
99 |
# Add the numbers to the list
100 |
101 |
return numbers
102 |
103 |
104 |
105 |
def get_text_and_values(text, key_list):
106 |
sentences = get_sentences(text)
107 |
total_numbers= []
108 |
infoDict = {}
109 |
for sentence in sentences:
110 |
numbers = extract_numbers(text = sentence, given_strings = key_list)
111 |
112 |
if not numbers:
113 |
114 |
else: infoDict[sentence] = numbers
115 |
return infoDict
116 |
117 |
118 |
def get_useful_text(dictionary):
119 |
keysList = list(dictionary.keys())
120 |
tes = ('\n'.join(keysList))
121 |
return tes
122 |
123 |
def get_values(dictionary):
124 |
pr = list(dictionary.values())
125 |
return pr
126 |
127 |
128 |
def initialize_qa_transformer(model):
129 |
qa = pipeline("text2text-generation", model=model)
130 |
return qa
131 |
132 |
133 |
def get_answers_unfiltered(dictionary, question, qa_pipeline):
134 |
keysList = list(dictionary.keys())
135 |
answers = []
136 |
for kl in keysList:
137 |
answer = qa_pipeline(f'{kl} Domanda: {question}')
138 |
139 |
return answers
140 |
141 |
142 |
def get_total(answered_values, text, keywords, raw_values, unique_values = False):
143 |
numeric_list = [num for sublist in raw_values for num in sublist if isinstance(num, (int, float))]
144 |
#numbers = [float(x[0]['generated_text']) for x in answered_values if x[0]['generated_text'].isdigit()]
145 |
pattern = r'\d+(?:[.,]\d+)?'
146 |
numbers = []
147 |
for sub_lst in answered_values:
148 |
for d in sub_lst:
149 |
for k, v in d.items():
150 |
# Replace commas with dots
151 |
v = v.replace(',', '.')
152 |
# Extract numbers and convert to float
153 |
numbers += [float(match) for match in re.findall(pattern, v) if (float(match) >= 5.0) and (float(match) in numeric_list)]
154 |
###### remove duplicates
155 |
if unique_values:
156 |
numbers = list(set(numbers))
157 |
158 |
total = 0
159 |
sum = 0
160 |
total_list = []
161 |
# Define a regular expression pattern that will match a number
162 |
pattern = r'\d+'
163 |
# Loop through the keywords and search for them in the text
164 |
found = False
165 |
for keyword in keywords:
166 |
# Build a regular expression pattern that looks for the keyword
167 |
# followed by up to three words, then a number
168 |
keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})'
169 |
match =, text, re.IGNORECASE)
170 |
if match:
171 |
# If we find a match, print the number and set found to True
172 |
number =
173 |
if (number in numbers) and (number in numeric_list):
174 |
175 |
print(f"Found a value ({number}) for keyword '{keyword}'.")
176 |
found = True
177 |
178 |
# If we didn't find a match
179 |
if not found:
180 |
for value in numbers:
181 |
if value in numeric_list:
182 |
total += value
183 |
184 |
#If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq
185 |
for value in total_list:
186 |
sum += value
187 |
return numbers, sum
188 |
189 |
190 |
191 |
192 |
def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False):
193 |
194 |
tex = ''
195 |
dictionary = get_text_and_values(text, k_words)
196 |
raw = get_values(dictionary)
197 |
qa = initialize_qa_transformer(transformer)
198 |
val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa)
199 |
keywords = ['totale', 'complessivo', 'complessiva']
200 |
values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True)
201 |
if return_text:
202 |
tex = get_useful_text(dictionary)
203 |
return values, return_text, tex
204 |
elif return_text == False:
205 |
return values, return_text
206 |
207 |
208 |
209 |
def format_output(extracted_values):
210 |
output = f"Valori: {extracted_values[0][0]}\n"
211 |
output += f"Totale: {extracted_values[0][1]}\n"
212 |
if extracted_values[1] == True:
213 |
output += "-------------------\n"
214 |
output += f"Rif. Testo:\n{extracted_values[2]}"
215 |
return output
216 |
217 |
218 |
219 |
220 |
def pdf_ocr(file):
221 |
# Convert PDF to image
222 |
with tempfile.TemporaryDirectory() as path:
223 |
with open(file, "rb") as f:
224 |
content =
225 |
226 |
with, filetype="pdf") as doc:
227 |
num_pages = len(doc)
228 |
229 |
# Extract text from the PDF
230 |
text = ""
231 |
for page in doc:
232 |
text += page.get_text()
233 |
234 |
# Perform OCR on the PDF if the extracted text is empty
235 |
if not text:
236 |
# Convert PDF pages to images
237 |
images = convert_from_path(content)
238 |
for i, img in enumerate(images):
239 |
text += pytesseract.image_to_string(img, lang='ita')
240 |
241 |
# Clear the image list to free up memory
242 |
del images
243 |
244 |
# Call extractor_clean and format_output functions
245 |
ks = ('mq', 'metri quadri', 'm2')
246 |
tra = 'it5/it5-large-question-answering'
247 |
quest = "Quanti metri quadri misura la superficie?"
248 |
totalK = ['totale', 'complessivo', 'complessiva']
249 |
250 |
extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True)
251 |
output = format_output(extracted_values=extracted_values)
252 |
253 |
return output
254 |
255 |
256 |
def ocr_interface(pdf_file):
257 |
# Call the pdf_ocr function
258 |
ocr_output = pdf_ocr(
259 |
return ocr_output
260 |
261 |
262 |
pdf_input = gr.inputs.File(label="PDF File")
263 |
output_text = gr.outputs.Textbox(label="Output")
264 |
iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, outputs=output_text)
265 |