MrFeelgoood commited on
Commit
004a744
·
1 Parent(s): 12f5ffc

Update app.py

Browse files

Added source code for the app

Files changed (1) hide show
  1. app.py +261 -3
app.py CHANGED
@@ -1,7 +1,265 @@
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
7
  iface.launch()
 
1
+ import spacy
2
+ from spacy.language import Language
3
+ from spacy.lang.it import Italian
4
+ import re
5
+ from transformers import pipeline
6
+ from gradio.inputs import File
7
  import gradio as gr
8
+ from pdf2image import convert_from_path
9
+ import pytesseract
10
+ import tempfile
11
+ import os
12
+ from gradio.inputs import Dropdown
13
+ import gradio as gr
14
+ import tempfile
15
+ import os
16
+ from pdf2image import convert_from_path
17
+ import pytesseract
18
+ from IPython.display import Markdown
19
+ import fitz
20
+ from pdf2image import convert_from_bytes
21
+
22
+
23
+ def preprocess_punctuation(text):
24
+ pattern = r'(?<![a-z])[a-zA-Z\.]{1,4}(?:\.[a-zA-Z\.]{1,4})*\.(?!\s*[A-Z])'
25
+ matches = re.findall(pattern, text)
26
+ res = [*set(matches)]
27
+ #res = [r for r in res if not nlp(r).ents or
28
+ #not any(ent.label_ in nlp.get_pipe('ner').labels for ent in nlp(r).ents)] #optimized
29
+ return res
30
+
31
+
32
+ def preprocess_text(text):
33
+ prep_text = re.sub(r'\n\s*\n', '\n', text)
34
+ prep_text = re.sub(r'\n{2,}', '\n', prep_text)
35
+ #string_with_single_newlines_and_no_blank_lines = re.sub(r' {2,}', ' ', string_with_single_newlines_and_no_blank_lines)
36
+ #print(string_with_single_newlines_and_no_blank_lines)
37
+ return prep_text
38
+
39
+
40
+
41
+ @Language.component('custom_tokenizer')
42
+ def custom_tokenizer(doc):
43
+ # Define a custom rule to ignore colons as a sentence boundary
44
+ for token in doc[:-1]:
45
+ if (token.text == ":"):
46
+ doc[token.i+1].is_sent_start = False
47
+ return doc
48
+
49
+
50
+
51
+ def get_sentences(text, dictionary = None):
52
+ cl_sentences = []
53
+ chars_to_strip = [' ', '\n']
54
+ chars_to_strip_str = ''.join(set(chars_to_strip))
55
+ nlp = spacy.load("it_core_news_lg") #load ita moodel
56
+ nlp.add_pipe("custom_tokenizer", before="parser")
57
+
58
+ for punct in preprocess_punctuation(text):
59
+ nlp.tokenizer.add_special_case(punct, [{spacy.symbols.ORTH: punct, spacy.symbols.NORM: punct}])
60
+
61
+ doc = nlp(text) # Process the text with spaCy
62
+ sentences = list(doc.sents) # Split the text into sentences
63
+ for sentence in sentences:
64
+ sent = sentence.text
65
+ cl_sentence = ' '.join(filter(None, sent.lstrip(chars_to_strip_str).rstrip(chars_to_strip_str).split(' ')))
66
+ if cl_sentence!= '':
67
+ cl_sentences.append(cl_sentence)
68
+ return cl_sentences
69
+
70
+
71
+
72
+
73
+ def extract_numbers(text, given_strings):
74
+ # Split text into a list of words
75
+ words = text.split()
76
+ # Find the indices of the given strings in the list of words
77
+ indices = [i for i, word in enumerate(words) if any(s in word for s in given_strings)]
78
+ # Initialize an empty list to store the numbers
79
+ numbers = []
80
+ # Loop through each index
81
+ for index in indices:
82
+ # Define the range of words to search for numbers
83
+ start = max(index - 1, 0)
84
+ end = min(index + 2, len(words))
85
+ # Extract the words within the range
86
+ context = words[start:end]
87
+ # Check if the context contains mathematical operators
88
+ if any(re.match(r'[+\*/]', word) for word in context):
89
+ continue
90
+ # Find all numbers in the context
91
+ context_numbers = [
92
+ float(re.sub('[^0-9\.,]+', '', word).replace(',', '.'))
93
+ if re.sub('[^0-9\.,]+', '', word).replace(',', '.').replace('.', '', 1).isdigit()
94
+ else int(re.sub('[^0-9]+', '', word))
95
+ if re.sub('[^0-9]+', '', word).isdigit()
96
+ else None
97
+ for word in context
98
+ ]
99
+ # Add the numbers to the list
100
+ numbers.extend(context_numbers)
101
+ return numbers
102
+
103
+
104
+
105
+ def get_text_and_values(text, key_list):
106
+ sentences = get_sentences(text)
107
+ total_numbers= []
108
+ infoDict = {}
109
+ for sentence in sentences:
110
+ numbers = extract_numbers(text = sentence, given_strings = key_list)
111
+ total_numbers.append(numbers)
112
+ if not numbers:
113
+ continue
114
+ else: infoDict[sentence] = numbers
115
+ return infoDict
116
+
117
+
118
+ def get_useful_text(dictionary):
119
+ keysList = list(dictionary.keys())
120
+ tes = ('\n'.join(keysList))
121
+ return tes
122
+
123
+ def get_values(dictionary):
124
+ pr = list(dictionary.values())
125
+ return pr
126
+
127
+
128
+ def initialize_qa_transformer(model):
129
+ qa = pipeline("text2text-generation", model=model)
130
+ return qa
131
+
132
+
133
+ def get_answers_unfiltered(dictionary, question, qa_pipeline):
134
+ keysList = list(dictionary.keys())
135
+ answers = []
136
+ for kl in keysList:
137
+ answer = qa_pipeline(f'{kl} Domanda: {question}')
138
+ answers.append(answer)
139
+ return answers
140
+
141
+
142
+ def get_total(answered_values, text, keywords, raw_values, unique_values = False):
143
+ numeric_list = [num for sublist in raw_values for num in sublist if isinstance(num, (int, float))]
144
+ #numbers = [float(x[0]['generated_text']) for x in answered_values if x[0]['generated_text'].isdigit()]
145
+ pattern = r'\d+(?:[.,]\d+)?'
146
+ numbers = []
147
+ for sub_lst in answered_values:
148
+ for d in sub_lst:
149
+ for k, v in d.items():
150
+ # Replace commas with dots
151
+ v = v.replace(',', '.')
152
+ # Extract numbers and convert to float
153
+ numbers += [float(match) for match in re.findall(pattern, v) if (float(match) >= 5.0) and (float(match) in numeric_list)]
154
+ ###### remove duplicates
155
+ if unique_values:
156
+ numbers = list(set(numbers))
157
+ ######
158
+ total = 0
159
+ sum = 0
160
+ total_list = []
161
+ # Define a regular expression pattern that will match a number
162
+ pattern = r'\d+'
163
+ # Loop through the keywords and search for them in the text
164
+ found = False
165
+ for keyword in keywords:
166
+ # Build a regular expression pattern that looks for the keyword
167
+ # followed by up to three words, then a number
168
+ keyword_pattern = f'{keyword}(\\s+\\w+){{0,3}}\\s+({pattern})'
169
+ match = re.search(keyword_pattern, text, re.IGNORECASE)
170
+ if match:
171
+ # If we find a match, print the number and set found to True
172
+ number = match.group(2)
173
+ if (number in numbers) and (number in numeric_list):
174
+ total_list.append(int(number))
175
+ print(f"Found a value ({number}) for keyword '{keyword}'.")
176
+ found = True
177
+
178
+ # If we didn't find a match
179
+ if not found:
180
+ for value in numbers:
181
+ if value in numeric_list:
182
+ total += value
183
+ total_list.append(total)
184
+ #If there is more than one total, it means different lots with many total measures for each house. Calculate the sum of the totals mq
185
+ for value in total_list:
186
+ sum += value
187
+ return numbers, sum
188
+
189
+
190
+
191
+
192
+ def extractor_clean(text, k_words, transformer, question, total_kwords, return_text = False):
193
+
194
+ tex = ''
195
+ dictionary = get_text_and_values(text, k_words)
196
+ raw = get_values(dictionary)
197
+ qa = initialize_qa_transformer(transformer)
198
+ val = get_answers_unfiltered(dictionary, question = question, qa_pipeline = qa)
199
+ keywords = ['totale', 'complessivo', 'complessiva']
200
+ values = get_total(answered_values= val, raw_values = raw, text = text, keywords = total_kwords, unique_values = True)
201
+ if return_text:
202
+ tex = get_useful_text(dictionary)
203
+ return values, return_text, tex
204
+ elif return_text == False:
205
+ return values, return_text
206
+
207
+
208
+
209
+ def format_output(extracted_values):
210
+ output = f"Valori: {extracted_values[0][0]}\n"
211
+ output += f"Totale: {extracted_values[0][1]}\n"
212
+ if extracted_values[1] == True:
213
+ output += "-------------------\n"
214
+ output += f"Rif. Testo:\n{extracted_values[2]}"
215
+ return output
216
+
217
+
218
+
219
+
220
+ def pdf_ocr(file):
221
+ # Convert PDF to image
222
+ with tempfile.TemporaryDirectory() as path:
223
+ with open(file, "rb") as f:
224
+ content = f.read()
225
+
226
+ with fitz.open(stream=content, filetype="pdf") as doc:
227
+ num_pages = len(doc)
228
+
229
+ # Extract text from the PDF
230
+ text = ""
231
+ for page in doc:
232
+ text += page.get_text()
233
+
234
+ # Perform OCR on the PDF if the extracted text is empty
235
+ if not text:
236
+ # Convert PDF pages to images
237
+ images = convert_from_path(content)
238
+ for i, img in enumerate(images):
239
+ text += pytesseract.image_to_string(img, lang='ita')
240
+
241
+ # Clear the image list to free up memory
242
+ del images
243
+
244
+ # Call extractor_clean and format_output functions
245
+ ks = ('mq', 'metri quadri', 'm2')
246
+ tra = 'it5/it5-large-question-answering'
247
+ quest = "Quanti metri quadri misura la superficie?"
248
+ totalK = ['totale', 'complessivo', 'complessiva']
249
+
250
+ extracted_values = extractor_clean(text=text, k_words=ks, transformer=tra, question=quest, total_kwords=totalK, return_text=True)
251
+ output = format_output(extracted_values=extracted_values)
252
+
253
+ return output
254
+
255
+
256
+ def ocr_interface(pdf_file):
257
+ # Call the pdf_ocr function
258
+ ocr_output = pdf_ocr(pdf_file.name)
259
+ return ocr_output
260
 
 
 
261
 
262
+ pdf_input = gr.inputs.File(label="PDF File")
263
+ output_text = gr.outputs.Textbox(label="Output")
264
+ iface = gr.Interface(fn=ocr_interface, inputs=pdf_input, outputs=output_text)
265
  iface.launch()