Spaces:
Sleeping
Sleeping
fix page 6-7 extraction issue
Browse files
app.py
CHANGED
@@ -159,6 +159,12 @@ if 'page_count' in st.session_state:
|
|
159 |
else:
|
160 |
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
|
163 |
if 'run_button' in st.session_state and st.session_state.run_button == True:
|
164 |
st.session_state.running = True
|
@@ -175,16 +181,17 @@ if 'page_count' in st.session_state:
|
|
175 |
pdf_tables_image_list=[]
|
176 |
st.session_state.pdf_text_list=[]
|
177 |
|
178 |
-
for page_number in range(st.session_state.num_pages_to_extract
|
|
|
179 |
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
|
180 |
st.session_state.color_image_list.append(image[0])
|
181 |
-
progress_percentage = (page_number) / (st.session_state.num_pages_to_extract
|
182 |
read_pdf_progress_bar.progress(progress_percentage)
|
183 |
read_pdf_progress_bar.progress(0)
|
184 |
for index, image in enumerate(st.session_state.color_image_list):
|
185 |
image_np = np.array(image)
|
186 |
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
|
187 |
-
progress_percentage = (index) / (st.session_state.
|
188 |
read_pdf_progress_bar.progress(progress_percentage)
|
189 |
st.session_state.extracted_text = ""
|
190 |
|
@@ -195,34 +202,38 @@ if 'page_count' in st.session_state:
|
|
195 |
|
196 |
|
197 |
figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
198 |
-
|
|
|
199 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
200 |
st.session_state.pdf_tables_image_list.append(tables_image_list)
|
201 |
st.session_state.pdf_textbox_image_list.append(textbox_image_list)
|
202 |
if st.session_state.pdf_figures_image_list[index]:
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
207 |
if st.session_state.pdf_tables_image_list:
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
212 |
if st.session_state.pdf_textbox_image_list:
|
213 |
textbox_index = 1
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
219 |
|
220 |
|
221 |
-
|
222 |
-
|
223 |
# st.write(text)
|
224 |
# print(text)
|
225 |
-
progress_percentage = (index) / (st.session_state.
|
226 |
read_pdf_progress_bar.progress(progress_percentage)
|
227 |
st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
|
228 |
st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
|
|
|
159 |
else:
|
160 |
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
|
161 |
|
162 |
+
if 'num_pages_to_extract2'not in st.session_state:
|
163 |
+
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=6, key='num_pages_to_extract_slider2')
|
164 |
+
else:
|
165 |
+
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
|
166 |
+
|
167 |
+
|
168 |
st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
|
169 |
if 'run_button' in st.session_state and st.session_state.run_button == True:
|
170 |
st.session_state.running = True
|
|
|
181 |
pdf_tables_image_list=[]
|
182 |
st.session_state.pdf_text_list=[]
|
183 |
|
184 |
+
for page_number in range(st.session_state.num_pages_to_extract,
|
185 |
+
st.session_state.num_pages_to_extract2+1):
|
186 |
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
|
187 |
st.session_state.color_image_list.append(image[0])
|
188 |
+
progress_percentage = (page_number-st.session_state.num_pages_to_extract) / (st.session_state.num_pages_to_extract2-st.session_state.num_pages_to_extract)
|
189 |
read_pdf_progress_bar.progress(progress_percentage)
|
190 |
read_pdf_progress_bar.progress(0)
|
191 |
for index, image in enumerate(st.session_state.color_image_list):
|
192 |
image_np = np.array(image)
|
193 |
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
|
194 |
+
progress_percentage = (index) / len(st.session_state.color_image_list)
|
195 |
read_pdf_progress_bar.progress(progress_percentage)
|
196 |
st.session_state.extracted_text = ""
|
197 |
|
|
|
202 |
|
203 |
|
204 |
figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
205 |
+
if textbox_image_list:
|
206 |
+
print("index="+str(index)+" txt book " + str(len(textbox_image_list)))
|
207 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
208 |
st.session_state.pdf_tables_image_list.append(tables_image_list)
|
209 |
st.session_state.pdf_textbox_image_list.append(textbox_image_list)
|
210 |
if st.session_state.pdf_figures_image_list[index]:
|
211 |
+
if st.session_state.pdf_figures_image_list[index]:
|
212 |
+
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
213 |
+
raw_image_file_name = f"page_{index+1}_{pdf_figure_text_image[0]}.png"
|
214 |
+
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
215 |
+
Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
|
216 |
if st.session_state.pdf_tables_image_list:
|
217 |
+
if st.session_state.pdf_tables_image_list[index]:
|
218 |
+
for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
|
219 |
+
raw_image_file_name = f"page_{index+1}_{pdf_table_text_image[0]}.png"
|
220 |
+
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
221 |
+
Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
|
222 |
if st.session_state.pdf_textbox_image_list:
|
223 |
textbox_index = 1
|
224 |
+
if st.session_state.pdf_textbox_image_list[index]:
|
225 |
+
for pdf_textbox_image in st.session_state.pdf_textbox_image_list[index]:
|
226 |
+
raw_image_file_name = f"page_{index+1}_textbox_{textbox_index}.png"
|
227 |
+
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
228 |
+
Image.fromarray(pdf_textbox_image).save(temp_textbox_dir + cleaned_image_file_name)
|
229 |
+
textbox_index = textbox_index + 1
|
230 |
|
231 |
|
232 |
+
st.session_state.pdf_text_list.append(text)
|
233 |
+
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
234 |
# st.write(text)
|
235 |
# print(text)
|
236 |
+
progress_percentage = (index) / len(st.session_state.color_image_list)
|
237 |
read_pdf_progress_bar.progress(progress_percentage)
|
238 |
st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
|
239 |
st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
|
utils.py
CHANGED
@@ -393,7 +393,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
393 |
print("Table of Contents")
|
394 |
# display_image_np(max_height_image)
|
395 |
#print(text)
|
396 |
-
return
|
397 |
else:
|
398 |
print("not Table of Contents")
|
399 |
indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
|
@@ -420,7 +420,8 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
420 |
print("no left over end")
|
421 |
text=extract_two_columns_text(max_height_image_copy,debug)
|
422 |
if text == "error":
|
423 |
-
return
|
|
|
424 |
else:
|
425 |
return figures_image_list,tables_image_list,text_box_list,text
|
426 |
else:
|
|
|
393 |
print("Table of Contents")
|
394 |
# display_image_np(max_height_image)
|
395 |
#print(text)
|
396 |
+
return None, None, None, "Table of Contents"
|
397 |
else:
|
398 |
print("not Table of Contents")
|
399 |
indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
|
|
|
420 |
print("no left over end")
|
421 |
text=extract_two_columns_text(max_height_image_copy,debug)
|
422 |
if text == "error":
|
423 |
+
return None, None, None, "error"
|
424 |
+
# return()
|
425 |
else:
|
426 |
return figures_image_list,tables_image_list,text_box_list,text
|
427 |
else:
|