zmbfeng commited on
Commit
c965cb0
1 Parent(s): 7d07000

fix page 6-7 extraction issue

Browse files
Files changed (2) hide show
  1. app.py +31 -20
  2. utils.py +3 -2
app.py CHANGED
@@ -159,6 +159,12 @@ if 'page_count' in st.session_state:
159
  else:
160
  st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
161
 
 
 
 
 
 
 
162
  st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
163
  if 'run_button' in st.session_state and st.session_state.run_button == True:
164
  st.session_state.running = True
@@ -175,16 +181,17 @@ if 'page_count' in st.session_state:
175
  pdf_tables_image_list=[]
176
  st.session_state.pdf_text_list=[]
177
 
178
- for page_number in range(st.session_state.num_pages_to_extract):
 
179
  image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
180
  st.session_state.color_image_list.append(image[0])
181
- progress_percentage = (page_number) / (st.session_state.num_pages_to_extract-1)
182
  read_pdf_progress_bar.progress(progress_percentage)
183
  read_pdf_progress_bar.progress(0)
184
  for index, image in enumerate(st.session_state.color_image_list):
185
  image_np = np.array(image)
186
  st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
187
- progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
188
  read_pdf_progress_bar.progress(progress_percentage)
189
  st.session_state.extracted_text = ""
190
 
@@ -195,34 +202,38 @@ if 'page_count' in st.session_state:
195
 
196
 
197
  figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
198
- print("index="+str(index)+" txt book " + str(len(textbox_image_list)))
 
199
  st.session_state.pdf_figures_image_list.append(figures_image_list)
200
  st.session_state.pdf_tables_image_list.append(tables_image_list)
201
  st.session_state.pdf_textbox_image_list.append(textbox_image_list)
202
  if st.session_state.pdf_figures_image_list[index]:
203
- for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
204
- raw_image_file_name = f"page_{index+1}_{pdf_figure_text_image[0]}.png"
205
- cleaned_image_file_name = clean_filename(raw_image_file_name)
206
- Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
 
207
  if st.session_state.pdf_tables_image_list:
208
- for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
209
- raw_image_file_name = f"page_{index+1}_{pdf_table_text_image[0]}.png"
210
- cleaned_image_file_name = clean_filename(raw_image_file_name)
211
- Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
 
212
  if st.session_state.pdf_textbox_image_list:
213
  textbox_index = 1
214
- for pdf_textbox_image in st.session_state.pdf_textbox_image_list[index]:
215
- raw_image_file_name = f"page_{index+1}_textbox_{textbox_index}.png"
216
- cleaned_image_file_name = clean_filename(raw_image_file_name)
217
- Image.fromarray(pdf_textbox_image).save(temp_textbox_dir + cleaned_image_file_name)
218
- textbox_index = textbox_index + 1
 
219
 
220
 
221
- st.session_state.pdf_text_list.append(text)
222
- st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
223
  # st.write(text)
224
  # print(text)
225
- progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
226
  read_pdf_progress_bar.progress(progress_percentage)
227
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
228
  st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
 
159
  else:
160
  st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
161
 
162
+ if 'num_pages_to_extract2'not in st.session_state:
163
+ st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=6, key='num_pages_to_extract_slider2')
164
+ else:
165
+ st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
166
+
167
+
168
  st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
169
  if 'run_button' in st.session_state and st.session_state.run_button == True:
170
  st.session_state.running = True
 
181
  pdf_tables_image_list=[]
182
  st.session_state.pdf_text_list=[]
183
 
184
+ for page_number in range(st.session_state.num_pages_to_extract,
185
+ st.session_state.num_pages_to_extract2+1):
186
  image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
187
  st.session_state.color_image_list.append(image[0])
188
+ progress_percentage = (page_number-st.session_state.num_pages_to_extract) / (st.session_state.num_pages_to_extract2-st.session_state.num_pages_to_extract)
189
  read_pdf_progress_bar.progress(progress_percentage)
190
  read_pdf_progress_bar.progress(0)
191
  for index, image in enumerate(st.session_state.color_image_list):
192
  image_np = np.array(image)
193
  st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
194
+ progress_percentage = (index) / len(st.session_state.color_image_list)
195
  read_pdf_progress_bar.progress(progress_percentage)
196
  st.session_state.extracted_text = ""
197
 
 
202
 
203
 
204
  figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
205
+ if textbox_image_list:
206
+ print("index="+str(index)+" txt book " + str(len(textbox_image_list)))
207
  st.session_state.pdf_figures_image_list.append(figures_image_list)
208
  st.session_state.pdf_tables_image_list.append(tables_image_list)
209
  st.session_state.pdf_textbox_image_list.append(textbox_image_list)
210
  if st.session_state.pdf_figures_image_list[index]:
211
+ if st.session_state.pdf_figures_image_list[index]:
212
+ for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
213
+ raw_image_file_name = f"page_{index+1}_{pdf_figure_text_image[0]}.png"
214
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
215
+ Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
216
  if st.session_state.pdf_tables_image_list:
217
+ if st.session_state.pdf_tables_image_list[index]:
218
+ for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
219
+ raw_image_file_name = f"page_{index+1}_{pdf_table_text_image[0]}.png"
220
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
221
+ Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
222
  if st.session_state.pdf_textbox_image_list:
223
  textbox_index = 1
224
+ if st.session_state.pdf_textbox_image_list[index]:
225
+ for pdf_textbox_image in st.session_state.pdf_textbox_image_list[index]:
226
+ raw_image_file_name = f"page_{index+1}_textbox_{textbox_index}.png"
227
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
228
+ Image.fromarray(pdf_textbox_image).save(temp_textbox_dir + cleaned_image_file_name)
229
+ textbox_index = textbox_index + 1
230
 
231
 
232
+ st.session_state.pdf_text_list.append(text)
233
+ st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
234
  # st.write(text)
235
  # print(text)
236
+ progress_percentage = (index) / len(st.session_state.color_image_list)
237
  read_pdf_progress_bar.progress(progress_percentage)
238
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
239
  st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
utils.py CHANGED
@@ -393,7 +393,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
393
  print("Table of Contents")
394
  # display_image_np(max_height_image)
395
  #print(text)
396
- return("Table of Contents")
397
  else:
398
  print("not Table of Contents")
399
  indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
@@ -420,7 +420,8 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
420
  print("no left over end")
421
  text=extract_two_columns_text(max_height_image_copy,debug)
422
  if text == "error":
423
- return("error")
 
424
  else:
425
  return figures_image_list,tables_image_list,text_box_list,text
426
  else:
 
393
  print("Table of Contents")
394
  # display_image_np(max_height_image)
395
  #print(text)
396
+ return None, None, None, "Table of Contents"
397
  else:
398
  print("not Table of Contents")
399
  indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
 
420
  print("no left over end")
421
  text=extract_two_columns_text(max_height_image_copy,debug)
422
  if text == "error":
423
+ return None, None, None, "error"
424
+ # return()
425
  else:
426
  return figures_image_list,tables_image_list,text_box_list,text
427
  else: