zmbfeng commited on
Commit
bfc7f18
·
1 Parent(s): 3411406

upload to display text working

Browse files
Files changed (2) hide show
  1. app.py +135 -31
  2. utils.py +7 -7
app.py CHANGED
@@ -3,12 +3,33 @@ import pdf2image
3
  import utils
4
  import numpy as np
5
  import cv2
 
 
 
6
  import time
7
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
8
  # poppler-utils:
9
  # Installed: 22.02.0-2ubuntu0.4
10
  # install https://github.com/UB-Mannheim/tesseract/wiki
11
  #page extraction disabled
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  big_text = """
13
  <div style='text-align: center;'>
14
  <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
@@ -18,41 +39,124 @@ big_text = """
18
  st.markdown(big_text, unsafe_allow_html=True)
19
 
20
 
 
21
  if 'is_initialized' not in st.session_state:
22
  pdf_path = 'uploaded_pdf/data_sheet.pdf'
23
  st.session_state['is_initialized'] = True
24
- page_count = utils.get_pdf_page_count(pdf_path)
25
- print("page_count=",page_count)
26
- page_count=5
27
- print("new page_count=",page_count)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  read_pdf_progress_bar = st.progress(0)
29
- st.session_state.color_image_list = []
30
- st.session_state.gray_image_np_list = []
31
- for page_number in range(page_count):
32
- image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
33
- st.session_state.color_image_list.append(image[0])
34
- progress_percentage = (page_number) / (page_count-1)
35
- read_pdf_progress_bar.progress(progress_percentage)
36
- gray_pdf_image_np_list = []
37
- read_pdf_progress_bar.progress(0)
38
- for index, image in enumerate(st.session_state.color_image_list):
39
- image_np = np.array(image)
40
- st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
41
- progress_percentage = (index) / (page_count - 1)
42
- read_pdf_progress_bar.progress(progress_percentage)
43
- # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
44
- # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
45
- # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
46
- # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
47
- st.session_state.img_index = 0
48
- st.session_state.stop_button_clicked=False
49
- # st.image(st.session_state.gray_image_np_list[38])
50
-
51
- for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
52
- print("index="+str(index))
53
-
54
- text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
55
- st.write(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  #if 'img_index' not in st.session_state:
57
 
58
  # if st.button("Stop"):
 
3
  import utils
4
  import numpy as np
5
  import cv2
6
+ import os
7
+ import shutil
8
+
9
  import time
10
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
11
  # poppler-utils:
12
  # Installed: 22.02.0-2ubuntu0.4
13
  # install https://github.com/UB-Mannheim/tesseract/wiki
14
  #page extraction disabled
15
+
16
+ def is_new_pdf_upload(uploaded_file):
17
+ if 'last_pdf_uploaded_file' in st.session_state:
18
+ # Check if the newly uploaded file is different from the last one
19
+ if (uploaded_file.name != st.session_state.last_pdf_uploaded_file['name'] or
20
+ uploaded_file.size != st.session_state.last_pdf_uploaded_file['size']):
21
+ st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
22
+ # st.write("A new src image file has been uploaded.")
23
+ return True
24
+ else:
25
+ # st.write("The same src image file has been re-uploaded.")
26
+ return False
27
+ else:
28
+ # st.write("This is the first file upload detected.")
29
+ st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
30
+ return True
31
+ # Store current file details in session state
32
+
33
  big_text = """
34
  <div style='text-align: center;'>
35
  <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
 
39
  st.markdown(big_text, unsafe_allow_html=True)
40
 
41
 
42
+
43
  if 'is_initialized' not in st.session_state:
44
  pdf_path = 'uploaded_pdf/data_sheet.pdf'
45
  st.session_state['is_initialized'] = True
46
+ # page_count = utils.get_pdf_page_count(pdf_path)
47
+ # print("page_count=",page_count)
48
+ # page_count=5
49
+ # print("new page_count=",page_count)
50
+ # read_pdf_progress_bar = st.progress(0)
51
+ # st.session_state.color_image_list = []
52
+ # st.session_state.gray_image_np_list = []
53
+ # for page_number in range(page_count):
54
+ # image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
55
+ # st.session_state.color_image_list.append(image[0])
56
+ # progress_percentage = (page_number) / (page_count-1)
57
+ # read_pdf_progress_bar.progress(progress_percentage)
58
+ # gray_pdf_image_np_list = []
59
+ # read_pdf_progress_bar.progress(0)
60
+ # for index, image in enumerate(st.session_state.color_image_list):
61
+ # image_np = np.array(image)
62
+ # st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
63
+ # progress_percentage = (index) / (page_count - 1)
64
+ # read_pdf_progress_bar.progress(progress_percentage)
65
+ # # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
66
+ # # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
67
+ # # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
68
+ # # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
69
+ # st.session_state.img_index = 0
70
+ # st.session_state.stop_button_clicked=False
71
+ # # st.image(st.session_state.gray_image_np_list[38])
72
+
73
+ uploaded_locked_pdf_file = st.file_uploader("Upload a locked pdf",
74
+ type=['pdf'])
75
+ st.markdown(
76
+ f'<a href="https://ikmtechnology.github.io/ikmtechnology/data_sheet.pdf" target="_blank">Sample 1 download and then upload to above</a>',
77
+ unsafe_allow_html=True)
78
+
79
+ if uploaded_locked_pdf_file is not None:
80
+ if is_new_pdf_upload(uploaded_locked_pdf_file):
81
+ # To see details
82
+ # file_details = {"FileName": uploaded_driving_video_file.name, "FileType": uploaded_driving_video_file.type, "FileSize": uploaded_driving_video_file.size}
83
+ # st.write(file_details)
84
+ save_path = './uploaded_videos'
85
+ if not os.path.exists(save_path):
86
+ os.makedirs(save_path)
87
+ with open(os.path.join(save_path, uploaded_locked_pdf_file.name), "wb") as f:
88
+ f.write(uploaded_locked_pdf_file.getbuffer()) # Write the file to the specified location
89
+ st.success(f'Saved file temp_{uploaded_locked_pdf_file.name} in {save_path}')
90
+ st.session_state.uploaded_pdf_path=os.path.join(save_path, uploaded_locked_pdf_file.name)
91
+ st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
92
+ print("page_count=",st.session_state.page_count)
93
+
94
+ if 'extracted_text' in st.session_state:
95
+ del st.session_state.extracted_text
96
+ st.rerun()
97
+ if 'page_count' in st.session_state:
98
+ st.write(f"total page count = {st.session_state.page_count}")
99
+ if 'num_pages_to_extract'not in st.session_state:
100
+ st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=5, key='num_pages_to_extract_slider')
101
+ else:
102
+ st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
103
+
104
+ st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
105
+ if 'run_button' in st.session_state and st.session_state.run_button == True:
106
+ st.session_state.running = True
107
+ else:
108
+ st.session_state.running = False
109
  read_pdf_progress_bar = st.progress(0)
110
+ if st.button('Extract Pages', disabled=st.session_state.running, key='run_button'):
111
+
112
+ st.session_state.color_image_list = []
113
+ st.session_state.gray_image_np_list = []
114
+ for page_number in range(st.session_state.num_pages_to_extract):
115
+ image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
116
+ st.session_state.color_image_list.append(image[0])
117
+ progress_percentage = (page_number) / (st.session_state.num_pages_to_extract-1)
118
+ read_pdf_progress_bar.progress(progress_percentage)
119
+ read_pdf_progress_bar.progress(0)
120
+ for index, image in enumerate(st.session_state.color_image_list):
121
+ image_np = np.array(image)
122
+ st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
123
+ progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
124
+ read_pdf_progress_bar.progress(progress_percentage)
125
+ st.session_state.extracted_text = ""
126
+ for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
127
+ print("index="+str(index))
128
+
129
+ text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
130
+ st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
131
+ # st.write(text)
132
+ # print(text)
133
+ progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
134
+ read_pdf_progress_bar.progress(progress_percentage)
135
+ #add_animation_to_image()
136
+ #st.session_state['video_generated'] = True
137
+ st.rerun()
138
+ if 'extracted_text' in st.session_state:
139
+ st.write(st.session_state.extracted_text)
140
+ # for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
141
+ # print("index="+str(index))
142
+ #
143
+ # text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
144
+ # st.write(text)
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
  #if 'img_index' not in st.session_state:
161
 
162
  # if st.button("Stop"):
utils.py CHANGED
@@ -287,11 +287,11 @@ def extract_two_columns_text(image_index,image_np,debug):
287
  if debug:
288
  print("left column image start")
289
  # display(left_column_img)
290
- st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
291
  print("left column image end")
292
  print("right column image start")
293
  # display(right_column_img)
294
- st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
295
  print("right column image end")
296
  left_text = pytesseract.image_to_string(left_column_img)
297
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
@@ -347,7 +347,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
347
  print(table[0])
348
  # st.write(table[0])#to_be_displayed
349
  # st.image(Image.fromarray(table[1]))#to_be_displayed
350
- st.image(Image.fromarray(cropped_image))#to_be_displayed
351
  found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
352
  if found_hor_lines_list is not None:
353
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
@@ -371,7 +371,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
371
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
372
  else:
373
  max_height_image = cropped_image.copy()
374
- st.write("selected segment")
375
  # print("max height image start")
376
  # st.image(Image.fromarray(max_height_image))#to_be_displayed
377
  # print("max height image end")
@@ -403,18 +403,18 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
403
  color_tuple=(0, 255, 0)
404
  cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
405
  print("still in the middle start")
406
- st.image(Image.fromarray(bgr_image))
407
  print("still in the middle end")
408
  left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
409
  if debug:
410
  print("left over start")
411
- st.image(Image.fromarray(left_over_content))
412
  print("left over end")
413
  max_height_image_copy=max_height_image.copy()
414
  cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
415
  if debug:
416
  print("no left over start")
417
- st.image(Image.fromarray(max_height_image_copy))
418
  print("no left over end")
419
  text=extract_two_columns_text(max_height_image_copy,debug)
420
  if text == "error":
 
287
  if debug:
288
  print("left column image start")
289
  # display(left_column_img)
290
+ # st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
291
  print("left column image end")
292
  print("right column image start")
293
  # display(right_column_img)
294
+ # st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
295
  print("right column image end")
296
  left_text = pytesseract.image_to_string(left_column_img)
297
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
 
347
  print(table[0])
348
  # st.write(table[0])#to_be_displayed
349
  # st.image(Image.fromarray(table[1]))#to_be_displayed
350
+ # st.image(Image.fromarray(cropped_image))#to_be_displayed
351
  found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
352
  if found_hor_lines_list is not None:
353
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
 
371
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
372
  else:
373
  max_height_image = cropped_image.copy()
374
+ # st.write("selected segment")
375
  # print("max height image start")
376
  # st.image(Image.fromarray(max_height_image))#to_be_displayed
377
  # print("max height image end")
 
403
  color_tuple=(0, 255, 0)
404
  cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
405
  print("still in the middle start")
406
+ # st.image(Image.fromarray(bgr_image))
407
  print("still in the middle end")
408
  left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
409
  if debug:
410
  print("left over start")
411
+ # st.image(Image.fromarray(left_over_content))
412
  print("left over end")
413
  max_height_image_copy=max_height_image.copy()
414
  cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
415
  if debug:
416
  print("no left over start")
417
+ # st.image(Image.fromarray(max_height_image_copy))
418
  print("no left over end")
419
  text=extract_two_columns_text(max_height_image_copy,debug)
420
  if text == "error":