Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 26, 2024

Commit

bfc7f18

1 Parent(s): 3411406

upload to display text working

Browse files

Files changed (2) hide show

app.py +135 -31
utils.py +7 -7

app.py CHANGED Viewed

@@ -3,12 +3,33 @@ import pdf2image
 import utils
 import numpy as np
 import cv2
 import time
 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
 # install https://github.com/UB-Mannheim/tesseract/wiki
 #page extraction disabled
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
@@ -18,41 +39,124 @@ big_text = """
 st.markdown(big_text, unsafe_allow_html=True)
 if 'is_initialized' not in st.session_state:
     pdf_path = 'uploaded_pdf/data_sheet.pdf'
     st.session_state['is_initialized'] = True
-    page_count = utils.get_pdf_page_count(pdf_path)
-    print("page_count=",page_count)
-    page_count=5
-    print("new page_count=",page_count)
     read_pdf_progress_bar = st.progress(0)
-    st.session_state.color_image_list = []
-    st.session_state.gray_image_np_list = []
-    for page_number in range(page_count):
-         image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
-         st.session_state.color_image_list.append(image[0])
-         progress_percentage = (page_number) / (page_count-1)
-         read_pdf_progress_bar.progress(progress_percentage)
-    gray_pdf_image_np_list = []
-    read_pdf_progress_bar.progress(0)
-    for index, image in enumerate(st.session_state.color_image_list):
-        image_np = np.array(image)
-        st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
-        progress_percentage = (index) / (page_count - 1)
-        read_pdf_progress_bar.progress(progress_percentage)
-    # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
-    # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
-    # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
-    # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
-    st.session_state.img_index = 0
-    st.session_state.stop_button_clicked=False
-# st.image(st.session_state.gray_image_np_list[38])
-for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
-  print("index="+str(index))
-  text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
-  st.write(text)
 #if 'img_index' not in st.session_state:
 # if st.button("Stop"):

 import utils
 import numpy as np
 import cv2
+import os
+import shutil
 import time
 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
 # install https://github.com/UB-Mannheim/tesseract/wiki
 #page extraction disabled
+def is_new_pdf_upload(uploaded_file):
+    if 'last_pdf_uploaded_file' in st.session_state:
+        # Check if the newly uploaded file is different from the last one
+        if (uploaded_file.name != st.session_state.last_pdf_uploaded_file['name'] or
+                uploaded_file.size != st.session_state.last_pdf_uploaded_file['size']):
+            st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
+            # st.write("A new src image file has been uploaded.")
+            return True
+        else:
+            # st.write("The same src image file has been re-uploaded.")
+            return False
+    else:
+        # st.write("This is the first file upload detected.")
+        st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
+        return True
+    # Store current file details in session state
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
 st.markdown(big_text, unsafe_allow_html=True)
 if 'is_initialized' not in st.session_state:
     pdf_path = 'uploaded_pdf/data_sheet.pdf'
     st.session_state['is_initialized'] = True
+#     page_count = utils.get_pdf_page_count(pdf_path)
+#     print("page_count=",page_count)
+#     page_count=5
+#     print("new page_count=",page_count)
+#     read_pdf_progress_bar = st.progress(0)
+#     st.session_state.color_image_list = []
+#     st.session_state.gray_image_np_list = []
+#     for page_number in range(page_count):
+#          image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
+#          st.session_state.color_image_list.append(image[0])
+#          progress_percentage = (page_number) / (page_count-1)
+#          read_pdf_progress_bar.progress(progress_percentage)
+#     gray_pdf_image_np_list = []
+#     read_pdf_progress_bar.progress(0)
+#     for index, image in enumerate(st.session_state.color_image_list):
+#         image_np = np.array(image)
+#         st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
+#         progress_percentage = (index) / (page_count - 1)
+#         read_pdf_progress_bar.progress(progress_percentage)
+#     # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
+#     # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
+#     # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
+#     # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
+#     st.session_state.img_index = 0
+#     st.session_state.stop_button_clicked=False
+# # st.image(st.session_state.gray_image_np_list[38])
+uploaded_locked_pdf_file = st.file_uploader("Upload a locked pdf",
+                                           type=['pdf'])
+st.markdown(
+    f'<a href="https://ikmtechnology.github.io/ikmtechnology/data_sheet.pdf" target="_blank">Sample 1 download and then upload to above</a>',
+    unsafe_allow_html=True)
+if uploaded_locked_pdf_file is not None:
+    if is_new_pdf_upload(uploaded_locked_pdf_file):
+        # To see details
+        # file_details = {"FileName": uploaded_driving_video_file.name, "FileType": uploaded_driving_video_file.type, "FileSize": uploaded_driving_video_file.size}
+        # st.write(file_details)
+        save_path = './uploaded_videos'
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        with open(os.path.join(save_path, uploaded_locked_pdf_file.name), "wb") as f:
+            f.write(uploaded_locked_pdf_file.getbuffer())  # Write the file to the specified location
+            st.success(f'Saved file temp_{uploaded_locked_pdf_file.name} in {save_path}')
+            st.session_state.uploaded_pdf_path=os.path.join(save_path, uploaded_locked_pdf_file.name)
+            st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
+            print("page_count=",st.session_state.page_count)
+        if 'extracted_text' in st.session_state:
+            del st.session_state.extracted_text
+        st.rerun()
+if 'page_count' in st.session_state:
+    st.write(f"total page count = {st.session_state.page_count}")
+    if 'num_pages_to_extract'not in st.session_state:
+        st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=5, key='num_pages_to_extract_slider')
+    else:
+        st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
+    st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
+    if 'run_button' in st.session_state and st.session_state.run_button == True:
+        st.session_state.running = True
+    else:
+        st.session_state.running = False
     read_pdf_progress_bar = st.progress(0)
+    if st.button('Extract Pages', disabled=st.session_state.running, key='run_button'):
+        st.session_state.color_image_list = []
+        st.session_state.gray_image_np_list = []
+        for page_number in range(st.session_state.num_pages_to_extract):
+             image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
+             st.session_state.color_image_list.append(image[0])
+             progress_percentage = (page_number) / (st.session_state.num_pages_to_extract-1)
+             read_pdf_progress_bar.progress(progress_percentage)
+        read_pdf_progress_bar.progress(0)
+        for index, image in enumerate(st.session_state.color_image_list):
+            image_np = np.array(image)
+            st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
+            progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
+            read_pdf_progress_bar.progress(progress_percentage)
+        st.session_state.extracted_text = ""
+        for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
+          print("index="+str(index))
+          text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
+          st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
+          # st.write(text)
+          # print(text)
+          progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
+          read_pdf_progress_bar.progress(progress_percentage)
+        #add_animation_to_image()
+        #st.session_state['video_generated'] = True
+        st.rerun()
+    if 'extracted_text' in st.session_state:
+        st.write(st.session_state.extracted_text)
+# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
+#   print("index="+str(index))
+#
+#   text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
+#   st.write(text)
 #if 'img_index' not in st.session_state:
 # if st.button("Stop"):

utils.py CHANGED Viewed

@@ -287,11 +287,11 @@ def extract_two_columns_text(image_index,image_np,debug):
       if debug:
           print("left column image start")
           # display(left_column_img)
-          st.image(Image.fromarray(left_column_array_bgr_image))  # to_be_displayed
           print("left column image end")
           print("right column image start")
           # display(right_column_img)
-          st.image(Image.fromarray(right_column_array_bgr_image))  # to_be_displayed
           print("right column image end")
       left_text = pytesseract.image_to_string(left_column_img)
       # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
@@ -347,7 +347,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
             print(table[0])
             # st.write(table[0])#to_be_displayed
             # st.image(Image.fromarray(table[1]))#to_be_displayed
-        st.image(Image.fromarray(cropped_image))#to_be_displayed
     found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
     if found_hor_lines_list is not None:
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
@@ -371,7 +371,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
         max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
     else:
         max_height_image = cropped_image.copy()
-    st.write("selected segment")
     # print("max height image start")
     # st.image(Image.fromarray(max_height_image))#to_be_displayed
     # print("max height image end")
@@ -403,18 +403,18 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
             color_tuple=(0, 255, 0)
             cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
             print("still in the middle start")
-            st.image(Image.fromarray(bgr_image))
             print("still in the middle end")
           left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
           if debug:
             print("left over start")
-            st.image(Image.fromarray(left_over_content))
             print("left over end")
           max_height_image_copy=max_height_image.copy()
           cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
           if debug:
             print("no left over start")
-            st.image(Image.fromarray(max_height_image_copy))
             print("no left over end")
           text=extract_two_columns_text(max_height_image_copy,debug)
           if text == "error":

       if debug:
           print("left column image start")
           # display(left_column_img)
+          # st.image(Image.fromarray(left_column_array_bgr_image))  # to_be_displayed
           print("left column image end")
           print("right column image start")
           # display(right_column_img)
+          # st.image(Image.fromarray(right_column_array_bgr_image))  # to_be_displayed
           print("right column image end")
       left_text = pytesseract.image_to_string(left_column_img)
       # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
             print(table[0])
             # st.write(table[0])#to_be_displayed
             # st.image(Image.fromarray(table[1]))#to_be_displayed
+        # st.image(Image.fromarray(cropped_image))#to_be_displayed
     found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
     if found_hor_lines_list is not None:
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
         max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
     else:
         max_height_image = cropped_image.copy()
+    # st.write("selected segment")
     # print("max height image start")
     # st.image(Image.fromarray(max_height_image))#to_be_displayed
     # print("max height image end")
             color_tuple=(0, 255, 0)
             cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
             print("still in the middle start")
+            # st.image(Image.fromarray(bgr_image))
             print("still in the middle end")
           left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
           if debug:
             print("left over start")
+            # st.image(Image.fromarray(left_over_content))
             print("left over end")
           max_height_image_copy=max_height_image.copy()
           cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
           if debug:
             print("no left over start")
+            # st.image(Image.fromarray(max_height_image_copy))
             print("no left over end")
           text=extract_two_columns_text(max_height_image_copy,debug)
           if text == "error":