Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 24

Commit

ad5ca2c

•

1 Parent(s): 27aefdb

slide show implemented

Browse files

Files changed (3) hide show

app.py +61 -9
requirements.txt +1 -0
utils.py +13 -0

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import streamlit as st
-from pdf2image import convert_from_path
 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
@@ -13,11 +14,62 @@ big_text = """
     """
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
-pdf_path = 'uploaded_pdf/data_sheet.pdf'
-print("start")
-images = convert_from_path(pdf_path, first_page=1, last_page=2)
-print("done")
-#
 # total_pages = 100
 # print(f"total_pages = {total_pages}")
 # st.write(f"total_pages = {total_pages}")

 import streamlit as st
+import pdf2image
+import utils
+import numpy as np
+import cv2
 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
+#page extraction disabled
 big_text = """
     <div style='text-align: center;'>
         <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
     """
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
+if 'is_initialized' not in st.session_state:
+    pdf_path = 'uploaded_pdf/data_sheet.pdf'
+    st.session_state['is_initialized'] = True
+    page_count = utils.get_pdf_page_count(pdf_path)
+    print("page_count=",page_count)
+    page_count=50
+    print("new page_count=",page_count)
+    read_pdf_progress_bar = st.progress(0)
+    st.session_state.color_image_list = []
+    st.session_state.gray_image_np_list = []
+    for page_number in range(page_count):
+         image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
+         st.session_state.color_image_list.append(image[0])
+         progress_percentage = (page_number) / (page_count-1)
+         read_pdf_progress_bar.progress(progress_percentage)
+    gray_pdf_image_np_list = []
+    read_pdf_progress_bar.progress(0)
+    for index, image in enumerate(st.session_state.color_image_list):
+        image_np = np.array(image)
+        st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
+        progress_percentage = (index) / (page_count - 1)
+        read_pdf_progress_bar.progress(progress_percentage)
+    st.session_state.img_index = 0
+#if 'img_index' not in st.session_state:
+st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
+st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
+col1, col2 = st.columns(2)
+with col1:
+    if st.button("Previous"):
+        print("Previous pressed")
+        # Decrease index, wrap around if it goes below 0
+        print("st.session_state.img_index =", str(st.session_state.img_index))
+        if st.session_state.img_index > 0:
+            print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
+            st.session_state.img_index -= 1
+            print("case 2 after  st.session_state.img_index =", str(st.session_state.img_index))
+        else:
+            print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
+            st.session_state.img_index = len(st.session_state.color_image_list) - 1
+with col2:
+    if st.button("Next"):
+        print("Next pressed")
+        # Increase index, wrap around if it goes past the last image
+        if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
+            st.session_state.img_index += 1
+        else:
+            st.session_state.img_index = 0
+# #
 # total_pages = 100
 # print(f"total_pages = {total_pages}")
 # st.write(f"total_pages = {total_pages}")

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ opencv-python
 pytesseract
 pdf2image
 Pillow

 pytesseract
 pdf2image
 Pillow
+numpy

utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import subprocess
+def get_pdf_page_count(pdf_path):
+    try:
+        # Running pdfinfo command to get information about the PDF
+        result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
+        # Parsing the output to find the line with the number of pages
+        for line in result.stdout.split('\n'):
+            if 'Pages:' in line:
+                return int(line.split(':')[1].strip())
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None