zmbfeng commited on
Commit
ad5ca2c
1 Parent(s): 27aefdb

slide show implemented

Browse files
Files changed (3) hide show
  1. app.py +61 -9
  2. requirements.txt +1 -0
  3. utils.py +13 -0
app.py CHANGED
@@ -1,11 +1,12 @@
1
  import streamlit as st
2
- from pdf2image import convert_from_path
3
-
4
-
 
5
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
6
  # poppler-utils:
7
  # Installed: 22.02.0-2ubuntu0.4
8
-
9
  big_text = """
10
  <div style='text-align: center;'>
11
  <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
@@ -13,11 +14,62 @@ big_text = """
13
  """
14
  # Display the styled text
15
  st.markdown(big_text, unsafe_allow_html=True)
16
- pdf_path = 'uploaded_pdf/data_sheet.pdf'
17
- print("start")
18
- images = convert_from_path(pdf_path, first_page=1, last_page=2)
19
- print("done")
20
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # total_pages = 100
22
  # print(f"total_pages = {total_pages}")
23
  # st.write(f"total_pages = {total_pages}")
 
1
  import streamlit as st
2
+ import pdf2image
3
+ import utils
4
+ import numpy as np
5
+ import cv2
6
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
7
  # poppler-utils:
8
  # Installed: 22.02.0-2ubuntu0.4
9
+ #page extraction disabled
10
  big_text = """
11
  <div style='text-align: center;'>
12
  <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
 
14
  """
15
  # Display the styled text
16
  st.markdown(big_text, unsafe_allow_html=True)
17
+
18
+
19
+ if 'is_initialized' not in st.session_state:
20
+ pdf_path = 'uploaded_pdf/data_sheet.pdf'
21
+ st.session_state['is_initialized'] = True
22
+ page_count = utils.get_pdf_page_count(pdf_path)
23
+ print("page_count=",page_count)
24
+ page_count=50
25
+ print("new page_count=",page_count)
26
+ read_pdf_progress_bar = st.progress(0)
27
+ st.session_state.color_image_list = []
28
+ st.session_state.gray_image_np_list = []
29
+ for page_number in range(page_count):
30
+ image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
31
+ st.session_state.color_image_list.append(image[0])
32
+ progress_percentage = (page_number) / (page_count-1)
33
+ read_pdf_progress_bar.progress(progress_percentage)
34
+ gray_pdf_image_np_list = []
35
+ read_pdf_progress_bar.progress(0)
36
+ for index, image in enumerate(st.session_state.color_image_list):
37
+ image_np = np.array(image)
38
+ st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
39
+ progress_percentage = (index) / (page_count - 1)
40
+ read_pdf_progress_bar.progress(progress_percentage)
41
+ st.session_state.img_index = 0
42
+
43
+ #if 'img_index' not in st.session_state:
44
+
45
+
46
+ st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
47
+ st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
48
+
49
+ col1, col2 = st.columns(2)
50
+ with col1:
51
+ if st.button("Previous"):
52
+ print("Previous pressed")
53
+ # Decrease index, wrap around if it goes below 0
54
+ print("st.session_state.img_index =", str(st.session_state.img_index))
55
+ if st.session_state.img_index > 0:
56
+ print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
57
+ st.session_state.img_index -= 1
58
+ print("case 2 after st.session_state.img_index =", str(st.session_state.img_index))
59
+ else:
60
+ print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
61
+ st.session_state.img_index = len(st.session_state.color_image_list) - 1
62
+ with col2:
63
+ if st.button("Next"):
64
+
65
+ print("Next pressed")
66
+ # Increase index, wrap around if it goes past the last image
67
+ if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
68
+ st.session_state.img_index += 1
69
+
70
+ else:
71
+ st.session_state.img_index = 0
72
+ # #
73
  # total_pages = 100
74
  # print(f"total_pages = {total_pages}")
75
  # st.write(f"total_pages = {total_pages}")
requirements.txt CHANGED
@@ -4,3 +4,4 @@ opencv-python
4
  pytesseract
5
  pdf2image
6
  Pillow
 
 
4
  pytesseract
5
  pdf2image
6
  Pillow
7
+ numpy
utils.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ def get_pdf_page_count(pdf_path):
4
+ try:
5
+ # Running pdfinfo command to get information about the PDF
6
+ result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
7
+ # Parsing the output to find the line with the number of pages
8
+ for line in result.stdout.split('\n'):
9
+ if 'Pages:' in line:
10
+ return int(line.split(':')[1].strip())
11
+ except Exception as e:
12
+ print(f"An error occurred: {e}")
13
+ return None