able to load pdf file
Browse files- app.py +47 -22
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import json
|
4 |
-
|
|
|
5 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
|
6 |
|
7 |
import torch
|
@@ -116,8 +117,8 @@ big_text = """
|
|
116 |
# Display the styled text
|
117 |
st.markdown(big_text, unsafe_allow_html=True)
|
118 |
|
119 |
-
|
120 |
-
|
121 |
st.markdown(
|
122 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
|
123 |
unsafe_allow_html=True)
|
@@ -126,8 +127,8 @@ st.markdown(
|
|
126 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
|
127 |
unsafe_allow_html=True)
|
128 |
st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
|
129 |
-
if
|
130 |
-
if is_new_file_upload(
|
131 |
print("is new file uploaded")
|
132 |
if 'prev_query' in st.session_state:
|
133 |
del st.session_state['prev_query']
|
@@ -136,25 +137,49 @@ if uploaded_json_file is not None:
|
|
136 |
save_path = './uploaded_files'
|
137 |
if not os.path.exists(save_path):
|
138 |
os.makedirs(save_path)
|
139 |
-
with open(os.path.join(save_path,
|
140 |
-
f.write(
|
141 |
-
st.success(f'Saved file temp_{
|
142 |
-
st.session_state.uploaded_path=os.path.join(save_path,
|
143 |
# st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
|
144 |
# print("page_count=",st.session_state.page_count)
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
st.rerun()
|
159 |
|
160 |
if 'paragraph_sentence_encodings' in st.session_state:
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import json
|
4 |
+
import fitz
|
5 |
+
import re
|
6 |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
|
7 |
|
8 |
import torch
|
|
|
117 |
# Display the styled text
|
118 |
st.markdown(big_text, unsafe_allow_html=True)
|
119 |
|
120 |
+
uploaded_pdf_file = st.file_uploader("Upload a PDF file",
|
121 |
+
type=['pdf'])
|
122 |
st.markdown(
|
123 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
|
124 |
unsafe_allow_html=True)
|
|
|
127 |
f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
|
128 |
unsafe_allow_html=True)
|
129 |
st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
|
130 |
+
if uploaded_pdf_file is not None:
|
131 |
+
if is_new_file_upload(uploaded_pdf_file):
|
132 |
print("is new file uploaded")
|
133 |
if 'prev_query' in st.session_state:
|
134 |
del st.session_state['prev_query']
|
|
|
137 |
save_path = './uploaded_files'
|
138 |
if not os.path.exists(save_path):
|
139 |
os.makedirs(save_path)
|
140 |
+
with open(os.path.join(save_path, uploaded_pdf_file.name), "wb") as f:
|
141 |
+
f.write(uploaded_pdf_file.getbuffer()) # Write the file to the specified location
|
142 |
+
st.success(f'Saved file temp_{uploaded_pdf_file.name} in {save_path}')
|
143 |
+
st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
|
144 |
# st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
|
145 |
# print("page_count=",st.session_state.page_count)
|
146 |
+
doc = fitz.open(st.session_state.uploaded_path)
|
147 |
+
sentence_endings = ('.', '!', '?')
|
148 |
+
start_page = 1
|
149 |
+
st.session_state.restored_paragraphs = []
|
150 |
+
for page_num in range(start_page - 1, len(doc)): # start_page - 1 to adjust for 0-based index
|
151 |
+
page = doc.load_page(page_num)
|
152 |
+
blocks = page.get_text("blocks")
|
153 |
+
|
154 |
+
block_index = 1
|
155 |
+
for block in blocks:
|
156 |
+
x0, y0, x1, y1, text, block_type, flags = block
|
157 |
+
if text.strip() != "":
|
158 |
+
text = text.strip()
|
159 |
+
text = re.sub(r'\n\s+\n', '\n\n', text)
|
160 |
+
list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
|
161 |
+
match = list_pattern.search(text)
|
162 |
+
containsList = False
|
163 |
+
if match:
|
164 |
+
containsList = True
|
165 |
+
# print ("list detected")
|
166 |
+
paragraph = ""
|
167 |
+
if bool(re.search(r'\n{2,}', text)):
|
168 |
+
substrings = re.split(r'\n{2,}', text)
|
169 |
+
for substring in substrings:
|
170 |
+
if substring.strip() != "":
|
171 |
+
paragraph = substring
|
172 |
+
st.session_state.restored_paragraphs.append(
|
173 |
+
{"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": text});
|
174 |
+
# print(f"<substring> {substring} </substring>")
|
175 |
+
else:
|
176 |
+
paragraph = text
|
177 |
+
st.session_state.restored_paragraphs.append(
|
178 |
+
{"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
|
179 |
+
if isinstance(st.session_state.restored_paragraphs, list):
|
180 |
+
# Count the restored_paragraphs of top-level elements
|
181 |
+
st.session_state.list_count = len(st.session_state.restored_paragraphs)
|
182 |
+
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count}')
|
183 |
st.rerun()
|
184 |
|
185 |
if 'paragraph_sentence_encodings' in st.session_state:
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ torch
|
|
3 |
scikit-learn
|
4 |
nltk
|
5 |
sentencepiece
|
6 |
-
protobuf==3.20.3
|
|
|
|
3 |
scikit-learn
|
4 |
nltk
|
5 |
sentencepiece
|
6 |
+
protobuf==3.20.3
|
7 |
+
PyMuPDF
|