dtcda

Sleeping

App Files Files Community

zmbfeng commited on Sep 19

Commit

4c2c5b7

•

1 Parent(s): 0738fe9

able to load pdf file

Browse files

Files changed (2) hide show

app.py +47 -22
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import streamlit as st
 import os
 import json
 from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
@@ -116,8 +117,8 @@ big_text = """
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
-uploaded_json_file = st.file_uploader("Upload a pre-processed file",
-                                           type=['json'])
 st.markdown(
     f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
     unsafe_allow_html=True)
@@ -126,8 +127,8 @@ st.markdown(
 f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
     unsafe_allow_html=True)
 st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
-if uploaded_json_file is not None:
-    if is_new_file_upload(uploaded_json_file):
         print("is new file uploaded")
         if 'prev_query' in st.session_state:
             del st.session_state['prev_query']
@@ -136,25 +137,49 @@ if uploaded_json_file is not None:
         save_path = './uploaded_files'
         if not os.path.exists(save_path):
             os.makedirs(save_path)
-        with open(os.path.join(save_path, uploaded_json_file.name), "wb") as f:
-            f.write(uploaded_json_file.getbuffer())  # Write the file to the specified location
-            st.success(f'Saved file temp_{uploaded_json_file.name} in {save_path}')
-            st.session_state.uploaded_path=os.path.join(save_path, uploaded_json_file.name)
             # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
             # print("page_count=",st.session_state.page_count)
-        content = uploaded_json_file.read()
-        try:
-            st.session_state.restored_paragraphs = json.loads(content)
-            #print(data)
-            # Check if the parsed data is a dictionary
-            if isinstance(st.session_state.restored_paragraphs, list):
-                # Count the restored_paragraphs of top-level elements
-                st.session_state.list_count  = len(st.session_state.restored_paragraphs)
-                st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
-            else:
-                st.write('The JSON content is not a dictionary.')
-        except json.JSONDecodeError:
-            st.write('Invalid JSON file.')
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:

 import streamlit as st
 import os
 import json
+import fitz
+import re
 from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
+uploaded_pdf_file = st.file_uploader("Upload a PDF file",
+                                     type=['pdf'])
 st.markdown(
     f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
     unsafe_allow_html=True)
 f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
     unsafe_allow_html=True)
 st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
+if uploaded_pdf_file is not None:
+    if is_new_file_upload(uploaded_pdf_file):
         print("is new file uploaded")
         if 'prev_query' in st.session_state:
             del st.session_state['prev_query']
         save_path = './uploaded_files'
         if not os.path.exists(save_path):
             os.makedirs(save_path)
+        with open(os.path.join(save_path, uploaded_pdf_file.name), "wb") as f:
+            f.write(uploaded_pdf_file.getbuffer())  # Write the file to the specified location
+            st.success(f'Saved file temp_{uploaded_pdf_file.name} in {save_path}')
+            st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
             # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
             # print("page_count=",st.session_state.page_count)
+        doc = fitz.open(st.session_state.uploaded_path)
+        sentence_endings = ('.', '!', '?')
+        start_page = 1
+        st.session_state.restored_paragraphs = []
+        for page_num in range(start_page - 1, len(doc)):  # start_page - 1 to adjust for 0-based index
+            page = doc.load_page(page_num)
+            blocks = page.get_text("blocks")
+            block_index = 1
+            for block in blocks:
+                x0, y0, x1, y1, text, block_type, flags = block
+                if text.strip() != "":
+                    text = text.strip()
+                    text = re.sub(r'\n\s+\n', '\n\n', text)
+                    list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
+                    match = list_pattern.search(text)
+                    containsList = False
+                    if match:
+                        containsList = True
+                        # print ("list detected")
+                    paragraph = ""
+                    if bool(re.search(r'\n{2,}', text)):
+                        substrings = re.split(r'\n{2,}', text)
+                        for substring in substrings:
+                            if substring.strip() != "":
+                                paragraph = substring
+                                st.session_state.restored_paragraphs.append(
+                                    {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": text});
+                                # print(f"<substring> {substring} </substring>")
+                    else:
+                        paragraph = text
+                        st.session_state.restored_paragraphs.append(
+                            {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
+        if isinstance(st.session_state.restored_paragraphs, list):
+            # Count the restored_paragraphs of top-level elements
+            st.session_state.list_count = len(st.session_state.restored_paragraphs)
+            st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count}')
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ torch
 scikit-learn
 nltk
 sentencepiece
-protobuf==3.20.3

 scikit-learn
 nltk
 sentencepiece
+protobuf==3.20.3
+PyMuPDF