import streamlit as st
import os
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
def is_new_file_upload(uploaded_file):
if 'last_uploaded_file' in st.session_state:
# Check if the newly uploaded file is different from the last one
if (uploaded_file.name != st.session_state.last_uploaded_file['name'] or
uploaded_file.size != st.session_state.last_uploaded_file['size']):
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
# st.write("A new src image file has been uploaded.")
return True
else:
# st.write("The same src image file has been re-uploaded.")
return False
else:
# st.write("This is the first file upload detected.")
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
return True
big_text = """
Knowledge Extraction 1
"""
# Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
uploaded_json_file = st.file_uploader("Upload a pre-processed file",
type=['json'])
st.markdown(
f'Sample 1 download and then upload to above',
unsafe_allow_html=True)
if uploaded_json_file is not None:
if is_new_file_upload(uploaded_json_file):
print("is new file uploaded")
save_path = './uploaded_files'
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(os.path.join(save_path, uploaded_json_file.name), "wb") as f:
f.write(uploaded_json_file.getbuffer()) # Write the file to the specified location
st.success(f'Saved file temp_{uploaded_json_file.name} in {save_path}')
st.session_state.uploaded_path=os.path.join(save_path, uploaded_json_file.name)
# st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
# print("page_count=",st.session_state.page_count)
content = uploaded_json_file.read()
try:
st.session_state.restored_paragraphs = json.loads(content)
#print(data)
# Check if the parsed data is a dictionary
if isinstance(st.session_state.restored_paragraphs, list):
# Count the restored_paragraphs of top-level elements
st.session_state.list_count = len(st.session_state.restored_paragraphs)
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
else:
st.write('The JSON content is not a dictionary.')
except json.JSONDecodeError:
st.write('Invalid JSON file.')
st.rerun()
if 'is_initialized' not in st.session_state:
st.session_state['is_initialized'] = True
nltk.download('punkt')
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
if 'list_count' in st.session_state:
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
if 'paragraph_sentence_encodings' not in st.session_state:
print("start embedding paragarphs")
read_progress_bar = st.progress(0)
st.session_state.paragraph_sentence_encodings = []
for index,paragraph in enumerate(st.session_state.restored_paragraphs):
#print(paragraph)
progress_percentage = (index) / (st.session_state.list_count - 1)
print(progress_percentage)
read_progress_bar.progress(progress_percentage)
sentence_encodings = []
sentences = sent_tokenize(paragraph['text'])
for sentence in sentences:
if sentence.strip().endswith('?'):
sentence_encodings.append(None)
continue
if len(sentence.strip()) < 4:
sentence_encodings.append(None)
continue
sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
with torch.no_grad():
sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
sentence_encodings.append([sentence, sentence_encoding])
# sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])