import streamlit as st from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings from langchain.vectorstores import FAISS from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationRetrievalChain from langchain.llms import ChatOpenAI def get_html(html): text = "" for pdf in html: pdf_reader= PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text def get_chunk_text(raw_text): text_splitter = CharacterTextSplitter(seperator="\n", chunk_size=1000, chunk_overlap=20,length_function=len) chunks = text_splitter.split_text(text) return chunks def get_vector_store(text_chunks): # embeddings = OpenAIEmbeddings() embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl") vector_store = FAISS.from_texts(texts=text_chunks,embedding = embeddings) return vector_store def get_conversation_chain(vector_store): llm = ChatOpenAI() memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True) conversation_chain = ConversationRetrievalChain.from_llm( llm = llm, retriever = vector_store.as_retriever(), memory = memory ) return conversation_chain def handle_input(user_input): response = st.session_state.conversation({"question":user_input}) st.write(response) def main(): load_dotenv() st.set_page_config(page_title="Reads your html",page_icon=":books:") if "conversation" not in st.session_state: st.session_state.conversation = None st.header("Get your best Element") user_input = st.text_input("Pass your Element with its information") if user_input: handle_input(user_input) with st.sidebar: st.subheader("your html") html_docs = st.file_uploader("upload your html file and click process") if st.button("process"): with st.spinner("processing"): #get pdf text raw_text = get_html(html_docs) #get the text chunks text_chunks = get_chunk_text(raw_text) #create vector store vector_store = get_vector_store(text_chunks) #create conversation chain st.session_state.conversation = get_conversation_chain(vector_store) if __name__ == '__main__': main()