Spaces:

TarunEnma
/

ReadHtml

Build error

File size: 2,637 Bytes

acf42a8
879d3fd
 
 
56a9d44
879d3fd
56a9d44
 
 
879d3fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56a9d44
 
879d3fd
 
56a9d44
 
 
 
 
 
 
 
 
 
879d3fd
56a9d44
 
 
879d3fd
 
515efe8
879d3fd
515efe8
56a9d44
 
 
515efe8
56a9d44
 
 
acf42a8
515efe8
 
879d3fd
 
 
 
 
 
 
 
 
 
 
 
 
56a9d44
 
 
 
 
 
acf42a8
515efe8

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationRetrievalChain
from langchain.llms import ChatOpenAI

def get_html(html):
    text = ""
    for pdf in html:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_chunk_text(raw_text):
    text_splitter = CharacterTextSplitter(seperator="\n", chunk_size=1000, chunk_overlap=20,length_function=len)
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vector_store = FAISS.from_texts(texts=text_chunks,embedding = embeddings)
    return vector_store

def get_conversation_chain(vector_store):
    llm = ChatOpenAI()
    memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
    conversation_chain = ConversationRetrievalChain.from_llm(
        llm = llm,
        retriever = vector_store.as_retriever(),
        memory = memory
    )
    return conversation_chain
    
def handle_input(user_input):
    response = st.session_state.conversation({"question":user_input})
    st.write(response)
            
    
def main():
    load_dotenv()
    st.set_page_config(page_title="Reads your html",page_icon=":books:")

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    st.header("Get your best Element")
    user_input = st.text_input("Pass your Element with its information")
    if user_input:
        handle_input(user_input)

    with st.sidebar:
        st.subheader("your html")
        html_docs = st.file_uploader("upload your html file and click process")
        if st.button("process"):
            with st.spinner("processing"):
                
                #get pdf text
                raw_text = get_html(html_docs)
                
            
                #get the text chunks
                text_chunks = get_chunk_text(raw_text)
                

                #create vector store
                vector_store = get_vector_store(text_chunks)

                #create conversation chain
                st.session_state.conversation = get_conversation_chain(vector_store)
                


if __name__ == '__main__':
    main()