import streamlit as st from dotenv import load_dotenv from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS def get_html(html): text = "" for pdf in html: pdf_reader= PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text def get_chunk_text(raw_text): text_splitter = CharacterTextSplitter(seperator="\n", chunk_size=1000, chunk_overlap=20,length_function=len) chunks = text_splitter.split_text(text) return chunks def get_vector_store(text_chunks): embeddings = OpenAIEmbeddings vector_store = FAISS.from_texts(texts=text_chunks,embedding = embeddings) return vector_store def main(): load_dotenv() st.set_page_config(page_title="Reads your html",page_icon=":books:") st.header("Get your best Element") st.text_input("Pass your Element with its information") with st.sidebar: st.subheader("your html") html_docs = st.file_uploader("upload your html file and click process") if st.button("process"): with st.spinner("processing"): #get pdf text raw_text = get_html(html_docs) #get the text chunks text_chunks = get_chunk_text(raw_text) #create vector store vectors = get_vector_store(text_chunks) if __name__ == '__main__': main()