Spaces:

TarunEnma
/

ReadHtml

Build error

App Files Files Community

TarunEnma commited on Jun 10, 2024

Commit

879d3fd

verified ·

1 Parent(s): bb71bcd

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -2

app.py CHANGED Viewed

@@ -1,13 +1,52 @@
 import streamlit as st
 def main():
     st.set_page_config(page_title="Reads your html",page_icon=":books:")
     st.header("Get your best Element")
     st.text_input("Pass your Element with its information")
     with st.sidebar:
         st.subheader("your html")
-        st.file_uploader("upload your html file and click process")
-        st.button("process")
 if __name__ == '__main__':
     main()

 import streamlit as st
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+def get_html(html):
+    text = ""
+    for pdf in html:
+        pdf_reader= PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_chunk_text(raw_text):
+    text_splitter = CharacterTextSplitter(seperator="\n", chunk_size=1000, chunk_overlap=20,length_function=len)
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vector_store(text_chunks):
+    embeddings = OpenAIEmbeddings
+    vector_store = FAISS.from_texts(texts=text_chunks,embedding = embeddings)
+    return vector_store
 def main():
+    load_dotenv()
     st.set_page_config(page_title="Reads your html",page_icon=":books:")
     st.header("Get your best Element")
     st.text_input("Pass your Element with its information")
     with st.sidebar:
         st.subheader("your html")
+        html_docs = st.file_uploader("upload your html file and click process")
+        if st.button("process"):
+            with st.spinner("processing"):
+                #get pdf text
+                raw_text = get_html(html_docs)
+                #get the text chunks
+                text_chunks = get_chunk_text(raw_text)
+                #create vector store
+                vectors = get_vector_store(text_chunks)
 if __name__ == '__main__':
     main()