TarunEnma commited on
Commit
879d3fd
·
verified ·
1 Parent(s): bb71bcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -2
app.py CHANGED
@@ -1,13 +1,52 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  def main():
 
3
  st.set_page_config(page_title="Reads your html",page_icon=":books:")
4
  st.header("Get your best Element")
5
  st.text_input("Pass your Element with its information")
6
 
7
  with st.sidebar:
8
  st.subheader("your html")
9
- st.file_uploader("upload your html file and click process")
10
- st.button("process")
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  if __name__ == '__main__':
13
  main()
 
1
  import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+
8
+ def get_html(html):
9
+ text = ""
10
+ for pdf in html:
11
+ pdf_reader= PdfReader(pdf)
12
+ for page in pdf_reader.pages:
13
+ text += page.extract_text()
14
+ return text
15
+
16
+ def get_chunk_text(raw_text):
17
+ text_splitter = CharacterTextSplitter(seperator="\n", chunk_size=1000, chunk_overlap=20,length_function=len)
18
+ chunks = text_splitter.split_text(text)
19
+ return chunks
20
+
21
+ def get_vector_store(text_chunks):
22
+ embeddings = OpenAIEmbeddings
23
+ vector_store = FAISS.from_texts(texts=text_chunks,embedding = embeddings)
24
+ return vector_store
25
+
26
+
27
+
28
  def main():
29
+ load_dotenv()
30
  st.set_page_config(page_title="Reads your html",page_icon=":books:")
31
  st.header("Get your best Element")
32
  st.text_input("Pass your Element with its information")
33
 
34
  with st.sidebar:
35
  st.subheader("your html")
36
+ html_docs = st.file_uploader("upload your html file and click process")
37
+ if st.button("process"):
38
+ with st.spinner("processing"):
39
+
40
+ #get pdf text
41
+ raw_text = get_html(html_docs)
42
+
43
+
44
+ #get the text chunks
45
+ text_chunks = get_chunk_text(raw_text)
46
+
47
+
48
+ #create vector store
49
+ vectors = get_vector_store(text_chunks)
50
 
51
  if __name__ == '__main__':
52
  main()