Manasa1 commited on
Commit
cd50977
·
verified ·
1 Parent(s): 56dc6a5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import streamlit as st
3
+ from langchain_community.document_loaders import UnstructuredPDFLoader
4
+ from langchain_text_splitters.character import CharacterTextSplitter
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_groq import ChatGroq
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ import os
11
+ import nltk
12
+ nltk.download('punkt_tab')
13
+ nltk.download('averaged_perceptron_tagger_eng')
14
+ # Install Poppler and Tesseract in the runtime environment
15
+ os.system("apt-get update && apt-get install -y poppler-utils tesseract-ocr")
16
+
17
+ secret = os.getenv('Groq_api')
18
+
19
+ working_dir = os.path.dirname(os.path.abspath(__file__))
20
+
21
+ def load_documents(file_path):
22
+ # Specify poppler_path and tesseract_path to ensure compatibility
23
+ loader = UnstructuredPDFLoader(
24
+ file_path,
25
+ poppler_path="/usr/bin",
26
+ tesseract_path="/usr/bin/tesseract"
27
+ )
28
+ documents = loader.load()
29
+ return documents
30
+
31
+ def setup_vectorstore(documents):
32
+ embeddings = HuggingFaceEmbeddings()
33
+ text_splitter = CharacterTextSplitter(
34
+ separator="/n",
35
+ chunk_size=1000,
36
+ chunk_overlap=200
37
+ )
38
+ doc_chunks = text_splitter.split_documents(documents)
39
+ vectorstores = FAISS.from_documents(doc_chunks, embeddings)
40
+ return vectorstores
41
+
42
+ def create_chain(vectorstores):
43
+ llm = ChatGroq(
44
+ api_key=secret,
45
+ model="llama-3.1-8b-instant",
46
+ temperature=0
47
+ )
48
+ retriever = vectorstores.as_retriever()
49
+ memory = ConversationBufferMemory(
50
+ llm=llm,
51
+ output_key="answer",
52
+ memory_key="chat_history",
53
+ return_messages=True
54
+ )
55
+ chain = ConversationalRetrievalChain.from_llm(
56
+ llm=llm,
57
+ retriever=retriever,
58
+ memory=memory,
59
+ verbose=True
60
+ )
61
+ return chain
62
+
63
+ # Streamlit page configuration
64
+ st.set_page_config(
65
+ page_title="Chat with your documents",
66
+ page_icon="📑",
67
+ layout="centered"
68
+ )
69
+
70
+ st.title("📝Chat With your docs 😎")
71
+
72
+ # Initialize session states
73
+ if "chat_history" not in st.session_state:
74
+ st.session_state.chat_history = []
75
+
76
+ uploaded_file = st.file_uploader(label="Upload your PDF")
77
+
78
+ if uploaded_file:
79
+ file_path = f"{working_dir}/{uploaded_file.name}"
80
+ with open(file_path, "wb") as f:
81
+ f.write(uploaded_file.getbuffer())
82
+
83
+ if "vectorstores" not in st.session_state:
84
+ st.session_state.vectorstores = setup_vectorstore(load_documents(file_path))
85
+
86
+ if "conversation_chain" not in st.session_state:
87
+ st.session_state.conversation_chain = create_chain(st.session_state.vectorstores)
88
+
89
+ # Display chat history
90
+ for message in st.session_state.chat_history:
91
+ with st.chat_message(message["role"]):
92
+ st.markdown(message["content"])
93
+
94
+ # User input handling
95
+ user_input = st.chat_input("Ask any questions relevant to uploaded pdf")
96
+
97
+ if user_input:
98
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
99
+ with st.chat_message("user"):
100
+ st.markdown(user_input)
101
+
102
+ with st.chat_message("assistant"):
103
+ response = st.session_state.conversation_chain({"question": user_input})
104
+ assistant_response = response["answer"]
105
+ st.markdown(assistant_response)
106
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})