techasad commited on
Commit
cae3b1e
1 Parent(s): 7a6e3fa

Upload 3 files

Browse files
Files changed (3) hide show
  1. Readme.md +3 -0
  2. pdfassistant.py +348 -0
  3. requirements.txt +99 -0
Readme.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ install all the dependencies with 'pip install -r requirements.txt'
2
+ add sectrets.toml file in .streamlit folder with your api key
3
+ run the app with streamlit run pdfassistant.py
pdfassistant.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ import langchain
4
+ from textwrap import dedent
5
+ import pandas as pd
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from langchain_community.callbacks import StreamlitCallbackHandler
8
+ from langchain_openai import ChatOpenAI
9
+ from langchain_community.chat_models import ChatGooglePalm
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.vectorstores.faiss import FAISS
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain.memory import ConversationBufferMemory
15
+ import tempfile
16
+ from langchain.document_loaders.csv_loader import CSVLoader
17
+ from langchain.document_loaders.pdf import PyPDFLoader
18
+ from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
19
+ from langchain.chains.question_answering import load_qa_chain
20
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
21
+ from langchain.agents import load_tools
22
+ import os
23
+ from io import BytesIO
24
+ from langdetect import detect
25
+ from gtts import gTTS
26
+ from langchain.prompts import (
27
+ ChatPromptTemplate
28
+ )
29
+
30
+
31
+
32
+
33
+
34
+ google_api_key = st.secrets["GOOGLE_API_KEY"]
35
+ #api_key2 = st.secrets["OPENAI_API_KEY"]
36
+ os.environ["GOOGLE_API_KEY"] = google_api_key
37
+
38
+ st.set_page_config(page_title='Personal Chatbot', page_icon='books')
39
+ st.header('Knowledge Query Assistant')
40
+ st.write("Upload your file to begin a chat, or ask any general questions you have")
41
+ st.sidebar.title('Options')
42
+
43
+
44
+ st.sidebar.subheader("Please Choose the AI Engine")
45
+ use_google = st.sidebar.checkbox("Use Free AI", value =True)
46
+ use_openai = st.sidebar.checkbox("Use OpenAI with your API Key")
47
+
48
+ openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key:", type="password")
49
+
50
+ def choose_llm():
51
+ try:
52
+ if use_google and use_openai:
53
+ st.sidebar.warning("Please choose only one AI engine.")
54
+ st.warning("Please choose only one AI engine.")
55
+ elif use_google:
56
+ llm = ChatGooglePalm(temperature=0.1)
57
+ elif use_openai:
58
+ if not openai_api_key:
59
+ st.sidebar.warning("Please provide your OpenAI API Key.")
60
+ st.warning("Please provide your OpenAI API Key.")
61
+ llm = ChatOpenAI(api_key=openai_api_key, temperature=0.1)
62
+ return llm
63
+ except Exception as e:
64
+ " "
65
+
66
+
67
+ llm = choose_llm()
68
+
69
+ if llm:
70
+ st.sidebar.success("AI Engine selected")
71
+ else:
72
+ st.sidebar.warning("Please choose an AI engine.")
73
+
74
+
75
+
76
+ @st.cache_resource(show_spinner=False)
77
+ def processing_csv_pdf_docx(uploaded_file):
78
+ with st.spinner(text="Embedding Your Files"):
79
+
80
+ # Read text from the uploaded PDF file
81
+ data = []
82
+ for file in uploaded_file:
83
+ split_tup = os.path.splitext(file.name)
84
+ file_extension = split_tup[1]
85
+
86
+ if file_extension == ".pdf":
87
+
88
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file1:
89
+ tmp_file1.write(file.getvalue())
90
+ tmp_file_path1 = tmp_file1.name
91
+ loader = PyPDFLoader(file_path=tmp_file_path1)
92
+ documents = loader.load()
93
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
94
+ data += text_splitter.split_documents(documents)
95
+
96
+
97
+ if file_extension == ".csv":
98
+
99
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
100
+ tmp_file.write(file.getvalue())
101
+ tmp_file_path = tmp_file.name
102
+
103
+ loader = CSVLoader(file_path=tmp_file_path, encoding="utf-8", csv_args={
104
+ 'delimiter': ','})
105
+ documents = loader.load()
106
+
107
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
108
+
109
+ data += text_splitter.split_documents(documents)
110
+ st.sidebar.header(f"Data-{file.name}")
111
+ data1 = pd.read_csv(tmp_file_path)
112
+ st.sidebar.dataframe(data1)
113
+
114
+ if file_extension == ".docx":
115
+
116
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
117
+ tmp_file.write(file.getvalue())
118
+ tmp_file_path = tmp_file.name
119
+ loader = UnstructuredWordDocumentLoader(file_path=tmp_file_path)
120
+ documents = loader.load()
121
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
122
+
123
+ data += text_splitter.split_documents(documents)
124
+
125
+
126
+ # Download embeddings from GooglePalm
127
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
128
+ #embeddings = GooglePalmEmbeddings()
129
+ #embeddings = OpenAIEmbeddings()
130
+
131
+ # Create a FAISS index from texts and embeddings
132
+
133
+ vectorstore = FAISS.from_documents(data, embeddings)
134
+ #vectorstore.save_local("./faiss")
135
+ return vectorstore
136
+
137
+
138
+
139
+ with st.sidebar:
140
+ uploaded_file = st.file_uploader("Upload your files",
141
+ help="Multiple Files are Supported",
142
+ type=['pdf', 'docx', 'csv'], accept_multiple_files= True)
143
+
144
+
145
+ if not uploaded_file:
146
+ st.warning("Upload your file(s) to start chatting!")
147
+
148
+
149
+
150
+ if 'history' not in st.session_state:
151
+ st.session_state['history'] = []
152
+
153
+
154
+ if "messages" not in st.session_state or st.sidebar.button("Clear conversation history"):
155
+ st.session_state["messages"]= []
156
+
157
+
158
+ st.sidebar.subheader('Created by Engr. Muhammad Asadullah')
159
+
160
+ # Adding links to social accounts
161
+ st.sidebar.markdown("[LinkedIn](https://www.linkedin.com/in/asad18/)")
162
+ st.sidebar.markdown("[GitHub](https://github.com/TechAsad)")
163
+ st.sidebar.markdown("[Fiverr](https://www.fiverr.com/promptengr?source=gig_page&gigs=slug%3Acreate-streamlit-and-gradio-web-apps-for-ai-and-data-analysis%2Cpckg_id%3A1&is_choice=true)")
164
+ st.sidebar.markdown("[Website](https://tenlancer.com/)")
165
+ ########--Save PDF--########
166
+
167
+
168
+
169
+ def main():
170
+ try:
171
+ if (use_openai and openai_api_key) or use_google:
172
+ if uploaded_file:
173
+ db = processing_csv_pdf_docx(uploaded_file)
174
+ for file in uploaded_file:
175
+ st.success(f'Your File: {file.name} is Embedded', icon="✅")
176
+
177
+ for msg in st.session_state.messages:
178
+ st.chat_message(msg["role"]).write(msg["content"])
179
+ if msg["role"] == "Assistant":
180
+
181
+ st.chat_message(msg["role"]).audio(msg["audio_content"], format='audio/wav')
182
+ #st.audio(audio_msg, format='audio/mp3').audio(audio_msg)
183
+
184
+
185
+ if prompt := st.chat_input(placeholder="Type your question!"):
186
+ st.session_state.messages.append({"role": "user", "content": prompt})
187
+ st.chat_message("user").write(prompt)
188
+ memory = ConversationBufferMemory(memory_key="chat_history", input_key="question", human_prefix= "", ai_prefix= "")
189
+ user_message = {"role": "user", "content": prompt}
190
+
191
+
192
+ for i in range(0, len(st.session_state.messages), 2):
193
+ if i + 1 < len(st.session_state.messages):
194
+ user_prompt = st.session_state.messages[i]
195
+ ai_res = st.session_state.messages[i + 1]
196
+
197
+ current_role = user_prompt["role"]
198
+ current_content = user_prompt["content"]
199
+
200
+ next_role = ai_res["role"]
201
+ next_content = ai_res["content"]
202
+
203
+ # Concatenate role and content for context and output
204
+ user = f"{current_role}: {current_content}"
205
+ ai = f"{next_role}: {next_content}"
206
+
207
+ memory.save_context({"question": user}, {"output": ai})
208
+
209
+ # Get user input -> Generate the answer
210
+ greetings = ['Hey', 'Hello', 'hi', 'hello', 'hey', 'helloo', 'hellooo', 'g morning', 'gmorning', 'good morning', 'morning',
211
+ 'good day', 'good afternoon', 'good evening', 'greetings', 'greeting', 'good to see you',
212
+ 'its good seeing you', 'how are you', "how're you", 'how are you doing', "how ya doin'", 'how ya doin',
213
+ 'how is everything', 'how is everything going', "how's everything going", 'how is you', "how's you",
214
+ 'how are things', "how're things", 'how is it going', "how's it going", "how's it goin'", "how's it goin",
215
+ 'how is life been treating you', "how's life been treating you", 'how have you been', "how've you been",
216
+ 'what is up', "what's up", 'what is cracking', "what's cracking", 'what is good', "what's good",
217
+ 'what is happening', "what's happening", 'what is new', "what's new", 'what is neww', "g’day", 'howdy']
218
+ compliment = ['thank you', 'thanks', 'thanks a lot', 'thanks a bunch', 'great', 'ok', 'ok thanks', 'okay', 'great', 'awesome', 'nice']
219
+
220
+ prompt_template =dedent(r"""
221
+ You are a helpful assistant to help user find information from his documents.
222
+ talk humbly. Answer the question from the provided context. Do not answer from your own training data.
223
+ Use the following pieces of context to answer the question at the end.
224
+ If you don't know the answer, just say that you don't know. Do not makeup any answer.
225
+ Do not answer hypothetically. Do not answer in more than 100 words.
226
+ Please Do Not say: "Based on the provided context"
227
+ Always use the context to find the answer.
228
+
229
+ this is the context from study material:
230
+ ---------
231
+ {context}
232
+ ---------
233
+
234
+ Current Conversation:
235
+ ---------
236
+ {chat_history}
237
+ ---------
238
+
239
+ Question:
240
+ {question}
241
+
242
+ Helpful Answer:
243
+ """)
244
+
245
+
246
+
247
+ PROMPT = PromptTemplate(
248
+ template=prompt_template, input_variables=["context", "question", "chat_history"]
249
+ )
250
+
251
+ # Run the question-answering chain
252
+
253
+
254
+
255
+ # Load question-answering chain
256
+ chain = load_qa_chain(llm=llm, verbose= True, prompt = PROMPT,memory=memory, chain_type="stuff")
257
+
258
+ #chain = load_qa_chain(ChatOpenAI(temperature=0.9, model="gpt-3.5-turbo-0613", streaming=True) , verbose= True, prompt = PROMPT, memory=memory,chain_type="stuff")
259
+
260
+ with st.chat_message("Assistant"):
261
+ st_cb = StreamlitCallbackHandler(st.container())
262
+ if prompt.lower() in greetings:
263
+ response = 'Hi, how are you? I am here to help you get information from your file. How can I assist you?'
264
+
265
+
266
+ audio_buffer = BytesIO()
267
+ audio_file = gTTS(text=response, lang='en', slow=False)
268
+ audio_file.write_to_fp(audio_buffer)
269
+ audio_buffer.seek(0)
270
+ #st.audio(audio_buffer, format='audio/mp3')
271
+ st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
272
+
273
+ elif prompt.lower() in compliment:
274
+ response = 'My pleasure! If you have any more questions, feel free to ask.'
275
+
276
+
277
+ audio_buffer = BytesIO()
278
+ audio_file = gTTS(text=response, lang='en', slow=False)
279
+ audio_file.write_to_fp(audio_buffer)
280
+ audio_buffer.seek(0)
281
+ #st.audio(audio_buffer, format='audio/mp3')
282
+ st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
283
+
284
+ elif uploaded_file:
285
+ with st.spinner('Bot is typing ...'):
286
+ docs = db.similarity_search(prompt, k=5, fetch_k=10)
287
+ response = chain.run(input_documents=docs, question=prompt)
288
+
289
+
290
+ lang = detect(response)
291
+
292
+ audio_buffer = BytesIO()
293
+ audio_file = gTTS(text=response, lang=lang, slow=False)
294
+ audio_file.write_to_fp(audio_buffer)
295
+ audio_buffer.seek(0)
296
+ # st.audio(audio_buffer, format='audio/mp3')
297
+ #st.session_state.audio.append({"role": "Assistant", "audio": audio_buffer})
298
+ st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
299
+
300
+ assistant_message = {"role": "assistant", "content": response}
301
+ else:
302
+ with st.spinner('Bot is typing ...'):
303
+ prompt_chat = ChatPromptTemplate.from_template("you are a helpful assistant, Answer the question with your knowledge.\n\n current conversation: {chat_history} \n\n Question: {question} \n\n Answer:")
304
+ chain = prompt_chat | llm
305
+ response = chain.invoke({"chat_history": memory, "question": prompt}).content
306
+
307
+
308
+ lang = detect(response)
309
+
310
+ audio_buffer = BytesIO()
311
+ audio_file = gTTS(text=response, lang=lang, slow=False)
312
+ audio_file.write_to_fp(audio_buffer)
313
+ audio_buffer.seek(0)
314
+ #st.audio(audio_buffer, format='audio/mp3')
315
+ #st.session_state.audio.append({"role": "Assistant", "audio": audio_buffer})
316
+ st.session_state.messages.append({"role": "Assistant", "content": response, "audio_content": audio_buffer})
317
+
318
+ assistant_message = {"role": "assistant", "content": response}
319
+
320
+ st.write(response)
321
+ st.audio(audio_buffer, format='audio/wav')
322
+
323
+
324
+ except Exception as e:
325
+
326
+ "Sorry, there was a problem. A corrupted file or;"
327
+ if use_google:
328
+ "Google PaLM AI only take English Data and Questions. Or the AI could not find the answer in your provided document."
329
+ elif use_openai:
330
+ "Please check your OpenAI API key"
331
+
332
+
333
+
334
+ hide_streamlit_style = """
335
+ <style>
336
+ #MainMenu {visibility: hidden;}
337
+ footer {visibility: hidden;}
338
+ </style>
339
+ """
340
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
341
+
342
+
343
+ if __name__ == '__main__':
344
+ main()
345
+
346
+
347
+
348
+
requirements.txt ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.1
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.2.0
6
+ attrs==23.2.0
7
+ blinker==1.7.0
8
+ cachetools==5.3.2
9
+ certifi==2023.11.17
10
+ charset-normalizer==3.3.2
11
+ click==8.1.7
12
+ dataclasses-json==0.6.3
13
+ distro==1.9.0
14
+ frozenlist==1.4.1
15
+ gitdb==4.0.11
16
+ GitPython==3.1.41
17
+ google-ai-generativelanguage==0.4.0
18
+ google-api-core==2.15.0
19
+ google-auth==2.27.0
20
+ google-generativeai==0.3.2
21
+ googleapis-common-protos==1.62.0
22
+ greenlet==3.0.3
23
+ grpcio==1.60.0
24
+ grpcio-status==1.60.0
25
+ gtts
26
+ h11==0.14.0
27
+ httpcore==1.0.2
28
+ httpx==0.26.0
29
+ idna==3.6
30
+ importlib-metadata==7.0.1
31
+ Jinja2==3.1.3
32
+ jsonpatch==1.33
33
+ jsonpointer==2.4
34
+ jsonschema==4.21.1
35
+ jsonschema-specifications==2023.12.1
36
+ langchain==0.1.4
37
+ langchain-community==0.0.16
38
+ langchain-core==0.1.16
39
+ langchain-openai==0.0.5
40
+ langdetect
41
+ langsmith==0.0.83
42
+ lxml==5.1.0
43
+ markdown-it-py==3.0.0
44
+ MarkupSafe==2.1.4
45
+ marshmallow==3.20.2
46
+ mdurl==0.1.2
47
+ multidict==6.0.4
48
+ mypy-extensions==1.0.0
49
+ numpy==1.26.3
50
+ openai==1.10.0
51
+ packaging==23.2
52
+ pandas==2.2.0
53
+ pillow==10.2.0
54
+ proto-plus==1.23.0
55
+ protobuf==4.25.2
56
+ pyarrow==15.0.0
57
+ pyasn1==0.5.1
58
+ pyasn1-modules==0.3.0
59
+ pydantic==2.5.3
60
+ pydantic_core==2.14.6
61
+ pydeck==0.8.1b0
62
+ Pygments==2.17.2
63
+ pypdf==4.0.0
64
+ PyPDF2==3.0.1
65
+ python-dateutil==2.8.2
66
+ python-docx==1.1.0
67
+ pytz==2023.3.post1
68
+ PyYAML==6.0.1
69
+ referencing==0.32.1
70
+ regex==2023.12.25
71
+ requests==2.31.0
72
+ rich==13.7.0
73
+ rpds-py==0.17.1
74
+ rsa==4.9
75
+ six==1.16.0
76
+ smmap==5.0.1
77
+ sniffio==1.3.0
78
+ SQLAlchemy==2.0.25
79
+ streamlit==1.30.0
80
+ tenacity==8.2.3
81
+ tiktoken==0.5.2
82
+ toml==0.10.2
83
+ toolz==0.12.1
84
+ tornado==6.4
85
+ tqdm==4.66.1
86
+ typing-inspect==0.9.0
87
+ typing_extensions==4.9.0
88
+ tzdata==2023.4
89
+ tzlocal==5.2
90
+ urllib3==2.1.0
91
+ validators==0.22.0
92
+ yarl==1.9.4
93
+ zipp==3.17.0
94
+ sentence-transformers
95
+ unstructured
96
+ faiss-cpu
97
+ pycryptodome==3.15.0
98
+ unstructured[pdf]
99
+ cryptography>=3.1