sujitb commited on
Commit
1f313e7
1 Parent(s): 0e38166

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -60
app.py CHANGED
@@ -6,6 +6,7 @@ from sentence_transformers import SentenceTransformer, util
6
  from openai import OpenAI
7
  from datetime import datetime
8
  import pandas as pd
 
9
  import os
10
  import time
11
  import json
@@ -33,26 +34,21 @@ index = pc.Index(name=INDEX_NAME)
33
  system_instructions_text='''
34
  Your task is to extract the answer to a question from a body of text provided to you.
35
  The body of text will be enclosed within the delimiter tags <text> and </text>
36
-
37
  For example,
38
  <text> General Preparation Tips for VARC Section:
39
-
40
  You need to develop an incessant habit of speed reading.
41
  Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
42
  The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
43
  Revise at least 50 new words every day
44
  </text>
45
-
46
  Question: What are some tips for preparing for VARC?
47
  Here are some tips for preparing for the VARC section:
48
  1. develop an incessant habit of speed reading
49
  2. Start reading newspapers, editorials, fiction and nonfiction novels
50
  3. Learn basic grammar concepts\n
51
  4. Revise at least 50 new words a day
52
-
53
  Question: How many new words are to be learnt in a day?
54
  It is advised that 50 new words are learn every day
55
-
56
  Your response should be based on the information contained in the provided text and should not included any other sources.
57
  If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
58
  Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
@@ -66,7 +62,6 @@ json_instructions='''
66
  ]
67
  You need to check which content is most appropriate to answer the question and prepare
68
  an answer based on the content
69
-
70
  For example,
71
  [
72
  { "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
@@ -80,15 +75,12 @@ json_instructions='''
80
  Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
81
  Solve at least 50 new problems every day"}
82
  ]
83
-
84
  Question: What are some tips for preparing for Verbal exam?
85
  Here are some tips for preparing for the VARC section:
86
  1. develop an incessant habit of speed reading
87
  2. Start reading newspapers, editorials, fiction and nonfiction novels
88
  3. Learn basic grammar concepts\n
89
  4. Revise at least 50 new words a day
90
-
91
-
92
  Your response should be based on the information contained in the provided content in the json and should not included any other sources.
93
  If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
94
  Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question
@@ -96,16 +88,15 @@ json_instructions='''
96
 
97
  def get_meta_score(url,question_embedding):
98
  qry = index.fetch(ids=[url], namespace="meta")
99
- try:
100
- emb=qry['vectors'][url]['values']
101
- vector1 = np.array(emb).reshape(1, -1) # Reshape to ensure compatibility with sklearn
102
- vector2 = question_embedding.numpy().reshape(1, -1)
103
-
104
- # Calculate cosine similarity
105
- cosine_scores = util.cos_sim(question_embedding, emb)
106
- return cosine_scores.item()
107
- except:
108
- return 0
109
 
110
  def query_from_pinecone(index,namespace, question_embedding, top_k=3):
111
  # get embedding from THE SAME embedder as the documents
@@ -140,16 +131,21 @@ def write_log(query,url, score, ans, ts):
140
  f.write(buffer)
141
 
142
  return
143
-
144
  logfile='querylog.csv'
145
  try:
146
- df_log=pd.read_csv(logfile, index_col=0)
147
  except:
148
  df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
149
 
150
-
151
-
152
  st.title('CLLM Answering Machine')
 
 
 
 
 
 
 
153
 
154
  # Initialize chat history
155
  if "messages" not in st.session_state:
@@ -175,8 +171,10 @@ if QUESTION:
175
  # Add user message to chat history
176
  st.session_state.messages.append({"role": "user", "content": QUESTION})
177
 
 
178
  question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
179
 
 
180
  ns='webpages'
181
  ns='full'
182
  resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
@@ -191,41 +189,42 @@ if QUESTION:
191
  meta_score= get_meta_score(r['id'],question_embedding)
192
  score=.5* r['score'] + .5*meta_score
193
  d['score']=score
194
- st.write(d['url'], score, r['score'], meta_score)
195
  resplist.append(d)
196
 
197
  if len(resplist)>0:
198
  sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)
199
-
200
  # Get the elements with the top 2 highest values
201
  top_2 = [resplist[i] for i in sorted_indices[:2]]
202
-
203
  # covert to array
204
-
205
  json_data = json.dumps(top_2)
206
-
207
-
208
- goodmatch=False
209
- if resplist[sorted_indices[0]]['score']>.5:
 
210
  goodmatch=True
211
  mode = "two" # two passages
212
-
213
  client = OpenAI()
214
-
215
  if mode=="one":
216
  instr=system_instructions_text
217
-
218
  out= resplist[sorted_indices[0]]['content']
219
  content="""
220
  <text>
221
  {}
222
  </text>
223
  """.format(out)
224
-
225
  if mode=="two":
226
  instr=json_instructions
227
  content=json_data
228
-
229
  response = client.chat.completions.create(
230
  model="gpt-3.5-turbo",
231
  messages=[
@@ -234,11 +233,11 @@ if QUESTION:
234
  {"role": "user", "content": "Question:"+QUESTION}
235
  ]
236
  )
237
-
238
  ans= response.choices[0].message.content
239
  else:
240
  ans='Weak match to your query. Please try reframing your question'
241
-
242
  #st.write("Matched URL:{} Score:{}".format(url,score))
243
  testing = False
244
  if testing:
@@ -246,27 +245,28 @@ if QUESTION:
246
  st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
247
  if len(resp)>=2:
248
  st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
249
-
250
- with st.chat_message("assistant"):
251
- response = st.write_stream(response_generator(ans))
252
- if goodmatch:
253
- st.write('Resources:'+top_2[0]['url'])
254
- st.write(top_2[1]['url'])
255
- # Add assistant response to chat history
256
- st.session_state.messages.append({"role": "assistant", "content": response})
257
- #st.write(ans)
258
-
259
- #st.write(' ----------------------')
260
- #st.write(out)
261
-
262
- now= str(datetime.utcnow())
263
- url = top_2[0]['url'] + ' ; '+top_2[1]['url']
264
- df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
265
- write_log(QUESTION,url, score, ans, now)
266
- #df.to_csv("hf://datasets/sujitb/data/test.csv")
267
- #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
268
-
269
- else: ## Zero response from pinecone query
 
270
  #st.write("No matches for query")
271
  ans= "No matches for query"
272
  response = st.write_stream(response_generator(ans))
@@ -276,4 +276,4 @@ if QUESTION:
276
  now= str(datetime.utcnow())
277
  df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
278
  #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
279
- write_log(QUESTION,url, score, ans, now)
 
6
  from openai import OpenAI
7
  from datetime import datetime
8
  import pandas as pd
9
+ import numpy as np
10
  import os
11
  import time
12
  import json
 
34
  system_instructions_text='''
35
  Your task is to extract the answer to a question from a body of text provided to you.
36
  The body of text will be enclosed within the delimiter tags <text> and </text>
 
37
  For example,
38
  <text> General Preparation Tips for VARC Section:
 
39
  You need to develop an incessant habit of speed reading.
40
  Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
41
  The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
42
  Revise at least 50 new words every day
43
  </text>
 
44
  Question: What are some tips for preparing for VARC?
45
  Here are some tips for preparing for the VARC section:
46
  1. develop an incessant habit of speed reading
47
  2. Start reading newspapers, editorials, fiction and nonfiction novels
48
  3. Learn basic grammar concepts\n
49
  4. Revise at least 50 new words a day
 
50
  Question: How many new words are to be learnt in a day?
51
  It is advised that 50 new words are learn every day
 
52
  Your response should be based on the information contained in the provided text and should not included any other sources.
53
  If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
54
  Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
 
62
  ]
63
  You need to check which content is most appropriate to answer the question and prepare
64
  an answer based on the content
 
65
  For example,
66
  [
67
  { "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
 
75
  Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
76
  Solve at least 50 new problems every day"}
77
  ]
 
78
  Question: What are some tips for preparing for Verbal exam?
79
  Here are some tips for preparing for the VARC section:
80
  1. develop an incessant habit of speed reading
81
  2. Start reading newspapers, editorials, fiction and nonfiction novels
82
  3. Learn basic grammar concepts\n
83
  4. Revise at least 50 new words a day
 
 
84
  Your response should be based on the information contained in the provided content in the json and should not included any other sources.
85
  If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
86
  Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question
 
88
 
89
  def get_meta_score(url,question_embedding):
90
  qry = index.fetch(ids=[url], namespace="meta")
91
+
92
+ emb=qry['vectors'][url]['values']
93
+ vector1 = np.array(emb).reshape(1, -1) # Reshape to ensure compatibility with sklearn
94
+ vector2 = question_embedding.numpy().reshape(1, -1)
95
+
96
+ # Calculate cosine similarity
97
+ cosine_scores = util.cos_sim(question_embedding, emb)
98
+ return cosine_scores.item()
99
+
 
100
 
101
  def query_from_pinecone(index,namespace, question_embedding, top_k=3):
102
  # get embedding from THE SAME embedder as the documents
 
131
  f.write(buffer)
132
 
133
  return
134
+
135
  logfile='querylog.csv'
136
  try:
137
+ df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8")
138
  except:
139
  df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
140
 
 
 
141
  st.title('CLLM Answering Machine')
142
+ with st.sidebar:
143
+ st.markdown('*Search History*')
144
+ st.write('# Queries', len(df_log))
145
+ qrylist = df_log['query'].tail(10).tolist()
146
+ for q in qrylist[::-1]:
147
+ st.write(q)
148
+
149
 
150
  # Initialize chat history
151
  if "messages" not in st.session_state:
 
171
  # Add user message to chat history
172
  st.session_state.messages.append({"role": "user", "content": QUESTION})
173
 
174
+ st.write('Searching knowledgebase...')
175
  question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
176
 
177
+ THRESHOLD=.4
178
  ns='webpages'
179
  ns='full'
180
  resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
 
189
  meta_score= get_meta_score(r['id'],question_embedding)
190
  score=.5* r['score'] + .5*meta_score
191
  d['score']=score
192
+ #st.write(d['url'], score, r['score'], meta_score)
193
  resplist.append(d)
194
 
195
  if len(resplist)>0:
196
  sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)
197
+
198
  # Get the elements with the top 2 highest values
199
  top_2 = [resplist[i] for i in sorted_indices[:2]]
200
+
201
  # covert to array
202
+
203
  json_data = json.dumps(top_2)
204
+
205
+
206
+ goodmatch=False
207
+ if resplist[sorted_indices[0]]['score']>=THRESHOLD:
208
+ st.write('Preparing answers...')
209
  goodmatch=True
210
  mode = "two" # two passages
211
+
212
  client = OpenAI()
213
+
214
  if mode=="one":
215
  instr=system_instructions_text
216
+
217
  out= resplist[sorted_indices[0]]['content']
218
  content="""
219
  <text>
220
  {}
221
  </text>
222
  """.format(out)
223
+
224
  if mode=="two":
225
  instr=json_instructions
226
  content=json_data
227
+
228
  response = client.chat.completions.create(
229
  model="gpt-3.5-turbo",
230
  messages=[
 
233
  {"role": "user", "content": "Question:"+QUESTION}
234
  ]
235
  )
236
+
237
  ans= response.choices[0].message.content
238
  else:
239
  ans='Weak match to your query. Please try reframing your question'
240
+
241
  #st.write("Matched URL:{} Score:{}".format(url,score))
242
  testing = False
243
  if testing:
 
245
  st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
246
  if len(resp)>=2:
247
  st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
248
+ ## Send RESPONSE
249
+ with st.chat_message("assistant"):
250
+ response = st.write_stream(response_generator(ans))
251
+ if goodmatch:
252
+ st.write('Resources:')
253
+ st.write(top_2[0]['url'])
254
+ st.write(top_2[1]['url'])
255
+ # Add assistant response to chat history
256
+ st.session_state.messages.append({"role": "assistant", "content": response})
257
+ #st.write(ans)
258
+
259
+ #st.write(' ----------------------')
260
+ #st.write(out)
261
+
262
+ now= str(datetime.utcnow())
263
+ url = top_2[0]['url'] + ' ; '+top_2[1]['url']
264
+ df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
265
+ write_log(QUESTION,url, score, ans, now)
266
+ #df.to_csv("hf://datasets/sujitb/data/test.csv")
267
+ #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
268
+
269
+ else: ## Zero response from pinecone query
270
  #st.write("No matches for query")
271
  ans= "No matches for query"
272
  response = st.write_stream(response_generator(ans))
 
276
  now= str(datetime.utcnow())
277
  df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
278
  #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
279
+ write_log(QUESTION,'No match', 0, '-', now)