Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ from sentence_transformers import SentenceTransformer, util
|
|
6 |
from openai import OpenAI
|
7 |
from datetime import datetime
|
8 |
import pandas as pd
|
|
|
9 |
import os
|
10 |
import time
|
11 |
import json
|
@@ -33,26 +34,21 @@ index = pc.Index(name=INDEX_NAME)
|
|
33 |
system_instructions_text='''
|
34 |
Your task is to extract the answer to a question from a body of text provided to you.
|
35 |
The body of text will be enclosed within the delimiter tags <text> and </text>
|
36 |
-
|
37 |
For example,
|
38 |
<text> General Preparation Tips for VARC Section:
|
39 |
-
|
40 |
You need to develop an incessant habit of speed reading.
|
41 |
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
|
42 |
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
|
43 |
Revise at least 50 new words every day
|
44 |
</text>
|
45 |
-
|
46 |
Question: What are some tips for preparing for VARC?
|
47 |
Here are some tips for preparing for the VARC section:
|
48 |
1. develop an incessant habit of speed reading
|
49 |
2. Start reading newspapers, editorials, fiction and nonfiction novels
|
50 |
3. Learn basic grammar concepts\n
|
51 |
4. Revise at least 50 new words a day
|
52 |
-
|
53 |
Question: How many new words are to be learnt in a day?
|
54 |
It is advised that 50 new words are learn every day
|
55 |
-
|
56 |
Your response should be based on the information contained in the provided text and should not included any other sources.
|
57 |
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
|
58 |
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
|
@@ -66,7 +62,6 @@ json_instructions='''
|
|
66 |
]
|
67 |
You need to check which content is most appropriate to answer the question and prepare
|
68 |
an answer based on the content
|
69 |
-
|
70 |
For example,
|
71 |
[
|
72 |
{ "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
|
@@ -80,15 +75,12 @@ json_instructions='''
|
|
80 |
Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
|
81 |
Solve at least 50 new problems every day"}
|
82 |
]
|
83 |
-
|
84 |
Question: What are some tips for preparing for Verbal exam?
|
85 |
Here are some tips for preparing for the VARC section:
|
86 |
1. develop an incessant habit of speed reading
|
87 |
2. Start reading newspapers, editorials, fiction and nonfiction novels
|
88 |
3. Learn basic grammar concepts\n
|
89 |
4. Revise at least 50 new words a day
|
90 |
-
|
91 |
-
|
92 |
Your response should be based on the information contained in the provided content in the json and should not included any other sources.
|
93 |
If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
|
94 |
Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question
|
@@ -96,16 +88,15 @@ json_instructions='''
|
|
96 |
|
97 |
def get_meta_score(url,question_embedding):
|
98 |
qry = index.fetch(ids=[url], namespace="meta")
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
return 0
|
109 |
|
110 |
def query_from_pinecone(index,namespace, question_embedding, top_k=3):
|
111 |
# get embedding from THE SAME embedder as the documents
|
@@ -140,16 +131,21 @@ def write_log(query,url, score, ans, ts):
|
|
140 |
f.write(buffer)
|
141 |
|
142 |
return
|
143 |
-
|
144 |
logfile='querylog.csv'
|
145 |
try:
|
146 |
-
df_log=pd.read_csv(
|
147 |
except:
|
148 |
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
|
149 |
|
150 |
-
|
151 |
-
|
152 |
st.title('CLLM Answering Machine')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
# Initialize chat history
|
155 |
if "messages" not in st.session_state:
|
@@ -175,8 +171,10 @@ if QUESTION:
|
|
175 |
# Add user message to chat history
|
176 |
st.session_state.messages.append({"role": "user", "content": QUESTION})
|
177 |
|
|
|
178 |
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
|
179 |
|
|
|
180 |
ns='webpages'
|
181 |
ns='full'
|
182 |
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
|
@@ -191,41 +189,42 @@ if QUESTION:
|
|
191 |
meta_score= get_meta_score(r['id'],question_embedding)
|
192 |
score=.5* r['score'] + .5*meta_score
|
193 |
d['score']=score
|
194 |
-
st.write(d['url'], score, r['score'], meta_score)
|
195 |
resplist.append(d)
|
196 |
|
197 |
if len(resplist)>0:
|
198 |
sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)
|
199 |
-
|
200 |
# Get the elements with the top 2 highest values
|
201 |
top_2 = [resplist[i] for i in sorted_indices[:2]]
|
202 |
-
|
203 |
# covert to array
|
204 |
-
|
205 |
json_data = json.dumps(top_2)
|
206 |
-
|
207 |
-
|
208 |
-
goodmatch=False
|
209 |
-
if resplist[sorted_indices[0]]['score']
|
|
|
210 |
goodmatch=True
|
211 |
mode = "two" # two passages
|
212 |
-
|
213 |
client = OpenAI()
|
214 |
-
|
215 |
if mode=="one":
|
216 |
instr=system_instructions_text
|
217 |
-
|
218 |
out= resplist[sorted_indices[0]]['content']
|
219 |
content="""
|
220 |
<text>
|
221 |
{}
|
222 |
</text>
|
223 |
""".format(out)
|
224 |
-
|
225 |
if mode=="two":
|
226 |
instr=json_instructions
|
227 |
content=json_data
|
228 |
-
|
229 |
response = client.chat.completions.create(
|
230 |
model="gpt-3.5-turbo",
|
231 |
messages=[
|
@@ -234,11 +233,11 @@ if QUESTION:
|
|
234 |
{"role": "user", "content": "Question:"+QUESTION}
|
235 |
]
|
236 |
)
|
237 |
-
|
238 |
ans= response.choices[0].message.content
|
239 |
else:
|
240 |
ans='Weak match to your query. Please try reframing your question'
|
241 |
-
|
242 |
#st.write("Matched URL:{} Score:{}".format(url,score))
|
243 |
testing = False
|
244 |
if testing:
|
@@ -246,27 +245,28 @@ if QUESTION:
|
|
246 |
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
|
247 |
if len(resp)>=2:
|
248 |
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
|
|
270 |
#st.write("No matches for query")
|
271 |
ans= "No matches for query"
|
272 |
response = st.write_stream(response_generator(ans))
|
@@ -276,4 +276,4 @@ if QUESTION:
|
|
276 |
now= str(datetime.utcnow())
|
277 |
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
|
278 |
#df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
|
279 |
-
write_log(QUESTION,
|
|
|
6 |
from openai import OpenAI
|
7 |
from datetime import datetime
|
8 |
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
import os
|
11 |
import time
|
12 |
import json
|
|
|
34 |
system_instructions_text='''
|
35 |
Your task is to extract the answer to a question from a body of text provided to you.
|
36 |
The body of text will be enclosed within the delimiter tags <text> and </text>
|
|
|
37 |
For example,
|
38 |
<text> General Preparation Tips for VARC Section:
|
|
|
39 |
You need to develop an incessant habit of speed reading.
|
40 |
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
|
41 |
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
|
42 |
Revise at least 50 new words every day
|
43 |
</text>
|
|
|
44 |
Question: What are some tips for preparing for VARC?
|
45 |
Here are some tips for preparing for the VARC section:
|
46 |
1. develop an incessant habit of speed reading
|
47 |
2. Start reading newspapers, editorials, fiction and nonfiction novels
|
48 |
3. Learn basic grammar concepts\n
|
49 |
4. Revise at least 50 new words a day
|
|
|
50 |
Question: How many new words are to be learnt in a day?
|
51 |
It is advised that 50 new words are learn every day
|
|
|
52 |
Your response should be based on the information contained in the provided text and should not included any other sources.
|
53 |
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
|
54 |
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
|
|
|
62 |
]
|
63 |
You need to check which content is most appropriate to answer the question and prepare
|
64 |
an answer based on the content
|
|
|
65 |
For example,
|
66 |
[
|
67 |
{ "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
|
|
|
75 |
Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
|
76 |
Solve at least 50 new problems every day"}
|
77 |
]
|
|
|
78 |
Question: What are some tips for preparing for Verbal exam?
|
79 |
Here are some tips for preparing for the VARC section:
|
80 |
1. develop an incessant habit of speed reading
|
81 |
2. Start reading newspapers, editorials, fiction and nonfiction novels
|
82 |
3. Learn basic grammar concepts\n
|
83 |
4. Revise at least 50 new words a day
|
|
|
|
|
84 |
Your response should be based on the information contained in the provided content in the json and should not included any other sources.
|
85 |
If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
|
86 |
Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question
|
|
|
88 |
|
89 |
def get_meta_score(url,question_embedding):
|
90 |
qry = index.fetch(ids=[url], namespace="meta")
|
91 |
+
|
92 |
+
emb=qry['vectors'][url]['values']
|
93 |
+
vector1 = np.array(emb).reshape(1, -1) # Reshape to ensure compatibility with sklearn
|
94 |
+
vector2 = question_embedding.numpy().reshape(1, -1)
|
95 |
+
|
96 |
+
# Calculate cosine similarity
|
97 |
+
cosine_scores = util.cos_sim(question_embedding, emb)
|
98 |
+
return cosine_scores.item()
|
99 |
+
|
|
|
100 |
|
101 |
def query_from_pinecone(index,namespace, question_embedding, top_k=3):
|
102 |
# get embedding from THE SAME embedder as the documents
|
|
|
131 |
f.write(buffer)
|
132 |
|
133 |
return
|
134 |
+
|
135 |
logfile='querylog.csv'
|
136 |
try:
|
137 |
+
df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8")
|
138 |
except:
|
139 |
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
|
140 |
|
|
|
|
|
141 |
st.title('CLLM Answering Machine')
|
142 |
+
with st.sidebar:
|
143 |
+
st.markdown('*Search History*')
|
144 |
+
st.write('# Queries', len(df_log))
|
145 |
+
qrylist = df_log['query'].tail(10).tolist()
|
146 |
+
for q in qrylist[::-1]:
|
147 |
+
st.write(q)
|
148 |
+
|
149 |
|
150 |
# Initialize chat history
|
151 |
if "messages" not in st.session_state:
|
|
|
171 |
# Add user message to chat history
|
172 |
st.session_state.messages.append({"role": "user", "content": QUESTION})
|
173 |
|
174 |
+
st.write('Searching knowledgebase...')
|
175 |
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
|
176 |
|
177 |
+
THRESHOLD=.4
|
178 |
ns='webpages'
|
179 |
ns='full'
|
180 |
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
|
|
|
189 |
meta_score= get_meta_score(r['id'],question_embedding)
|
190 |
score=.5* r['score'] + .5*meta_score
|
191 |
d['score']=score
|
192 |
+
#st.write(d['url'], score, r['score'], meta_score)
|
193 |
resplist.append(d)
|
194 |
|
195 |
if len(resplist)>0:
|
196 |
sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)
|
197 |
+
|
198 |
# Get the elements with the top 2 highest values
|
199 |
top_2 = [resplist[i] for i in sorted_indices[:2]]
|
200 |
+
|
201 |
# covert to array
|
202 |
+
|
203 |
json_data = json.dumps(top_2)
|
204 |
+
|
205 |
+
|
206 |
+
goodmatch=False
|
207 |
+
if resplist[sorted_indices[0]]['score']>=THRESHOLD:
|
208 |
+
st.write('Preparing answers...')
|
209 |
goodmatch=True
|
210 |
mode = "two" # two passages
|
211 |
+
|
212 |
client = OpenAI()
|
213 |
+
|
214 |
if mode=="one":
|
215 |
instr=system_instructions_text
|
216 |
+
|
217 |
out= resplist[sorted_indices[0]]['content']
|
218 |
content="""
|
219 |
<text>
|
220 |
{}
|
221 |
</text>
|
222 |
""".format(out)
|
223 |
+
|
224 |
if mode=="two":
|
225 |
instr=json_instructions
|
226 |
content=json_data
|
227 |
+
|
228 |
response = client.chat.completions.create(
|
229 |
model="gpt-3.5-turbo",
|
230 |
messages=[
|
|
|
233 |
{"role": "user", "content": "Question:"+QUESTION}
|
234 |
]
|
235 |
)
|
236 |
+
|
237 |
ans= response.choices[0].message.content
|
238 |
else:
|
239 |
ans='Weak match to your query. Please try reframing your question'
|
240 |
+
|
241 |
#st.write("Matched URL:{} Score:{}".format(url,score))
|
242 |
testing = False
|
243 |
if testing:
|
|
|
245 |
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
|
246 |
if len(resp)>=2:
|
247 |
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
|
248 |
+
## Send RESPONSE
|
249 |
+
with st.chat_message("assistant"):
|
250 |
+
response = st.write_stream(response_generator(ans))
|
251 |
+
if goodmatch:
|
252 |
+
st.write('Resources:')
|
253 |
+
st.write(top_2[0]['url'])
|
254 |
+
st.write(top_2[1]['url'])
|
255 |
+
# Add assistant response to chat history
|
256 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
257 |
+
#st.write(ans)
|
258 |
+
|
259 |
+
#st.write(' ----------------------')
|
260 |
+
#st.write(out)
|
261 |
+
|
262 |
+
now= str(datetime.utcnow())
|
263 |
+
url = top_2[0]['url'] + ' ; '+top_2[1]['url']
|
264 |
+
df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
|
265 |
+
write_log(QUESTION,url, score, ans, now)
|
266 |
+
#df.to_csv("hf://datasets/sujitb/data/test.csv")
|
267 |
+
#df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
|
268 |
+
|
269 |
+
else: ## Zero response from pinecone query
|
270 |
#st.write("No matches for query")
|
271 |
ans= "No matches for query"
|
272 |
response = st.write_stream(response_generator(ans))
|
|
|
276 |
now= str(datetime.utcnow())
|
277 |
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
|
278 |
#df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
|
279 |
+
write_log(QUESTION,'No match', 0, '-', now)
|