File size: 13,736 Bytes
589c9b1 074b93b b403bb0 074b93b 8596e21 c690f92 1f313e7 8596e21 7a2f450 b492789 6b136b5 976a040 c387a59 976a040 7a2f450 976a040 c387a59 074b93b 0b02df4 b403bb0 c387a59 ed6e9e8 074b93b 8560ab0 8596e21 976a040 8596e21 c5c24fb 088165e c5c24fb 088165e c5c24fb 8560ab0 d9f949e 1f313e7 d9f949e 8560ab0 976a040 d944fdb 1f313e7 7a2f450 167e1ea 54faee4 4999c72 167e1ea 54faee4 167e1ea dcfaeaf 80b830f 167e1ea 7a2f450 167e1ea 7a2f450 61f781e 1f313e7 15e7fb0 167e1ea 14c00ab 167e1ea 1f313e7 976a040 54faee4 167e1ea 976a040 8560ab0 976a040 503beb8 8560ab0 976a040 7c3dc0e 8560ab0 8596e21 1f313e7 8560ab0 0b02df4 32ad03a c5c24fb 32ad03a b25eee6 32ad03a b25eee6 72cd7b7 66dfeba 32ad03a 1f313e7 32ad03a 66dfeba 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 75237fd 1f313e7 503beb8 75237fd 503beb8 1f313e7 09a527e 503beb8 1f313e7 167e1ea 1f313e7 167e1ea 1f313e7 167e1ea 1f313e7 976a040 c690f92 8560ab0 167e1ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 |
import streamlit as st
#from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from datetime import datetime
import pandas as pd
import numpy as np
import os
import time
import json
from tavily import TavilyClient
from huggingface_hub import HfFileSystem
token = os.getenv('FILE_TOKEN')
fs = HfFileSystem(token=token)
api_key=os.environ["OPENAI_API_KEY"]
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens
# Store the index as a variable
INDEX_NAME = 'cl-search-idx'
INDEX_NAME = 'cl-kb'
pc_api_key= os.environ["clpine"] #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)
system_instructions_text='''
Your task is to extract the answer to a question from a body of text provided to you.
The body of text will be enclosed within the delimiter tags <text> and </text>
For example,
<text> General Preparation Tips for VARC Section:
You need to develop an incessant habit of speed reading.
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
Revise at least 50 new words every day
</text>
Question: What are some tips for preparing for VARC?
Here are some tips for preparing for the VARC section:
1. develop an incessant habit of speed reading
2. Start reading newspapers, editorials, fiction and nonfiction novels
3. Learn basic grammar concepts\n
4. Revise at least 50 new words a day
Question: How many new words are to be learnt in a day?
It is advised that 50 new words are learn every day
Your response should be based on the information contained in the provided text and should not included any other sources.
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
'''
json_instructions='''
Your task is to extract the answer to a question from a body of text provided to you in a json array.
The json will contain two pieces of content in this format:
[
{"id":1 , "content": " first content"},
{"id":2 , "content": " second content"}
]
You need to check which content is most appropriate to answer the question and prepare
an answer based on the content
For example,
[
{ "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
You need to develop an incessant habit of speed reading.
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
Revise at least 50 new words every day"},
{ "id":2 , "content" : "General Preparation Tips for Quantitative Section:\n
You need to develop an speed in solving math problems.
Start with reading funda books, math text books.
Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
Solve at least 50 new problems every day"}
]
Question: What are some tips for preparing for Verbal exam?
Here are some tips for preparing for the VARC section:
1. develop an incessant habit of speed reading
2. Start reading newspapers, editorials, fiction and nonfiction novels
3. Learn basic grammar concepts\n
4. Revise at least 50 new words a day
Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question.
Your response should be based on the information contained in the provided content in the json and should not included any other sources.
If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
'''
def get_meta_score(url,question_embedding):
qry = index.fetch(ids=[url], namespace="meta")
emb=qry['vectors'][url]['values']
vector1 = np.array(emb).reshape(1, -1) # Reshape to ensure compatibility with sklearn
vector2 = question_embedding.numpy().reshape(1, -1)
# Calculate cosine similarity
cosine_scores = util.cos_sim(question_embedding, emb)
return cosine_scores.item()
def query_from_pinecone(index,namespace, question_embedding, top_k=3):
# get embedding from THE SAME embedder as the documents
return index.query(
vector=question_embedding,
top_k=top_k,
namespace=namespace,
include_metadata=True # gets the metadata (dates, text, etc)
).get('matches')
def response_generator(response):
for word in response.split():
yield word + " "
time.sleep(0.05)
def write_log(query,url, score, ans, ts):
# Construct new row
score = str(score)
ans = ans.replace('"', '""')
new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"'
with fs.open("datasets/sujitb/data/querylog.csv", "r") as f:
buffer = f.read()
# Append the new row to buffer
buffer += new_row
# Write the buffer to the file in "W" mode
with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f:
f.write(buffer)
return
logfile='querylog.csv'
qlist=[
'What are the best books for VARC in CAT?',
'What is the XAT exam pattern? How many sections ? How many questions are asked in each section?',
'I want to know about Personalized coaching for IGSE/IB',
'Which IIMs accept admissions under the IPM exam?',
'What topics are covered under CAT exam syllabus?',
'What is the pattern of the IPM exam?',
'Which Central Universities offer courses for CUET exam',
'For CAT preparation which is better - online classes or classroom program?',
'What programs are offered under CUET exam by Central University of Jharkhand?',
'What is the pattern of the IPM exam?',
'When is the CAT 2024 exam going to be held?',
'What are program benefits of the MBA 2024 Online Classes?',
'What topics are covered in CUET General Test?',
'IIM A B C vs FMS - how to select the best bschool?'
]
try:
df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8", index_col=0)
except:
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
#st.title('CLLM Answering Machine')
st.subheader('CLLM Answering Machine', divider='rainbow')
with st.sidebar:
dispstr= 'Search History'
st.markdown('*{}*'.format(dispstr))
#st.write('Past Queries')
#qlist = df_log.tail(30)['query'].tolist()
for q in qlist[::-1]:
st.write(q)
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
#with st.chat_message("user"):
# st.write("Hello 👋 Ask any question related to careerlauncher.com in the text box below")
QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?')
#QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?'
score=0
testing=True
ext_url=''
if QUESTION:
with st.chat_message("user"):
st.markdown(QUESTION)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": QUESTION})
st.write('Searching knowledge base...')
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
THRESHOLD=.4
ns='webpages'
#ns='full'
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
resplist=[]
id=0
for r in resp:
id+=1
d={}
d['id']=id
d['content']=r['metadata']['data']
d['title']=r['metadata']['title']
d['url']=r['id']
if ns=='webpages':
d['url']= r['metadata']['url']
meta_score= get_meta_score(d['url'],question_embedding)
score=.5* r['score'] + .5*meta_score
d['score']=score
#st.write(d['url'], score, r['score'], meta_score)
resplist.append(d)
#check youtube
respmeta= query_from_pinecone(index, 'meta', question_embedding.tolist(), 5)
for r in respmeta:
if 'youtube' in r['id'] and r['score']>=.8:
d['id']=r['id']
d['content']=r['metadata']['data']
d['title']=r['metadata']['title']
d['url']= r['metadata']['url']
d['score']=r['score']
resplist.append(d)
if len(resplist)>0:
sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)
# Get the elements with the top 2 highest values
top_2 = [resplist[i] for i in sorted_indices[:2]]
# covert to array
json_data = json.dumps(top_2)
goodmatch=False
if resplist[sorted_indices[0]]['score']>=THRESHOLD:
st.write('Preparing answers...')
goodmatch=True
mode = "two" # two passages
client = OpenAI()
if mode=="one":
instr=system_instructions_text
out= resplist[sorted_indices[0]]['content']
content="""
<text>
{}
</text>
""".format(out)
if mode=="two":
instr=json_instructions
content=json_data
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content":instr },
{"role": "user", "content": content},
{"role": "user", "content": "Question:"+QUESTION}
]
)
ans= response.choices[0].message.content
else:
ans='Weak match to your query. Please try reframing your question'
## Call Tavily
tavily_key= os.environ["TAVILY_KEY"]
tavily = TavilyClient(api_key=tavily_key)
success= 0
while success<3:
success+=1
try:
resp=tavily.search(query=QUESTION)
with st.chat_message("assistant"):
ans=resp['results'][0]['content']
ext_url= resp['results'][0]['url']
break
except:
pass
#st.write("Matched URL:{} Score:{}".format(url,score))
testing = False
if testing:
if len(resp)>=1:
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
if len(resp)>=2:
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
## Send RESPONSE
with st.chat_message("assistant"):
response = st.write_stream(response_generator(ans))
if goodmatch:
st.write('Resources:')
for k in range(2):
disp_title=top_2[k]['title']
disp_url= top_2[k]['url']
if 'youtube' in disp_url:
disp_title='Youtube: '+disp_title
if k>0:
if top_2[k]['url']==top_2[k-1]['url']:
break
st.write("["+disp_title+"]("+disp_url+")")
else: # not a good match
if len(ext_url)>5:
st.write('External Site:',ext_url)
#st.write(top_2[0]['url'])
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
#st.write(ans)
#st.write(' ----------------------')
#st.write(out)
now= str(datetime.utcnow())
url = top_2[0]['url'] + ' ; '+top_2[1]['url']
df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
#write_log(QUESTION,url, score, ans, now)
#df.to_csv("hf://datasets/sujitb/data/test.csv")
storage_options={"token":token}
df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)
else: ## Zero response from pinecone query
#st.write("No matches for query")
ans= "No matches for query"
response = st.write_stream(response_generator(ans))
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
now= str(datetime.utcnow())
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
storage_options={"token":token}
df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)
#write_log(QUESTION,'No match', 0, '-', now)
|