|
import streamlit as st |
|
|
|
|
|
from pinecone import Pinecone, ServerlessSpec |
|
from sentence_transformers import SentenceTransformer, util |
|
from openai import OpenAI |
|
from datetime import datetime |
|
import pandas as pd |
|
import numpy as np |
|
import os |
|
import time |
|
import json |
|
|
|
from huggingface_hub import HfFileSystem |
|
token = os.getenv('FILE_TOKEN') |
|
fs = HfFileSystem(token=token) |
|
|
|
|
|
|
|
|
|
api_key=os.environ["OPENAI_API_KEY"] |
|
|
|
|
|
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') |
|
bi_encoder.max_seq_length = 256 |
|
|
|
|
|
INDEX_NAME = 'cl-search-idx' |
|
|
|
pc_api_key= os.environ["clpine"] |
|
pc = Pinecone(api_key=pc_api_key) |
|
index = pc.Index(name=INDEX_NAME) |
|
|
|
system_instructions_text=''' |
|
Your task is to extract the answer to a question from a body of text provided to you. |
|
The body of text will be enclosed within the delimiter tags <text> and </text> |
|
For example, |
|
<text> General Preparation Tips for VARC Section: |
|
You need to develop an incessant habit of speed reading. |
|
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages. |
|
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc. |
|
Revise at least 50 new words every day |
|
</text> |
|
Question: What are some tips for preparing for VARC? |
|
Here are some tips for preparing for the VARC section: |
|
1. develop an incessant habit of speed reading |
|
2. Start reading newspapers, editorials, fiction and nonfiction novels |
|
3. Learn basic grammar concepts\n |
|
4. Revise at least 50 new words a day |
|
Question: How many new words are to be learnt in a day? |
|
It is advised that 50 new words are learn every day |
|
Your response should be based on the information contained in the provided text and should not included any other sources. |
|
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this" |
|
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question |
|
''' |
|
json_instructions=''' |
|
Your task is to extract the answer to a question from a body of text provided to you in a json array. |
|
The json will contain two pieces of content in this format: |
|
[ |
|
{"id":1 , "content": " first content"}, |
|
{"id":2 , "content": " second content"} |
|
] |
|
You need to check which content is most appropriate to answer the question and prepare |
|
an answer based on the content |
|
For example, |
|
[ |
|
{ "id":1 , "content" : "General Preparation Tips for Verbal Section:\n |
|
You need to develop an incessant habit of speed reading. |
|
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages. |
|
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc. |
|
Revise at least 50 new words every day"}, |
|
{ "id":2 , "content" : "General Preparation Tips for Quantitative Section:\n |
|
You need to develop an speed in solving math problems. |
|
Start with reading funda books, math text books. |
|
Learn the basic concepts like arithmetic, geometry, numbers, probability, etc. |
|
Solve at least 50 new problems every day"} |
|
] |
|
Question: What are some tips for preparing for Verbal exam? |
|
Here are some tips for preparing for the VARC section: |
|
1. develop an incessant habit of speed reading |
|
2. Start reading newspapers, editorials, fiction and nonfiction novels |
|
3. Learn basic grammar concepts\n |
|
4. Revise at least 50 new words a day |
|
Your response should be based on the information contained in the provided content in the json and should not included any other sources. |
|
If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this" |
|
Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question |
|
''' |
|
|
|
def get_meta_score(url,question_embedding): |
|
qry = index.fetch(ids=[url], namespace="meta") |
|
|
|
emb=qry['vectors'][url]['values'] |
|
vector1 = np.array(emb).reshape(1, -1) |
|
vector2 = question_embedding.numpy().reshape(1, -1) |
|
|
|
|
|
cosine_scores = util.cos_sim(question_embedding, emb) |
|
return cosine_scores.item() |
|
|
|
|
|
def query_from_pinecone(index,namespace, question_embedding, top_k=3): |
|
|
|
|
|
return index.query( |
|
vector=question_embedding, |
|
top_k=top_k, |
|
namespace=namespace, |
|
include_metadata=True |
|
).get('matches') |
|
|
|
def response_generator(response): |
|
|
|
for word in response.split(): |
|
yield word + " " |
|
time.sleep(0.05) |
|
|
|
def write_log(query,url, score, ans, ts): |
|
|
|
score = str(score) |
|
ans = ans.replace('"', '""') |
|
new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"' |
|
|
|
with fs.open("datasets/sujitb/data/querylog.csv", "r") as f: |
|
buffer = f.read() |
|
|
|
|
|
buffer += new_row |
|
|
|
|
|
with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f: |
|
f.write(buffer) |
|
|
|
return |
|
|
|
logfile='querylog.csv' |
|
try: |
|
df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8") |
|
except: |
|
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts']) |
|
|
|
st.title('CLLM Answering Machine') |
|
with st.sidebar: |
|
st.markdown('*Search History*') |
|
st.write('# Queries', len(df_log)) |
|
qrylist = df_log['query'].tail(10).tolist() |
|
for q in qrylist[::-1]: |
|
st.write(q) |
|
|
|
|
|
|
|
if "messages" not in st.session_state: |
|
st.session_state.messages = [] |
|
|
|
for message in st.session_state.messages: |
|
with st.chat_message(message["role"]): |
|
st.markdown(message["content"]) |
|
|
|
|
|
QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?') |
|
|
|
|
|
|
|
score=0 |
|
testing=True |
|
|
|
|
|
if QUESTION: |
|
|
|
with st.chat_message("user"): |
|
st.markdown(QUESTION) |
|
|
|
st.session_state.messages.append({"role": "user", "content": QUESTION}) |
|
|
|
st.write('Searching knowledgebase...') |
|
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) |
|
|
|
THRESHOLD=.4 |
|
ns='webpages' |
|
ns='full' |
|
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10) |
|
resplist=[] |
|
id=0 |
|
for r in resp: |
|
id+=1 |
|
d={} |
|
d['id']=id |
|
d['content']=r['metadata']['data'] |
|
d['url']=r['id'] |
|
meta_score= get_meta_score(r['id'],question_embedding) |
|
score=.5* r['score'] + .5*meta_score |
|
d['score']=score |
|
|
|
resplist.append(d) |
|
|
|
if len(resplist)>0: |
|
sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True) |
|
|
|
|
|
top_2 = [resplist[i] for i in sorted_indices[:2]] |
|
|
|
|
|
|
|
json_data = json.dumps(top_2) |
|
|
|
|
|
goodmatch=False |
|
if resplist[sorted_indices[0]]['score']>=THRESHOLD: |
|
st.write('Preparing answers...') |
|
goodmatch=True |
|
mode = "two" |
|
|
|
client = OpenAI() |
|
|
|
if mode=="one": |
|
instr=system_instructions_text |
|
|
|
out= resplist[sorted_indices[0]]['content'] |
|
content=""" |
|
<text> |
|
{} |
|
</text> |
|
""".format(out) |
|
|
|
if mode=="two": |
|
instr=json_instructions |
|
content=json_data |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "system", "content":instr }, |
|
{"role": "user", "content": content}, |
|
{"role": "user", "content": "Question:"+QUESTION} |
|
] |
|
) |
|
|
|
ans= response.choices[0].message.content |
|
else: |
|
ans='Weak match to your query. Please try reframing your question' |
|
|
|
|
|
testing = False |
|
if testing: |
|
if len(resp)>=1: |
|
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score'])) |
|
if len(resp)>=2: |
|
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score'])) |
|
|
|
with st.chat_message("assistant"): |
|
response = st.write_stream(response_generator(ans)) |
|
if goodmatch: |
|
st.write('Resources:') |
|
st.write(top_2[0]['url']) |
|
st.write(top_2[1]['url']) |
|
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|
|
|
|
|
|
|
|
|
now= str(datetime.utcnow()) |
|
url = top_2[0]['url'] + ' ; '+top_2[1]['url'] |
|
df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now] |
|
write_log(QUESTION,url, score, ans, now) |
|
|
|
|
|
|
|
else: |
|
|
|
ans= "No matches for query" |
|
response = st.write_stream(response_generator(ans)) |
|
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|
now= str(datetime.utcnow()) |
|
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now] |
|
|
|
write_log(QUESTION,'No match', 0, '-', now) |
|
|