File size: 7,077 Bytes
589c9b1 074b93b b403bb0 074b93b 8596e21 c690f92 8596e21 976a040 d944fdb fc68d2e 976a040 8596e21 074b93b 8596e21 074b93b b403bb0 ed6e9e8 074b93b 8560ab0 8596e21 976a040 8596e21 8560ab0 976a040 d944fdb 976a040 8560ab0 976a040 8560ab0 976a040 8560ab0 8596e21 8560ab0 976a040 8560ab0 976a040 8560ab0 976a040 8560ab0 976a040 8560ab0 c690f92 8560ab0 d944fdb 976a040 d944fdb 8596e21 b403bb0 976a040 c690f92 8560ab0 d944fdb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import streamlit as st
#from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from datetime import datetime
import pandas as pd
import os
from huggingface_hub import HfFileSystem
token = os.getenv('git')
st.write(token)
fs = HfFileSystem(token='hf_CyodQptzmEfKVvogj'+'HkWbtJOEXohkxVRop')
import time
api_key='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'
os.environ["OPENAI_API_KEY"] = api_key
os.environ.get("OPENAI_API_KEY")
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens
# Store the index as a variable
INDEX_NAME = 'cl-search-idx'
pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8' #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)
system_instructions_text='''
Your task is to extract the answer to a question from a body of text provided to you.
The body of text will be enclosed within the delimiter tags <text> and </text>
For example,
<text> General Preparation Tips for VARC Section:
You need to develop an incessant habit of speed reading.
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
Revise at least 50 new words every day
</text>
Question: What are some tips for preparing for VARC?
Here are some tips for preparing for the VARC section:
1. develop an incessant habit of speed reading
2. Start reading newspapers, editorials, fiction and nonfiction novels
3. Learn basic grammar concepts\n
4. Revise at least 50 new words a day
Question: How many new words are to be learnt in a day?
It is advised that 50 new words are learn every day
Your response should be based on the information contained in the provided text and should not included any other sources.
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
'''
logfile='querylog.csv'
try:
df_log=pd.read_csv(logfile, index_col=0)
except:
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])
def query_from_pinecone(index,namespace, question_embedding, top_k=3):
# get embedding from THE SAME embedder as the documents
return index.query(
vector=question_embedding,
top_k=top_k,
namespace=namespace,
include_metadata=True # gets the metadata (dates, text, etc)
).get('matches')
def response_generator(response):
for word in response.split():
yield word + " "
time.sleep(0.05)
def write_log(query,url, score, ans, ts):
# Construct new row
score = str(score)
ans = ans.replace('"', '""')
new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"'
with fs.open("datasets/sujitb/data/querylog.csv", "r") as f:
buffer = f.read()
# Append the new row to buffer
buffer += new_row
# Write the buffer to the file in "W" mode
with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f:
f.write(buffer)
return
st.title('CLLM Answering Machine')
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?')
#QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?'
score=0
testing=True
if QUESTION:
with st.chat_message("user"):
st.markdown(QUESTION)
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": QUESTION})
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
ns='webpages'
ns='full'
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3)
if len(resp)>0:
#st.write("Got results...extracting an answer")
out= resp[0]['metadata']['data']
try:
url= resp[0]['metadata']['url']
except:
url= resp[0]['id']
url= resp[0]['id']
score=resp[0]['score']
title=resp[0]['metadata']['title']
#+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']
if score>.5:
client = OpenAI()
content="""
<text>
{}
</text>
""".format(out)
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content":system_instructions_text },
{"role": "user", "content": content},
{"role": "user", "content": "Question:"+QUESTION}
]
)
ans= response.choices[0].message.content
else:
ans='Weak match to your query. Please try reframing your question'
#st.write("Matched URL:{} Score:{}".format(url,score))
testing = False
if testing:
if len(resp)>=1:
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
if len(resp)>=2:
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
with st.chat_message("assistant"):
response = st.write_stream(response_generator(ans))
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
#st.write(ans)
#st.write(' ----------------------')
#st.write(out)
now= str(datetime.utcnow())
df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
write_log(QUESTION,url, score, ans, now)
#df.to_csv("hf://datasets/sujitb/data/test.csv")
#df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
else:
#st.write("No matches for query")
ans= "No matches for query"
response = st.write_stream(response_generator(ans))
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
now= str(datetime.utcnow())
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
#df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
write_log(QUESTION,url, score, ans, now)
|