|
import streamlit as st |
|
|
|
|
|
from pinecone import Pinecone, ServerlessSpec |
|
from sentence_transformers import SentenceTransformer, util |
|
from openai import OpenAI |
|
from datetime import datetime |
|
import pandas as pd |
|
import os |
|
|
|
from huggingface_hub import HfFileSystem |
|
token = os.getenv('git') |
|
|
|
st.write(token) |
|
|
|
fs = HfFileSystem(token='hf_CyodQptzmEfKVvogj'+'HkWbtJOEXohkxVRop') |
|
|
|
import time |
|
|
|
|
|
api_key='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' |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
|
|
os.environ.get("OPENAI_API_KEY") |
|
|
|
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') |
|
bi_encoder.max_seq_length = 256 |
|
|
|
|
|
INDEX_NAME = 'cl-search-idx' |
|
|
|
pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8' |
|
pc = Pinecone(api_key=pc_api_key) |
|
index = pc.Index(name=INDEX_NAME) |
|
|
|
system_instructions_text=''' |
|
Your task is to extract the answer to a question from a body of text provided to you. |
|
The body of text will be enclosed within the delimiter tags <text> and </text> |
|
|
|
For example, |
|
<text> General Preparation Tips for VARC Section: |
|
|
|
You need to develop an incessant habit of speed reading. |
|
Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages. |
|
The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc. |
|
Revise at least 50 new words every day |
|
</text> |
|
|
|
Question: What are some tips for preparing for VARC? |
|
Here are some tips for preparing for the VARC section: |
|
1. develop an incessant habit of speed reading |
|
2. Start reading newspapers, editorials, fiction and nonfiction novels |
|
3. Learn basic grammar concepts\n |
|
4. Revise at least 50 new words a day |
|
|
|
Question: How many new words are to be learnt in a day? |
|
It is advised that 50 new words are learn every day |
|
|
|
Your response should be based on the information contained in the provided text and should not included any other sources. |
|
If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this" |
|
Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question |
|
''' |
|
logfile='querylog.csv' |
|
try: |
|
df_log=pd.read_csv(logfile, index_col=0) |
|
except: |
|
df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts']) |
|
|
|
def query_from_pinecone(index,namespace, question_embedding, top_k=3): |
|
|
|
|
|
return index.query( |
|
vector=question_embedding, |
|
top_k=top_k, |
|
namespace=namespace, |
|
include_metadata=True |
|
).get('matches') |
|
|
|
def response_generator(response): |
|
|
|
for word in response.split(): |
|
yield word + " " |
|
time.sleep(0.05) |
|
|
|
def write_log(query,url, score, ans, ts): |
|
|
|
score = str(score) |
|
ans = ans.replace('"', '""') |
|
new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"' |
|
|
|
with fs.open("datasets/sujitb/data/querylog.csv", "r") as f: |
|
buffer = f.read() |
|
|
|
|
|
buffer += new_row |
|
|
|
|
|
with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f: |
|
f.write(buffer) |
|
|
|
return |
|
|
|
st.title('CLLM Answering Machine') |
|
|
|
|
|
if "messages" not in st.session_state: |
|
st.session_state.messages = [] |
|
|
|
for message in st.session_state.messages: |
|
with st.chat_message(message["role"]): |
|
st.markdown(message["content"]) |
|
|
|
|
|
QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?') |
|
|
|
|
|
|
|
score=0 |
|
testing=True |
|
|
|
|
|
if QUESTION: |
|
|
|
with st.chat_message("user"): |
|
st.markdown(QUESTION) |
|
|
|
st.session_state.messages.append({"role": "user", "content": QUESTION}) |
|
|
|
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) |
|
|
|
ns='webpages' |
|
ns='full' |
|
resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3) |
|
if len(resp)>0: |
|
|
|
out= resp[0]['metadata']['data'] |
|
try: |
|
url= resp[0]['metadata']['url'] |
|
except: |
|
url= resp[0]['id'] |
|
url= resp[0]['id'] |
|
score=resp[0]['score'] |
|
title=resp[0]['metadata']['title'] |
|
|
|
|
|
if score>.5: |
|
client = OpenAI() |
|
content=""" |
|
<text> |
|
{} |
|
</text> |
|
""".format(out) |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "system", "content":system_instructions_text }, |
|
{"role": "user", "content": content}, |
|
{"role": "user", "content": "Question:"+QUESTION} |
|
] |
|
) |
|
|
|
ans= response.choices[0].message.content |
|
else: |
|
ans='Weak match to your query. Please try reframing your question' |
|
|
|
|
|
testing = False |
|
if testing: |
|
if len(resp)>=1: |
|
st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score'])) |
|
if len(resp)>=2: |
|
st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score'])) |
|
|
|
with st.chat_message("assistant"): |
|
response = st.write_stream(response_generator(ans)) |
|
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|
|
|
|
|
|
|
|
|
now= str(datetime.utcnow()) |
|
df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now] |
|
write_log(QUESTION,url, score, ans, now) |
|
|
|
|
|
|
|
else: |
|
|
|
ans= "No matches for query" |
|
response = st.write_stream(response_generator(ans)) |
|
|
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
|
|
|
now= str(datetime.utcnow()) |
|
df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now] |
|
|
|
write_log(QUESTION,url, score, ans, now) |
|
|