File size: 7,024 Bytes
589c9b1
074b93b
b403bb0
074b93b
 
8596e21
c690f92
 
8596e21
976a040
 
d944fdb
 
 
 
976a040
 
 
 
8596e21
074b93b
8596e21
 
 
074b93b
 
 
 
 
 
b403bb0
ed6e9e8
 
074b93b
 
8560ab0
8596e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976a040
8596e21
8560ab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976a040
 
 
 
 
 
d944fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976a040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8560ab0
976a040
 
8560ab0
 
976a040
 
 
 
 
 
8560ab0
8596e21
8560ab0
976a040
8560ab0
 
976a040
 
 
 
 
 
8560ab0
 
976a040
8560ab0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976a040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8560ab0
c690f92
8560ab0
d944fdb
976a040
d944fdb
8596e21
b403bb0
976a040
 
 
 
 
 
c690f92
8560ab0
d944fdb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import streamlit as st

#from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from datetime import datetime
import pandas as pd
import os

from huggingface_hub import HfFileSystem

token = os.getenv('git')

fs = HfFileSystem(token=token)

import time


api_key='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'

os.environ["OPENAI_API_KEY"] = api_key

os.environ.get("OPENAI_API_KEY")

bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256     # Truncate long documents to 256 tokens

# Store the index as a variable
INDEX_NAME = 'cl-search-idx'

pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8'  #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)

system_instructions_text='''
          Your task is to extract the answer to a question from a body of text provided to you.
          The body of text will be enclosed within the delimiter tags  <text> and </text>

          For example,
          <text> General Preparation Tips for VARC Section:

          You need to develop an incessant habit of speed reading.
          Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
          The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
          Revise at least 50 new words every day
          </text>

          Question: What are some tips for preparing for VARC?
          Here are some tips for preparing for the VARC section:
          1. develop an incessant habit of speed reading
          2. Start reading newspapers, editorials, fiction and nonfiction novels
          3. Learn basic grammar concepts\n
          4. Revise at least 50 new words a day

          Question: How many new words are to be learnt in a day?
          It is advised that 50 new words are learn every day

          Your response should be based on the information contained in the provided text and should not included any other sources.
          If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
           Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
          '''
logfile='querylog.csv'
try:
    df_log=pd.read_csv(logfile, index_col=0)
except:
    df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])

def query_from_pinecone(index,namespace, question_embedding, top_k=3):
    # get embedding from THE SAME embedder as the documents

    return index.query(
      vector=question_embedding,
      top_k=top_k,
      namespace=namespace,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

def response_generator(response):

    for word in response.split():
        yield word + " "
        time.sleep(0.05)

def write_log(query,url, score, ans, ts):
    # Construct new row
    score = str(score)
    ans = ans.replace('"', '""')
    new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"'

    with fs.open("datasets/sujitb/data/querylog.csv", "r") as f:
        buffer = f.read()

    # Append the new row to buffer
    buffer += new_row

    # Write the buffer to the file in "W" mode
    with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f:
        f.write(buffer)

    return

st.title('CLLM Answering Machine')

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])


QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?')


#QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?')  ##' How to prepare for Verbal section ?'
score=0
testing=True


if QUESTION:

    with st.chat_message("user"):
        st.markdown(QUESTION)
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": QUESTION})

    question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)

    ns='webpages'
    ns='full'
    resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3)
    if len(resp)>0:
        #st.write("Got results...extracting an answer")
        out= resp[0]['metadata']['data']
        try:
            url= resp[0]['metadata']['url']
        except:
            url= resp[0]['id']
        url= resp[0]['id']
        score=resp[0]['score']
        title=resp[0]['metadata']['title']
        #+ '\n*************\n'+  resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']

        if score>.5:
            client = OpenAI()
            content="""
            <text>
            {}
            </text>
            """.format(out)

            response = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=[
                {"role": "system", "content":system_instructions_text },
                {"role": "user", "content": content},
                {"role": "user", "content": "Question:"+QUESTION}
              ]
            )

            ans= response.choices[0].message.content
        else:
            ans='Weak match to your query. Please try reframing your question'

        #st.write("Matched URL:{}  Score:{}".format(url,score))
        testing = False
        if testing:
            if len(resp)>=1:
                st.write("2nd Matched URL:{}  Score:{}".format(resp[1]['id'],resp[1]['score']))
            if len(resp)>=2:
                st.write("3rd Matched URL:{}  Score:{}".format(resp[2]['id'],resp[2]['score']))

        with st.chat_message("assistant"):
            response = st.write_stream(response_generator(ans))
            # Add assistant response to chat history
            st.session_state.messages.append({"role": "assistant", "content": response})
        #st.write(ans)

        #st.write(' ----------------------')
        #st.write(out)

        now= str(datetime.utcnow())
        df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
        write_log(QUESTION,url, score, ans, now)
        #df.to_csv("hf://datasets/sujitb/data/test.csv")
        #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)

    else:
        #st.write("No matches for query")
        ans= "No matches for query"
        response = st.write_stream(response_generator(ans))
        # Add assistant response to chat history
        st.session_state.messages.append({"role": "assistant", "content": response})

        now= str(datetime.utcnow())
        df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
        #df_log.to_csv("hf://datasets/sujitb/data/"+logfile)
        write_log(QUESTION,url, score, ans, now)