File size: 13,736 Bytes
589c9b1
074b93b
b403bb0
074b93b
 
8596e21
c690f92
 
1f313e7
8596e21
7a2f450
b492789
6b136b5
976a040
c387a59
 
976a040
7a2f450
976a040
 
c387a59
074b93b
 
 
 
 
 
 
0b02df4
b403bb0
c387a59
ed6e9e8
074b93b
 
8560ab0
8596e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976a040
8596e21
c5c24fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
088165e
c5c24fb
 
088165e
c5c24fb
8560ab0
d9f949e
 
1f313e7
 
 
 
 
 
 
 
 
d9f949e
8560ab0
 
 
 
 
 
 
 
 
 
976a040
 
 
 
 
 
d944fdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f313e7
7a2f450
167e1ea
 
 
54faee4
4999c72
167e1ea
 
54faee4
 
167e1ea
 
 
 
dcfaeaf
80b830f
 
167e1ea
 
7a2f450
167e1ea
7a2f450
 
 
61f781e
 
 
1f313e7
15e7fb0
167e1ea
 
14c00ab
167e1ea
1f313e7
 
976a040
 
 
 
 
 
 
 
54faee4
 
167e1ea
976a040
 
 
 
8560ab0
976a040
503beb8
8560ab0
 
976a040
 
 
 
 
 
7c3dc0e
8560ab0
8596e21
1f313e7
8560ab0
0b02df4
32ad03a
 
 
 
 
c5c24fb
32ad03a
 
b25eee6
32ad03a
b25eee6
 
 
72cd7b7
66dfeba
32ad03a
 
1f313e7
32ad03a
 
66dfeba
 
 
 
 
 
 
 
 
 
 
75237fd
 
1f313e7
75237fd
 
1f313e7
75237fd
1f313e7
75237fd
1f313e7
 
 
 
 
75237fd
 
1f313e7
75237fd
1f313e7
75237fd
 
1f313e7
75237fd
 
 
 
 
 
1f313e7
75237fd
 
 
1f313e7
75237fd
 
 
 
 
 
 
 
1f313e7
75237fd
 
 
1f313e7
503beb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75237fd
 
 
 
 
 
 
503beb8
 
1f313e7
 
 
 
 
09a527e
 
 
 
 
 
 
 
 
 
 
503beb8
 
 
 
 
1f313e7
 
 
 
 
 
 
 
 
167e1ea
1f313e7
167e1ea
1f313e7
167e1ea
 
1f313e7
 
976a040
 
 
 
 
 
c690f92
8560ab0
167e1ea
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import streamlit as st

#from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
from datetime import datetime
import pandas as pd
import numpy as np
import os
import time
import json
from tavily import TavilyClient
from huggingface_hub import HfFileSystem
token = os.getenv('FILE_TOKEN')
fs = HfFileSystem(token=token)




api_key=os.environ["OPENAI_API_KEY"]


bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256     # Truncate long documents to 256 tokens

# Store the index as a variable
INDEX_NAME = 'cl-search-idx'
INDEX_NAME = 'cl-kb'

pc_api_key= os.environ["clpine"]  #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)

system_instructions_text='''
          Your task is to extract the answer to a question from a body of text provided to you.
          The body of text will be enclosed within the delimiter tags  <text> and </text>
          For example,
          <text> General Preparation Tips for VARC Section:
          You need to develop an incessant habit of speed reading.
          Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
          The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
          Revise at least 50 new words every day
          </text>
          Question: What are some tips for preparing for VARC?
          Here are some tips for preparing for the VARC section:
          1. develop an incessant habit of speed reading
          2. Start reading newspapers, editorials, fiction and nonfiction novels
          3. Learn basic grammar concepts\n
          4. Revise at least 50 new words a day
          Question: How many new words are to be learnt in a day?
          It is advised that 50 new words are learn every day
          Your response should be based on the information contained in the provided text and should not included any other sources.
          If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
           Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
          '''
json_instructions='''
          Your task is to extract the answer to a question from a body of text provided to you in a json array.
          The json will contain two pieces of content in this format:
          [
              {"id":1 , "content": " first content"},
              {"id":2 , "content": " second content"}
          ]
          You need to check which content is most appropriate to answer the question and prepare
          an answer based on the content
          For example,
          [
          { "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
          You need to develop an incessant habit of speed reading.
          Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
          The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
          Revise at least 50 new words every day"},
          { "id":2 , "content" : "General Preparation Tips for Quantitative Section:\n
          You need to develop an speed in solving math problems.
          Start with reading funda books,  math text books.
          Learn the basic  concepts like arithmetic, geometry, numbers, probability,  etc.
          Solve at least 50 new problems every day"}
          ]
          Question: What are some tips for preparing for Verbal exam?
          Here are some tips for preparing for the VARC section:
          1. develop an incessant habit of speed reading
          2. Start reading newspapers, editorials, fiction and nonfiction novels
          3. Learn basic grammar concepts\n
          4. Revise at least 50 new words a day
          Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question.
          Your response should be based on the information contained in the provided content in the json and should not included any other sources.
          If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
           
          '''

def get_meta_score(url,question_embedding):
    qry = index.fetch(ids=[url], namespace="meta")

    emb=qry['vectors'][url]['values']
    vector1 = np.array(emb).reshape(1, -1)  # Reshape to ensure compatibility with sklearn
    vector2 = question_embedding.numpy().reshape(1, -1)

    # Calculate cosine similarity
    cosine_scores = util.cos_sim(question_embedding, emb)
    return  cosine_scores.item()


def query_from_pinecone(index,namespace, question_embedding, top_k=3):
    # get embedding from THE SAME embedder as the documents

    return index.query(
      vector=question_embedding,
      top_k=top_k,
      namespace=namespace,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

def response_generator(response):

    for word in response.split():
        yield word + " "
        time.sleep(0.05)

def write_log(query,url, score, ans, ts):
    # Construct new row
    score = str(score)
    ans = ans.replace('"', '""')
    new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"'

    with fs.open("datasets/sujitb/data/querylog.csv", "r") as f:
        buffer = f.read()

    # Append the new row to buffer
    buffer += new_row

    # Write the buffer to the file in "W" mode
    with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f:
        f.write(buffer)

    return

logfile='querylog.csv'

qlist=[
'What are the best books for VARC in CAT?',
'What is the XAT exam pattern? How many sections ? How many questions are asked in each section?',
'I want to know about Personalized coaching for IGSE/IB',
'Which IIMs accept admissions under the IPM exam?',
'What topics are covered under CAT exam syllabus?',
'What is the pattern of the IPM exam?',
'Which Central Universities offer courses for CUET exam',
'For CAT preparation which is better - online classes or classroom program?',
'What programs are offered under CUET exam  by Central University of Jharkhand?',
'What is the pattern of the IPM exam?',
'When is the CAT 2024 exam going to be held?',
'What are program benefits of the MBA 2024 Online Classes?',
'What topics are covered in CUET General Test?',
'IIM A B C vs FMS - how to select the best bschool?'
]

try:
    df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8", index_col=0)
except:
    df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])

#st.title('CLLM Answering Machine')
st.subheader('CLLM Answering Machine', divider='rainbow')

with st.sidebar:
    dispstr= 'Search History'
    st.markdown('*{}*'.format(dispstr))
    #st.write('Past Queries')
    #qlist = df_log.tail(30)['query'].tolist()
    for q in qlist[::-1]:
        st.write(q)

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

#with st.chat_message("user"):
#    st.write("Hello 👋  Ask any question related to careerlauncher.com in the text box below")

QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?')


#QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?')  ##' How to prepare for Verbal section ?'
score=0
testing=True
ext_url=''

if QUESTION:

    with st.chat_message("user"):
        st.markdown(QUESTION)
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": QUESTION})

    st.write('Searching knowledge base...')
    question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)

    THRESHOLD=.4
    ns='webpages'
    #ns='full'
    resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
    resplist=[]
    id=0
    for r in resp:
        id+=1
        d={}
        d['id']=id
        d['content']=r['metadata']['data']
        d['title']=r['metadata']['title']
        d['url']=r['id']
        if ns=='webpages':
            d['url']= r['metadata']['url']
            
        meta_score= get_meta_score(d['url'],question_embedding)
        
        score=.5* r['score'] + .5*meta_score
        d['score']=score
        #st.write(d['url'], score, r['score'], meta_score)
        resplist.append(d)

    #check youtube
    respmeta= query_from_pinecone(index, 'meta', question_embedding.tolist(), 5)
    for r in respmeta:
        if 'youtube' in r['id'] and r['score']>=.8:
            d['id']=r['id']
            d['content']=r['metadata']['data']
            d['title']=r['metadata']['title']
            d['url']= r['metadata']['url']   
            d['score']=r['score']
            resplist.append(d)

    if len(resplist)>0:
        sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)

        # Get the elements with the top 2 highest values
        top_2 =  [resplist[i] for i in sorted_indices[:2]]

        # covert to array

        json_data = json.dumps(top_2)


        goodmatch=False
        if resplist[sorted_indices[0]]['score']>=THRESHOLD:
            st.write('Preparing answers...')
            goodmatch=True
            mode = "two"  # two passages

            client = OpenAI()

            if mode=="one":
                instr=system_instructions_text

                out= resplist[sorted_indices[0]]['content']
                content="""
                <text>
                {}
                </text>
                """.format(out)

            if mode=="two":
                instr=json_instructions
                content=json_data

            response = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=[
                {"role": "system", "content":instr },
                {"role": "user", "content": content},
                {"role": "user", "content": "Question:"+QUESTION}
              ]
            )

            ans= response.choices[0].message.content
        else:
            ans='Weak match to your query. Please try reframing your question'

            ## Call Tavily
            tavily_key= os.environ["TAVILY_KEY"]

            tavily = TavilyClient(api_key=tavily_key)

            success= 0
            while success<3:
                success+=1
                try:
                    resp=tavily.search(query=QUESTION)
                    with st.chat_message("assistant"):
                        ans=resp['results'][0]['content']
                        
                    ext_url= resp['results'][0]['url']
                    break
                except:
                    pass
                        

            #st.write("Matched URL:{}  Score:{}".format(url,score))
            testing = False
            if testing:
                if len(resp)>=1:
                    st.write("2nd Matched URL:{}  Score:{}".format(resp[1]['id'],resp[1]['score']))
                if len(resp)>=2:
                    st.write("3rd Matched URL:{}  Score:{}".format(resp[2]['id'],resp[2]['score']))

                    
        ##  Send RESPONSE
        with st.chat_message("assistant"):
            response = st.write_stream(response_generator(ans))
            if goodmatch:
                st.write('Resources:')
                for k in range(2):
                    disp_title=top_2[k]['title']
                    disp_url= top_2[k]['url']
                    if 'youtube' in disp_url:
                        disp_title='Youtube: '+disp_title

                    if k>0:
                        if top_2[k]['url']==top_2[k-1]['url']:
                            break
                    st.write("["+disp_title+"]("+disp_url+")")

            else: # not a good match
                
                if len(ext_url)>5:    
                    st.write('External Site:',ext_url)
                #st.write(top_2[0]['url'])                
            # Add assistant response to chat history
            st.session_state.messages.append({"role": "assistant", "content": response})
        #st.write(ans)

        #st.write(' ----------------------')
        #st.write(out)

        now= str(datetime.utcnow())
        url = top_2[0]['url'] + ' ; '+top_2[1]['url']

        df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
        #write_log(QUESTION,url, score, ans, now)
        #df.to_csv("hf://datasets/sujitb/data/test.csv")
        storage_options={"token":token}
        df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)

    else:  ## Zero response from pinecone query
        #st.write("No matches for query")
        ans= "No matches for query"
        response = st.write_stream(response_generator(ans))
        # Add assistant response to chat history
        st.session_state.messages.append({"role": "assistant", "content": response})

        now= str(datetime.utcnow())
        df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
        storage_options={"token":token}
        df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)
        #write_log(QUESTION,'No match', 0, '-', now)