chat

Sleeping

App Files Files Community

chat / app.py

sujitb

Update app.py

7c3dc0e verified 8 months ago

raw

history blame

11.7 kB

	import streamlit as st

	#from transformers import pipeline
	from pinecone import Pinecone, ServerlessSpec
	from sentence_transformers import SentenceTransformer, util
	from openai import OpenAI
	from datetime import datetime
	import pandas as pd
	import numpy as np
	import os
	import time
	import json

	from huggingface_hub import HfFileSystem
	token = os.getenv('FILE_TOKEN')
	fs = HfFileSystem(token=token)




	api_key=os.environ["OPENAI_API_KEY"]


	bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
	bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens

	# Store the index as a variable
	INDEX_NAME = 'cl-search-idx'
	INDEX_NAME = 'cl-kb'

	pc_api_key= os.environ["clpine"] #AWS
	pc = Pinecone(api_key=pc_api_key)
	index = pc.Index(name=INDEX_NAME)

	system_instructions_text='''
	Your task is to extract the answer to a question from a body of text provided to you.
	The body of text will be enclosed within the delimiter tags <text> and </text>
	For example,
	<text> General Preparation Tips for VARC Section:
	You need to develop an incessant habit of speed reading.
	Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
	The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
	Revise at least 50 new words every day
	</text>
	Question: What are some tips for preparing for VARC?
	Here are some tips for preparing for the VARC section:
	1. develop an incessant habit of speed reading
	2. Start reading newspapers, editorials, fiction and nonfiction novels
	3. Learn basic grammar concepts\n
	4. Revise at least 50 new words a day
	Question: How many new words are to be learnt in a day?
	It is advised that 50 new words are learn every day
	Your response should be based on the information contained in the provided text and should not included any other sources.
	If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this"
	Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question
	'''
	json_instructions='''
	Your task is to extract the answer to a question from a body of text provided to you in a json array.
	The json will contain two pieces of content in this format:
	[
	{"id":1 , "content": " first content"},
	{"id":2 , "content": " second content"}
	]
	You need to check which content is most appropriate to answer the question and prepare
	an answer based on the content
	For example,
	[
	{ "id":1 , "content" : "General Preparation Tips for Verbal Section:\n
	You need to develop an incessant habit of speed reading.
	Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages.
	The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc.
	Revise at least 50 new words every day"},
	{ "id":2 , "content" : "General Preparation Tips for Quantitative Section:\n
	You need to develop an speed in solving math problems.
	Start with reading funda books, math text books.
	Learn the basic concepts like arithmetic, geometry, numbers, probability, etc.
	Solve at least 50 new problems every day"}
	]
	Question: What are some tips for preparing for Verbal exam?
	Here are some tips for preparing for the VARC section:
	1. develop an incessant habit of speed reading
	2. Start reading newspapers, editorials, fiction and nonfiction novels
	3. Learn basic grammar concepts\n
	4. Revise at least 50 new words a day
	Your response should be based on the information contained in the provided content in the json and should not included any other sources.
	If you are unable to answer the question from the content provided, please respond " Sorry. I do not have enough information to answer this"
	Do repeat the question. Do not make a pointed reference to the content provided. Directly answer the question
	'''

	def get_meta_score(url,question_embedding):
	qry = index.fetch(ids=[url], namespace="meta")

	emb=qry['vectors'][url]['values']
	vector1 = np.array(emb).reshape(1, -1) # Reshape to ensure compatibility with sklearn
	vector2 = question_embedding.numpy().reshape(1, -1)

	# Calculate cosine similarity
	cosine_scores = util.cos_sim(question_embedding, emb)
	return cosine_scores.item()


	def query_from_pinecone(index,namespace, question_embedding, top_k=3):
	# get embedding from THE SAME embedder as the documents

	return index.query(
	vector=question_embedding,
	top_k=top_k,
	namespace=namespace,
	include_metadata=True # gets the metadata (dates, text, etc)
	).get('matches')

	def response_generator(response):

	for word in response.split():
	yield word + " "
	time.sleep(0.05)

	def write_log(query,url, score, ans, ts):
	# Construct new row
	score = str(score)
	ans = ans.replace('"', '""')
	new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"'

	with fs.open("datasets/sujitb/data/querylog.csv", "r") as f:
	buffer = f.read()

	# Append the new row to buffer
	buffer += new_row

	# Write the buffer to the file in "W" mode
	with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f:
	f.write(buffer)

	return

	logfile='querylog.csv'

	qlist=[
	'What are the best books for VARC in CAT?',
	'What is the XAT exam pattern? How many sections ? How many questions?',
	'I want to know about Personalized coaching for IGSE/IB ',
	'Which IIMs accept admissions under the IPM exam?',
	'What topics are covered under CAT exam syllabus?',
	'For CAT preparation which is better - online classes or classroom program?',
	'What programs are offered under CUET exam by Central University of Jharkhand?',
	'What is the pattern of the IPM exam?',
	'When is the CAT 2024 exam going to be held?',
	'What are program benefits of the MBA 2024 Online Classes?'
	]

	try:
	df_log = pd.read_csv("hf://datasets/sujitb/data/querylog.csv", encoding="utf-8", index_col=0)
	except:
	df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts'])

	#st.title('CLLM Answering Machine')
	st.subheader('CLLM Answering Machine', divider='rainbow')

	with st.sidebar:
	dispstr= 'Search History '+str(len(df_log))
	st.markdown('{}'.format(dispstr))
	#st.write('Past Queries')
	qlist = df_log.tail(30)['query'].tolist()
	for q in qlist[::-1]:
	st.write(q)

	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []
	# Display chat messages from history on app rerun
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	with st.chat_message("user"):
	st.write("Hello 👋")

	QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?')


	#QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?'
	score=0
	testing=True


	if QUESTION:

	with st.chat_message("user"):
	st.markdown(QUESTION)
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": QUESTION})

	st.write('Searching knowledge base...')
	question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)

	THRESHOLD=.4
	ns='webpages'
	ns='full'
	resp= query_from_pinecone(index,ns, question_embedding.tolist(), 10)
	resplist=[]
	id=0
	for r in resp:
	id+=1
	d={}
	d['id']=id
	d['content']=r['metadata']['data']
	d['url']=r['id']
	meta_score= get_meta_score(r['id'],question_embedding)
	score=.5* r['score'] + .5*meta_score
	d['score']=score
	#st.write(d['url'], score, r['score'], meta_score)
	resplist.append(d)

	if len(resplist)>0:
	sorted_indices = sorted(range(len(resplist)), key=lambda i: resplist[i]['score'], reverse=True)

	# Get the elements with the top 2 highest values
	top_2 = [resplist[i] for i in sorted_indices[:2]]

	# covert to array

	json_data = json.dumps(top_2)


	goodmatch=False
	if resplist[sorted_indices[0]]['score']>=THRESHOLD:
	st.write('Preparing answers...')
	goodmatch=True
	mode = "two" # two passages

	client = OpenAI()

	if mode=="one":
	instr=system_instructions_text

	out= resplist[sorted_indices[0]]['content']
	content="""
	<text>
	{}
	</text>
	""".format(out)

	if mode=="two":
	instr=json_instructions
	content=json_data

	response = client.chat.completions.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content":instr },
	{"role": "user", "content": content},
	{"role": "user", "content": "Question:"+QUESTION}
	]
	)

	ans= response.choices[0].message.content
	else:
	ans='Weak match to your query. Please try reframing your question'

	#st.write("Matched URL:{} Score:{}".format(url,score))
	testing = False
	if testing:
	if len(resp)>=1:
	st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score']))
	if len(resp)>=2:
	st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score']))
	## Send RESPONSE
	with st.chat_message("assistant"):
	response = st.write_stream(response_generator(ans))
	if goodmatch:
	st.write('Resources:')
	st.write(top_2[0]['url'])
	st.write(top_2[1]['url'])
	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": response})
	#st.write(ans)

	#st.write(' ----------------------')
	#st.write(out)

	now= str(datetime.utcnow())
	url = top_2[0]['url'] + ' ; '+top_2[1]['url']

	df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now]
	#write_log(QUESTION,url, score, ans, now)
	#df.to_csv("hf://datasets/sujitb/data/test.csv")
	storage_options={"token":token}
	df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)

	else: ## Zero response from pinecone query
	#st.write("No matches for query")
	ans= "No matches for query"
	response = st.write_stream(response_generator(ans))
	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": response})

	now= str(datetime.utcnow())
	df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now]
	storage_options={"token":token}
	df_log.to_csv("hf://datasets/sujitb/data/"+logfile,storage_options= storage_options)
	#write_log(QUESTION,'No match', 0, '-', now)