import streamlit as st #from transformers import pipeline from pinecone import Pinecone, ServerlessSpec from sentence_transformers import SentenceTransformer, util from openai import OpenAI from datetime import datetime import pandas as pd import os from huggingface_hub import HfFileSystem token = os.getenv('FILE_TOKEN') st.write(token) fs = HfFileSystem(token=token) import time api_key=os.environ["OPENAI_API_KEY"] bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens # Store the index as a variable INDEX_NAME = 'cl-search-idx' pc_api_key= os.environ["clpine"] #AWS pc = Pinecone(api_key=pc_api_key) index = pc.Index(name=INDEX_NAME) system_instructions_text=''' Your task is to extract the answer to a question from a body of text provided to you. The body of text will be enclosed within the delimiter tags and For example, General Preparation Tips for VARC Section: You need to develop an incessant habit of speed reading. Start with reading newspapers, editorials, fiction and nonfiction novels and simple passages. The more you read, the faster you read. Learn the basic grammar concepts like parts of speech, articles,verbs, adjectives, tenses, auxiliary verbs, modifiers, modals etc. Revise at least 50 new words every day Question: What are some tips for preparing for VARC? Here are some tips for preparing for the VARC section: 1. develop an incessant habit of speed reading 2. Start reading newspapers, editorials, fiction and nonfiction novels 3. Learn basic grammar concepts\n 4. Revise at least 50 new words a day Question: How many new words are to be learnt in a day? It is advised that 50 new words are learn every day Your response should be based on the information contained in the provided text and should not included any other sources. If you are unable to answer the question from the text provided, please respond " Sorry. I do not have enough information to answer this" Do repeat the question. Do not make a pointed reference to the text provided. Directly answer the question ''' logfile='querylog.csv' try: df_log=pd.read_csv(logfile, index_col=0) except: df_log=pd.DataFrame(columns=['query','url','score','ans', 'ts']) def query_from_pinecone(index,namespace, question_embedding, top_k=3): # get embedding from THE SAME embedder as the documents return index.query( vector=question_embedding, top_k=top_k, namespace=namespace, include_metadata=True # gets the metadata (dates, text, etc) ).get('matches') def response_generator(response): for word in response.split(): yield word + " " time.sleep(0.05) def write_log(query,url, score, ans, ts): # Construct new row score = str(score) ans = ans.replace('"', '""') new_row = f'\n"{query}","{url}",{score},"{ans}","{ts}"' with fs.open("datasets/sujitb/data/querylog.csv", "r") as f: buffer = f.read() # Append the new row to buffer buffer += new_row # Write the buffer to the file in "W" mode with fs.open("datasets/sujitb/data/querylog.csv", "w",encoding="utf-8") as f: f.write(buffer) return st.title('CLLM Answering Machine') # Initialize chat history if "messages" not in st.session_state: st.session_state.messages = [] # Display chat messages from history on app rerun for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) QUESTION = st.chat_input('Ask a question -e.g How to prepare for Verbal section for CAT?') #QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?' score=0 testing=True if QUESTION: with st.chat_message("user"): st.markdown(QUESTION) # Add user message to chat history st.session_state.messages.append({"role": "user", "content": QUESTION}) question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) ns='webpages' ns='full' resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3) if len(resp)>0: #st.write("Got results...extracting an answer") out= resp[0]['metadata']['data'] try: url= resp[0]['metadata']['url'] except: url= resp[0]['id'] url= resp[0]['id'] score=resp[0]['score'] title=resp[0]['metadata']['title'] #+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text'] if score>.5: client = OpenAI() content=""" {} """.format(out) response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content":system_instructions_text }, {"role": "user", "content": content}, {"role": "user", "content": "Question:"+QUESTION} ] ) ans= response.choices[0].message.content else: ans='Weak match to your query. Please try reframing your question' #st.write("Matched URL:{} Score:{}".format(url,score)) testing = False if testing: if len(resp)>=1: st.write("2nd Matched URL:{} Score:{}".format(resp[1]['id'],resp[1]['score'])) if len(resp)>=2: st.write("3rd Matched URL:{} Score:{}".format(resp[2]['id'],resp[2]['score'])) with st.chat_message("assistant"): response = st.write_stream(response_generator(ans)) # Add assistant response to chat history st.session_state.messages.append({"role": "assistant", "content": response}) #st.write(ans) #st.write(' ----------------------') #st.write(out) now= str(datetime.utcnow()) df_log.loc[len(df_log)]=[QUESTION,url,score,ans,now] write_log(QUESTION,url, score, ans, now) #df.to_csv("hf://datasets/sujitb/data/test.csv") #df_log.to_csv("hf://datasets/sujitb/data/"+logfile) else: #st.write("No matches for query") ans= "No matches for query" response = st.write_stream(response_generator(ans)) # Add assistant response to chat history st.session_state.messages.append({"role": "assistant", "content": response}) now= str(datetime.utcnow()) df_log.loc[len(df_log)]=[QUESTION,'No match',0,'-',now] #df_log.to_csv("hf://datasets/sujitb/data/"+logfile) write_log(QUESTION,url, score, ans, now)