import streamlit as st #from transformers import pipeline from pinecone import Pinecone, ServerlessSpec from sentence_transformers import SentenceTransformer, util bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens # Store the index as a variable INDEX_NAME = 'cl-search-idx' pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8' #AWS pc = Pinecone(api_key=pc_api_key) index = pc.Index(name=INDEX_NAME) def query_from_pinecone(index,namespace, question_embedding, top_k=3): # get embedding from THE SAME embedder as the documents return index.query( vector=question_embedding, top_k=top_k, namespace=namespace, include_metadata=True # gets the metadata (dates, text, etc) ).get('matches') QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?' if QUESTION: question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) ns='full' resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3) if len(resp)>0: out= resp[0]['metadata']['data'] url= "Matching url "+resp[0]['id'] #+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text'] st.write(url) st.write(out) else: st.write("No matches for query")