import streamlit as st from transformers import pipeline from pinecone import Pinecone, ServerlessSpec from sentence_transformers import SentenceTransformer, util bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4') bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens # Store the index as a variable INDEX_NAME = 'cl-search-idx' NAMESPACE = 'webpages' pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8' #AWS pc = Pinecone(api_key=pc_api_key) index = pc.Index(name=INDEX_NAME) def query_from_pinecone(index,namespace, question_embedding, top_k=3): # get embedding from THE SAME embedder as the documents return index.query( vector=question_embedding, top_k=top_k, namespace=namespace, include_metadata=True # gets the metadata (dates, text, etc) ).get('matches') QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?' question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True) resp= query_from_pinecone(index,NAMESPACE, question_embedding.tolist(), 3) out= resp[0]['metadata']['text'] #+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text'] st.write(out)