File size: 1,273 Bytes
589c9b1 074b93b 589c9b1 074b93b ed6e9e8 074b93b ed6e9e8 074b93b 589c9b1 074b93b ed6e9e8 074b93b 589c9b1 074b93b ed6e9e8 05d88f5 074b93b 589c9b1 9641d38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import streamlit as st
from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util
bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens
# Store the index as a variable
INDEX_NAME = 'cl-search-idx'
NAMESPACE = 'webpages'
pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8' #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)
def query_from_pinecone(index,namespace, question_embedding, top_k=3):
# get embedding from THE SAME embedder as the documents
return index.query(
vector=question_embedding,
top_k=top_k,
namespace=namespace,
include_metadata=True # gets the metadata (dates, text, etc)
).get('matches')
QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?'
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
resp= query_from_pinecone(index,NAMESPACE, question_embedding.tolist(), 3)
out= resp[0]['metadata']['text']
#+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']
st.write(out) |