File size: 1,278 Bytes
589c9b1
074b93b
589c9b1
074b93b
 
 
 
 
 
 
 
 
 
ed6e9e8
 
074b93b
 
ed6e9e8
074b93b
589c9b1
074b93b
 
 
ed6e9e8
074b93b
 
589c9b1
 
074b93b
 
ed6e9e8
074b93b
 
589c9b1
074b93b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import streamlit as st

from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util


bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256     # Truncate long documents to 256 tokens

# Store the index as a variable
INDEX_NAME = 'cl-search-idx'
NAMESPACE = 'webpages'
pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8'  #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)

def query_from_pinecone(index,namespace, question_embedding, top_k=3):
    # get embedding from THE SAME embedder as the documents

    return index.query(
      vector=question_embedding,
      top_k=top_k,
      namespace=namespace,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')


QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?')  ##' How to prepare for Verbal section ?'
question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
resp= query_from_pinecone(index,NAMESPACE, question_embedding.tolist(), 3)
docresult= resp[0]['metadata']['text'] 
#+ '\n*************\n'+  resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']

st.json(out)