File size: 1,440 Bytes
589c9b1
074b93b
b403bb0
074b93b
 
 
 
 
 
 
 
 
b403bb0
ed6e9e8
 
074b93b
 
ed6e9e8
074b93b
589c9b1
074b93b
 
 
ed6e9e8
074b93b
 
589c9b1
074b93b
589c9b1
b403bb0
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import streamlit as st

#from transformers import pipeline
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, util


bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
bi_encoder.max_seq_length = 256     # Truncate long documents to 256 tokens

# Store the index as a variable
INDEX_NAME = 'cl-search-idx'

pc_api_key= '3f916d01-2a69-457d-85eb-966c5d1849a8'  #AWS
pc = Pinecone(api_key=pc_api_key)
index = pc.Index(name=INDEX_NAME)

def query_from_pinecone(index,namespace, question_embedding, top_k=3):
    # get embedding from THE SAME embedder as the documents

    return index.query(
      vector=question_embedding,
      top_k=top_k,
      namespace=namespace,
      include_metadata=True   # gets the metadata (dates, text, etc)
    ).get('matches')

QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?')  ##' How to prepare for Verbal section ?'

if QUESTION:
    question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)

    ns='full'
    resp= query_from_pinecone(index,ns, question_embedding.tolist(), 3)
    if len(resp)>0:
        out= resp[0]['metadata']['data']
        url= "Matching url "+resp[0]['id']
        #+ '\n*************\n'+  resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']
        st.write(url)
        st.write(out)
    else:
        st.write("No matches for query")