sujitb commited on
Commit
99a1d6d
1 Parent(s): 79fed39
Files changed (1) hide show
  1. clqna.py +35 -0
clqna.py CHANGED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import pipeline
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ from sentence_transformers import SentenceTransformer, util
6
+
7
+
8
+ bi_encoder = SentenceTransformer('msmarco-distilbert-base-v4')
9
+ bi_encoder.max_seq_length = 256 # Truncate long documents to 256 tokens
10
+
11
+ # Store the index as a variable
12
+ INDEX_NAME = 'cl-search-idx'
13
+ NAMESPACE = 'webpages'
14
+
15
+ index = pc.Index(name=INDEX_NAME)
16
+
17
+ def query_from_pinecone(index, question_embedding, top_k=3):
18
+ # get embedding from THE SAME embedder as the documents
19
+
20
+ return index.query(
21
+ vector=question_embedding,
22
+ top_k=top_k,
23
+ namespace=NAMESPACE,
24
+ include_metadata=True # gets the metadata (dates, text, etc)
25
+ ).get('matches')
26
+
27
+
28
+ QUESTION=st.text_area('Ask a question -e.g How to prepare for Verbal section for CAT?') ##' How to prepare for Verbal section ?'
29
+ question_embedding = bi_encoder.encode(QUESTION, convert_to_tensor=True)
30
+ resp= query_from_pinecone(question_embedding.tolist(), 3)
31
+ docresult= resp[0]['metadata']['text']
32
+ #+ '\n*************\n'+ resp[1]['metadata']['text'] + '\n*************\n'+ resp[2]['metadata']['text']
33
+
34
+ st.json(out)
35
+