Nattyboi commited on
Commit
0e5c934
·
verified ·
1 Parent(s): 93e1386

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +108 -0
utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ def google_search(query, api_key, cx):
4
+ url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cx}"
5
+
6
+ response = requests.get(url)
7
+
8
+ if response.status_code == 200:
9
+ search_results = response.json()
10
+ return search_results
11
+ else:
12
+ print(f"Error: {response.status_code}")
13
+ return None
14
+
15
+
16
+
17
+
18
+ def generate_embedding_for_user_resume(data,user_id):
19
+ from sentence_transformers import SentenceTransformer
20
+
21
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
22
+
23
+
24
+ def get_embedding(data, precision="float32"):
25
+ return model.encode(data, precision=precision)
26
+
27
+
28
+ from pinecone import Vector
29
+ def create_docs_with_vector_embeddings(bson_float32, data):
30
+ docs = []
31
+ for i, (bson_f32_emb, text) in enumerate(zip(bson_float32, data)):
32
+ doc =Vector(
33
+ id=f"{i}",
34
+ values= bson_f32_emb.tolist(),
35
+ metadata={"text":text,"user_id":user_id},
36
+ )
37
+ docs.append(doc)
38
+ return docs
39
+ float32_embeddings = get_embedding(data, "float32")
40
+
41
+
42
+
43
+
44
+ docs = create_docs_with_vector_embeddings(float32_embeddings, data)
45
+ return docs
46
+
47
+
48
+ def insert_embeddings_into_pinecone_database(doc,api_key,name_space):
49
+ from pinecone import Pinecone
50
+ pc = Pinecone(api_key=api_key)
51
+ index_name = "resumes"
52
+ index = pc.Index(index_name)
53
+ upsert_response = index.upsert(namespace=name_space,vectors=doc)
54
+ return upsert_response
55
+
56
+
57
+
58
+
59
+ def query_vector_database(query,api_key,name_space):
60
+ from pinecone import Pinecone
61
+ from sentence_transformers import SentenceTransformer
62
+ model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
63
+ ret=[]
64
+ pc = Pinecone(api_key=api_key)
65
+ index_name = "resumes"
66
+
67
+
68
+ index = pc.Index(index_name)
69
+
70
+ # Define a function to generate embeddings in multiple precisions
71
+ def get_embedding(data, precision="float32"):
72
+ return model.encode(data, precision=precision)
73
+
74
+ query_embedding = get_embedding(query, precision="float32")
75
+
76
+ response = index.query(
77
+ namespace=name_space,
78
+ vector=query_embedding.tolist(),
79
+ top_k=3,
80
+ include_metadata=True
81
+ )
82
+
83
+
84
+ for doc in response['matches']:
85
+ ret.append(doc['metadata']['text'])
86
+ return ret
87
+
88
+
89
+ def delete_vector_namespace(name_space,api_key):
90
+ from pinecone import Pinecone
91
+ pc = Pinecone(api_key=api_key)
92
+ index_name = "resumes"
93
+
94
+
95
+ index = pc.Index(index_name)
96
+ response = index.delete(delete_all=True,namespace=name_space)
97
+ return response
98
+
99
+
100
+
101
+ def split_text_into_chunks(text, chunk_size=400):
102
+ # Split the text into words using whitespace.
103
+ words = text.split()
104
+
105
+ # Group the words into chunks of size 'chunk_size'.
106
+ chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
107
+ return chunks
108
+