Update README.md
Browse files
README.md
CHANGED
@@ -12,30 +12,33 @@ This model focuses on retrieval tasks while also performing well on various task
|
|
12 |
from transformers import AutoTokenizer, AutoModel
|
13 |
import torch
|
14 |
# Sentences we want sentence embeddings for
|
|
|
15 |
sentences = ["this is a test sentence", "this is another test sentence"]
|
16 |
|
17 |
# Prefixing for retrieval tasks
|
18 |
instruction = "Represent this sentence for searching relevant passages: "
|
19 |
|
20 |
# Load model from HuggingFace Hub
|
21 |
-
tokenizer = AutoTokenizer.from_pretrained('Marqo/marqo-merged-bge-gist-gte-base')
|
22 |
-
model = AutoModel.from_pretrained('Marqo/marqo-merged-bge-gist-gte-base')
|
23 |
model.eval()
|
24 |
|
25 |
# Tokenize sentences
|
26 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
27 |
-
encoded_input_with_prefixing = tokenizer([instruction + q for q in
|
28 |
|
29 |
# Compute token embeddings
|
30 |
with torch.no_grad():
|
31 |
model_output = model(**encoded_input)
|
32 |
model_output_with_prefixing = model(**encoded_input_with_prefixing)
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
36 |
# normalize embeddings
|
37 |
-
|
38 |
-
print("Sentence embeddings:",
|
39 |
```
|
40 |
## Evaluation
|
41 |
<img src="slerp.png" alt="109M models retrieval benchmarks" width="650" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
|
|
|
12 |
from transformers import AutoTokenizer, AutoModel
|
13 |
import torch
|
14 |
# Sentences we want sentence embeddings for
|
15 |
+
token=""
|
16 |
sentences = ["this is a test sentence", "this is another test sentence"]
|
17 |
|
18 |
# Prefixing for retrieval tasks
|
19 |
instruction = "Represent this sentence for searching relevant passages: "
|
20 |
|
21 |
# Load model from HuggingFace Hub
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained('Marqo/marqo-merged-bge-gist-gte-base', token=token)
|
23 |
+
model = AutoModel.from_pretrained('Marqo/marqo-merged-bge-gist-gte-base', token=token)
|
24 |
model.eval()
|
25 |
|
26 |
# Tokenize sentences
|
27 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
28 |
+
encoded_input_with_prefixing = tokenizer([instruction + q for q in sentences], padding=True, truncation=True, return_tensors='pt')
|
29 |
|
30 |
# Compute token embeddings
|
31 |
with torch.no_grad():
|
32 |
model_output = model(**encoded_input)
|
33 |
model_output_with_prefixing = model(**encoded_input_with_prefixing)
|
34 |
+
sentence_embeddings = model_output[0][:, 0]
|
35 |
+
sentence_embeddings_with_prefixing = model_output_with_prefixing[0][:, 0]
|
36 |
+
|
37 |
+
sentence_embeddings_avg = (sentence_embeddings + sentence_embeddings_with_prefixing) / 2
|
38 |
+
|
39 |
# normalize embeddings
|
40 |
+
sentence_embeddings_avg = torch.nn.functional.normalize(sentence_embeddings_avg, p=2, dim=1)
|
41 |
+
print("Sentence embeddings:", sentence_embeddings_avg)
|
42 |
```
|
43 |
## Evaluation
|
44 |
<img src="slerp.png" alt="109M models retrieval benchmarks" width="650" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
|