Spaces:
Sleeping
Sleeping
Testing version
Browse files- requirements.txt +2 -1
- sbert_cosine.py +52 -11
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
git+https://github.com/huggingface/evaluate@main
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@main
|
2 |
+
torch
|
sbert_cosine.py
CHANGED
@@ -17,6 +17,7 @@ import evaluate
|
|
17 |
import datasets
|
18 |
import torch
|
19 |
import torch.nn as nn
|
|
|
20 |
|
21 |
_CITATION = """\
|
22 |
@article{Reimers2019,
|
@@ -70,15 +71,25 @@ class sbert_cosine(evaluate.Metric):
|
|
70 |
citation=_CITATION,
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
-
features=
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Homepage of the module for documentation
|
78 |
-
homepage="http://
|
79 |
# Additional links to the codebase or references
|
80 |
-
codebase_urls=["
|
81 |
-
reference_urls=["
|
82 |
)
|
83 |
|
84 |
def _download_and_prepare(self, dl_manager):
|
@@ -86,10 +97,40 @@ class sbert_cosine(evaluate.Metric):
|
|
86 |
# TODO: Download external resources if needed
|
87 |
pass
|
88 |
|
89 |
-
def _compute(self, predictions, references):
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return {
|
94 |
-
"
|
95 |
}
|
|
|
17 |
import datasets
|
18 |
import torch
|
19 |
import torch.nn as nn
|
20 |
+
from transformers import AutoTokenizer, BertModel
|
21 |
|
22 |
_CITATION = """\
|
23 |
@article{Reimers2019,
|
|
|
71 |
citation=_CITATION,
|
72 |
inputs_description=_KWARGS_DESCRIPTION,
|
73 |
# This defines the format of each prediction and reference
|
74 |
+
features=[
|
75 |
+
datasets.Features(
|
76 |
+
{
|
77 |
+
"predictions": datasets.Value("string", id="sequence"),
|
78 |
+
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
|
79 |
+
}
|
80 |
+
),
|
81 |
+
datasets.Features(
|
82 |
+
{
|
83 |
+
"predictions": datasets.Value("string", id="sequence"),
|
84 |
+
"references": datasets.Value("string", id="sequence"),
|
85 |
+
}
|
86 |
+
),
|
87 |
+
],
|
88 |
# Homepage of the module for documentation
|
89 |
+
homepage="http://sbert.net",
|
90 |
# Additional links to the codebase or references
|
91 |
+
codebase_urls=["https://github.com/UKPLab/sentence-transformers"],
|
92 |
+
reference_urls=["https://github.com/UKPLab/sentence-transformers"]
|
93 |
)
|
94 |
|
95 |
def _download_and_prepare(self, dl_manager):
|
|
|
97 |
# TODO: Download external resources if needed
|
98 |
pass
|
99 |
|
100 |
+
def _compute(self, predictions, references, model_type='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
|
101 |
"""Returns the scores"""
|
102 |
+
def mean_pooling(model_output, attention_mask):
|
103 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
104 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
105 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
106 |
+
|
107 |
+
def batch_to_device(batch, target_device: device):
|
108 |
+
"""
|
109 |
+
send a pytorch batch to a device (CPU/GPU)
|
110 |
+
"""
|
111 |
+
for key in batch:
|
112 |
+
if isinstance(batch[key], torch.Tensor):
|
113 |
+
batch[key] = batch[key].to(target_device)
|
114 |
+
return batch
|
115 |
+
|
116 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
117 |
+
|
118 |
+
tokenizer = AutoTokenizer.from_pretrained(model_type)
|
119 |
+
model = BertModel.from_pretrained(model_type)
|
120 |
+
model = model.to(device)
|
121 |
+
cosine = nn.CosineSimilarity()
|
122 |
+
|
123 |
+
def calculate(x: str, y: str):
|
124 |
+
encoded_input = tokenizer([x, y], padding=True, truncation=True, return_tensors='pt')
|
125 |
+
encoded_input = batch_to_device(encode_input, device)
|
126 |
+
model_output = model(**encoded_input)
|
127 |
+
embeds = mean_pooling(model_output, encoded_input['attention_mask'])
|
128 |
+
res = cosine(embeds[0, :], embeds[1, :]).item()
|
129 |
+
return res
|
130 |
+
|
131 |
+
with torch.no_grad():
|
132 |
+
score = torch.mean([calculate(pred, ref) for pred, ref in zip(predictions, references)]).item()
|
133 |
+
|
134 |
return {
|
135 |
+
"score": score,
|
136 |
}
|