transZ commited on
Commit
59083bb
·
1 Parent(s): 1167cb4

Testing version

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. sbert_cosine.py +52 -11
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ torch
sbert_cosine.py CHANGED
@@ -17,6 +17,7 @@ import evaluate
17
  import datasets
18
  import torch
19
  import torch.nn as nn
 
20
 
21
  _CITATION = """\
22
  @article{Reimers2019,
@@ -70,15 +71,25 @@ class sbert_cosine(evaluate.Metric):
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
- features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
- }),
 
 
 
 
 
 
 
 
 
 
77
  # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
  def _download_and_prepare(self, dl_manager):
@@ -86,10 +97,40 @@ class sbert_cosine(evaluate.Metric):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
  }
 
17
  import datasets
18
  import torch
19
  import torch.nn as nn
20
+ from transformers import AutoTokenizer, BertModel
21
 
22
  _CITATION = """\
23
  @article{Reimers2019,
 
71
  citation=_CITATION,
72
  inputs_description=_KWARGS_DESCRIPTION,
73
  # This defines the format of each prediction and reference
74
+ features=[
75
+ datasets.Features(
76
+ {
77
+ "predictions": datasets.Value("string", id="sequence"),
78
+ "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
79
+ }
80
+ ),
81
+ datasets.Features(
82
+ {
83
+ "predictions": datasets.Value("string", id="sequence"),
84
+ "references": datasets.Value("string", id="sequence"),
85
+ }
86
+ ),
87
+ ],
88
  # Homepage of the module for documentation
89
+ homepage="http://sbert.net",
90
  # Additional links to the codebase or references
91
+ codebase_urls=["https://github.com/UKPLab/sentence-transformers"],
92
+ reference_urls=["https://github.com/UKPLab/sentence-transformers"]
93
  )
94
 
95
  def _download_and_prepare(self, dl_manager):
 
97
  # TODO: Download external resources if needed
98
  pass
99
 
100
+ def _compute(self, predictions, references, model_type='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
101
  """Returns the scores"""
102
+ def mean_pooling(model_output, attention_mask):
103
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
104
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
105
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
106
+
107
+ def batch_to_device(batch, target_device: device):
108
+ """
109
+ send a pytorch batch to a device (CPU/GPU)
110
+ """
111
+ for key in batch:
112
+ if isinstance(batch[key], torch.Tensor):
113
+ batch[key] = batch[key].to(target_device)
114
+ return batch
115
+
116
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
117
+
118
+ tokenizer = AutoTokenizer.from_pretrained(model_type)
119
+ model = BertModel.from_pretrained(model_type)
120
+ model = model.to(device)
121
+ cosine = nn.CosineSimilarity()
122
+
123
+ def calculate(x: str, y: str):
124
+ encoded_input = tokenizer([x, y], padding=True, truncation=True, return_tensors='pt')
125
+ encoded_input = batch_to_device(encode_input, device)
126
+ model_output = model(**encoded_input)
127
+ embeds = mean_pooling(model_output, encoded_input['attention_mask'])
128
+ res = cosine(embeds[0, :], embeds[1, :]).item()
129
+ return res
130
+
131
+ with torch.no_grad():
132
+ score = torch.mean([calculate(pred, ref) for pred, ref in zip(predictions, references)]).item()
133
+
134
  return {
135
+ "score": score,
136
  }