Spaces:

transZ
/

sbert_cosine

Sleeping

App Files Files Community

transZ commited on Feb 27, 2023

Commit

59083bb

1 Parent(s): 1167cb4

Testing version

Browse files

Files changed (2) hide show

requirements.txt +2 -1
sbert_cosine.py +52 -11

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- git+https://github.com/huggingface/evaluate@main


1	+ git+https://github.com/huggingface/evaluate@main
2	+ torch

sbert_cosine.py CHANGED Viewed

@@ -17,6 +17,7 @@ import evaluate
 import datasets
 import torch
 import torch.nn as nn
 _CITATION = """\
 @article{Reimers2019,
@@ -70,15 +71,25 @@ class sbert_cosine(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
             # Homepage of the module for documentation
-            homepage="http://module.homepage",
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
@@ -86,10 +97,40 @@ class sbert_cosine(evaluate.Metric):
         # TODO: Download external resources if needed
         pass
-    def _compute(self, predictions, references):
         """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
         return {
-            "accuracy": accuracy,
         }

 import datasets
 import torch
 import torch.nn as nn
+from transformers import AutoTokenizer, BertModel
 _CITATION = """\
 @article{Reimers2019,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
             # Homepage of the module for documentation
+            homepage="http://sbert.net",
             # Additional links to the codebase or references
+            codebase_urls=["https://github.com/UKPLab/sentence-transformers"],
+            reference_urls=["https://github.com/UKPLab/sentence-transformers"]
         )
     def _download_and_prepare(self, dl_manager):
         # TODO: Download external resources if needed
         pass
+    def _compute(self, predictions, references, model_type='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
         """Returns the scores"""
+        def mean_pooling(model_output, attention_mask):
+            token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        def batch_to_device(batch, target_device: device):
+            """
+            send a pytorch batch to a device (CPU/GPU)
+            """
+            for key in batch:
+                if isinstance(batch[key], torch.Tensor):
+                    batch[key] = batch[key].to(target_device)
+            return batch
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        tokenizer = AutoTokenizer.from_pretrained(model_type)
+        model = BertModel.from_pretrained(model_type)
+        model = model.to(device)
+        cosine = nn.CosineSimilarity()
+        def calculate(x: str, y: str):
+            encoded_input = tokenizer([x, y], padding=True, truncation=True, return_tensors='pt')
+            encoded_input = batch_to_device(encode_input, device)
+            model_output = model(**encoded_input)
+            embeds = mean_pooling(model_output, encoded_input['attention_mask'])
+            res = cosine(embeds[0, :], embeds[1, :]).item()
+            return res
+        with torch.no_grad():
+            score = torch.mean([calculate(pred, ref) for pred, ref in zip(predictions, references)]).item()
         return {
+            "score": score,
         }