Elron commited on
Commit
1e4984f
·
verified ·
1 Parent(s): b4ab559

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +273 -141
metrics.py CHANGED
@@ -1,4 +1,3 @@
1
- import itertools
2
  import re
3
  import string
4
  import uuid
@@ -30,7 +29,7 @@ from .operators import CopyFields
30
  from .random_utils import get_seed
31
  from .settings_utils import get_settings
32
  from .stream import MultiStream, Stream
33
- from .type_utils import isoftype, to_float_or_default
34
 
35
  logger = get_logger()
36
  settings = get_settings()
@@ -75,6 +74,86 @@ class Metric(Artifact):
75
  def main_score(self):
76
  pass
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def consume_stream(self, stream: Stream):
79
  references = []
80
  predictions = []
@@ -335,6 +414,8 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
335
  n_resamples: int = OptionalField(
336
  default_factory=lambda: settings.num_resamples_for_global_metrics
337
  )
 
 
338
  process_single_instances = True
339
 
340
  def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
@@ -385,6 +466,7 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
385
  instance_score[self.main_score] = no_score_value
386
 
387
  instance["score"]["instance"].update(instance_score)
 
388
 
389
  result = self._compute(references, predictions, task_data)
390
 
@@ -459,7 +541,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
459
  instance["task_data"] if "task_data" in instance else {}
460
  for instance in stream
461
  ]
462
-
463
  # compute the metric over all refs and preds
464
  instance_scores = self.compute(
465
  references=references,
@@ -724,6 +806,8 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
724
 
725
  for instance in stream:
726
  refs, pred = instance["references"], instance["prediction"]
 
 
727
  task_data = instance["task_data"] if "task_data" in instance else {}
728
 
729
  instance_score = self.compute(
@@ -837,42 +921,13 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
837
  pass
838
 
839
 
840
- class Squad(GlobalMetric):
841
- _metric = None
842
- main_score = "f1"
843
- metric = "squad"
844
-
845
- def prepare(self):
846
- super().prepare()
847
- self._metric = evaluate.load(self.metric)
848
-
849
- def compute(
850
- self,
851
- references: List[List[str]],
852
- predictions: List[str],
853
- task_data: List[Dict],
854
- ) -> dict:
855
- ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
856
- formatted_predictions = [
857
- {"prediction_text": prediction, "id": ids[i]}
858
- for i, prediction in enumerate(predictions)
859
- ]
860
- formatted_references = [
861
- {"answers": {"answer_start": [-1], "text": reference}, "id": ids[i]}
862
- for i, reference in enumerate(references)
863
- ]
864
-
865
- return self._metric.compute(
866
- predictions=formatted_predictions,
867
- references=formatted_references,
868
- )
869
-
870
-
871
  class Accuracy(InstanceMetric):
872
  reduction_map = {"mean": ["accuracy"]}
873
  main_score = "accuracy"
874
  ci_scores = ["accuracy"]
875
 
 
 
876
  def compute(
877
  self, references: List[Any], prediction: Any, task_data: List[Dict]
878
  ) -> dict:
@@ -886,11 +941,28 @@ class Accuracy(InstanceMetric):
886
  return result
887
 
888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
  class StringContainment(InstanceMetric):
890
  reduction_map = {"mean": ["string_containment"]}
891
  main_score = "string_containment"
892
  ci_scores = ["string_containment"]
893
 
 
 
 
894
  def compute(
895
  self, references: List[Any], prediction: Any, task_data: List[Dict]
896
  ) -> dict:
@@ -1005,7 +1077,7 @@ class HuggingfaceMetric(GlobalMetric):
1005
 
1006
  passed_task_data[additional_input_field] = next(iter(values))
1007
 
1008
- # add check that all required fields in self.metrics are in passed_task_data print(passed_task_data)
1009
  result = self.metric.compute(
1010
  predictions=predictions,
1011
  references=references,
@@ -1087,6 +1159,9 @@ class F1(GlobalMetric):
1087
  average = None # Report per class then aggregate by mean
1088
  metric = "f1"
1089
 
 
 
 
1090
  def prepare(self):
1091
  super().prepare()
1092
  self._metric = evaluate.load(self.metric)
@@ -1098,23 +1173,12 @@ class F1(GlobalMetric):
1098
  self.id_to_str[id] = str
1099
  return self.str_to_id[str]
1100
 
1101
- def _labels_match_average_format(
1102
- self, references: List[List[str]], predictions: List[str]
1103
- ):
1104
- return True
1105
-
1106
  def compute(
1107
  self,
1108
  references: List[List[str]],
1109
  predictions: List[str],
1110
  task_data: List[Dict],
1111
  ) -> dict:
1112
- assert all(
1113
- len(reference) == 1 for reference in references
1114
- ), "Only a single reference per prediction is allowed in F1 metric"
1115
- if not self._labels_match_average_format(references, predictions):
1116
- return {self.main_score: np.nan}
1117
-
1118
  self.str_to_id = {}
1119
  self.id_to_str = {}
1120
  formatted_references = [
@@ -1149,27 +1213,29 @@ class F1Micro(F1):
1149
 
1150
 
1151
  class F1Binary(F1):
 
 
1152
  process_single_instances = False
1153
  main_score = "f1_binary"
1154
  average = "binary"
1155
  pos_classes = {"1", "1.0", "yes", "true"}
 
1156
 
1157
  def get_str_id(self, str):
1158
- if str.lower() in self.pos_classes:
1159
- return 1
1160
- return 0
1161
 
1162
- # References and predictions must include up to 2 unique values, one of them in pos_classes
1163
- def _labels_match_average_format(
1164
- self, references: List[List[str]], predictions: List[str]
1165
- ):
1166
- classes = set(predictions + list(itertools.chain(*references)))
1167
- n_classes = len(classes)
1168
- if n_classes > 2:
1169
- return False
1170
- if n_classes == 2 and len(set(classes).difference(self.pos_classes)) == 0:
1171
- return False
1172
- return True
 
1173
 
1174
 
1175
  class RecallBinary(F1Binary):
@@ -1197,6 +1263,9 @@ class F1MultiLabel(GlobalMetric):
1197
  average = None # Report per class then aggregate by mean
1198
  metric = "f1"
1199
 
 
 
 
1200
  def prepare(self):
1201
  super().prepare()
1202
  self._metric = evaluate.load(self.metric, "multilabel")
@@ -1224,7 +1293,6 @@ class F1MultiLabel(GlobalMetric):
1224
  self.str_to_id = {}
1225
  self.id_to_str = {}
1226
 
1227
- self._validate_references_and_prediction(references, predictions)
1228
  references = [reference[0] for reference in references]
1229
 
1230
  labels = list({label for reference in references for label in reference})
@@ -1267,23 +1335,6 @@ class F1MultiLabel(GlobalMetric):
1267
  final_result = {self.main_score: result[self.metric]}
1268
  return final_result
1269
 
1270
- def _validate_references_and_prediction(self, references, predictions):
1271
- for reference in references:
1272
- if not len(reference) == 1:
1273
- raise ValueError(
1274
- f"Only a single reference per prediction is allowed in F1 multi label metric. Received reference: {reference}"
1275
- )
1276
- if not isoftype(reference[0], List[str]):
1277
- raise ValueError(
1278
- f"Each reference is expected to be a list of strings in F1 multi label metric. Received reference: '{reference[0]}'"
1279
- )
1280
-
1281
- for prediction in predictions:
1282
- if not isoftype(prediction, List[str]):
1283
- raise ValueError(
1284
- f"Each prediction is expected to be a list of strings in F1 multi label metric. Received prediction: '{prediction}'"
1285
- )
1286
-
1287
 
1288
  class PrecisionMacroMultiLabel(F1MultiLabel):
1289
  main_score = "precision_macro"
@@ -1324,6 +1375,9 @@ class Rouge(HuggingfaceMetric):
1324
  main_score = "rougeL"
1325
  scale = 1.0
1326
 
 
 
 
1327
  use_aggregator: bool = True
1328
  rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
1329
 
@@ -1361,6 +1415,8 @@ class CharEditDistanceAccuracy(InstanceMetric):
1361
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
1362
  main_score = "char_edit_dist_accuracy"
1363
  ci_scores = ["char_edit_dist_accuracy"]
 
 
1364
 
1365
  _requirements_list: List[str] = ["editdistance"]
1366
 
@@ -1371,10 +1427,6 @@ class CharEditDistanceAccuracy(InstanceMetric):
1371
  self.eval = editdistance.eval
1372
 
1373
  def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
1374
- assert (
1375
- len(references) == 1
1376
- ), f"Expected only one reference , but received: {references}"
1377
-
1378
  formatted_prediction = "".join(prediction.split())
1379
  formatted_reference = "".join(references[0].split())
1380
  max_length = max(len(formatted_reference), len(formatted_prediction))
@@ -1387,6 +1439,8 @@ class CharEditDistanceAccuracy(InstanceMetric):
1387
  class Wer(HuggingfaceMetric):
1388
  hf_metric_name = "wer"
1389
  main_score = "wer"
 
 
1390
 
1391
  _requirements_list: List[str] = ["jiwer"]
1392
 
@@ -1396,9 +1450,6 @@ class Wer(HuggingfaceMetric):
1396
  predictions: List[str],
1397
  task_data: List[Dict],
1398
  ) -> dict:
1399
- assert all(
1400
- len(reference) == 1 for reference in references
1401
- ), "Only single reference per prediction is allowed in wer metric"
1402
  formatted_references = [reference[0] for reference in references]
1403
  result = self.metric.compute(
1404
  predictions=predictions, references=formatted_references
@@ -1410,12 +1461,21 @@ class Spearmanr(HuggingfaceMetric):
1410
  hf_metric_name = "spearmanr"
1411
  main_score = "spearmanr"
1412
  process_single_instances = False
 
 
 
 
 
 
 
 
1413
 
1414
 
1415
  class KendallTauMetric(GlobalMetric):
1416
  main_score = "kendalltau_b"
1417
  variant = "b"
1418
  process_single_instances = False
 
1419
 
1420
  _requirements_list: List[str] = ["scipy"]
1421
 
@@ -1448,6 +1508,9 @@ class MatthewsCorrelation(HuggingfaceMetric):
1448
  main_score = "matthews_correlation"
1449
  str_to_id: dict = InternalField(default_factory=dict)
1450
 
 
 
 
1451
  def get_str_id(self, str):
1452
  if str not in self.str_to_id:
1453
  id = len(self.str_to_id)
@@ -1475,6 +1538,8 @@ class RocAuc(GlobalMetric):
1475
  main_score = "roc_auc"
1476
  process_single_instances = False
1477
  _requirements_list: List[str] = ["sklearn"]
 
 
1478
 
1479
  def prepare(self):
1480
  from sklearn import metrics
@@ -1502,6 +1567,8 @@ class RocAuc(GlobalMetric):
1502
 
1503
  class CustomF1(GlobalMetric):
1504
  main_score = "f1_micro"
 
 
1505
  groups = None
1506
  zero_division = 0.0
1507
 
@@ -1556,6 +1623,8 @@ class CustomF1(GlobalMetric):
1556
  def get_groups(self, elements, task_data):
1557
  groups = set()
1558
  for sublist, additional_input in zip(elements, task_data):
 
 
1559
  for e in sublist:
1560
  if self.should_ignore_element(e, additional_input):
1561
  continue
@@ -1568,18 +1637,7 @@ class CustomF1(GlobalMetric):
1568
  predictions: List[Any],
1569
  task_data: List[Dict],
1570
  ) -> dict:
1571
- # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
1572
- if (
1573
- isinstance(references[0], list)
1574
- and len(references[0]) > 0
1575
- and isinstance(references[0][0], list)
1576
- ):
1577
- references = [element[0] for element in references]
1578
-
1579
- assert len(references) == len(predictions), (
1580
- f"references size ({len(references)})"
1581
- f" doesn't mach predictions size ({len(references)})."
1582
- )
1583
 
1584
  if self.groups is None:
1585
  groups = self.get_groups(references, task_data)
@@ -1672,6 +1730,8 @@ class CustomF1(GlobalMetric):
1672
 
1673
 
1674
  class NER(CustomF1):
 
 
1675
  def get_element_group(self, element, additional_input):
1676
  return element[1]
1677
 
@@ -1702,6 +1762,8 @@ class TokenOverlap(InstanceMetric):
1702
  reduction_map = {"mean": ["f1", "precision", "recall"]}
1703
  main_score = "f1"
1704
  ci_scores = ["f1", "precision", "recall"]
 
 
1705
 
1706
  def compute(
1707
  self, references: List[Any], prediction: Any, task_data: List[Dict]
@@ -1836,25 +1898,11 @@ class Reward(BulkInstanceMetric):
1836
 
1837
 
1838
  class LlamaIndexCorrectness(InstanceMetric):
1839
- """LlamaIndex based metric class for evaluating correctness.
1840
-
1841
- Attributes:
1842
- reduction_map (dict): A dictionary specifying the reduction method for the metric.
1843
- main_score (str): The main score used for evaluation.
1844
- _requirements_list (List[str]): A list specifying any additional requirements for the metric.
1845
-
1846
- Methods:
1847
- prepare(self): Initialization method for the metric.
1848
- compute(self, references, predictions, additional_inputs): Method to compute the metric.
1849
-
1850
- Usage:
1851
- metric = LlamaIndexCorrectnessMetric()
1852
- scores = metric.compute(references, prediction, additional_inputs)
1853
- """
1854
 
1855
  model_name: str = ""
1856
  main_score: str = ""
1857
-
1858
  reduction_map: Dict[str, List[str]] = None
1859
  openai_models: List[str] = ["gpt-3.5-turbo"]
1860
  anthropic_models: List[
@@ -1875,9 +1923,16 @@ class LlamaIndexCorrectness(InstanceMetric):
1875
  Returns:
1876
  Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
1877
  """
1878
- score_str = eval_response.split("\n")[0]
 
 
 
 
 
 
 
 
1879
  reasoning_str = "\n".join(eval_response.split("\n")[1:])
1880
- score = float(score_str)
1881
  reasoning = reasoning_str.lstrip("\n")
1882
  return score, reasoning
1883
 
@@ -1942,7 +1997,10 @@ class LlamaIndexCorrectness(InstanceMetric):
1942
  ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False. Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
1943
 
1944
  query = task_data["question"]
1945
- contexts = task_data["contexts"]
 
 
 
1946
 
1947
  per_reference_results = []
1948
  for reference_response in references:
@@ -1968,9 +2026,9 @@ class Perplexity(BulkInstanceMetric):
1968
 
1969
  main_score = "perplexity"
1970
  reduction_map = {"mean": ["perplexity"]}
 
1971
 
1972
  perplexity_prompt: str
1973
-
1974
  batch_size: int = 32
1975
  model_name: str
1976
 
@@ -2193,6 +2251,22 @@ class Perplexity(BulkInstanceMetric):
2193
  return shifted_logits, shifted_labels
2194
 
2195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2196
  class NDCG(GlobalMetric):
2197
  """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
2198
 
@@ -2211,6 +2285,8 @@ class NDCG(GlobalMetric):
2211
  main_score = "nDCG"
2212
 
2213
  _requirements_list: List[str] = ["sklearn"]
 
 
2214
 
2215
  def prepare(self):
2216
  from sklearn.metrics import ndcg_score
@@ -2227,6 +2303,7 @@ class NDCG(GlobalMetric):
2227
  from collections import defaultdict
2228
 
2229
  query_to_predictions_and_references = defaultdict(lambda: [[], []])
 
2230
  for reference, pred, inputs_dict in zip(references, predictions, task_data):
2231
  query = inputs_dict.get("query")
2232
  query_to_predictions_and_references[query][0].append(pred)
@@ -2257,10 +2334,13 @@ class NDCG(GlobalMetric):
2257
 
2258
 
2259
  class RetrievalMetric(InstanceMetric):
 
 
 
2260
  def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
2261
  # digest input
2262
  pred_ids: List[Any] = prediction
2263
- ref_ids: List[Any] = list(dict.fromkeys(references))
2264
 
2265
  # relevance_at_k: 1-based dictionary of indicators (0/1), telling whether
2266
  # the doc id retrieved at position k (assuming it is 1-based, so k starts
@@ -2408,6 +2488,9 @@ class RetrievalAtK(RetrievalMetric):
2408
 
2409
 
2410
  class KPA(CustomF1):
 
 
 
2411
  def get_element_group(self, element, additional_input):
2412
  return additional_input["keypoint"]
2413
 
@@ -3088,7 +3171,11 @@ class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment):
3088
 
3089
 
3090
  class BinaryMaxF1(F1Binary):
 
 
3091
  main_score = "max_f1_binary"
 
 
3092
 
3093
  def compute(
3094
  self,
@@ -3096,34 +3183,13 @@ class BinaryMaxF1(F1Binary):
3096
  predictions: List[List[str]],
3097
  task_data: List[Dict],
3098
  ) -> dict:
3099
- assert all(
3100
- len(reference) == 1 for reference in references
3101
- ), "Only a single reference per prediction is allowed in F1 metric"
3102
- classes = set(itertools.chain(*references))
3103
- n_clases = len(classes)
3104
- assert len(classes) <= 2, "References of BinaryMaxF1 must be binary"
3105
- pos_classes = classes.intersection(self.pos_classes)
3106
- neg_classes = classes.difference(self.pos_classes)
3107
- n_pos_classes = len(pos_classes)
3108
- if n_clases == 2:
3109
- assert (
3110
- n_pos_classes == 1
3111
- ), "Only one positive class is allowed in BinaryMaxF1"
3112
- pos_class = next(iter(pos_classes)) if n_pos_classes > 0 else "1.0"
3113
- neg_class = next(iter(neg_classes)) if len(neg_classes) > 0 else "0.0"
3114
-
3115
- float_predictions = []
3116
- for prediction in predictions:
3117
- try:
3118
- float_predictions.append(float(prediction))
3119
- except Exception:
3120
- float_predictions.append(0)
3121
 
3122
  best_thr = -1
3123
  best_f1 = -1
3124
  for thr in set(float_predictions):
3125
  new_predictions = [
3126
- pos_class if float_prediction >= thr else neg_class
3127
  for float_prediction in float_predictions
3128
  ]
3129
  f1 = super().compute(references, new_predictions, task_data)[
@@ -3134,3 +3200,69 @@ class BinaryMaxF1(F1Binary):
3134
  best_thr = thr
3135
 
3136
  return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import string
3
  import uuid
 
29
  from .random_utils import get_seed
30
  from .settings_utils import get_settings
31
  from .stream import MultiStream, Stream
32
+ from .type_utils import isoftype, parse_type_string, to_float_or_default
33
 
34
  logger = get_logger()
35
  settings = get_settings()
 
74
  def main_score(self):
75
  pass
76
 
77
+ # Override 'prediction_type' with the expected type of predictions
78
+ # and references. Example: "List[str]", "List[Dict]"", "string".
79
+ # If left with default None, a warning will be displayed.
80
+ # In future versions of unitxt, this will be an error.
81
+ prediction_type: str = None
82
+
83
+ # Standard metrics can receive multiple references per predictions (in a list)
84
+ # Some metrics support only a single reference per prediction (one element in the list)
85
+ single_reference_per_prediction: bool = False
86
+
87
+ # Used to store the parsed prediction type and avoid
88
+ # parsing on every use
89
+ _parsed_prediction_type = None
90
+
91
+ def _validate_references_and_prediction(self, references, predictions):
92
+ if not isoftype(predictions, List[Any]):
93
+ raise ValueError(
94
+ f"Metric {self.get_metric_name()} should receive a list of predictions {self.get_metric_name()}. Received predictions of type {type(predictions)}: {predictions}"
95
+ )
96
+
97
+ if not isoftype(references, List[Any]):
98
+ raise ValueError(
99
+ f"Metric {self.get_metric_name()} should receive a list of predictions. Received references of type {type(references)}: {references}"
100
+ )
101
+
102
+ if len(references) != len(predictions):
103
+ raise ValueError(
104
+ f"references size ({len(references)})"
105
+ f" doesn't mach predictions size ({len(references)})."
106
+ )
107
+
108
+ for reference in references:
109
+ self._validate_reference(reference)
110
+
111
+ for prediction in predictions:
112
+ self._validate_prediction(prediction)
113
+
114
+ def _validate_prediction(self, prediction):
115
+ if not isoftype(prediction, self.get_prediction_type()):
116
+ raise ValueError(
117
+ f"Each prediction is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}"
118
+ )
119
+
120
+ def _validate_reference(self, reference):
121
+ if not isoftype(reference, List[Any]):
122
+ raise ValueError(
123
+ f"Expecting a list of references for each prediction in {self.get_metric_name()} metric. Received reference of type {type(reference)}: {reference}"
124
+ )
125
+ if self.single_reference_per_prediction and not len(reference) == 1:
126
+ raise ValueError(
127
+ f"Expecting a list with a single reference per prediction in {self.get_metric_name()} metric. Received a list with multiple references: {reference}"
128
+ )
129
+ for ref in reference:
130
+ if not isoftype(ref, self.get_prediction_type()):
131
+ raise ValueError(
132
+ f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}"
133
+ )
134
+
135
+ def get_prediction_type(self):
136
+ if self.prediction_type is None:
137
+ logger.warning(
138
+ f"{self.get_metric_name()} metric does not set the 'prediction_type' parameter so input type checking is not performed. Set the prediction type to the expected prediction type (e.g. 'str', 'List[str]', or 'Any'). In future version of unitxt this will raise an exception."
139
+ )
140
+ self._parsed_prediction_type = Any
141
+ try:
142
+ if self._parsed_prediction_type is not None:
143
+ return self._parsed_prediction_type
144
+
145
+ self._parsed_prediction_type = parse_type_string(self.prediction_type)
146
+ except ValueError:
147
+ raise ValueError(
148
+ "Could convert prediction type '{self.prediction_type}' in {self.get_metric_name()} to known type. To enable type checking for this prediction type, open unitxt issue with this message. Alternatively, set the metric's prediction_type to 'Any'"
149
+ ) from None
150
+ return self._parsed_prediction_type
151
+
152
+ def get_metric_name(self):
153
+ if self.artifact_identifier is not None:
154
+ return self.artifact_identifier
155
+ return self.__class__.__name__
156
+
157
  def consume_stream(self, stream: Stream):
158
  references = []
159
  predictions = []
 
414
  n_resamples: int = OptionalField(
415
  default_factory=lambda: settings.num_resamples_for_global_metrics
416
  )
417
+
418
+ # calculate scores for single instances
419
  process_single_instances = True
420
 
421
  def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
 
466
  instance_score[self.main_score] = no_score_value
467
 
468
  instance["score"]["instance"].update(instance_score)
469
+ self._validate_references_and_prediction(references, predictions)
470
 
471
  result = self._compute(references, predictions, task_data)
472
 
 
541
  instance["task_data"] if "task_data" in instance else {}
542
  for instance in stream
543
  ]
544
+ self._validate_references_and_prediction(references, predictions)
545
  # compute the metric over all refs and preds
546
  instance_scores = self.compute(
547
  references=references,
 
806
 
807
  for instance in stream:
808
  refs, pred = instance["references"], instance["prediction"]
809
+ self._validate_prediction(pred)
810
+ self._validate_reference(refs)
811
  task_data = instance["task_data"] if "task_data" in instance else {}
812
 
813
  instance_score = self.compute(
 
921
  pass
922
 
923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
924
  class Accuracy(InstanceMetric):
925
  reduction_map = {"mean": ["accuracy"]}
926
  main_score = "accuracy"
927
  ci_scores = ["accuracy"]
928
 
929
+ prediction_type = "Any" # string representation is compared
930
+
931
  def compute(
932
  self, references: List[Any], prediction: Any, task_data: List[Dict]
933
  ) -> dict:
 
941
  return result
942
 
943
 
944
+ class UnsortedListExactMatch(InstanceMetric):
945
+ reduction_map = {"mean": ["unsorted_list_exact_match"]}
946
+ main_score = "unsorted_list_exact_match"
947
+ ci_scores = ["unsorted_list_exact_match"]
948
+
949
+ def compute(
950
+ self, references: List[Any], prediction: Any, task_data: List[Dict]
951
+ ) -> dict:
952
+ result = {self.main_score: float(sorted(prediction) == sorted(references[0]))}
953
+ result["score"] = result[self.main_score]
954
+ result["score_name"] = self.main_score
955
+ return result
956
+
957
+
958
  class StringContainment(InstanceMetric):
959
  reduction_map = {"mean": ["string_containment"]}
960
  main_score = "string_containment"
961
  ci_scores = ["string_containment"]
962
 
963
+ prediction_type = "Any" # string representation is compared
964
+ single_reference_per_prediction = False # multiple references allowed
965
+
966
  def compute(
967
  self, references: List[Any], prediction: Any, task_data: List[Dict]
968
  ) -> dict:
 
1077
 
1078
  passed_task_data[additional_input_field] = next(iter(values))
1079
 
1080
+ # add check that all required fields in self.metrics are in passed_task_data
1081
  result = self.metric.compute(
1082
  predictions=predictions,
1083
  references=references,
 
1159
  average = None # Report per class then aggregate by mean
1160
  metric = "f1"
1161
 
1162
+ prediction_type = "str"
1163
+ single_reference_per_prediction = True
1164
+
1165
  def prepare(self):
1166
  super().prepare()
1167
  self._metric = evaluate.load(self.metric)
 
1173
  self.id_to_str[id] = str
1174
  return self.str_to_id[str]
1175
 
 
 
 
 
 
1176
  def compute(
1177
  self,
1178
  references: List[List[str]],
1179
  predictions: List[str],
1180
  task_data: List[Dict],
1181
  ) -> dict:
 
 
 
 
 
 
1182
  self.str_to_id = {}
1183
  self.id_to_str = {}
1184
  formatted_references = [
 
1213
 
1214
 
1215
  class F1Binary(F1):
1216
+ """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
1217
+
1218
  process_single_instances = False
1219
  main_score = "f1_binary"
1220
  average = "binary"
1221
  pos_classes = {"1", "1.0", "yes", "true"}
1222
+ threshold = 0.5
1223
 
1224
  def get_str_id(self, str):
1225
+ return int(str)
 
 
1226
 
1227
+ def compute(
1228
+ self,
1229
+ references: List[List[str]],
1230
+ predictions: List[str],
1231
+ task_data: List[Dict],
1232
+ ) -> dict:
1233
+ predictions_floats = [to_float_or_default(p) for p in predictions]
1234
+ predictions = [str(int(p > self.threshold)) for p in predictions_floats]
1235
+ references = [
1236
+ ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
1237
+ ]
1238
+ return super().compute(references, predictions, task_data)
1239
 
1240
 
1241
  class RecallBinary(F1Binary):
 
1263
  average = None # Report per class then aggregate by mean
1264
  metric = "f1"
1265
 
1266
+ prediction_type = "List[str]"
1267
+ single_reference_per_prediction = True
1268
+
1269
  def prepare(self):
1270
  super().prepare()
1271
  self._metric = evaluate.load(self.metric, "multilabel")
 
1293
  self.str_to_id = {}
1294
  self.id_to_str = {}
1295
 
 
1296
  references = [reference[0] for reference in references]
1297
 
1298
  labels = list({label for reference in references for label in reference})
 
1335
  final_result = {self.main_score: result[self.metric]}
1336
  return final_result
1337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1338
 
1339
  class PrecisionMacroMultiLabel(F1MultiLabel):
1340
  main_score = "precision_macro"
 
1375
  main_score = "rougeL"
1376
  scale = 1.0
1377
 
1378
+ prediction_type = "str"
1379
+ single_reference_per_prediction = False # multiple references allowed
1380
+
1381
  use_aggregator: bool = True
1382
  rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
1383
 
 
1415
  reduction_map = {"mean": ["char_edit_dist_accuracy"]}
1416
  main_score = "char_edit_dist_accuracy"
1417
  ci_scores = ["char_edit_dist_accuracy"]
1418
+ prediction_type = "str"
1419
+ single_reference_per_prediction = True
1420
 
1421
  _requirements_list: List[str] = ["editdistance"]
1422
 
 
1427
  self.eval = editdistance.eval
1428
 
1429
  def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
 
 
 
 
1430
  formatted_prediction = "".join(prediction.split())
1431
  formatted_reference = "".join(references[0].split())
1432
  max_length = max(len(formatted_reference), len(formatted_prediction))
 
1439
  class Wer(HuggingfaceMetric):
1440
  hf_metric_name = "wer"
1441
  main_score = "wer"
1442
+ prediction_type = "str"
1443
+ single_reference_per_prediction = True
1444
 
1445
  _requirements_list: List[str] = ["jiwer"]
1446
 
 
1450
  predictions: List[str],
1451
  task_data: List[Dict],
1452
  ) -> dict:
 
 
 
1453
  formatted_references = [reference[0] for reference in references]
1454
  result = self.metric.compute(
1455
  predictions=predictions, references=formatted_references
 
1461
  hf_metric_name = "spearmanr"
1462
  main_score = "spearmanr"
1463
  process_single_instances = False
1464
+ prediction_type = "float"
1465
+
1466
+ # Spearmanr references are not list
1467
+ def _validate_reference(self, reference):
1468
+ if not isoftype(reference, self.get_prediction_type()):
1469
+ raise ValueError(
1470
+ f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}"
1471
+ )
1472
 
1473
 
1474
  class KendallTauMetric(GlobalMetric):
1475
  main_score = "kendalltau_b"
1476
  variant = "b"
1477
  process_single_instances = False
1478
+ prediction_type = "str"
1479
 
1480
  _requirements_list: List[str] = ["scipy"]
1481
 
 
1508
  main_score = "matthews_correlation"
1509
  str_to_id: dict = InternalField(default_factory=dict)
1510
 
1511
+ single_reference_per_prediction = True
1512
+ prediction_type = "str"
1513
+
1514
  def get_str_id(self, str):
1515
  if str not in self.str_to_id:
1516
  id = len(self.str_to_id)
 
1538
  main_score = "roc_auc"
1539
  process_single_instances = False
1540
  _requirements_list: List[str] = ["sklearn"]
1541
+ single_reference_per_prediction = True
1542
+ prediction_type = "str"
1543
 
1544
  def prepare(self):
1545
  from sklearn import metrics
 
1567
 
1568
  class CustomF1(GlobalMetric):
1569
  main_score = "f1_micro"
1570
+ prediction_type = "Any"
1571
+ single_reference_per_prediction = True
1572
  groups = None
1573
  zero_division = 0.0
1574
 
 
1623
  def get_groups(self, elements, task_data):
1624
  groups = set()
1625
  for sublist, additional_input in zip(elements, task_data):
1626
+ if not isinstance(sublist, list):
1627
+ sublist = [sublist]
1628
  for e in sublist:
1629
  if self.should_ignore_element(e, additional_input):
1630
  continue
 
1637
  predictions: List[Any],
1638
  task_data: List[Dict],
1639
  ) -> dict:
1640
+ references = [element[0] for element in references]
 
 
 
 
 
 
 
 
 
 
 
1641
 
1642
  if self.groups is None:
1643
  groups = self.get_groups(references, task_data)
 
1730
 
1731
 
1732
  class NER(CustomF1):
1733
+ prediction_type = "List[Tuple[str,str]]"
1734
+
1735
  def get_element_group(self, element, additional_input):
1736
  return element[1]
1737
 
 
1762
  reduction_map = {"mean": ["f1", "precision", "recall"]}
1763
  main_score = "f1"
1764
  ci_scores = ["f1", "precision", "recall"]
1765
+ single_reference_per_prediction = False
1766
+ prediction_type = "str"
1767
 
1768
  def compute(
1769
  self, references: List[Any], prediction: Any, task_data: List[Dict]
 
1898
 
1899
 
1900
  class LlamaIndexCorrectness(InstanceMetric):
1901
+ """LlamaIndex based metric class for evaluating correctness."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1902
 
1903
  model_name: str = ""
1904
  main_score: str = ""
1905
+ prediction_type: str = "str"
1906
  reduction_map: Dict[str, List[str]] = None
1907
  openai_models: List[str] = ["gpt-3.5-turbo"]
1908
  anthropic_models: List[
 
1923
  Returns:
1924
  Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
1925
  """
1926
+ import re
1927
+
1928
+ match = re.search(r"\b\d+\.\d+\b|\b\d+\b", eval_response)
1929
+
1930
+ if match:
1931
+ score = float(match.group())
1932
+ else:
1933
+ raise Exception("could not parse judge response")
1934
+
1935
  reasoning_str = "\n".join(eval_response.split("\n")[1:])
 
1936
  reasoning = reasoning_str.lstrip("\n")
1937
  return score, reasoning
1938
 
 
1997
  ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False. Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
1998
 
1999
  query = task_data["question"]
2000
+
2001
+ contexts = None
2002
+ if "contexts" in task_data:
2003
+ contexts = task_data["contexts"]
2004
 
2005
  per_reference_results = []
2006
  for reference_response in references:
 
2026
 
2027
  main_score = "perplexity"
2028
  reduction_map = {"mean": ["perplexity"]}
2029
+ prediction_type = "str"
2030
 
2031
  perplexity_prompt: str
 
2032
  batch_size: int = 32
2033
  model_name: str
2034
 
 
2251
  return shifted_logits, shifted_labels
2252
 
2253
 
2254
+ class Squad(HuggingfaceMetric):
2255
+ hf_metric_name = "squad"
2256
+ main_score = "f1"
2257
+ scale = 100.0
2258
+ scaled_fields = ["f1", "exact_match"]
2259
+ prediction_type = "Dict[str,Any]"
2260
+
2261
+ # Squad references are not list, but a dict that contain a field called 'answers/text'
2262
+ # which is the list of references
2263
+ def _validate_reference(self, reference):
2264
+ if not isoftype(reference, self.get_prediction_type()):
2265
+ raise ValueError(
2266
+ f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}"
2267
+ )
2268
+
2269
+
2270
  class NDCG(GlobalMetric):
2271
  """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
2272
 
 
2285
  main_score = "nDCG"
2286
 
2287
  _requirements_list: List[str] = ["sklearn"]
2288
+ single_reference_per_prediction = True
2289
+ prediction_type = "Optional[float]"
2290
 
2291
  def prepare(self):
2292
  from sklearn.metrics import ndcg_score
 
2303
  from collections import defaultdict
2304
 
2305
  query_to_predictions_and_references = defaultdict(lambda: [[], []])
2306
+ references = [reference[0] for reference in references]
2307
  for reference, pred, inputs_dict in zip(references, predictions, task_data):
2308
  query = inputs_dict.get("query")
2309
  query_to_predictions_and_references[query][0].append(pred)
 
2334
 
2335
 
2336
  class RetrievalMetric(InstanceMetric):
2337
+ prediction_type = "List[str]"
2338
+ single_reference_per_prediction = True
2339
+
2340
  def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
2341
  # digest input
2342
  pred_ids: List[Any] = prediction
2343
+ ref_ids: List[Any] = list(dict.fromkeys(references[0]))
2344
 
2345
  # relevance_at_k: 1-based dictionary of indicators (0/1), telling whether
2346
  # the doc id retrieved at position k (assuming it is 1-based, so k starts
 
2488
 
2489
 
2490
  class KPA(CustomF1):
2491
+ prediction_type = "str"
2492
+ single_reference_per_prediction = True
2493
+
2494
  def get_element_group(self, element, additional_input):
2495
  return additional_input["keypoint"]
2496
 
 
3171
 
3172
 
3173
  class BinaryMaxF1(F1Binary):
3174
+ """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
3175
+
3176
  main_score = "max_f1_binary"
3177
+ prediction_type = str
3178
+ single_reference_per_prediction = True
3179
 
3180
  def compute(
3181
  self,
 
3183
  predictions: List[List[str]],
3184
  task_data: List[Dict],
3185
  ) -> dict:
3186
+ float_predictions = [to_float_or_default(p) for p in predictions]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3187
 
3188
  best_thr = -1
3189
  best_f1 = -1
3190
  for thr in set(float_predictions):
3191
  new_predictions = [
3192
+ "1" if float_prediction >= thr else "0"
3193
  for float_prediction in float_predictions
3194
  ]
3195
  f1 = super().compute(references, new_predictions, task_data)[
 
3200
  best_thr = thr
3201
 
3202
  return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
3203
+
3204
+
3205
+ class BinaryAccuracy(InstanceMetric):
3206
+ """Calculate accuracy for a binary task, using 0.5 as the threshold in the case of float predictions."""
3207
+
3208
+ reduction_map = {"mean": ["accuracy_binary"]}
3209
+ main_score = "accuracy_binary"
3210
+ ci_scores = ["accuracy_binary"]
3211
+ pos_classes = {"1", "1.0", "yes", "true"}
3212
+ threshold = 0.5
3213
+
3214
+ prediction_type = "str"
3215
+ single_reference_per_prediction = True
3216
+
3217
+ def compute(
3218
+ self, references: List[Any], prediction: Any, task_data: List[Dict]
3219
+ ) -> dict:
3220
+ float_prediction = to_float_or_default(prediction)
3221
+ prediction = str(int(float_prediction > self.threshold))
3222
+ references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
3223
+
3224
+ result = {self.main_score: float([prediction] == references)}
3225
+ result["score"] = result[self.main_score]
3226
+ result["score_name"] = self.main_score
3227
+ return result
3228
+
3229
+
3230
+ class BinaryMaxAccuracy(GlobalMetric):
3231
+ """Calculate the maximal accuracy and the decision threshold that achieves it for a binary task with float predictions."""
3232
+
3233
+ process_single_instances = False
3234
+ main_score = "max_accuracy_binary"
3235
+ pos_classes = {"1", "1.0", "yes", "true"}
3236
+
3237
+ prediction_type = "str"
3238
+ single_reference_per_prediction = True
3239
+
3240
+ def compute(
3241
+ self,
3242
+ references: List[List[str]],
3243
+ predictions: List[List[str]],
3244
+ task_data: List[Dict],
3245
+ ) -> dict:
3246
+ float_predictions = [to_float_or_default(p) for p in predictions]
3247
+ references = [
3248
+ ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
3249
+ ]
3250
+
3251
+ best_thr = -1
3252
+ best_acc = -1
3253
+ for thr in set(float_predictions):
3254
+ new_predictions = [
3255
+ "1" if float_prediction >= thr else "0"
3256
+ for float_prediction in float_predictions
3257
+ ]
3258
+ acc = np.mean(
3259
+ [
3260
+ [prediction] == reference
3261
+ for prediction, reference in zip(new_predictions, references)
3262
+ ]
3263
+ )
3264
+ if acc > best_acc:
3265
+ best_acc = acc
3266
+ best_thr = thr
3267
+
3268
+ return {self.main_score: best_acc, "best_thr_max_acc": best_thr}