Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 24, 2024

Commit

1e4984f

verified ·

1 Parent(s): b4ab559

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +273 -141

metrics.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import itertools
 import re
 import string
 import uuid
@@ -30,7 +29,7 @@ from .operators import CopyFields
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .type_utils import isoftype, to_float_or_default
 logger = get_logger()
 settings = get_settings()
@@ -75,6 +74,86 @@ class Metric(Artifact):
     def main_score(self):
         pass
     def consume_stream(self, stream: Stream):
         references = []
         predictions = []
@@ -335,6 +414,8 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     n_resamples: int = OptionalField(
         default_factory=lambda: settings.num_resamples_for_global_metrics
     )
     process_single_instances = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
@@ -385,6 +466,7 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                     instance_score[self.main_score] = no_score_value
             instance["score"]["instance"].update(instance_score)
         result = self._compute(references, predictions, task_data)
@@ -459,7 +541,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             instance["task_data"] if "task_data" in instance else {}
             for instance in stream
         ]
         # compute the metric over all refs and preds
         instance_scores = self.compute(
             references=references,
@@ -724,6 +806,8 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         for instance in stream:
             refs, pred = instance["references"], instance["prediction"]
             task_data = instance["task_data"] if "task_data" in instance else {}
             instance_score = self.compute(
@@ -837,42 +921,13 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         pass
-class Squad(GlobalMetric):
-    _metric = None
-    main_score = "f1"
-    metric = "squad"
-    def prepare(self):
-        super().prepare()
-        self._metric = evaluate.load(self.metric)
-    def compute(
-        self,
-        references: List[List[str]],
-        predictions: List[str],
-        task_data: List[Dict],
-    ) -> dict:
-        ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
-        formatted_predictions = [
-            {"prediction_text": prediction, "id": ids[i]}
-            for i, prediction in enumerate(predictions)
-        ]
-        formatted_references = [
-            {"answers": {"answer_start": [-1], "text": reference}, "id": ids[i]}
-            for i, reference in enumerate(references)
-        ]
-        return self._metric.compute(
-            predictions=formatted_predictions,
-            references=formatted_references,
-        )
 class Accuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy"]}
     main_score = "accuracy"
     ci_scores = ["accuracy"]
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
@@ -886,11 +941,28 @@ class Accuracy(InstanceMetric):
         return result
 class StringContainment(InstanceMetric):
     reduction_map = {"mean": ["string_containment"]}
     main_score = "string_containment"
     ci_scores = ["string_containment"]
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
@@ -1005,7 +1077,7 @@ class HuggingfaceMetric(GlobalMetric):
             passed_task_data[additional_input_field] = next(iter(values))
-        # add check that all required fields in self.metrics are in passed_task_data       print(passed_task_data)
         result = self.metric.compute(
             predictions=predictions,
             references=references,
@@ -1087,6 +1159,9 @@ class F1(GlobalMetric):
     average = None  # Report per class then aggregate by mean
     metric = "f1"
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(self.metric)
@@ -1098,23 +1173,12 @@ class F1(GlobalMetric):
             self.id_to_str[id] = str
         return self.str_to_id[str]
-    def _labels_match_average_format(
-        self, references: List[List[str]], predictions: List[str]
-    ):
-        return True
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        assert all(
-            len(reference) == 1 for reference in references
-        ), "Only a single reference per prediction is allowed in F1 metric"
-        if not self._labels_match_average_format(references, predictions):
-            return {self.main_score: np.nan}
         self.str_to_id = {}
         self.id_to_str = {}
         formatted_references = [
@@ -1149,27 +1213,29 @@ class F1Micro(F1):
 class F1Binary(F1):
     process_single_instances = False
     main_score = "f1_binary"
     average = "binary"
     pos_classes = {"1", "1.0", "yes", "true"}
     def get_str_id(self, str):
-        if str.lower() in self.pos_classes:
-            return 1
-        return 0
-    # References and predictions must include up to 2 unique values, one of them in pos_classes
-    def _labels_match_average_format(
-        self, references: List[List[str]], predictions: List[str]
-    ):
-        classes = set(predictions + list(itertools.chain(*references)))
-        n_classes = len(classes)
-        if n_classes > 2:
-            return False
-        if n_classes == 2 and len(set(classes).difference(self.pos_classes)) == 0:
-            return False
-        return True
 class RecallBinary(F1Binary):
@@ -1197,6 +1263,9 @@ class F1MultiLabel(GlobalMetric):
     average = None  # Report per class then aggregate by mean
     metric = "f1"
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(self.metric, "multilabel")
@@ -1224,7 +1293,6 @@ class F1MultiLabel(GlobalMetric):
         self.str_to_id = {}
         self.id_to_str = {}
-        self._validate_references_and_prediction(references, predictions)
         references = [reference[0] for reference in references]
         labels = list({label for reference in references for label in reference})
@@ -1267,23 +1335,6 @@ class F1MultiLabel(GlobalMetric):
             final_result = {self.main_score: result[self.metric]}
         return final_result
-    def _validate_references_and_prediction(self, references, predictions):
-        for reference in references:
-            if not len(reference) == 1:
-                raise ValueError(
-                    f"Only a single reference per prediction is allowed in F1 multi label metric. Received reference: {reference}"
-                )
-            if not isoftype(reference[0], List[str]):
-                raise ValueError(
-                    f"Each reference is expected to be a list of strings in F1 multi label metric. Received reference: '{reference[0]}'"
-                )
-        for prediction in predictions:
-            if not isoftype(prediction, List[str]):
-                raise ValueError(
-                    f"Each prediction is expected to be a list of strings in F1 multi label metric. Received prediction: '{prediction}'"
-                )
 class PrecisionMacroMultiLabel(F1MultiLabel):
     main_score = "precision_macro"
@@ -1324,6 +1375,9 @@ class Rouge(HuggingfaceMetric):
     main_score = "rougeL"
     scale = 1.0
     use_aggregator: bool = True
     rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
@@ -1361,6 +1415,8 @@ class CharEditDistanceAccuracy(InstanceMetric):
     reduction_map = {"mean": ["char_edit_dist_accuracy"]}
     main_score = "char_edit_dist_accuracy"
     ci_scores = ["char_edit_dist_accuracy"]
     _requirements_list: List[str] = ["editdistance"]
@@ -1371,10 +1427,6 @@ class CharEditDistanceAccuracy(InstanceMetric):
         self.eval = editdistance.eval
     def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
-        assert (
-            len(references) == 1
-        ), f"Expected only one reference , but received: {references}"
         formatted_prediction = "".join(prediction.split())
         formatted_reference = "".join(references[0].split())
         max_length = max(len(formatted_reference), len(formatted_prediction))
@@ -1387,6 +1439,8 @@ class CharEditDistanceAccuracy(InstanceMetric):
 class Wer(HuggingfaceMetric):
     hf_metric_name = "wer"
     main_score = "wer"
     _requirements_list: List[str] = ["jiwer"]
@@ -1396,9 +1450,6 @@ class Wer(HuggingfaceMetric):
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
-        assert all(
-            len(reference) == 1 for reference in references
-        ), "Only single reference per prediction is allowed in wer metric"
         formatted_references = [reference[0] for reference in references]
         result = self.metric.compute(
             predictions=predictions, references=formatted_references
@@ -1410,12 +1461,21 @@ class Spearmanr(HuggingfaceMetric):
     hf_metric_name = "spearmanr"
     main_score = "spearmanr"
     process_single_instances = False
 class KendallTauMetric(GlobalMetric):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
     _requirements_list: List[str] = ["scipy"]
@@ -1448,6 +1508,9 @@ class MatthewsCorrelation(HuggingfaceMetric):
     main_score = "matthews_correlation"
     str_to_id: dict = InternalField(default_factory=dict)
     def get_str_id(self, str):
         if str not in self.str_to_id:
             id = len(self.str_to_id)
@@ -1475,6 +1538,8 @@ class RocAuc(GlobalMetric):
     main_score = "roc_auc"
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
         from sklearn import metrics
@@ -1502,6 +1567,8 @@ class RocAuc(GlobalMetric):
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
     groups = None
     zero_division = 0.0
@@ -1556,6 +1623,8 @@ class CustomF1(GlobalMetric):
     def get_groups(self, elements, task_data):
         groups = set()
         for sublist, additional_input in zip(elements, task_data):
             for e in sublist:
                 if self.should_ignore_element(e, additional_input):
                     continue
@@ -1568,18 +1637,7 @@ class CustomF1(GlobalMetric):
         predictions: List[Any],
         task_data: List[Dict],
     ) -> dict:
-        # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
-        if (
-            isinstance(references[0], list)
-            and len(references[0]) > 0
-            and isinstance(references[0][0], list)
-        ):
-            references = [element[0] for element in references]
-        assert len(references) == len(predictions), (
-            f"references size ({len(references)})"
-            f" doesn't mach predictions size ({len(references)})."
-        )
         if self.groups is None:
             groups = self.get_groups(references, task_data)
@@ -1672,6 +1730,8 @@ class CustomF1(GlobalMetric):
 class NER(CustomF1):
     def get_element_group(self, element, additional_input):
         return element[1]
@@ -1702,6 +1762,8 @@ class TokenOverlap(InstanceMetric):
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     main_score = "f1"
     ci_scores = ["f1", "precision", "recall"]
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
@@ -1836,25 +1898,11 @@ class Reward(BulkInstanceMetric):
 class LlamaIndexCorrectness(InstanceMetric):
-    """LlamaIndex based metric class for evaluating correctness.
-    Attributes:
-        reduction_map (dict): A dictionary specifying the reduction method for the metric.
-        main_score (str): The main score used for evaluation.
-        _requirements_list (List[str]): A list specifying any additional requirements for the metric.
-    Methods:
-        prepare(self): Initialization method for the metric.
-        compute(self, references, predictions, additional_inputs): Method to compute the metric.
-    Usage:
-        metric = LlamaIndexCorrectnessMetric()
-        scores = metric.compute(references, prediction, additional_inputs)
-    """
     model_name: str = ""
     main_score: str = ""
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
     anthropic_models: List[
@@ -1875,9 +1923,16 @@ class LlamaIndexCorrectness(InstanceMetric):
         Returns:
             Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
         """
-        score_str = eval_response.split("\n")[0]
         reasoning_str = "\n".join(eval_response.split("\n")[1:])
-        score = float(score_str)
         reasoning = reasoning_str.lstrip("\n")
         return score, reasoning
@@ -1942,7 +1997,10 @@ class LlamaIndexCorrectness(InstanceMetric):
         ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False.  Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
         query = task_data["question"]
-        contexts = task_data["contexts"]
         per_reference_results = []
         for reference_response in references:
@@ -1968,9 +2026,9 @@ class Perplexity(BulkInstanceMetric):
     main_score = "perplexity"
     reduction_map = {"mean": ["perplexity"]}
     perplexity_prompt: str
     batch_size: int = 32
     model_name: str
@@ -2193,6 +2251,22 @@ class Perplexity(BulkInstanceMetric):
             return shifted_logits, shifted_labels
 class NDCG(GlobalMetric):
     """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
@@ -2211,6 +2285,8 @@ class NDCG(GlobalMetric):
     main_score = "nDCG"
     _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
         from sklearn.metrics import ndcg_score
@@ -2227,6 +2303,7 @@ class NDCG(GlobalMetric):
         from collections import defaultdict
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
         for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)
@@ -2257,10 +2334,13 @@ class NDCG(GlobalMetric):
 class RetrievalMetric(InstanceMetric):
     def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
         # digest input
         pred_ids: List[Any] = prediction
-        ref_ids: List[Any] = list(dict.fromkeys(references))
         # relevance_at_k: 1-based dictionary of indicators (0/1), telling whether
         # the doc id retrieved at position k (assuming it is 1-based, so k starts
@@ -2408,6 +2488,9 @@ class RetrievalAtK(RetrievalMetric):
 class KPA(CustomF1):
     def get_element_group(self, element, additional_input):
         return additional_input["keypoint"]
@@ -3088,7 +3171,11 @@ class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment):
 class BinaryMaxF1(F1Binary):
     main_score = "max_f1_binary"
     def compute(
         self,
@@ -3096,34 +3183,13 @@ class BinaryMaxF1(F1Binary):
         predictions: List[List[str]],
         task_data: List[Dict],
     ) -> dict:
-        assert all(
-            len(reference) == 1 for reference in references
-        ), "Only a single reference per prediction is allowed in F1 metric"
-        classes = set(itertools.chain(*references))
-        n_clases = len(classes)
-        assert len(classes) <= 2, "References of BinaryMaxF1 must be binary"
-        pos_classes = classes.intersection(self.pos_classes)
-        neg_classes = classes.difference(self.pos_classes)
-        n_pos_classes = len(pos_classes)
-        if n_clases == 2:
-            assert (
-                n_pos_classes == 1
-            ), "Only one positive class is allowed in BinaryMaxF1"
-        pos_class = next(iter(pos_classes)) if n_pos_classes > 0 else "1.0"
-        neg_class = next(iter(neg_classes)) if len(neg_classes) > 0 else "0.0"
-        float_predictions = []
-        for prediction in predictions:
-            try:
-                float_predictions.append(float(prediction))
-            except Exception:
-                float_predictions.append(0)
         best_thr = -1
         best_f1 = -1
         for thr in set(float_predictions):
             new_predictions = [
-                pos_class if float_prediction >= thr else neg_class
                 for float_prediction in float_predictions
             ]
             f1 = super().compute(references, new_predictions, task_data)[
@@ -3134,3 +3200,69 @@ class BinaryMaxF1(F1Binary):
                 best_thr = thr
         return {self.main_score: best_f1, "best_thr_maxf1": best_thr}

 import re
 import string
 import uuid
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
+from .type_utils import isoftype, parse_type_string, to_float_or_default
 logger = get_logger()
 settings = get_settings()
     def main_score(self):
         pass
+    # Override 'prediction_type' with the expected type of predictions
+    # and references.  Example: "List[str]", "List[Dict]"", "string".
+    # If left with default None, a warning will be displayed.
+    # In future versions of unitxt, this will be an error.
+    prediction_type: str = None
+    # Standard metrics can receive multiple references per predictions (in a list)
+    # Some metrics support only a single reference per prediction (one element in the list)
+    single_reference_per_prediction: bool = False
+    # Used to store the parsed prediction type and avoid
+    # parsing on every use
+    _parsed_prediction_type = None
+    def _validate_references_and_prediction(self, references, predictions):
+        if not isoftype(predictions, List[Any]):
+            raise ValueError(
+                f"Metric {self.get_metric_name()} should receive a list of predictions {self.get_metric_name()}.  Received predictions of type {type(predictions)}: {predictions}"
+            )
+        if not isoftype(references, List[Any]):
+            raise ValueError(
+                f"Metric {self.get_metric_name()} should receive a list of predictions. Received references of type {type(references)}: {references}"
+            )
+        if len(references) != len(predictions):
+            raise ValueError(
+                f"references size ({len(references)})"
+                f" doesn't mach predictions size ({len(references)})."
+            )
+        for reference in references:
+            self._validate_reference(reference)
+        for prediction in predictions:
+            self._validate_prediction(prediction)
+    def _validate_prediction(self, prediction):
+        if not isoftype(prediction, self.get_prediction_type()):
+            raise ValueError(
+                f"Each prediction is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}"
+            )
+    def _validate_reference(self, reference):
+        if not isoftype(reference, List[Any]):
+            raise ValueError(
+                f"Expecting a list of references for each prediction in {self.get_metric_name()} metric. Received reference of type {type(reference)}: {reference}"
+            )
+        if self.single_reference_per_prediction and not len(reference) == 1:
+            raise ValueError(
+                f"Expecting a list with a single reference per prediction in {self.get_metric_name()} metric. Received a list with multiple references: {reference}"
+            )
+        for ref in reference:
+            if not isoftype(ref, self.get_prediction_type()):
+                raise ValueError(
+                    f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}"
+                )
+    def get_prediction_type(self):
+        if self.prediction_type is None:
+            logger.warning(
+                f"{self.get_metric_name()} metric does not set the 'prediction_type' parameter so input type checking is not performed. Set the prediction type to the expected prediction type (e.g. 'str', 'List[str]', or 'Any'). In future version of unitxt this will raise an exception."
+            )
+            self._parsed_prediction_type = Any
+        try:
+            if self._parsed_prediction_type is not None:
+                return self._parsed_prediction_type
+            self._parsed_prediction_type = parse_type_string(self.prediction_type)
+        except ValueError:
+            raise ValueError(
+                "Could convert prediction type '{self.prediction_type}' in {self.get_metric_name()} to known type.  To enable type checking for this prediction type, open unitxt issue with this message. Alternatively, set the metric's prediction_type to 'Any'"
+            ) from None
+        return self._parsed_prediction_type
+    def get_metric_name(self):
+        if self.artifact_identifier is not None:
+            return self.artifact_identifier
+        return self.__class__.__name__
     def consume_stream(self, stream: Stream):
         references = []
         predictions = []
     n_resamples: int = OptionalField(
         default_factory=lambda: settings.num_resamples_for_global_metrics
     )
+    # calculate scores for single instances
     process_single_instances = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
                     instance_score[self.main_score] = no_score_value
             instance["score"]["instance"].update(instance_score)
+        self._validate_references_and_prediction(references, predictions)
         result = self._compute(references, predictions, task_data)
             instance["task_data"] if "task_data" in instance else {}
             for instance in stream
         ]
+        self._validate_references_and_prediction(references, predictions)
         # compute the metric over all refs and preds
         instance_scores = self.compute(
             references=references,
         for instance in stream:
             refs, pred = instance["references"], instance["prediction"]
+            self._validate_prediction(pred)
+            self._validate_reference(refs)
             task_data = instance["task_data"] if "task_data" in instance else {}
             instance_score = self.compute(
         pass
 class Accuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy"]}
     main_score = "accuracy"
     ci_scores = ["accuracy"]
+    prediction_type = "Any"  # string representation is compared
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
         return result
+class UnsortedListExactMatch(InstanceMetric):
+    reduction_map = {"mean": ["unsorted_list_exact_match"]}
+    main_score = "unsorted_list_exact_match"
+    ci_scores = ["unsorted_list_exact_match"]
+    def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
+    ) -> dict:
+        result = {self.main_score: float(sorted(prediction) == sorted(references[0]))}
+        result["score"] = result[self.main_score]
+        result["score_name"] = self.main_score
+        return result
 class StringContainment(InstanceMetric):
     reduction_map = {"mean": ["string_containment"]}
     main_score = "string_containment"
     ci_scores = ["string_containment"]
+    prediction_type = "Any"  # string representation is compared
+    single_reference_per_prediction = False  # multiple references allowed
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
             passed_task_data[additional_input_field] = next(iter(values))
+        # add check that all required fields in self.metrics are in passed_task_data
         result = self.metric.compute(
             predictions=predictions,
             references=references,
     average = None  # Report per class then aggregate by mean
     metric = "f1"
+    prediction_type = "str"
+    single_reference_per_prediction = True
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(self.metric)
             self.id_to_str[id] = str
         return self.str_to_id[str]
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
         self.str_to_id = {}
         self.id_to_str = {}
         formatted_references = [
 class F1Binary(F1):
+    """Calculate f1 for a binary task, using 0.5 as the threshold in the case of float predictions."""
     process_single_instances = False
     main_score = "f1_binary"
     average = "binary"
     pos_classes = {"1", "1.0", "yes", "true"}
+    threshold = 0.5
     def get_str_id(self, str):
+        return int(str)
+    def compute(
+        self,
+        references: List[List[str]],
+        predictions: List[str],
+        task_data: List[Dict],
+    ) -> dict:
+        predictions_floats = [to_float_or_default(p) for p in predictions]
+        predictions = [str(int(p > self.threshold)) for p in predictions_floats]
+        references = [
+            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
+        ]
+        return super().compute(references, predictions, task_data)
 class RecallBinary(F1Binary):
     average = None  # Report per class then aggregate by mean
     metric = "f1"
+    prediction_type = "List[str]"
+    single_reference_per_prediction = True
     def prepare(self):
         super().prepare()
         self._metric = evaluate.load(self.metric, "multilabel")
         self.str_to_id = {}
         self.id_to_str = {}
         references = [reference[0] for reference in references]
         labels = list({label for reference in references for label in reference})
             final_result = {self.main_score: result[self.metric]}
         return final_result
 class PrecisionMacroMultiLabel(F1MultiLabel):
     main_score = "precision_macro"
     main_score = "rougeL"
     scale = 1.0
+    prediction_type = "str"
+    single_reference_per_prediction = False  # multiple references allowed
     use_aggregator: bool = True
     rouge_types: List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
     reduction_map = {"mean": ["char_edit_dist_accuracy"]}
     main_score = "char_edit_dist_accuracy"
     ci_scores = ["char_edit_dist_accuracy"]
+    prediction_type = "str"
+    single_reference_per_prediction = True
     _requirements_list: List[str] = ["editdistance"]
         self.eval = editdistance.eval
     def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
         formatted_prediction = "".join(prediction.split())
         formatted_reference = "".join(references[0].split())
         max_length = max(len(formatted_reference), len(formatted_prediction))
 class Wer(HuggingfaceMetric):
     hf_metric_name = "wer"
     main_score = "wer"
+    prediction_type = "str"
+    single_reference_per_prediction = True
     _requirements_list: List[str] = ["jiwer"]
         predictions: List[str],
         task_data: List[Dict],
     ) -> dict:
         formatted_references = [reference[0] for reference in references]
         result = self.metric.compute(
             predictions=predictions, references=formatted_references
     hf_metric_name = "spearmanr"
     main_score = "spearmanr"
     process_single_instances = False
+    prediction_type = "float"
+    # Spearmanr references are not list
+    def _validate_reference(self, reference):
+        if not isoftype(reference, self.get_prediction_type()):
+            raise ValueError(
+                f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}"
+            )
 class KendallTauMetric(GlobalMetric):
     main_score = "kendalltau_b"
     variant = "b"
     process_single_instances = False
+    prediction_type = "str"
     _requirements_list: List[str] = ["scipy"]
     main_score = "matthews_correlation"
     str_to_id: dict = InternalField(default_factory=dict)
+    single_reference_per_prediction = True
+    prediction_type = "str"
     def get_str_id(self, str):
         if str not in self.str_to_id:
             id = len(self.str_to_id)
     main_score = "roc_auc"
     process_single_instances = False
     _requirements_list: List[str] = ["sklearn"]
+    single_reference_per_prediction = True
+    prediction_type = "str"
     def prepare(self):
         from sklearn import metrics
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
+    prediction_type = "Any"
+    single_reference_per_prediction = True
     groups = None
     zero_division = 0.0
     def get_groups(self, elements, task_data):
         groups = set()
         for sublist, additional_input in zip(elements, task_data):
+            if not isinstance(sublist, list):
+                sublist = [sublist]
             for e in sublist:
                 if self.should_ignore_element(e, additional_input):
                     continue
         predictions: List[Any],
         task_data: List[Dict],
     ) -> dict:
+        references = [element[0] for element in references]
         if self.groups is None:
             groups = self.get_groups(references, task_data)
 class NER(CustomF1):
+    prediction_type = "List[Tuple[str,str]]"
     def get_element_group(self, element, additional_input):
         return element[1]
     reduction_map = {"mean": ["f1", "precision", "recall"]}
     main_score = "f1"
     ci_scores = ["f1", "precision", "recall"]
+    single_reference_per_prediction = False
+    prediction_type = "str"
     def compute(
         self, references: List[Any], prediction: Any, task_data: List[Dict]
 class LlamaIndexCorrectness(InstanceMetric):
+    """LlamaIndex based metric class for evaluating correctness."""
     model_name: str = ""
     main_score: str = ""
+    prediction_type: str = "str"
     reduction_map: Dict[str, List[str]] = None
     openai_models: List[str] = ["gpt-3.5-turbo"]
     anthropic_models: List[
         Returns:
             Tuple[float, str]: A tuple containing the score as a float and the reasoning as a string.
         """
+        import re
+        match = re.search(r"\b\d+\.\d+\b|\b\d+\b", eval_response)
+        if match:
+            score = float(match.group())
+        else:
+            raise Exception("could not parse judge response")
         reasoning_str = "\n".join(eval_response.split("\n")[1:])
         reasoning = reasoning_str.lstrip("\n")
         return score, reasoning
         ), f"Cannot run send data to remote APIs ({self.model_name}) when unitxt.settings.allow_passing_data_to_remote_api=False.  Set UNITXT_ALLOW_PASSING_DATA_TO_REMOTE_API environment variable, if you want to allow this."
         query = task_data["question"]
+        contexts = None
+        if "contexts" in task_data:
+            contexts = task_data["contexts"]
         per_reference_results = []
         for reference_response in references:
     main_score = "perplexity"
     reduction_map = {"mean": ["perplexity"]}
+    prediction_type = "str"
     perplexity_prompt: str
     batch_size: int = 32
     model_name: str
             return shifted_logits, shifted_labels
+class Squad(HuggingfaceMetric):
+    hf_metric_name = "squad"
+    main_score = "f1"
+    scale = 100.0
+    scaled_fields = ["f1", "exact_match"]
+    prediction_type = "Dict[str,Any]"
+    # Squad references are not list, but a dict that contain a field called 'answers/text'
+    # which is the list of references
+    def _validate_reference(self, reference):
+        if not isoftype(reference, self.get_prediction_type()):
+            raise ValueError(
+                f"Each reference is expected to be of type '{self.prediction_type}' in {self.get_metric_name()} metric. Received prediction of type {type(reference)}: {reference}"
+            )
 class NDCG(GlobalMetric):
     """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
     main_score = "nDCG"
     _requirements_list: List[str] = ["sklearn"]
+    single_reference_per_prediction = True
+    prediction_type = "Optional[float]"
     def prepare(self):
         from sklearn.metrics import ndcg_score
         from collections import defaultdict
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
+        references = [reference[0] for reference in references]
         for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)
 class RetrievalMetric(InstanceMetric):
+    prediction_type = "List[str]"
+    single_reference_per_prediction = True
     def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
         # digest input
         pred_ids: List[Any] = prediction
+        ref_ids: List[Any] = list(dict.fromkeys(references[0]))
         # relevance_at_k: 1-based dictionary of indicators (0/1), telling whether
         # the doc id retrieved at position k (assuming it is 1-based, so k starts
 class KPA(CustomF1):
+    prediction_type = "str"
+    single_reference_per_prediction = True
     def get_element_group(self, element, additional_input):
         return additional_input["keypoint"]
 class BinaryMaxF1(F1Binary):
+    """Calculate the maximal F1 and the decision threshold that achieves it for a binary task with float predictions."""
     main_score = "max_f1_binary"
+    prediction_type = str
+    single_reference_per_prediction = True
     def compute(
         self,
         predictions: List[List[str]],
         task_data: List[Dict],
     ) -> dict:
+        float_predictions = [to_float_or_default(p) for p in predictions]
         best_thr = -1
         best_f1 = -1
         for thr in set(float_predictions):
             new_predictions = [
+                "1" if float_prediction >= thr else "0"
                 for float_prediction in float_predictions
             ]
             f1 = super().compute(references, new_predictions, task_data)[
                 best_thr = thr
         return {self.main_score: best_f1, "best_thr_maxf1": best_thr}
+class BinaryAccuracy(InstanceMetric):
+    """Calculate accuracy for a binary task, using 0.5 as the threshold in the case of float predictions."""
+    reduction_map = {"mean": ["accuracy_binary"]}
+    main_score = "accuracy_binary"
+    ci_scores = ["accuracy_binary"]
+    pos_classes = {"1", "1.0", "yes", "true"}
+    threshold = 0.5
+    prediction_type = "str"
+    single_reference_per_prediction = True
+    def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
+    ) -> dict:
+        float_prediction = to_float_or_default(prediction)
+        prediction = str(int(float_prediction > self.threshold))
+        references = ["1"] if references[0].lower() in self.pos_classes else ["0"]
+        result = {self.main_score: float([prediction] == references)}
+        result["score"] = result[self.main_score]
+        result["score_name"] = self.main_score
+        return result
+class BinaryMaxAccuracy(GlobalMetric):
+    """Calculate the maximal accuracy and the decision threshold that achieves it for a binary task with float predictions."""
+    process_single_instances = False
+    main_score = "max_accuracy_binary"
+    pos_classes = {"1", "1.0", "yes", "true"}
+    prediction_type = "str"
+    single_reference_per_prediction = True
+    def compute(
+        self,
+        references: List[List[str]],
+        predictions: List[List[str]],
+        task_data: List[Dict],
+    ) -> dict:
+        float_predictions = [to_float_or_default(p) for p in predictions]
+        references = [
+            ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
+        ]
+        best_thr = -1
+        best_acc = -1
+        for thr in set(float_predictions):
+            new_predictions = [
+                "1" if float_prediction >= thr else "0"
+                for float_prediction in float_predictions
+            ]
+            acc = np.mean(
+                [
+                    [prediction] == reference
+                    for prediction, reference in zip(new_predictions, references)
+                ]
+            )
+            if acc > best_acc:
+                best_acc = acc
+                best_thr = thr
+        return {self.main_score: best_acc, "best_thr_max_acc": best_thr}