# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """brier_score metric for multiclass problem.""" import numpy as np import evaluate import datasets _CITATION = """ @article{brier1950verification, title={Verification of forecasts expressed in terms of probability}, author={Brier, Glenn W}, journal={Monthly weather review}, volume={78}, number={1}, pages={1--3}, year={1950} } """ _DESCRIPTION = """ Measure to compare true observed labels with predicted probabilities in multiclass classification tasks. """ _KWARGS_DESCRIPTION = """ Multiclass Brier Score: Measure to compare true observed labels with predicted probabilities in multiclass classification tasks. Args: pred_probs: array-like of shape (n_sample, m_classes). references: array-like array of shape (n_sample,). Returns: brier_score: float, average brier score over all samples. Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> brier_metric = multiclass_brier_score() >>> brier_score = brier_metric.compute(pred_probs=[[0.0, 1.0, 0.0]], references=[1]) >>> print(brier_score) {'brier_score': 0.0} >>> brier_metric = multiclass_brier_score() >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8]], references=[2]) >>> print(round(brier_score['brier_score'], 2)) 0.06 >>> brier_metric = multiclass_brier_score() >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8], [0.0, 1.0, 0.0]], references=[2, 1]) >>> print(round(brier_score['brier_score'], 2)) 0.03 """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class multiclass_brier_score(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'pred_probs': datasets.Sequence(datasets.Value("float")), 'references': datasets.Value('int32'), }), # Additional links to the codebase or references #codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["https://search.r-project.org/CRAN/refmans/mlr3measures/html/mbrier.html"] ) def _compute(self, pred_probs: np.ndarray, references: np.ndarray): """ brier_score = 1/n * sum_{i=1}^n sum_{j=1}^m (y_{ij} - p{ij})^2 Args: pred_probs: numpy array of shape (n, m) where n is the number of samples and m is the number of classes references: numpy array of shape (n,) where n is the number of samples """ assert len(pred_probs) == len(references), "The length of the predictions and references should be the same" pred_probs = np.array(pred_probs) n = len(references) m = pred_probs.shape[1] # generate one-hot encoding for the references references_onehot = np.zeros((n, m)) references_onehot[np.arange(n), references] = 1 # shape: (n, m) brier_score = np.sum((references_onehot - pred_probs)**2) / float(n) return { "brier_score": brier_score, } if __name__ == "__main__": import doctest doctest.testmod()