Spaces:
Sleeping
Sleeping
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""brier_score metric for multiclass problem.""" | |
import numpy as np | |
import evaluate | |
import datasets | |
_CITATION = """ | |
@article{brier1950verification, | |
title={Verification of forecasts expressed in terms of probability}, | |
author={Brier, Glenn W}, | |
journal={Monthly weather review}, | |
volume={78}, | |
number={1}, | |
pages={1--3}, | |
year={1950} | |
} | |
""" | |
_DESCRIPTION = """ | |
Measure to compare true observed labels with predicted probabilities in multiclass classification tasks. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Multiclass Brier Score: Measure to compare true observed labels with predicted probabilities in multiclass classification tasks. | |
Args: | |
pred_probs: array-like of shape (n_sample, m_classes). | |
references: array-like array of shape (n_sample,). | |
Returns: | |
brier_score: float, average brier score over all samples. | |
Examples: | |
Examples should be written in doctest format, and should illustrate how | |
to use the function. | |
>>> brier_metric = multiclass_brier_score() | |
>>> brier_score = brier_metric.compute(pred_probs=[[0.0, 1.0, 0.0]], references=[1]) | |
>>> print(brier_score) | |
{'brier_score': 0.0} | |
>>> brier_metric = multiclass_brier_score() | |
>>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8]], references=[2]) | |
>>> print(round(brier_score['brier_score'], 2)) | |
0.06 | |
>>> brier_metric = multiclass_brier_score() | |
>>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8], [0.0, 1.0, 0.0]], references=[2, 1]) | |
>>> print(round(brier_score['brier_score'], 2)) | |
0.03 | |
""" | |
class multiclass_brier_score(evaluate.Metric): | |
"""TODO: Short description of my evaluation module.""" | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features({ | |
'pred_probs': datasets.Sequence(datasets.Value("float")), | |
'references': datasets.Value('int32'), | |
}), | |
# Additional links to the codebase or references | |
#codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
reference_urls=["https://search.r-project.org/CRAN/refmans/mlr3measures/html/mbrier.html"] | |
) | |
def _compute(self, pred_probs: np.ndarray, references: np.ndarray): | |
""" | |
brier_score = 1/n * sum_{i=1}^n sum_{j=1}^m (y_{ij} - p{ij})^2 | |
Args: | |
pred_probs: numpy array of shape (n, m) where n is the number of samples and m is the number of classes | |
references: numpy array of shape (n,) where n is the number of samples | |
""" | |
assert len(pred_probs) == len(references), "The length of the predictions and references should be the same" | |
pred_probs = np.array(pred_probs) | |
n = len(references) | |
m = pred_probs.shape[1] | |
# generate one-hot encoding for the references | |
references_onehot = np.zeros((n, m)) | |
references_onehot[np.arange(n), references] = 1 # shape: (n, m) | |
brier_score = np.sum((references_onehot - pred_probs)**2) / float(n) | |
return { | |
"brier_score": brier_score, | |
} | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod() |