multiclass_brier_score / multiclass_brier_score.py
huanghuayu's picture
add implementation
ef248b8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""brier_score metric for multiclass problem."""
import numpy as np
import evaluate
import datasets
_CITATION = """
@article{brier1950verification,
title={Verification of forecasts expressed in terms of probability},
author={Brier, Glenn W},
journal={Monthly weather review},
volume={78},
number={1},
pages={1--3},
year={1950}
}
"""
_DESCRIPTION = """
Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
"""
_KWARGS_DESCRIPTION = """
Multiclass Brier Score: Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
Args:
pred_probs: array-like of shape (n_sample, m_classes).
references: array-like array of shape (n_sample,).
Returns:
brier_score: float, average brier score over all samples.
Examples:
Examples should be written in doctest format, and should illustrate how
to use the function.
>>> brier_metric = multiclass_brier_score()
>>> brier_score = brier_metric.compute(pred_probs=[[0.0, 1.0, 0.0]], references=[1])
>>> print(brier_score)
{'brier_score': 0.0}
>>> brier_metric = multiclass_brier_score()
>>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8]], references=[2])
>>> print(round(brier_score['brier_score'], 2))
0.06
>>> brier_metric = multiclass_brier_score()
>>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8], [0.0, 1.0, 0.0]], references=[2, 1])
>>> print(round(brier_score['brier_score'], 2))
0.03
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class multiclass_brier_score(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features({
'pred_probs': datasets.Sequence(datasets.Value("float")),
'references': datasets.Value('int32'),
}),
# Additional links to the codebase or references
#codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
reference_urls=["https://search.r-project.org/CRAN/refmans/mlr3measures/html/mbrier.html"]
)
def _compute(self, pred_probs: np.ndarray, references: np.ndarray):
"""
brier_score = 1/n * sum_{i=1}^n sum_{j=1}^m (y_{ij} - p{ij})^2
Args:
pred_probs: numpy array of shape (n, m) where n is the number of samples and m is the number of classes
references: numpy array of shape (n,) where n is the number of samples
"""
assert len(pred_probs) == len(references), "The length of the predictions and references should be the same"
pred_probs = np.array(pred_probs)
n = len(references)
m = pred_probs.shape[1]
# generate one-hot encoding for the references
references_onehot = np.zeros((n, m))
references_onehot[np.arange(n), references] = 1 # shape: (n, m)
brier_score = np.sum((references_onehot - pred_probs)**2) / float(n)
return {
"brier_score": brier_score,
}
if __name__ == "__main__":
import doctest
doctest.testmod()