# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""brier_score metric for multiclass problem."""

import numpy as np

import evaluate
import datasets


_CITATION = """
@article{brier1950verification,
  title={Verification of forecasts expressed in terms of probability},
  author={Brier, Glenn W},
  journal={Monthly weather review},
  volume={78},
  number={1},
  pages={1--3},
  year={1950}
}
"""

_DESCRIPTION = """
Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
"""


_KWARGS_DESCRIPTION = """
Multiclass Brier Score: Measure to compare true observed labels with predicted probabilities in multiclass classification tasks.
Args:
    pred_probs: array-like of shape (n_sample, m_classes).
    references: array-like array of shape (n_sample,).
Returns:
    brier_score: float, average brier score over all samples.
Examples:
    Examples should be written in doctest format, and should illustrate how
    to use the function.

    >>> brier_metric = multiclass_brier_score()
    >>> brier_score = brier_metric.compute(pred_probs=[[0.0, 1.0, 0.0]], references=[1])
    >>> print(brier_score)
    {'brier_score': 0.0}

    >>> brier_metric = multiclass_brier_score()
    >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8]], references=[2])
    >>> print(round(brier_score['brier_score'], 2))
    0.06

    >>> brier_metric = multiclass_brier_score()
    >>> brier_score = brier_metric.compute(pred_probs=[[0.1, 0.1, 0.8], [0.0, 1.0, 0.0]], references=[2, 1])
    >>> print(round(brier_score['brier_score'], 2))
    0.03
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class multiclass_brier_score(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'pred_probs': datasets.Sequence(datasets.Value("float")),
                'references': datasets.Value('int32'),
            }),
            # Additional links to the codebase or references
            #codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=["https://search.r-project.org/CRAN/refmans/mlr3measures/html/mbrier.html"]
        )


    def _compute(self, pred_probs: np.ndarray, references: np.ndarray):
        """
        brier_score = 1/n * sum_{i=1}^n sum_{j=1}^m (y_{ij} - p{ij})^2
        Args:
            pred_probs: numpy array of shape (n, m) where n is the number of samples and m is the number of classes
            references: numpy array of shape (n,) where n is the number of samples
        """
        assert len(pred_probs) == len(references), "The length of the predictions and references should be the same"
        pred_probs = np.array(pred_probs)
        n = len(references)
        m = pred_probs.shape[1]
        # generate one-hot encoding for the references
        references_onehot = np.zeros((n, m))
        references_onehot[np.arange(n), references] = 1   # shape: (n, m)
        brier_score = np.sum((references_onehot - pred_probs)**2) / float(n)
        return {
            "brier_score": brier_score,
        }


if __name__ == "__main__":
    import doctest
    doctest.testmod()