|
from typing import Sequence |
|
|
|
import numpy |
|
from sklearn.metrics import roc_curve, auc |
|
|
|
import datasets |
|
import evaluate |
|
|
|
_DESCRIPTION = """ |
|
MC-AUROC (Multi-class Area Under the Receiver Operating Characteristic Curve) is a performance metric used in multiclass classification tasks. |
|
It evaluates the ability of a model to distinguish between positive and negative classes across different threshold values. |
|
The curve is generated by plotting the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings. |
|
AUROC provides a single scalar value indicating the overall discriminatory power of the model, with higher values suggesting better performance. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
AUROC metric for binary classification predictions. Here we use one-vs-all strategy to calculate the AUROC for multi-class classification problems. |
|
The multi-class AUROC is calculated by treating each class as the positive class and the rest as the negative class. |
|
The final score is the average of the AUROC scores for each class. |
|
|
|
Args: |
|
probabilities: list-like. Predicted probabilities or decision scores for the each class. |
|
true_labels: list-like. True labels indicating the actual class memberships (must be ordinal, starting from 0). |
|
Returns: |
|
auroc_score: float. Multi-class Area Under the Receiver Operating Characteristic Curve (MC-AUROC) score. |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class AVG_MULTICLASS_AUROC(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
citation="", |
|
features=[ |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Sequence(datasets.Value("float")), |
|
"references": datasets.Value("int8") |
|
} |
|
), |
|
], |
|
reference_urls=[ |
|
"https://en.wikipedia.org/wiki/Receiver_operating_characteristic" |
|
], |
|
) |
|
|
|
def _evaluate_statistics(self, variates, coverage): |
|
"""Evaluates the left and right margins for a given M-C distribution |
|
|
|
|
|
Parameters |
|
---------- |
|
|
|
variates : numpy.ndarray |
|
A 1-D array containing the simulated variates |
|
|
|
coverage : float |
|
A number, between 0 and 1 to indicate the desired coverage. Typically, |
|
this number is set to 0.95 (95% coverage). |
|
|
|
|
|
Returns |
|
------- |
|
|
|
stats : (float, float, float, float) |
|
mean, mode and credible intervals for the input simulation |
|
|
|
""" |
|
|
|
left_half = (1 - coverage) / 2 |
|
sorted_variates = numpy.sort(variates) |
|
|
|
|
|
|
|
|
|
lower_index = int(round(len(variates) * left_half)) |
|
|
|
|
|
upper_index = int(round(len(variates) * (1 - left_half))) |
|
|
|
lower = sorted_variates[lower_index - 1] |
|
upper = sorted_variates[upper_index - 1] |
|
|
|
return lower, upper |
|
|
|
def _compute(self, predictions: Sequence[Sequence[float]], references: Sequence[int], CI=False): |
|
""" |
|
Computes the average AUROC score for multi-class classification problems. |
|
""" |
|
probabilities = predictions |
|
|
|
n_classes = list(range(len(probabilities[0]))) |
|
fpr = dict() |
|
tpr = dict() |
|
thresholds = dict() |
|
roc_auc = dict() |
|
roc_auc_ci_low = dict() |
|
roc_auc_ci_high = dict() |
|
for i in range(len(n_classes)): |
|
fpr[i], tpr[i], thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references], |
|
y_score=[prob[i] for prob in probabilities]) |
|
|
|
if CI: |
|
confusion_matrices = self._get_CMs(i, probabilities, references, thresholds) |
|
|
|
low_ci_tpr, high_ci_tpr = [0] * len(thresholds[i]), [0] * len(thresholds[i]) |
|
位 = 1.0 |
|
for k in range(len(thresholds[i])): |
|
variates = numpy.random.beta(confusion_matrices[k]["TP"] + 位, confusion_matrices[k]["FN"] + 位, 1000000) |
|
low_ci_tpr[k], high_ci_tpr[k] = self._evaluate_statistics(variates, 0.95) |
|
|
|
roc_auc_ci_low[i] = auc(fpr[i], low_ci_tpr) |
|
roc_auc_ci_high[i] = auc(fpr[i], high_ci_tpr) |
|
|
|
|
|
roc_auc[i] = auc(fpr[i], tpr[i]) |
|
|
|
|
|
if numpy.isnan(roc_auc[i]): |
|
roc_auc[i] = 0 |
|
|
|
|
|
average_auc = numpy.mean(list(roc_auc.values())) |
|
if CI: |
|
average_auc_ci_low = numpy.mean(list(roc_auc_ci_low.values())) |
|
average_auc_ci_high = numpy.mean(list(roc_auc_ci_high.values())) |
|
|
|
return { |
|
"mc_auroc_score": average_auc, |
|
"mc_auroc_ci": (average_auc_ci_low, average_auc_ci_high) if CI else None |
|
} |
|
|
|
def _get_CMs(self, i, probabilities, references, thresholds): |
|
confusion_matrices = [] |
|
for threshold in thresholds[i]: |
|
TP = 0 |
|
FP = 0 |
|
TN = 0 |
|
FN = 0 |
|
for j in range(len(probabilities)): |
|
if probabilities[j][i] >= threshold: |
|
if references[j] == i: |
|
TP += 1 |
|
else: |
|
FP += 1 |
|
else: |
|
if references[j] == i: |
|
FN += 1 |
|
else: |
|
TN += 1 |
|
cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i} |
|
confusion_matrices.append(cm) |
|
|
|
return confusion_matrices |
|
|