File size: 6,008 Bytes
0c400ac 439a704 0c400ac 439a704 f4e212f 0c400ac 439a704 0c400ac 439a704 0c400ac 439a704 f4e212f 439a704 0c400ac bfab51a 0c400ac f4e212f 0c400ac f4e212f 0c400ac 439a704 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from typing import Sequence
import numpy
from sklearn.metrics import roc_curve, auc
import datasets
import evaluate
_DESCRIPTION = """
MC-AUROC (Multi-class Area Under the Receiver Operating Characteristic Curve) is a performance metric used in multiclass classification tasks.
It evaluates the ability of a model to distinguish between positive and negative classes across different threshold values.
The curve is generated by plotting the true positive rate (sensitivity) against the false positive rate (1-specificity) at various threshold settings.
AUROC provides a single scalar value indicating the overall discriminatory power of the model, with higher values suggesting better performance.
"""
_KWARGS_DESCRIPTION = """
AUROC metric for binary classification predictions. Here we use one-vs-all strategy to calculate the AUROC for multi-class classification problems.
The multi-class AUROC is calculated by treating each class as the positive class and the rest as the negative class.
The final score is the average of the AUROC scores for each class.
Args:
probabilities: list-like. Predicted probabilities or decision scores for the each class.
true_labels: list-like. True labels indicating the actual class memberships (must be ordinal, starting from 0).
Returns:
auroc_score: float. Multi-class Area Under the Receiver Operating Characteristic Curve (MC-AUROC) score.
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class AVG_MULTICLASS_AUROC(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
inputs_description=_KWARGS_DESCRIPTION,
citation="",
features=[
datasets.Features(
{
"predictions": datasets.Sequence(datasets.Value("float")),
"references": datasets.Value("int8")
}
),
],
reference_urls=[
"https://en.wikipedia.org/wiki/Receiver_operating_characteristic"
],
)
def _evaluate_statistics(self, variates, coverage):
"""Evaluates the left and right margins for a given M-C distribution
Parameters
----------
variates : numpy.ndarray
A 1-D array containing the simulated variates
coverage : float
A number, between 0 and 1 to indicate the desired coverage. Typically,
this number is set to 0.95 (95% coverage).
Returns
-------
stats : (float, float, float, float)
mean, mode and credible intervals for the input simulation
"""
left_half = (1 - coverage) / 2 # size of excluded (half) area
sorted_variates = numpy.sort(variates)
# n.b.: we return the equally tailed range
# calculates position of score which would exclude the left_half (left)
lower_index = int(round(len(variates) * left_half))
# calculates position of score which would exclude the right_half (right)
upper_index = int(round(len(variates) * (1 - left_half)))
lower = sorted_variates[lower_index - 1]
upper = sorted_variates[upper_index - 1]
return lower, upper
def _compute(self, predictions: Sequence[Sequence[float]], references: Sequence[int], CI=False):
"""
Computes the average AUROC score for multi-class classification problems.
"""
probabilities = predictions
n_classes = list(range(len(probabilities[0])))
fpr = dict()
tpr = dict()
thresholds = dict()
roc_auc = dict()
roc_auc_ci_low = dict()
roc_auc_ci_high = dict()
for i in range(len(n_classes)):
fpr[i], tpr[i], thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references],
y_score=[prob[i] for prob in probabilities])
if CI:
confusion_matrices = self._get_CMs(i, probabilities, references, thresholds)
low_ci_tpr, high_ci_tpr = [0] * len(thresholds[i]), [0] * len(thresholds[i])
位 = 1.0
for k in range(len(thresholds[i])):
variates = numpy.random.beta(confusion_matrices[k]["TP"] + 位, confusion_matrices[k]["FN"] + 位, 1000000)
low_ci_tpr[k], high_ci_tpr[k] = self._evaluate_statistics(variates, 0.95)
roc_auc_ci_low[i] = auc(fpr[i], low_ci_tpr)
roc_auc_ci_high[i] = auc(fpr[i], high_ci_tpr)
roc_auc[i] = auc(fpr[i], tpr[i])
# if AUC is NaN, set it to 0
if numpy.isnan(roc_auc[i]):
roc_auc[i] = 0
# Compute average AUC
average_auc = numpy.mean(list(roc_auc.values()))
if CI:
average_auc_ci_low = numpy.mean(list(roc_auc_ci_low.values()))
average_auc_ci_high = numpy.mean(list(roc_auc_ci_high.values()))
return {
"mc_auroc_score": average_auc,
"mc_auroc_ci": (average_auc_ci_low, average_auc_ci_high) if CI else None
}
def _get_CMs(self, i, probabilities, references, thresholds):
confusion_matrices = []
for threshold in thresholds[i]:
TP = 0
FP = 0
TN = 0
FN = 0
for j in range(len(probabilities)):
if probabilities[j][i] >= threshold:
if references[j] == i:
TP += 1
else:
FP += 1
else:
if references[j] == i:
FN += 1
else:
TN += 1
cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i}
confusion_matrices.append(cm)
return confusion_matrices
|