Spaces:
Runtime error
Runtime error
first version dump - to clean over weekend
Browse files- README.md +23 -4
- ece.py +128 -9
- requirements.txt +2 -1
README.md
CHANGED
@@ -5,7 +5,7 @@ datasets:
|
|
5 |
tags:
|
6 |
- evaluate
|
7 |
- metric
|
8 |
-
description:
|
9 |
sdk: gradio
|
10 |
sdk_version: 3.0.2
|
11 |
app_file: app.py
|
@@ -17,34 +17,53 @@ pinned: false
|
|
17 |
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
|
18 |
|
19 |
## Metric Description
|
|
|
20 |
*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
|
|
|
|
|
|
|
21 |
|
22 |
## How to Use
|
|
|
23 |
*Give general statement of how to use the metric*
|
24 |
-
|
25 |
*Provide simplest possible example for using the metric*
|
|
|
|
|
26 |
|
27 |
### Inputs
|
|
|
28 |
*List all input arguments in the format below*
|
29 |
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
|
|
30 |
|
31 |
### Output Values
|
32 |
-
|
33 |
*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
|
34 |
|
35 |
*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
|
36 |
|
37 |
#### Values from Popular Papers
|
38 |
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
|
|
|
|
39 |
|
40 |
### Examples
|
|
|
41 |
*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
|
|
|
42 |
|
43 |
## Limitations and Bias
|
|
|
44 |
*Note any known limitations or biases that the metric has, with links and references if possible.*
|
|
|
|
|
45 |
|
46 |
## Citation
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
## Further References
|
50 |
*Add any useful further references.*
|
|
|
5 |
tags:
|
6 |
- evaluate
|
7 |
- metric
|
8 |
+
description: binned estimator of expected calibration error
|
9 |
sdk: gradio
|
10 |
sdk_version: 3.0.2
|
11 |
app_file: app.py
|
|
|
17 |
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
|
18 |
|
19 |
## Metric Description
|
20 |
+
<!---
|
21 |
*Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
|
22 |
+
-->
|
23 |
+
`ECE` is a standard metric to evaluate top-1 prediction miscalibration. Generally, the lower the better.
|
24 |
+
|
25 |
|
26 |
## How to Use
|
27 |
+
<!---
|
28 |
*Give general statement of how to use the metric*
|
|
|
29 |
*Provide simplest possible example for using the metric*
|
30 |
+
-->
|
31 |
+
|
32 |
|
33 |
### Inputs
|
34 |
+
<!---
|
35 |
*List all input arguments in the format below*
|
36 |
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
37 |
+
-->
|
38 |
|
39 |
### Output Values
|
40 |
+
<!---
|
41 |
*Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
|
42 |
|
43 |
*State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
|
44 |
|
45 |
#### Values from Popular Papers
|
46 |
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
47 |
+
-->
|
48 |
+
|
49 |
|
50 |
### Examples
|
51 |
+
<!---
|
52 |
*Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
|
53 |
+
-->
|
54 |
|
55 |
## Limitations and Bias
|
56 |
+
<!---
|
57 |
*Note any known limitations or biases that the metric has, with links and references if possible.*
|
58 |
+
-->
|
59 |
+
See [3],[4] and [5]
|
60 |
|
61 |
## Citation
|
62 |
+
[1] Naeini, M.P., Cooper, G. and Hauskrecht, M., 2015, February. Obtaining well calibrated probabilities using bayesian binning. In Twenty-Ninth AAAI Conference on Artificial Intelligence.
|
63 |
+
[2] Guo, C., Pleiss, G., Sun, Y. and Weinberger, K.Q., 2017, July. On calibration of modern neural networks. In International Conference on Machine Learning (pp. 1321-1330). PMLR.
|
64 |
+
[3] Nixon, J., Dusenberry, M.W., Zhang, L., Jerfel, G. and Tran, D., 2019, June. Measuring Calibration in Deep Learning. In CVPR Workshops (Vol. 2, No. 7).
|
65 |
+
[4] Kumar, A., Liang, P.S. and Ma, T., 2019. Verified uncertainty calibration. Advances in Neural Information Processing Systems, 32.
|
66 |
+
[5] Vaicenavicius, J., Widmann, D., Andersson, C., Lindsten, F., Roll, J. and Schön, T., 2019, April. Evaluating model calibration in classification. In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 3459-3467). PMLR.
|
67 |
|
68 |
## Further References
|
69 |
*Add any useful further references.*
|
ece.py
CHANGED
@@ -15,14 +15,15 @@
|
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
|
|
18 |
|
19 |
|
20 |
# TODO: Add BibTeX citation
|
21 |
_CITATION = """\
|
22 |
@InProceedings{huggingface:module,
|
23 |
-
title = {
|
24 |
-
authors={
|
25 |
-
year={
|
26 |
}
|
27 |
"""
|
28 |
|
@@ -57,10 +58,109 @@ Examples:
|
|
57 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
58 |
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
61 |
class ECE(evaluate.EvaluationModule):
|
62 |
"""TODO: Short description of my evaluation module."""
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def _info(self):
|
65 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
66 |
return evaluate.EvaluationModuleInfo(
|
@@ -71,11 +171,11 @@ class ECE(evaluate.EvaluationModule):
|
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
features=datasets.Features({
|
74 |
-
'predictions': datasets.Value('
|
75 |
'references': datasets.Value('int64'),
|
76 |
}),
|
77 |
# Homepage of the module for documentation
|
78 |
-
homepage="http://module.homepage",
|
79 |
# Additional links to the codebase or references
|
80 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
81 |
reference_urls=["http://path.to.reference.url/new_module"]
|
@@ -88,8 +188,27 @@ class ECE(evaluate.EvaluationModule):
|
|
88 |
|
89 |
def _compute(self, predictions, references):
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
|
93 |
return {
|
94 |
-
"
|
95 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
18 |
+
import numpy as np
|
19 |
|
20 |
|
21 |
# TODO: Add BibTeX citation
|
22 |
_CITATION = """\
|
23 |
@InProceedings{huggingface:module,
|
24 |
+
title = {Expected Calibration Error},
|
25 |
+
authors={Jordy Van Landeghem},
|
26 |
+
year={2022}
|
27 |
}
|
28 |
"""
|
29 |
|
|
|
58 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
59 |
|
60 |
|
61 |
+
# TODO
|
62 |
+
|
63 |
+
def bin_idx_dd(P, bins):
|
64 |
+
oneDbins = np.digitize(P, bins) - 1 # since bins contains extra righmost&leftmost bins
|
65 |
+
|
66 |
+
# Tie-breaking to the left for rightmost bin
|
67 |
+
# Using `digitize`, values that fall on an edge are put in the right bin.
|
68 |
+
# For the rightmost bin, we want values equal to the right
|
69 |
+
# edge to be counted in the last bin, and not as an outlier.
|
70 |
+
|
71 |
+
for k in range(P.shape[-1]):
|
72 |
+
# Find the rounding precision
|
73 |
+
dedges_min = np.diff(bins).min()
|
74 |
+
if dedges_min == 0:
|
75 |
+
raise ValueError('The smallest edge difference is numerically 0.')
|
76 |
+
|
77 |
+
decimal = int(-np.log10(dedges_min)) + 6
|
78 |
+
|
79 |
+
# Find which points are on the rightmost edge.
|
80 |
+
on_edge = np.where(
|
81 |
+
(P[:, k] >= bins[-1]) & (np.around(P[:, k], decimal) == np.around(bins[-1], decimal))
|
82 |
+
)[0]
|
83 |
+
# Shift these points one bin to the left.
|
84 |
+
oneDbins[on_edge, k] -= 1
|
85 |
+
|
86 |
+
return oneDbins
|
87 |
+
|
88 |
+
|
89 |
+
def manual_binned_statistic(P, y_correct, bins, statistic="mean"):
|
90 |
+
|
91 |
+
binnumbers = bin_idx_dd(np.expand_dims(P, 0), bins)[0]
|
92 |
+
result = np.empty([len(bins)], float)
|
93 |
+
result.fill(np.nan)
|
94 |
+
|
95 |
+
flatcount = np.bincount(binnumbers, None)
|
96 |
+
a = flatcount.nonzero()
|
97 |
+
|
98 |
+
if statistic == 'mean':
|
99 |
+
flatsum = np.bincount(binnumbers, y_correct)
|
100 |
+
result[a] = flatsum[a] / flatcount[a]
|
101 |
+
return result, bins, binnumbers + 1 # fix for what happens in bin_idx_dd
|
102 |
+
|
103 |
+
def CE_estimate(y_correct, P, bins=None, n_bins=10, p=1):
|
104 |
+
"""
|
105 |
+
y_correct: binary (N x 1)
|
106 |
+
P: normalized (N x 1) either max or per class
|
107 |
+
|
108 |
+
Summary: weighted average over the accuracy/confidence difference of equal-range bins
|
109 |
+
"""
|
110 |
+
|
111 |
+
# defaults:
|
112 |
+
if bins is None:
|
113 |
+
n_bins = n_bins
|
114 |
+
bin_range = [0, 1]
|
115 |
+
bins = np.linspace(bin_range[0], bin_range[1], n_bins + 1)
|
116 |
+
# expected; equal range binning
|
117 |
+
else:
|
118 |
+
n_bins = len(bins) - 1
|
119 |
+
bin_range = [min(bins), max(bins)]
|
120 |
+
|
121 |
+
# average bin probability #55 for bin 50-60; mean per bin
|
122 |
+
calibrated_acc = bins[1:] # right/upper bin edges
|
123 |
+
# calibrated_acc = bin_centers(bins)
|
124 |
+
|
125 |
+
|
126 |
+
empirical_acc, bin_edges, bin_assignment = manual_binned_statistic(P, y_correct, bins)
|
127 |
+
bin_numbers, weights_ece = np.unique(bin_assignment, return_counts=True)
|
128 |
+
anindices = bin_numbers - 1 # reduce bin counts; left edge; indexes right BY DEFAULT
|
129 |
+
|
130 |
+
# Expected calibration error
|
131 |
+
if p < np.inf: # Lp-CE
|
132 |
+
CE = np.average(
|
133 |
+
abs(empirical_acc[anindices] - calibrated_acc[anindices]) ** p,
|
134 |
+
weights=weights_ece, # weighted average 1/binfreq
|
135 |
+
)
|
136 |
+
elif np.isinf(p): # max-ECE
|
137 |
+
CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
|
138 |
+
|
139 |
+
return CE
|
140 |
+
|
141 |
+
def top_CE(Y, P, **kwargs):
|
142 |
+
y_correct = (Y == np.argmax(P, -1)).astype(int)
|
143 |
+
p_max = np.max(P, -1)
|
144 |
+
top_CE = CE_estimate(y_correct, p_max, **kwargs) # can choose n_bins and norm
|
145 |
+
return top_CE
|
146 |
+
|
147 |
+
|
148 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
149 |
class ECE(evaluate.EvaluationModule):
|
150 |
"""TODO: Short description of my evaluation module."""
|
151 |
|
152 |
+
"""
|
153 |
+
0. create binning scheme [discretization of f]
|
154 |
+
1. build histogram P(f(X))
|
155 |
+
2. build conditional density estimate P(y|f(X))
|
156 |
+
3. average bin probabilities f_B as center/edge of bin
|
157 |
+
4. apply L^p norm distance and weights
|
158 |
+
"""
|
159 |
+
|
160 |
+
#have to add to initialization here?
|
161 |
+
#create bins using the params
|
162 |
+
#create proxy
|
163 |
+
|
164 |
def _info(self):
|
165 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
166 |
return evaluate.EvaluationModuleInfo(
|
|
|
171 |
inputs_description=_KWARGS_DESCRIPTION,
|
172 |
# This defines the format of each prediction and reference
|
173 |
features=datasets.Features({
|
174 |
+
'predictions': datasets.Value('float32'),
|
175 |
'references': datasets.Value('int64'),
|
176 |
}),
|
177 |
# Homepage of the module for documentation
|
178 |
+
homepage="http://module.homepage", #https://huggingface.co/spaces/jordyvl/ece
|
179 |
# Additional links to the codebase or references
|
180 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
181 |
reference_urls=["http://path.to.reference.url/new_module"]
|
|
|
188 |
|
189 |
def _compute(self, predictions, references):
|
190 |
"""Returns the scores"""
|
191 |
+
ECE = top_CE(references, predictions)
|
|
|
192 |
return {
|
193 |
+
"ECE": ECE,
|
194 |
+
}
|
195 |
+
|
196 |
+
|
197 |
+
def test_ECE():
|
198 |
+
N = 10 #10 instances
|
199 |
+
K = 5 #5 class problem
|
200 |
+
|
201 |
+
def random_mc_instance(concentration=1):
|
202 |
+
reference = np.argmax(np.random.dirichlet(([concentration for _ in range(K)])),-1)
|
203 |
+
prediction = np.random.dirichlet(([concentration for _ in range(K)])) #probabilities
|
204 |
+
#OH #return np.eye(K)[np.argmax(reference,-1)]
|
205 |
+
return reference, prediction
|
206 |
+
|
207 |
+
references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
|
208 |
+
references = np.array(references, dtype=np.int64)
|
209 |
+
predictions = np.array(predictions, dtype=np.float32)
|
210 |
+
res = ECE()._compute(predictions, references)
|
211 |
+
print(f"ECE: {res['ECE']}")
|
212 |
+
|
213 |
+
if __name__ == '__main__':
|
214 |
+
test_ECE()
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
evaluate==0.1.0
|
2 |
-
datasets~=2.0
|
|
|
|
1 |
evaluate==0.1.0
|
2 |
+
datasets~=2.0
|
3 |
+
numpy>=1.19.5
|