Update ISCO-08 Hierarchical Accuracy Metric description and implementation
Browse files- metric_template_1.py +67 -29
metric_template_1.py
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
-
"""Hierarchical Accuracy
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
@@ -20,39 +20,77 @@ import isco
|
|
20 |
|
21 |
|
22 |
# TODO: Add BibTeX citation
|
23 |
-
_CITATION = """
|
24 |
-
@
|
25 |
-
title
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
}
|
29 |
"""
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"""
|
35 |
|
36 |
-
|
37 |
-
# TODO: Add description of the arguments of the module here
|
38 |
_KWARGS_DESCRIPTION = """
|
39 |
-
Calculates
|
|
|
40 |
Args:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
reference should be a string with tokens separated by spaces.
|
45 |
Returns:
|
46 |
-
|
47 |
-
|
|
|
|
|
48 |
Examples:
|
49 |
-
|
50 |
-
to use the function.
|
51 |
|
52 |
-
>>>
|
53 |
-
>>> results =
|
54 |
>>> print(results)
|
55 |
-
{
|
|
|
|
|
|
|
|
|
|
|
56 |
"""
|
57 |
|
58 |
# TODO: Define external resources urls if needed
|
@@ -65,8 +103,8 @@ ILO_ISCO_CSV_URL = (
|
|
65 |
|
66 |
|
67 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
68 |
-
class
|
69 |
-
"""
|
70 |
|
71 |
def _info(self):
|
72 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
@@ -83,15 +121,15 @@ class MetricTemplate1(evaluate.Metric):
|
|
83 |
"references": datasets.Value("string"),
|
84 |
}
|
85 |
),
|
86 |
-
# Homepage of the module for documentation
|
87 |
homepage="http://module.homepage",
|
88 |
-
# Additional links to the codebase or references
|
89 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
90 |
reference_urls=["http://path.to.reference.url/new_module"],
|
91 |
)
|
92 |
|
93 |
def _download_and_prepare(self, dl_manager):
|
94 |
-
"""Download external ISCO-08 csv file for creating the hierarchy dictionary."""
|
95 |
isco_csv = dl_manager.download_and_extract(ISCO_CSV_MIRROR_URL)
|
96 |
print(f"ISCO CSV file downloaded")
|
97 |
self.isco_hierarchy = isco.create_hierarchy_dict(isco_csv)
|
@@ -99,7 +137,7 @@ class MetricTemplate1(evaluate.Metric):
|
|
99 |
print(self.isco_hierarchy)
|
100 |
|
101 |
def _compute(self, predictions, references):
|
102 |
-
"""Returns the scores"""
|
103 |
# Convert the inputs to strings
|
104 |
predictions = [str(p) for p in predictions]
|
105 |
references = [str(r) for r in references]
|
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
+
"""ISCO-08 Hierarchical Accuracy Measure."""
|
15 |
|
16 |
import evaluate
|
17 |
import datasets
|
|
|
20 |
|
21 |
|
22 |
# TODO: Add BibTeX citation
|
23 |
+
_CITATION = """
|
24 |
+
@article{scikit-learn,
|
25 |
+
title={Scikit-learn: Machine Learning in {P}ython},
|
26 |
+
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
|
27 |
+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
|
28 |
+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
|
29 |
+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
|
30 |
+
journal={Journal of Machine Learning Research},
|
31 |
+
volume={12},
|
32 |
+
pages={2825--2830},
|
33 |
+
year={2011}
|
34 |
}
|
35 |
"""
|
36 |
|
37 |
+
_DESCRIPTION = """
|
38 |
+
The ISCO-08 Hierarchical Accuracy Measure is an implementation
|
39 |
+
of the measure described in [Functional Annotation of Genes Using Hierarchical Text Categorization](https://www.researchgate.net/publication/44046343_Functional_Annotation_of_Genes_Using_Hierarchical_Text_Categorization)
|
40 |
+
(Kiritchenko, Svetlana and Famili, Fazel. 2005) with the ISCO-08 taxonomy by the International Labour Organization.\n
|
41 |
+
\n
|
42 |
+
1. The measure gives credit to partially correct classification,
|
43 |
+
e.g. misclassification into node $I$ (ISCO unit group "1120")
|
44 |
+
when the correct category is $G$ (ISCO unit group "1111")
|
45 |
+
should be penalized less than misclassification into node $D$
|
46 |
+
(e.g., ISCO unit group "1211") since $I$ is in the same subgraph (ISCO sub-major group "11")
|
47 |
+
as $G$ and $D$ is not.
|
48 |
+
2. The measure punishes distant errors more heavily:
|
49 |
+
1. the measure gives higher evaluation for correctly classifying one level down compared to staying at the parent node, e.g. classification into node $E$ (ISCO minor group "111") is better than classification into its parent $C$ (ISCO sub-major group "11") since $E$ is closer to the correct category $G$;
|
50 |
+
2. the measure gives lower evaluation for incorrectly classifying one level down comparing to staying at the parent node, e.g. classification into node $F$ (ISCO minor group "112") is worse than classification into its parent $C$ since $F$ is farther away from $G$.\n
|
51 |
+
\n
|
52 |
+
The features described are accomplished by pairing hierarchical variants of precision ($hP$) and recall ($hR$) to form a hierarchical F1 (hF_β) score where each sample belongs not only to its class (e.g., a unit group level code), but also to all ancestors of the class in a hierarchical graph (i.e., the minor, sub-major, and major group level codes).\n
|
53 |
+
\n
|
54 |
+
Hierarchical precision can be computed with:\n
|
55 |
+
$hP = \frac{| \v{C}_i ∩ \v{C}^′_i|} {|\v{C}^′_i |} = \frac{1}{2}$\n
|
56 |
+
\n
|
57 |
+
Hierarchical recall can be computed with:\n
|
58 |
+
$hR = \frac{| \v{C}_i ∩ \v{C}^′_i|} {|\v{C}_i |} = \frac{1}{2}$\n
|
59 |
+
\n
|
60 |
+
Combining the two values $hP$ and $hR$ into one hF-measure:\n
|
61 |
+
hF_β = \frac{(β^2 + 1) · hP · hR}{(β^2 · hP + hR)}, β ∈ [0, +∞)\n
|
62 |
+
\n
|
63 |
+
Note:\n
|
64 |
+
**TP**: True positive\n
|
65 |
+
**TN**: True negative\n
|
66 |
+
**FP**: False positive\n
|
67 |
+
**FN**: False negative\n
|
68 |
"""
|
69 |
|
|
|
|
|
70 |
_KWARGS_DESCRIPTION = """
|
71 |
+
Calculates hierarchical precision, hierarchical recall and hierarchical F1 given a list of reference codes and predicted codes from the ISCO-08 taxonomy by the International Labour Organization.
|
72 |
+
|
73 |
Args:
|
74 |
+
- references (List[str]): List of ISCO-08 reference codes. Each reference code should be a single token, 4-digit ISCO-08 code string.
|
75 |
+
- predictions (List[str]): List of machine predicted or human assigned ISCO-08 codes to score. Each prediction should be a single token, 4-digit ISCO-08 code string.
|
76 |
+
|
|
|
77 |
Returns:
|
78 |
+
- hierarchical_precision (`float` or `int`): Hierarchical precision score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
|
79 |
+
- hierarchical_recall: Hierarchical recall score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
|
80 |
+
- hierarchical_fmeasure: Hierarchical F1 score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
|
81 |
+
|
82 |
Examples:
|
83 |
+
Example 1
|
|
|
84 |
|
85 |
+
>>> hierarchical_accuracy_metric = evaluate.load("ham")
|
86 |
+
>>> results = ham.compute(reference=["1111", "1112", "1113", "1114"], predictions=["1111", "1113", "1120", "1211"])
|
87 |
>>> print(results)
|
88 |
+
{
|
89 |
+
'accuracy': 0.25,
|
90 |
+
'hierarchical_precision': 0.7142857142857143,
|
91 |
+
'hierarchical_recall': 0.5,
|
92 |
+
'hierarchical_fmeasure': 0.588235294117647
|
93 |
+
}
|
94 |
"""
|
95 |
|
96 |
# TODO: Define external resources urls if needed
|
|
|
103 |
|
104 |
|
105 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
106 |
+
class ISCOHAM(evaluate.Metric):
|
107 |
+
"""The ISCO-08 Hierarchical Accuracy Measure"""
|
108 |
|
109 |
def _info(self):
|
110 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
|
|
121 |
"references": datasets.Value("string"),
|
122 |
}
|
123 |
),
|
124 |
+
# TODO: Homepage of the module for documentation
|
125 |
homepage="http://module.homepage",
|
126 |
+
# TODO: Additional links to the codebase or references
|
127 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
128 |
reference_urls=["http://path.to.reference.url/new_module"],
|
129 |
)
|
130 |
|
131 |
def _download_and_prepare(self, dl_manager):
|
132 |
+
"""Download external ISCO-08 csv file from the ILO website for creating the hierarchy dictionary."""
|
133 |
isco_csv = dl_manager.download_and_extract(ISCO_CSV_MIRROR_URL)
|
134 |
print(f"ISCO CSV file downloaded")
|
135 |
self.isco_hierarchy = isco.create_hierarchy_dict(isco_csv)
|
|
|
137 |
print(self.isco_hierarchy)
|
138 |
|
139 |
def _compute(self, predictions, references):
|
140 |
+
"""Returns the accuracy scores."""
|
141 |
# Convert the inputs to strings
|
142 |
predictions = [str(p) for p in predictions]
|
143 |
references = [str(r) for r in references]
|