Spaces:

danieldux
/

isco_hierarchical_accuracy

Running

App Files Files Community

danieldux commited on Mar 4

Commit

475081a

•

1 Parent(s): d101c72

Update ISCO-08 Hierarchical Accuracy Metric description and implementation

Browse files

Files changed (1) hide show

metric_template_1.py +67 -29

metric_template_1.py CHANGED Viewed

@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Hierarchical Accuracy Metric."""
 import evaluate
 import datasets
@@ -20,39 +20,77 @@ import isco
 # TODO: Add BibTeX citation
-_CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
-# TODO: Add description of the module here
-_DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
 # TODO: Define external resources urls if needed
@@ -65,8 +103,8 @@ ILO_ISCO_CSV_URL = (
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class MetricTemplate1(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -83,15 +121,15 @@ class MetricTemplate1(evaluate.Metric):
                     "references": datasets.Value("string"),
                 }
             ),
-            # Homepage of the module for documentation
             homepage="http://module.homepage",
-            # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
             reference_urls=["http://path.to.reference.url/new_module"],
         )
     def _download_and_prepare(self, dl_manager):
-        """Download external ISCO-08 csv file for creating the hierarchy dictionary."""
         isco_csv = dl_manager.download_and_extract(ISCO_CSV_MIRROR_URL)
         print(f"ISCO CSV file downloaded")
         self.isco_hierarchy = isco.create_hierarchy_dict(isco_csv)
@@ -99,7 +137,7 @@ class MetricTemplate1(evaluate.Metric):
         print(self.isco_hierarchy)
     def _compute(self, predictions, references):
-        """Returns the scores"""
         # Convert the inputs to strings
         predictions = [str(p) for p in predictions]
         references = [str(r) for r in references]

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""ISCO-08 Hierarchical Accuracy Measure."""
 import evaluate
 import datasets
 # TODO: Add BibTeX citation
+_CITATION = """
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
 }
 """
+_DESCRIPTION = """
+The ISCO-08 Hierarchical Accuracy Measure is an implementation
+of the measure described in [Functional Annotation of Genes Using Hierarchical Text Categorization](https://www.researchgate.net/publication/44046343_Functional_Annotation_of_Genes_Using_Hierarchical_Text_Categorization)
+(Kiritchenko, Svetlana and Famili, Fazel. 2005) with the ISCO-08 taxonomy by the International Labour Organization.\n
+\n
+1. The measure gives credit to partially correct classification,
+e.g. misclassification into node $I$ (ISCO unit group "1120")
+when the correct category is $G$ (ISCO unit group "1111")
+should be penalized less than misclassification into node $D$
+(e.g., ISCO unit group "1211") since $I$ is in the same subgraph (ISCO sub-major group "11")
+as $G$ and $D$ is not.
+2. The measure punishes distant errors more heavily:
+    1. the measure gives higher evaluation for correctly classifying one level down compared to staying at the parent node, e.g. classification into node $E$ (ISCO minor group "111") is better than classification into its parent $C$ (ISCO sub-major group "11") since $E$ is closer to the correct category $G$;
+    2. the measure gives lower evaluation for incorrectly classifying one level down comparing to staying at the parent node, e.g. classification into node $F$ (ISCO minor group "112") is worse than classification into its parent $C$ since $F$ is farther away from $G$.\n
+\n
+The features described are accomplished by pairing hierarchical variants of precision ($hP$) and recall ($hR$) to form a hierarchical F1 (hF_β) score where each sample belongs not only to its class (e.g., a unit group level code), but also to all ancestors of the class in a hierarchical graph (i.e., the minor, sub-major, and major group level codes).\n
+\n
+Hierarchical precision can be computed with:\n
+$hP = \frac{| \v{C}_i ∩ \v{C}^′_i|}  {|\v{C}^′_i |} = \frac{1}{2}$\n
+\n
+Hierarchical recall can be computed with:\n
+$hR = \frac{| \v{C}_i ∩ \v{C}^′_i|}  {|\v{C}_i |} = \frac{1}{2}$\n
+\n
+Combining the two values $hP$ and $hR$ into one hF-measure:\n
+hF_β = \frac{(β^2 + 1) · hP · hR}{(β^2 · hP + hR)}, β ∈ [0, +∞)\n
+\n
+Note:\n
+**TP**: True positive\n
+**TN**: True negative\n
+**FP**: False positive\n
+**FN**: False negative\n
 """
 _KWARGS_DESCRIPTION = """
+Calculates hierarchical precision, hierarchical recall and hierarchical F1 given a list of reference codes and predicted codes from the ISCO-08 taxonomy by the International Labour Organization.
 Args:
+    - references (List[str]): List of ISCO-08 reference codes. Each reference code should be a single token, 4-digit ISCO-08 code string.
+    - predictions (List[str]): List of machine predicted or human assigned ISCO-08 codes to score. Each prediction should be a single token, 4-digit ISCO-08 code string.
 Returns:
+    - hierarchical_precision (`float` or `int`): Hierarchical precision score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
+    - hierarchical_recall: Hierarchical recall score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
+    - hierarchical_fmeasure: Hierarchical F1 score. Minimum possible value is 0. Maximum possible value is 1.0. A higher score means higher accuracy.
 Examples:
+    Example 1
+    >>> hierarchical_accuracy_metric = evaluate.load("ham")
+    >>> results = ham.compute(reference=["1111", "1112", "1113", "1114"], predictions=["1111", "1113", "1120", "1211"])
     >>> print(results)
+    {
+        'accuracy': 0.25,
+        'hierarchical_precision': 0.7142857142857143,
+        'hierarchical_recall': 0.5,
+        'hierarchical_fmeasure': 0.588235294117647
+    }
 """
 # TODO: Define external resources urls if needed
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ISCOHAM(evaluate.Metric):
+    """The ISCO-08 Hierarchical Accuracy Measure"""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
                     "references": datasets.Value("string"),
                 }
             ),
+            # TODO: Homepage of the module for documentation
             homepage="http://module.homepage",
+            # TODO: Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
             reference_urls=["http://path.to.reference.url/new_module"],
         )
     def _download_and_prepare(self, dl_manager):
+        """Download external ISCO-08 csv file from the ILO website for creating the hierarchy dictionary."""
         isco_csv = dl_manager.download_and_extract(ISCO_CSV_MIRROR_URL)
         print(f"ISCO CSV file downloaded")
         self.isco_hierarchy = isco.create_hierarchy_dict(isco_csv)
         print(self.isco_hierarchy)
     def _compute(self, predictions, references):
+        """Returns the accuracy scores."""
         # Convert the inputs to strings
         predictions = [str(p) for p in predictions]
         references = [str(r) for r in references]