lixuejing commited on
Commit
4ab9b19
Β·
1 Parent(s): 05d96a1

remove baseline_row

Browse files
Files changed (1) hide show
  1. src/display/utils.py +0 -25
src/display/utils.py CHANGED
@@ -61,31 +61,6 @@ class EvalQueueColumn: # Queue column
61
 
62
  ## All the model information that we might need
63
 
64
- # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
65
- # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
66
- # HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
67
- # MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
68
- # TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
69
- # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
70
- # GSM8K: paper
71
- # Define the human baselines
72
- human_baseline_row = {
73
- AutoEvalColumn.model.name: "<p>Human performance</p>",
74
- AutoEvalColumn.revision.name: "N/A",
75
- AutoEvalColumn.precision.name: None,
76
- AutoEvalColumn.average.name: 92.75,
77
- AutoEvalColumn.merged.name: False,
78
- AutoEvalColumn.arc.name: 80.0,
79
- AutoEvalColumn.hellaswag.name: 95.0,
80
- AutoEvalColumn.mmlu.name: 89.8,
81
- AutoEvalColumn.truthfulqa.name: 94.0,
82
- AutoEvalColumn.winogrande.name: 94.0,
83
- AutoEvalColumn.gsm8k.name: 100,
84
- AutoEvalColumn.c_sem.name: 100,
85
- AutoEvalColumn.dummy.name: "human_baseline",
86
- AutoEvalColumn.model_type.name: "",
87
- AutoEvalColumn.flagged.name: False,
88
- }
89
 
90
  @dataclass
91
  class ModelDetails:
 
61
 
62
  ## All the model information that we might need
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  @dataclass
66
  class ModelDetails: