Update README.md
Browse files
README.md
CHANGED
@@ -114,9 +114,8 @@ The dataset is comprised of a mixture of open datasets large-scale datasets avai
|
|
114 |
| Claude 2 | - |RLHF |8.06| 91.36|
|
115 |
| GPT-4 | -| RLHF |8.99| 95.28|
|
116 |
|
117 |
-
## Other
|
118 |
-
|
119 |
-
| Metric | Value |
|
120 |
|-----------------------|---------------------------|
|
121 |
| ARC (25-shot) | 47.0 |
|
122 |
| HellaSwag (10-shot) | 74.2 |
|
@@ -124,76 +123,8 @@ The dataset is comprised of a mixture of open datasets large-scale datasets avai
|
|
124 |
| TruthfulQA (0-shot) | 46.5 |
|
125 |
| Winogrande (5-shot) | 65.5 |
|
126 |
| GSM8K (5-shot) | 42.3 |
|
127 |
-
|
128 |
-
|
129 |
-
2. **BigBench**:
|
130 |
-
|
131 |
-
- Average: 35.26
|
132 |
-
- Details:
|
133 |
-
|
134 |
-
| Task | Version | Metric | Value | Stderr |
|
135 |
-
|-----------------------------------------------------|---------|-------------------------|-------|--------|
|
136 |
-
| bigbench_causal_judgement | 0 | multiple_choice_grade | 0.5316| 0.0363 |
|
137 |
-
| bigbench_date_understanding | 0 | multiple_choice_grade | 0.4363| 0.0259 |
|
138 |
-
| bigbench_disambiguation_qa | 0 | multiple_choice_grade | 0.3217| 0.0291 |
|
139 |
-
| bigbench_dyck_languages | 0 | multiple_choice_grade | 0.1450| 0.0111 |
|
140 |
-
| bigbench_formal_fallacies_syllogisms_negation | 0 | multiple_choice_grade | 0.4982| 0.0042 |
|
141 |
-
| bigbench_geometric_shapes | 0 | multiple_choice_grade | 0.1086| 0.0164 |
|
142 |
-
| bigbench_hyperbaton | 0 | exact_str_match | 0.0000| 0.0000 |
|
143 |
-
| bigbench_logical_deduction_five_objects | 0 | multiple_choice_grade | 0.5232| 0.0022 |
|
144 |
-
| bigbench_logical_deduction_seven_objects | 0 | multiple_choice_grade | 0.2480| 0.0193 |
|
145 |
-
| bigbench_logical_deduction_three_objects | 0 | multiple_choice_grade | 0.1814| 0.0146 |
|
146 |
-
| bigbench_movie_recommendation | 0 | multiple_choice_grade | 0.4067| 0.0284 |
|
147 |
-
| bigbench_navigate | 0 | multiple_choice_grade | 0.2580| 0.0196 |
|
148 |
-
| bigbench_reasoning_about_colored_objects | 0 | multiple_choice_grade | 0.5990| 0.0155 |
|
149 |
-
| bigbench_ruin_names | 0 | multiple_choice_grade | 0.4370| 0.0111 |
|
150 |
-
| bigbench_salient_translation_error_detection | 0 | multiple_choice_grade | 0.3951| 0.0231 |
|
151 |
-
| bigbench_snarks | 0 | multiple_choice_grade | 0.2265| 0.0133 |
|
152 |
-
| bigbench_sports_understanding | 0 | multiple_choice_grade | 0.6464| 0.0356 |
|
153 |
-
| bigbench_temporal_sequences | 0 | multiple_choice_grade | 0.5091| 0.0159 |
|
154 |
-
| bigbench_tracking_shuffled_objects_five_objects | 0 | multiple_choice_grade | 0.2680| 0.0140 |
|
155 |
-
| bigbench_tracking_shuffled_objects_seven_objects | 0 | multiple_choice_grade | 0.1856| 0.0110 |
|
156 |
-
| bigbench_tracking_shuffled_objects_three_objects | 0 | multiple_choice_grade | 0.1269| 0.0080 |
|
157 |
-
|
158 |
-
3. **AGI Benchmark**:
|
159 |
-
- Average: 33.23
|
160 |
-
- Details:
|
161 |
-
| Task |Version| Metric |Value | |Stderr|
|
162 |
-
|------------------------------|------:|--------|-----:|---|-----:|
|
163 |
-
|agieval_aqua_rat | 0|acc |0.2126|± |0.0257|
|
164 |
-
| | |acc_norm|0.1890|± |0.0246|
|
165 |
-
|agieval_gaokao_biology | 0|acc |0.2571|± |0.0302|
|
166 |
-
| | |acc_norm|0.3143|± |0.0321|
|
167 |
-
|agieval_gaokao_chemistry | 0|acc |0.2464|± |0.0300|
|
168 |
-
| | |acc_norm|0.2899|± |0.0316|
|
169 |
-
|agieval_gaokao_chinese | 0|acc |0.2927|± |0.0291|
|
170 |
-
| | |acc_norm|0.3049|± |0.0294|
|
171 |
-
|agieval_gaokao_english | 0|acc |0.6176|± |0.0278|
|
172 |
-
| | |acc_norm|0.6438|± |0.0274|
|
173 |
-
|agieval_gaokao_geography | 0|acc |0.3015|± |0.0326|
|
174 |
-
| | |acc_norm|0.3065|± |0.0328|
|
175 |
-
|agieval_gaokao_history | 0|acc |0.3106|± |0.0303|
|
176 |
-
| | |acc_norm|0.3319|± |0.0308|
|
177 |
-
|agieval_gaokao_mathqa | 0|acc |0.2650|± |0.0236|
|
178 |
-
| | |acc_norm|0.2707|± |0.0237|
|
179 |
-
|agieval_gaokao_physics | 0|acc |0.3450|± |0.0337|
|
180 |
-
| | |acc_norm|0.3550|± |0.0339|
|
181 |
-
|agieval_logiqa_en | 0|acc |0.2980|± |0.0179|
|
182 |
-
| | |acc_norm|0.3195|± |0.0183|
|
183 |
-
|agieval_logiqa_zh | 0|acc |0.2842|± |0.0177|
|
184 |
-
| | |acc_norm|0.3318|± |0.0185|
|
185 |
-
|agieval_lsat_ar | 0|acc |0.2000|± |0.0264|
|
186 |
-
| | |acc_norm|0.2043|± |0.0266|
|
187 |
-
|agieval_lsat_lr | 0|acc |0.3176|± |0.0206|
|
188 |
-
| | |acc_norm|0.3275|± |0.0208|
|
189 |
-
|agieval_lsat_rc | 0|acc |0.4312|± |0.0303|
|
190 |
-
| | |acc_norm|0.4201|± |0.0301|
|
191 |
-
|agieval_sat_en | 0|acc |0.6117|± |0.0340|
|
192 |
-
| | |acc_norm|0.6117|± |0.0340|
|
193 |
-
|agieval_sat_en_without_passage| 0|acc |0.3398|± |0.0331|
|
194 |
-
| | |acc_norm|0.3495|± |0.0333|
|
195 |
-
|agieval_sat_math | 0|acc |0.3182|± |0.0315|
|
196 |
-
| | |acc_norm|0.2909|± |0.0307|
|
197 |
|
198 |
### Training Infrastructure
|
199 |
|
|
|
114 |
| Claude 2 | - |RLHF |8.06| 91.36|
|
115 |
| GPT-4 | -| RLHF |8.99| 95.28|
|
116 |
|
117 |
+
## Other benchmarks:
|
118 |
+
| Task | Value |
|
|
|
119 |
|-----------------------|---------------------------|
|
120 |
| ARC (25-shot) | 47.0 |
|
121 |
| HellaSwag (10-shot) | 74.2 |
|
|
|
123 |
| TruthfulQA (0-shot) | 46.5 |
|
124 |
| Winogrande (5-shot) | 65.5 |
|
125 |
| GSM8K (5-shot) | 42.3 |
|
126 |
+
| BigBench (Avg) | 35.26 |
|
127 |
+
| AGI Benchmark (Avg) | 33.23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
### Training Infrastructure
|
130 |
|