nm-research commited on
Commit
bb96ea6
·
verified ·
1 Parent(s): 0f12672

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -16
README.md CHANGED
@@ -159,6 +159,7 @@ evalplus.evaluate \
159
 
160
  ### Accuracy
161
 
 
162
  <table>
163
  <thead>
164
  <tr>
@@ -173,52 +174,52 @@ evalplus.evaluate \
173
  <tr>
174
  <td rowspan="7"><b>OpenLLM Leaderboard V1</b></td>
175
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
176
- <td>55.63</td>
177
  <td>53.50</td>
178
- <td>96.17</td>
179
  </tr>
180
  <tr>
181
  <td>GSM8K (Strict-Match, 5-shot)</td>
182
- <td>60.96</td>
183
  <td>46.10</td>
184
- <td>75.63</td>
185
- </tr>
186
  <tr>
187
  <td>HellaSwag (Acc-Norm, 10-shot)</td>
188
- <td>75.21</td>
189
  <td>77.76</td>
190
- <td>103.39</td>
191
  </tr>
192
  <tr>
193
  <td>MMLU (Acc, 5-shot)</td>
194
- <td>54.38</td>
195
  <td>52.61</td>
196
- <td>96.75</td>
197
  </tr>
198
  <tr>
199
  <td>TruthfulQA (MC2, 0-shot)</td>
200
- <td>55.93</td>
201
  <td>39.84</td>
202
- <td>71.23</td>
203
  </tr>
204
  <tr>
205
  <td>Winogrande (Acc, 5-shot)</td>
206
- <td>69.67</td>
207
  <td>74.43</td>
208
- <td>106.84</td>
 
209
  </tr>
210
  <tr>
211
  <td><b>Average Score</b></td>
212
- <td><b>61.98</b></td>
213
  <td><b>57.37</b></td>
214
- <td><b>92.57</b></td>
215
  </tr>
216
  <tr>
217
  <td rowspan="2"><b>HumanEval</b></td>
218
  <td>HumanEval Pass@1</td>
219
  <td>30.00</td>
220
  <td>30.40</td>
221
- <td><b>101.33</b></td>
222
  </tr>
223
  </tbody>
224
  </table>
@@ -226,3 +227,4 @@ evalplus.evaluate \
226
 
227
 
228
 
 
 
159
 
160
  ### Accuracy
161
 
162
+
163
  <table>
164
  <thead>
165
  <tr>
 
174
  <tr>
175
  <td rowspan="7"><b>OpenLLM Leaderboard V1</b></td>
176
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
177
+ <td>53.75</td>
178
  <td>53.50</td>
179
+ <td>99.54</td>
180
  </tr>
181
  <tr>
182
  <td>GSM8K (Strict-Match, 5-shot)</td>
183
+ <td>47.84</td>
184
  <td>46.10</td>
185
+ <td>96.36</td>
186
+ </tr>
187
  <tr>
188
  <td>HellaSwag (Acc-Norm, 10-shot)</td>
189
+ <td>77.94</td>
190
  <td>77.76</td>
191
+ <td>99.77</td>
192
  </tr>
193
  <tr>
194
  <td>MMLU (Acc, 5-shot)</td>
195
+ <td>52.88</td>
196
  <td>52.61</td>
197
+ <td>99.49</td>
198
  </tr>
199
  <tr>
200
  <td>TruthfulQA (MC2, 0-shot)</td>
201
+ <td>39.04</td>
202
  <td>39.84</td>
203
+ <td>102.05</td>
204
  </tr>
205
  <tr>
206
  <td>Winogrande (Acc, 5-shot)</td>
 
207
  <td>74.43</td>
208
+ <td>74.43</td>
209
+ <td>100.00</td>
210
  </tr>
211
  <tr>
212
  <td><b>Average Score</b></td>
213
+ <td><b>57.65</b></td>
214
  <td><b>57.37</b></td>
215
+ <td><b>99.52</b></td>
216
  </tr>
217
  <tr>
218
  <td rowspan="2"><b>HumanEval</b></td>
219
  <td>HumanEval Pass@1</td>
220
  <td>30.00</td>
221
  <td>30.40</td>
222
+ <td>101.33</td>
223
  </tr>
224
  </tbody>
225
  </table>
 
227
 
228
 
229
 
230
+