lvkaokao commited on
Commit
eef68df
·
1 Parent(s): 306fada

update about.

Browse files
Files changed (1) hide show
  1. src/display/about.py +89 -4
src/display/about.py CHANGED
@@ -16,7 +16,7 @@ LLM_BENCHMARKS_TEXT = f"""
16
  ## ABOUT
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
19
- Submit a model for automated evaluation on the CPU/GPU on the "Submit" page!
20
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
21
 
22
  ### Tasks
@@ -162,7 +162,7 @@ I have another problem, help!
162
 
163
 
164
  EVALUATION_QUEUE_TEXT = f"""
165
- # Evaluation Queue for the Low-bit Quantized Open LLM Leaderboard
166
 
167
  Models added here will be automatically evaluated on the cluster.
168
 
@@ -191,11 +191,96 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
191
  ### 4) Fill up your model card
192
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
193
 
194
- ### 5) Select the compute dtype
195
- The `compute_dtype` will pass to `lm-eval`, which is the inference precision.
196
 
197
  """
198
 
199
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
200
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  """
 
16
  ## ABOUT
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
19
+ Submit a model for automated evaluation on the GPU cluster on the "Submit" page!
20
  The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
21
 
22
  ### Tasks
 
162
 
163
 
164
  EVALUATION_QUEUE_TEXT = f"""
165
+ # Evaluation Queue for the Open LLM Leaderboard
166
 
167
  Models added here will be automatically evaluated on the cluster.
168
 
 
191
  ### 4) Fill up your model card
192
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
193
 
194
+ ### 5) Select the correct precision
195
+ Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
196
 
197
  """
198
 
199
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
200
  CITATION_BUTTON_TEXT = r"""
201
+ @software{eval-harness,
202
+ author = {Gao, Leo and
203
+ Tow, Jonathan and
204
+ Biderman, Stella and
205
+ Black, Sid and
206
+ DiPofi, Anthony and
207
+ Foster, Charles and
208
+ Golding, Laurence and
209
+ Hsu, Jeffrey and
210
+ McDonell, Kyle and
211
+ Muennighoff, Niklas and
212
+ Phang, Jason and
213
+ Reynolds, Laria and
214
+ Tang, Eric and
215
+ Thite, Anish and
216
+ Wang, Ben and
217
+ Wang, Kevin and
218
+ Zou, Andy},
219
+ title = {A framework for few-shot language model evaluation},
220
+ month = sep,
221
+ year = 2021,
222
+ publisher = {Zenodo},
223
+ version = {v0.0.1},
224
+ doi = {10.5281/zenodo.5371628},
225
+ url = {https://doi.org/10.5281/zenodo.5371628}
226
+ }
227
+ @misc{clark2018think,
228
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
229
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
230
+ year={2018},
231
+ eprint={1803.05457},
232
+ archivePrefix={arXiv},
233
+ primaryClass={cs.AI}
234
+ }
235
+ @misc{zellers2019hellaswag,
236
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
237
+ author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
238
+ year={2019},
239
+ eprint={1905.07830},
240
+ archivePrefix={arXiv},
241
+ primaryClass={cs.CL}
242
+ }
243
+ @misc{hendrycks2021measuring,
244
+ title={Measuring Massive Multitask Language Understanding},
245
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
246
+ year={2021},
247
+ eprint={2009.03300},
248
+ archivePrefix={arXiv},
249
+ primaryClass={cs.CY}
250
+ }
251
+ @misc{lin2022truthfulqa,
252
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
253
+ author={Stephanie Lin and Jacob Hilton and Owain Evans},
254
+ year={2022},
255
+ eprint={2109.07958},
256
+ archivePrefix={arXiv},
257
+ primaryClass={cs.CL}
258
+ }
259
+ @misc{DBLP:journals/corr/abs-1907-10641,
260
+ title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
261
+ author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
262
+ year={2019},
263
+ eprint={1907.10641},
264
+ archivePrefix={arXiv},
265
+ primaryClass={cs.CL}
266
+ }
267
+ @misc{DBLP:journals/corr/abs-2110-14168,
268
+ title={Training Verifiers to Solve Math Word Problems},
269
+ author={Karl Cobbe and
270
+ Vineet Kosaraju and
271
+ Mohammad Bavarian and
272
+ Mark Chen and
273
+ Heewoo Jun and
274
+ Lukasz Kaiser and
275
+ Matthias Plappert and
276
+ Jerry Tworek and
277
+ Jacob Hilton and
278
+ Reiichiro Nakano and
279
+ Christopher Hesse and
280
+ John Schulman},
281
+ year={2021},
282
+ eprint={2110.14168},
283
+ archivePrefix={arXiv},
284
+ primaryClass={cs.CL}
285
+ }
286
  """