open-ko-llm-leaderboard season2

#88
by choco9966 - opened
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
- from gradio_space_ci import configure_space_ci # FOR CI
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
@@ -32,11 +32,6 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PU
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
  from src.tools.collections import update_collections
35
- from src.tools.plots import (
36
- create_metric_plot_obj,
37
- create_plot_df,
38
- create_scores_df,
39
- )
40
 
41
 
42
  def restart_space():
@@ -63,8 +58,6 @@ if REPO_ID == "upstage/open-ko-llm-leaderboard": # update only when it's from re
63
  update_collections(original_df.copy())
64
  leaderboard_df = original_df.copy()
65
 
66
- plot_df = create_plot_df(create_scores_df(raw_data))
67
-
68
  (
69
  finished_eval_queue_df,
70
  running_eval_queue_df,
@@ -155,7 +148,6 @@ def filter_models(
155
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
156
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
157
  filtered_df = filtered_df.loc[mask]
158
-
159
  return filtered_df
160
 
161
  leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
@@ -299,23 +291,7 @@ with demo:
299
  leaderboard_table,
300
  queue=True,
301
  )
302
-
303
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
304
- with gr.Row():
305
- with gr.Column():
306
- chart = create_metric_plot_obj(
307
- plot_df,
308
- [AutoEvalColumn.average.name],
309
- title="Average of Top Scores Over Time (from last update)",
310
- )
311
- gr.Plot(value=chart, min_width=500)
312
- with gr.Column():
313
- chart = create_metric_plot_obj(
314
- plot_df,
315
- BENCHMARK_COLS,
316
- title="Top Scores Over Time (from last update)",
317
- )
318
- gr.Plot(value=chart, min_width=500)
319
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
320
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
321
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
@@ -383,7 +359,7 @@ with demo:
383
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
384
  label="Model type",
385
  multiselect=False,
386
- value=ModelType.IFT.to_str(" : "),
387
  interactive=True,
388
  )
389
 
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
+ from gradio_space_ci.webhook import configure_space_ci
6
 
7
  from src.display.about import (
8
  CITATION_BUTTON_LABEL,
 
32
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
  from src.tools.collections import update_collections
 
 
 
 
 
35
 
36
 
37
  def restart_space():
 
58
  update_collections(original_df.copy())
59
  leaderboard_df = original_df.copy()
60
 
 
 
61
  (
62
  finished_eval_queue_df,
63
  running_eval_queue_df,
 
148
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
149
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
150
  filtered_df = filtered_df.loc[mask]
 
151
  return filtered_df
152
 
153
  leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
 
291
  leaderboard_table,
292
  queue=True,
293
  )
294
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
296
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
297
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
 
359
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
360
  label="Model type",
361
  multiselect=False,
362
+ value=ModelType.FT.to_str(" : "),
363
  interactive=True,
364
  )
365
 
requirements.txt CHANGED
@@ -2,17 +2,22 @@ APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
- gradio==4.19.2
6
- gradio_client==0.10.1
7
  huggingface-hub>=0.18.0
8
- matplotlib==3.7.1
9
- numpy==1.24.2
10
- pandas==2.0.0
11
  plotly==5.14.1
12
  python-dateutil==2.8.2
13
- requests==2.28.2
14
  sentencepiece
15
  tqdm==4.65.0
16
- transformers==4.38.2
17
  tokenizers>=0.15.0
18
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.1.2 # CI !!!
 
 
 
 
 
 
 
 
 
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
 
 
5
  huggingface-hub>=0.18.0
6
+ matplotlib==3.8.4
7
+ numpy==1.26.0
8
+ pandas==2.2.2
9
  plotly==5.14.1
10
  python-dateutil==2.8.2
 
11
  sentencepiece
12
  tqdm==4.65.0
13
+ transformers==4.43.1
14
  tokenizers>=0.15.0
15
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
16
+ isort
17
+ ruff
18
+ gradio==4.31.0
19
+ gradio[oauth]
20
+ gradio_leaderboard==0.0.11
21
+ requests==2.31.0
22
+ requests-oauthlib== 1.3.1
23
+ schedule == 1.2.2
src/display/about.py CHANGED
@@ -2,62 +2,51 @@ from src.display.utils import ModelType
2
 
3
 
4
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
- BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
6
 
7
  INTRODUCTION_TEXT = f"""
8
- 🚀 The Open Ko-LLM Leaderboard 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM).
9
 
10
- When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of __[KT](https://cloud.kt.com/)__.
11
- The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense.
12
- The evaluation dataset is exclusively private and only available for evaluation process.
13
- More detailed information about the benchmark dataset is provided on the “About” page.
14
 
15
- This leaderboard is co-hosted by __[Upstage](https://www.upstage.ai)__, and __[NIA](https://www.nia.or.kr/site/nia_kor/main.do)__ that provides various Korean Data Sets through __[AI-Hub](https://aihub.or.kr)__, and operated by __[Upstage](https://www.upstage.ai)__.
 
 
16
  """
17
 
18
  LLM_BENCHMARKS_TEXT = f"""
19
- # Context
 
20
  While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
21
 
22
- ## Icons
23
- {ModelType.PT.to_str(" : ")} model
24
- {ModelType.IFT.to_str(" : ")} model
25
- {ModelType.RL.to_str(" : ")} model
26
- If there is no icon, it indicates that there is insufficient information about the model.
27
- Please provide information about the model through an issue! 🤩
28
 
29
- 🏴‍☠️ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
30
- (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
31
 
32
- ## How it works
 
 
 
 
 
 
 
 
33
 
34
- 📈 We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
35
 
36
- We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
37
- - Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
38
- - Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
39
- - Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
40
- - Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
41
- - Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
42
- - Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
43
- - Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
44
- - Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
45
- - Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
46
- - KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
47
- - KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
48
- - Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
49
- - Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
50
 
51
- To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
52
 
53
- GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
54
 
55
- ## Details and Logs
56
- - Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
57
- - Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
58
 
59
  ## More resources
60
- If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
 
61
  """
62
 
63
 
@@ -66,38 +55,71 @@ FAQ_TEXT = """
66
 
67
 
68
  EVALUATION_QUEUE_TEXT = f"""
69
- # Evaluation Queue for the 🚀 Open Ko-LLM Leaderboard
70
- Models added here will be automatically evaluated on the KT GPU cluster.
 
 
 
71
 
72
- ## <Some good practices before submitting a model>
73
 
74
- ### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
75
- ```python
 
 
 
 
 
 
 
 
 
 
76
  from transformers import AutoConfig, AutoModel, AutoTokenizer
77
  config = AutoConfig.from_pretrained("your model name", revision=revision)
78
  model = AutoModel.from_pretrained("your model name", revision=revision)
79
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
80
  ```
81
 
82
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
83
 
84
- ⚠️ Make sure your model is public!
 
 
85
 
86
- ⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)
87
 
88
- ⚠️ If your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
89
 
90
- ### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
91
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
92
 
93
- ### 3️⃣ Make sure your model has an open license!
94
- This is a leaderboard for 🚀 Open Ko-LLMs, and we'd love for as many people as possible to know they can use your model
 
95
 
96
- ### 4️⃣ Fill up your model card
97
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
98
 
99
- ## In case of model failure
100
- If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  """
102
 
103
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
@@ -106,8 +128,10 @@ CITATION_BUTTON_TEXT = r"""
106
  title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
107
  author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
108
  year={2024},
109
- booktitle={ACL Main}
110
  }
 
 
111
  @software{eval-harness,
112
  author = {Gao, Leo and
113
  Tow, Jonathan and
@@ -132,40 +156,59 @@ CITATION_BUTTON_TEXT = r"""
132
  publisher = {Zenodo},
133
  version = {v0.0.1},
134
  doi = {10.5281/zenodo.5371628},
135
- url = {https://doi.org/10.5281/zenodo.5371628}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
137
- @misc{seo2023kocommongen,
138
- title={Korean Commonsense Reasoning Evaluation for Large Language Models},
139
- author={Jaehyung Seo, Chanjun Park, Hyeonseok Moon, Sugyeong Eo, Aram So, Heuiseok Lim},
140
- year={2023},
141
- affilation={Korea University, NLP&AI},
142
- booktitle={Proceedings of the 35th Annual Conference on Human & Cognitive Language Technology}}
143
- @misc{park2023koarc,
144
- title={Ko-ARC},
145
- original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
146
- author={Hyunbyung Park, Chanjun Park},
147
- original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
148
- year={2023}
149
  }
150
- @misc{park2023kohellaswag,
151
- title={Ko-HellaSwag},
152
- original_title={HellaSwag: Can a Machine Really Finish Your Sentence?},
153
- author={Hyunbyung Park, Chanjun Park},
154
- original_author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
155
- year={2023}
156
  }
157
- @misc{park2023kommlu,
158
- title={Ko-MMLU},
159
- original_title={Measuring Massive Multitask Language Understanding},
160
- author={Hyunbyung Park, Chanjun Park},
161
- original_author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
162
- year={2023}
 
 
 
 
163
  }
164
- @misc{park2023kotruthfulqa,
165
- title={Ko-TruthfulQA},
166
- original_title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
167
- author={Hyunbyung Park, Chanjun Park},
168
- original_author={Stephanie Lin and Jacob Hilton and Owain Evans},
169
- year={2023}
170
  }
171
  """
 
2
 
3
 
4
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
+ BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:75%;display:block;margin-left:auto;margin-right:auto">"""
6
 
7
  INTRODUCTION_TEXT = f"""
8
+ The previous Leaderboard version is live [here](https://huggingface.co/spaces/choco9966/open-ko-llm-leaderboard-old) 📊
9
 
10
+ 🚀 The Open Ko-LLM Leaderboard2 🇰🇷 objectively evaluates the performance of Korean Large Language Model (LLM). When you submit a model on the "Submit here!" page, it is automatically evaluated.
 
 
 
11
 
12
+ This leaderboard is co-hosted by [Upstage](https://www.upstage.ai/), and [NIA](https://www.nia.or.kr/site/nia_kor/main.do) that provides various Korean Data Sets through [AI-Hub](https://aihub.or.kr/), and operated by [Upstage](https://www.upstage.ai/). The GPU used for evaluation is operated with the support of [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php). If Season 1 focused on evaluating the capabilities of the LLM in terms of reasoning, language understanding, hallucination, and commonsense through academic benchmarks, Season 2 will focus on assessing the LLM's practical abilities and reliability. The datasets for this season are sponsored by [Flitto](https://www.flitto.com/portal/en), [SELECTSTAR](https://selectstar.ai/ko/), and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1). The evaluation dataset is exclusively private and only available for evaluation process. More detailed information about the benchmark dataset is provided on the “About” page.
13
+
14
+ You'll notably find explanations on the evaluations we are using, reproducibility guidelines, best practices on how to submit a model, and our FAQ.
15
  """
16
 
17
  LLM_BENCHMARKS_TEXT = f"""
18
+ # Motivation
19
+
20
  While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, 🚀 Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
21
 
22
+ ## How it works
 
 
 
 
 
23
 
24
+ 📈 We evaluate models on 9 key benchmarks using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) , a unified framework to test generative language models on a large number of different evaluation tasks.
 
25
 
26
+ - Ko-GPQA (provided by [Flitto](https://www.flitto.com/portal/en))
27
+ - Ko-WinoGrande (provided by [Flitto](https://www.flitto.com/portal/en))
28
+ - Ko-GSM8K (provided by [Flitto](https://www.flitto.com/portal/en))
29
+ - Ko-EQ-Bench (provided by [Flitto](https://www.flitto.com/portal/en))
30
+ - Ko-IFEval (provided by [Flitto](https://www.flitto.com/portal/en))
31
+ - KorNAT-Knowledge (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
32
+ - KorNAT-Social-Value (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
33
+ - Ko-Harmlessness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
34
+ - Ko-Helpfulness (provided by [SELECTSTAR](https://selectstar.ai/ko/) and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
35
 
36
+ For all these evaluations, a higher score is a better score. We chose these benchmarks as they test a variety of reasoning, harmlessness, helpfulness and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
37
 
38
+ The final score is converted to the average score from each evaluation datasets.
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ GPUs are provided by [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php) for the evaluations.
41
 
42
+ ## **Results**
43
 
44
+ - Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
45
+ - Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
 
46
 
47
  ## More resources
48
+
49
+ If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
50
  """
51
 
52
 
 
55
 
56
 
57
  EVALUATION_QUEUE_TEXT = f"""
58
+ # Evaluation Queue for the 🤗 Open Ko-LLM Leaderboard
59
+
60
+ Models added here will be automatically evaluated on the 🤗 cluster.
61
+
62
+ ## Submission Disclaimer
63
 
64
+ **By submitting a model, you acknowledge that:**
65
 
66
+ - We store information about who submitted each model in [Requests dataset](https://huggingface.co/datasets/open-ko-llm-leaderboard/requests).
67
+ - This practice helps maintain the integrity of our leaderboard, prevent spam, and ensure responsible submissions.
68
+ - Your submission will be visible to the community and you may be contacted regarding your model.
69
+ - Please submit carefully and responsibly 💛
70
+
71
+ ## First Steps Before Submitting a Model
72
+
73
+ ### 1. Ensure Your Model Loads with AutoClasses
74
+
75
+ Verify that you can load your model and tokenizer using AutoClasses:
76
+
77
+ ```jsx
78
  from transformers import AutoConfig, AutoModel, AutoTokenizer
79
  config = AutoConfig.from_pretrained("your model name", revision=revision)
80
  model = AutoModel.from_pretrained("your model name", revision=revision)
81
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
82
  ```
83
 
84
+ Note:
85
 
86
+ - If this step fails, debug your model before submitting.
87
+ - Ensure your model is public.
88
+ - We are working on adding support for models requiring `use_remote_code=True`.
89
 
90
+ ### 2. Convert Weights to Safetensors
91
 
92
+ [Safetensors](https://huggingface.co/docs/safetensors/index) is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
93
 
94
+ ### 3. Verify Your Model Open License
 
95
 
96
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
97
+
98
+ ### 4. Complete Your Model Card
99
 
 
100
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
101
 
102
+ ### 5. Select Correct Precision
103
+
104
+ Choose the right precision to avoid evaluation errors:
105
+
106
+ - Not all models convert properly from float16 to bfloat16.
107
+ - Incorrect precision can cause issues (e.g., loading a bf16 model in fp16 may generate NaNs).
108
+
109
+ > Important: When submitting, git branches and tags will be strictly tied to the specific commit present at the time of submission to ensure revision consistency.
110
+ >
111
+
112
+ ## Model types
113
+
114
+ - 🟢 : 🟢 pretrained model: new, base models, trained on a given text corpora using masked modelling
115
+ - 🟩 : 🟩 continuously pretrained model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
116
+ - 🔶 : 🔶 fine-tuned on domain-specific datasets model: pretrained models finetuned on more data
117
+ - 💬 : 💬 chat models (RLHF, DPO, IFT, ...) model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
118
+ - 🤝 : 🤝 base merges and moerges model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
119
+
120
+ Please provide information about the model through an issue! 🤩
121
+
122
+ 🏴‍☠️ : 🏴‍☠️ This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model. (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
123
  """
124
 
125
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
 
128
  title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
129
  author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
130
  year={2024},
131
+ booktitle={The 62nd Annual Meeting of the Association for Computational Linguistics (ACL 2024) }
132
  }
133
+
134
+
135
  @software{eval-harness,
136
  author = {Gao, Leo and
137
  Tow, Jonathan and
 
156
  publisher = {Zenodo},
157
  version = {v0.0.1},
158
  doi = {10.5281/zenodo.5371628},
159
+ url = {https://doi.org/10.5281/zenodo.5371628},
160
+ }
161
+
162
+ @misc{rein2023gpqagraduatelevelgoogleproofqa,
163
+ title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
164
+ author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
165
+ year={2023},
166
+ eprint={2311.12022},
167
+ archivePrefix={arXiv},
168
+ primaryClass={cs.AI},
169
+ url={https://arxiv.org/abs/2311.12022},
170
+ }
171
+
172
+ @article{sakaguchi2021winogrande,
173
+ title={Winogrande: An adversarial winograd schema challenge at scale},
174
+ author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
175
+ journal={Communications of the ACM},
176
+ volume={64},
177
+ number={9},
178
+ pages={99--106},
179
+ year={2021},
180
+ publisher={ACM New York, NY, USA}
181
  }
182
+
183
+ @article{cobbe2021training,
184
+ title={Training verifiers to solve math word problems},
185
+ author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
186
+ journal={arXiv preprint arXiv:2110.14168},
187
+ year={2021}
 
 
 
 
 
 
188
  }
189
+
190
+ article{paech2023eq,
191
+ title={Eq-bench: An emotional intelligence benchmark for large language models},
192
+ author={Paech, Samuel J},
193
+ journal={arXiv preprint arXiv:2312.06281},
194
+ year={2023}
195
  }
196
+
197
+
198
+ @misc{zhou2023instructionfollowingevaluationlargelanguage,
199
+ title={Instruction-Following Evaluation for Large Language Models},
200
+ author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
201
+ year={2023},
202
+ eprint={2311.07911},
203
+ archivePrefix={arXiv},
204
+ primaryClass={cs.CL},
205
+ url={https://arxiv.org/abs/2311.07911},
206
  }
207
+
208
+ @article{lee2024kornat,
209
+ title={KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge},
210
+ author={Lee, Jiyoung and Kim, Minwoo and Kim, Seungho and Kim, Junghwan and Won, Seunghyun and Lee, Hwaran and Choi, Edward},
211
+ journal={arXiv preprint arXiv:2402.13605},
212
+ year={2024}
213
  }
214
  """
src/display/formatting.py CHANGED
@@ -14,10 +14,9 @@ def model_hyperlink(link, model_name):
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
17
- details_model_name = model_name.replace("/", "__")
18
- details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
-
20
- return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
 
22
 
23
  def styled_error(error):
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
17
+ # details_model_name = model_name.replace("/", "__")
18
+ # details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
19
+ return model_hyperlink(link, model_name) # + " " + model_hyperlink(details_link, "📑")
 
20
 
21
 
22
  def styled_error(error):
src/display/utils.py CHANGED
@@ -14,19 +14,15 @@ class Task:
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
- arc = Task("ko_arc_challenge", "acc_norm", "Ko-ARC")
18
- hellaswag = Task("ko_hellaswag", "acc_norm", "Ko-HellaSwag")
19
- mmlu = Task("ko_mmlu", "acc", "Ko-MMLU")
20
- truthfulqa = Task("ko_truthfulqa_mc", "mc2", "Ko-TruthfulQA")
21
- winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
22
- gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
23
- commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
24
- eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
25
- instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
26
- korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
27
- korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
28
- harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
29
- helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
30
 
31
 
32
  # These classes are for user facing column names,
@@ -89,26 +85,30 @@ class ModelDetails:
89
 
90
  class ModelType(Enum):
91
  PT = ModelDetails(name="pretrained", symbol="🟢")
92
- # FT = ModelDetails(name="fine-tuned", symbol="🔶")
93
- IFT = ModelDetails(name="instruction-tuned", symbol="")
94
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
95
- Unknown = ModelDetails(name="", symbol="?")
 
96
 
97
  def to_str(self, separator=" "):
98
  return f"{self.value.symbol}{separator}{self.value.name}"
99
 
100
  @staticmethod
101
- def from_str(type):
102
- # if "fine-tuned" in type or "🔶" in type:
103
- # return ModelType.FT
104
- if "pretrained" in type or "🟢" in type:
 
 
105
  return ModelType.PT
106
- if "RL-tuned" in type or "🟦" in type:
107
- return ModelType.RL
108
- if "instruction-tuned" in type or "" in type:
109
- return ModelType.IFT
110
  return ModelType.Unknown
111
 
 
112
  class WeightType(Enum):
113
  Adapter = ModelDetails("Adapter")
114
  Original = ModelDetails("Original")
@@ -116,12 +116,13 @@ class WeightType(Enum):
116
 
117
  class Precision(Enum):
118
  float16 = ModelDetails("float16")
119
- # bfloat16 = ModelDetails("bfloat16")
120
- # qt_8bit = ModelDetails("8bit")
121
- # qt_4bit = ModelDetails("4bit")
122
- # qt_GPTQ = ModelDetails("GPTQ")
123
  Unknown = ModelDetails("?")
124
 
 
125
  def from_str(precision):
126
  if precision in ["torch.float16", "float16"]:
127
  return Precision.float16
@@ -134,15 +135,10 @@ class Precision(Enum):
134
  if precision in ["GPTQ", "None"]:
135
  return Precision.qt_GPTQ
136
  return Precision.Unknown
137
-
138
-
139
-
140
 
141
  # Column selection
142
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
143
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
144
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
145
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
146
 
147
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
148
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
@@ -157,4 +153,4 @@ NUMERIC_INTERVALS = {
157
  "13~35B": pd.Interval(13, 35, closed="right"),
158
  "35~60B": pd.Interval(35, 60, closed="right"),
159
  "60B+": pd.Interval(60, 10000, closed="right"),
160
- }
 
14
  col_name: str
15
 
16
  class Tasks(Enum):
17
+ gpqa = Task("ko_gpqa_diamond_zeroshot", "acc_norm,none", "Ko-GPQA")
18
+ winogrande = Task("ko_winogrande", "acc,none", "Ko-Winogrande")
19
+ gsm8k = Task("ko_gsm8k", "exact_match,strict-match", "Ko-GSM8k")
20
+ eqBench = Task("ko_eqbench", "eqbench,none", "Ko-EQ Bench")
21
+ instFollow = Task("ko_ifeval", "strict_acc,none", "Ko-IFEval")
22
+ korNatCka = Task("kornat_common", "acc_norm,none", "KorNAT-CKA")
23
+ korNatSva = Task("kornat_social", "A-SVA,none", "KorNAT-SVA")
24
+ harmlessness = Task("kornat_harmless", "acc_norm,none", "Ko-Harmlessness")
25
+ helpfulness = Task("kornat_helpful", "acc_norm,none", "Ko-Helpfulness")
 
 
 
 
26
 
27
 
28
  # These classes are for user facing column names,
 
85
 
86
  class ModelType(Enum):
87
  PT = ModelDetails(name="pretrained", symbol="🟢")
88
+ CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
89
+ FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
90
+ chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
91
+ merges = ModelDetails(name="base merges and moerges", symbol="🤝")
92
+ Unknown = ModelDetails(name="other", symbol="❓")
93
 
94
  def to_str(self, separator=" "):
95
  return f"{self.value.symbol}{separator}{self.value.name}"
96
 
97
  @staticmethod
98
+ def from_str(m_type):
99
+ if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
100
+ return ModelType.FT
101
+ if "continuously pretrained" in m_type or "🟩" in m_type:
102
+ return ModelType.CPT
103
+ if "pretrained" in m_type or "🟢" in m_type:
104
  return ModelType.PT
105
+ if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
106
+ return ModelType.chat
107
+ if "merge" in m_type or "🤝" in m_type:
108
+ return ModelType.merges
109
  return ModelType.Unknown
110
 
111
+
112
  class WeightType(Enum):
113
  Adapter = ModelDetails("Adapter")
114
  Original = ModelDetails("Original")
 
116
 
117
  class Precision(Enum):
118
  float16 = ModelDetails("float16")
119
+ bfloat16 = ModelDetails("bfloat16")
120
+ qt_8bit = ModelDetails("8bit")
121
+ qt_4bit = ModelDetails("4bit")
122
+ qt_GPTQ = ModelDetails("GPTQ")
123
  Unknown = ModelDetails("?")
124
 
125
+ @staticmethod
126
  def from_str(precision):
127
  if precision in ["torch.float16", "float16"]:
128
  return Precision.float16
 
135
  if precision in ["GPTQ", "None"]:
136
  return Precision.qt_GPTQ
137
  return Precision.Unknown
 
 
 
138
 
139
  # Column selection
140
+ COLS = [c.name for c in fields(AutoEvalColumn)]
141
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
 
 
142
 
143
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
144
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
153
  "13~35B": pd.Interval(13, 35, closed="right"),
154
  "35~60B": pd.Interval(35, 60, closed="right"),
155
  "60B+": pd.Interval(60, 10000, closed="right"),
156
+ }
src/leaderboard/read_evals.py CHANGED
@@ -48,7 +48,7 @@ class EvalResult:
48
  precision = Precision.from_str(config.get("model_dtype"))
49
 
50
  # Get model and org
51
- org_and_model = config.get("model_name", config.get("model_args", None))
52
  org_and_model = org_and_model.split("/", 1)
53
 
54
  if len(org_and_model) == 1:
@@ -96,26 +96,19 @@ class EvalResult:
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
99
-
100
- # Some truthfulQA values are NaNs
101
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
102
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
103
- results[task.benchmark] = 0.0
104
- continue
105
-
106
- # New tasks have been added, we need to skip them if not exists
107
- if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
108
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
109
  if accs.size == 0 or any([acc is None for acc in accs]):
110
- results[task.benchmark] = 0.0
111
  continue
112
-
113
- # We average all scores of a given metric (mostly for mmlu)
114
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
115
- if accs.size == 0 or any([acc is None for acc in accs]):
116
- continue
117
-
118
- mean_acc = np.mean(accs) * 100.0
119
  results[task.benchmark] = mean_acc
120
 
121
  return self(
@@ -151,27 +144,7 @@ class EvalResult:
151
  def to_dict(self):
152
  """Converts the Eval Result to a dict compatible with our dataframe display"""
153
 
154
- # Skip the new tasks for now
155
- # TODO: safely remove this code when the task results are all added
156
- skip_avg_len = 0
157
- if self.results['ko_winogrande'] == 0.0:
158
- skip_avg_len += 1
159
- if self.results['ko_gsm8k'] == 0.0:
160
- skip_avg_len += 1
161
- if self.results['ko_eq_bench'] == 0.0:
162
- skip_avg_len += 1
163
- if self.results['ko_inst_follow'] == 0.0:
164
- skip_avg_len += 1
165
- if self.results['kor_nat_cka'] == 0.0:
166
- skip_avg_len += 1
167
- if self.results['kor_nat_sva'] == 0.0:
168
- skip_avg_len += 1
169
- if self.results['ko_harmlessness'] == 0.0:
170
- skip_avg_len += 1
171
- if self.results['ko_helpfulness'] == 0.0:
172
- skip_avg_len += 1
173
-
174
- average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
 
176
  data_dict = {
177
  "eval_name": self.eval_name, # not a column, just a save name,
 
48
  precision = Precision.from_str(config.get("model_dtype"))
49
 
50
  # Get model and org
51
+ org_and_model = config.get("model_name", None)
52
  org_and_model = org_and_model.split("/", 1)
53
 
54
  if len(org_and_model) == 1:
 
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
99
+ if task.benchmark in ["ko_ifeval"]:
100
+ ko_ifeval = data["results"]["ko_ifeval"]
101
+ accs = np.mean([ko_ifeval["prompt_level_strict_acc,none"], ko_ifeval["inst_level_strict_acc,none"]])
102
+ mean_acc = np.mean(accs) * 100.0
103
+ results[task.benchmark] = mean_acc
104
+
105
+ if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eqbench", "kornat_common", "kornat_social", "kornat_harmless", "kornat_helpful", "ko_gpqa_diamond_zeroshot"]:
106
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
107
  if accs.size == 0 or any([acc is None for acc in accs]):
 
108
  continue
109
+
110
+ if task.benchmark not in ["ko_eqbench"]:
111
+ mean_acc = accs[0] * 100.0
 
 
 
 
112
  results[task.benchmark] = mean_acc
113
 
114
  return self(
 
144
  def to_dict(self):
145
  """Converts the Eval Result to a dict compatible with our dataframe display"""
146
 
147
+ average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  data_dict = {
150
  "eval_name": self.eval_name, # not a column, just a save name,
src/submission/submit.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
  from datetime import datetime, timezone
 
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
@@ -12,6 +13,7 @@ from src.submission.check_validity import (
12
  is_model_on_hub,
13
  user_submission_permission,
14
  )
 
15
 
16
  REQUESTED_MODELS = None
17
  USERS_TO_SUBMISSION_DATES = None
@@ -38,10 +40,7 @@ def add_new_eval(
38
 
39
  precision = precision.split(" ")[0]
40
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
41
- # 리더보드 종료
42
- if True:
43
- return styled_error("The current Season 1 will conclude on Friday, August 2, and the new season will commence on August 12.")
44
-
45
  if model_type is None or model_type == "":
46
  return styled_error("Please select a model type.")
47
 
@@ -100,6 +99,9 @@ def add_new_eval(
100
 
101
  # Seems good, creating the eval
102
  print("Adding new eval")
 
 
 
103
 
104
  eval_entry = {
105
  "model": model,
@@ -114,6 +116,7 @@ def add_new_eval(
114
  "likes": model_info.likes,
115
  "params": model_size,
116
  "license": license,
 
117
  }
118
 
119
  # Check for duplicate submission
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
+ import pandas as pd
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
7
  from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
 
13
  is_model_on_hub,
14
  user_submission_permission,
15
  )
16
+ from src.populate import get_evaluation_queue_df
17
 
18
  REQUESTED_MODELS = None
19
  USERS_TO_SUBMISSION_DATES = None
 
40
 
41
  precision = precision.split(" ")[0]
42
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
43
+
 
 
 
44
  if model_type is None or model_type == "":
45
  return styled_error("Please select a model type.")
46
 
 
99
 
100
  # Seems good, creating the eval
101
  print("Adding new eval")
102
+ # dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, cols=["job_id"])
103
+ # dfs = pd.concat(dfs).reset_index(drop=True)
104
+ # max_job_id = max([int(c) for c in dfs["job_id"].values])
105
 
106
  eval_entry = {
107
  "model": model,
 
116
  "likes": model_info.likes,
117
  "params": model_size,
118
  "license": license,
119
+ # "job_id": max_job_id+1
120
  }
121
 
122
  # Check for duplicate submission