Spaces:
Runtime error
Runtime error
Sean Cho
commited on
Commit
ยท
f73765d
1
Parent(s):
495b288
Initial Korean version
Browse files- app.py +6 -6
- src/assets/text_content.py +41 -43
app.py
CHANGED
@@ -374,7 +374,7 @@ with demo:
|
|
374 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
375 |
|
376 |
with gr.Column():
|
377 |
-
with gr.Accordion(f"โ
|
378 |
with gr.Row():
|
379 |
finished_eval_table = gr.components.Dataframe(
|
380 |
value=finished_eval_queue_df,
|
@@ -382,7 +382,7 @@ with demo:
|
|
382 |
datatype=EVAL_TYPES,
|
383 |
max_rows=5,
|
384 |
)
|
385 |
-
with gr.Accordion(f"๐
|
386 |
with gr.Row():
|
387 |
running_eval_table = gr.components.Dataframe(
|
388 |
value=running_eval_queue_df,
|
@@ -391,7 +391,7 @@ with demo:
|
|
391 |
max_rows=5,
|
392 |
)
|
393 |
|
394 |
-
with gr.Accordion(f"โณ
|
395 |
with gr.Row():
|
396 |
pending_eval_table = gr.components.Dataframe(
|
397 |
value=pending_eval_queue_df,
|
@@ -400,7 +400,7 @@ with demo:
|
|
400 |
max_rows=5,
|
401 |
)
|
402 |
with gr.Row():
|
403 |
-
gr.Markdown("# โ๏ธโจ
|
404 |
|
405 |
with gr.Row():
|
406 |
with gr.Column():
|
@@ -443,7 +443,7 @@ with demo:
|
|
443 |
label="Base model (for delta or adapter weights)"
|
444 |
)
|
445 |
|
446 |
-
submit_button = gr.Button("
|
447 |
submission_result = gr.Markdown()
|
448 |
submit_button.click(
|
449 |
add_new_eval,
|
@@ -460,7 +460,7 @@ with demo:
|
|
460 |
)
|
461 |
|
462 |
with gr.Row():
|
463 |
-
refresh_button = gr.Button("
|
464 |
refresh_button.click(
|
465 |
refresh,
|
466 |
inputs=[],
|
|
|
374 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
375 |
|
376 |
with gr.Column():
|
377 |
+
with gr.Accordion(f"โ
ํ๊ฐ ์๋ฃ ({len(finished_eval_queue_df)})", open=False):
|
378 |
with gr.Row():
|
379 |
finished_eval_table = gr.components.Dataframe(
|
380 |
value=finished_eval_queue_df,
|
|
|
382 |
datatype=EVAL_TYPES,
|
383 |
max_rows=5,
|
384 |
)
|
385 |
+
with gr.Accordion(f"๐ ํ๊ฐ ์งํ ๋๊ธฐ์ด ({len(running_eval_queue_df)})", open=False):
|
386 |
with gr.Row():
|
387 |
running_eval_table = gr.components.Dataframe(
|
388 |
value=running_eval_queue_df,
|
|
|
391 |
max_rows=5,
|
392 |
)
|
393 |
|
394 |
+
with gr.Accordion(f"โณ ํ๊ฐ ๋๊ธฐ ๋๊ธฐ์ด ({len(pending_eval_queue_df)})", open=False):
|
395 |
with gr.Row():
|
396 |
pending_eval_table = gr.components.Dataframe(
|
397 |
value=pending_eval_queue_df,
|
|
|
400 |
max_rows=5,
|
401 |
)
|
402 |
with gr.Row():
|
403 |
+
gr.Markdown("# โ๏ธโจ ์ฌ๊ธฐ์์ ๋ชจ๋ธ์ ์ ์ถํด์ฃผ์ธ์!", elem_classes="markdown-text")
|
404 |
|
405 |
with gr.Row():
|
406 |
with gr.Column():
|
|
|
443 |
label="Base model (for delta or adapter weights)"
|
444 |
)
|
445 |
|
446 |
+
submit_button = gr.Button("์ ์ถํ๊ณ ํ๊ฐ๋ฐ๊ธฐ")
|
447 |
submission_result = gr.Markdown()
|
448 |
submit_button.click(
|
449 |
add_new_eval,
|
|
|
460 |
)
|
461 |
|
462 |
with gr.Row():
|
463 |
+
refresh_button = gr.Button("์๋ก๊ณ ์นจ")
|
464 |
refresh_button.click(
|
465 |
refresh,
|
466 |
inputs=[],
|
src/assets/text_content.py
CHANGED
@@ -56,53 +56,54 @@ CHANGELOG_TEXT = f"""
|
|
56 |
- Release the leaderboard to public
|
57 |
"""
|
58 |
|
59 |
-
TITLE = """<h1 align="center" id="space-title"
|
60 |
|
61 |
INTRODUCTION_TEXT = f"""
|
62 |
-
|
63 |
|
64 |
-
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
Other cool benchmarks for LLMs are developed at HuggingFace: ๐๐ค [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), ๐ฅ๏ธ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
|
69 |
-
|
70 |
-
And also in other labs, check out the [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) and [MT Bench](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) among other great ressources.
|
71 |
"""
|
72 |
|
73 |
LLM_BENCHMARKS_TEXT = f"""
|
74 |
# Context
|
75 |
-
|
76 |
|
77 |
## Icons
|
78 |
{ModelType.PT.to_str(" : ")} model
|
79 |
{ModelType.FT.to_str(" : ")} model
|
80 |
{ModelType.IFT.to_str(" : ")} model
|
81 |
{ModelType.RL.to_str(" : ")} model
|
82 |
-
|
|
|
83 |
|
84 |
-
|
|
|
85 |
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
-
|
90 |
-
-
|
91 |
-
-
|
|
|
|
|
|
|
|
|
92 |
|
93 |
-
|
94 |
-
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
95 |
|
96 |
## Details and logs
|
97 |
You can find:
|
98 |
-
-
|
99 |
-
-
|
100 |
-
-
|
101 |
|
102 |
## Reproducibility
|
103 |
-
|
104 |
-
`python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
|
105 |
-
` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=2 --output_path=<output_path>`
|
106 |
|
107 |
The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
|
108 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
@@ -121,37 +122,34 @@ To get more information about quantization, see:
|
|
121 |
"""
|
122 |
|
123 |
EVALUATION_QUEUE_TEXT = f"""
|
124 |
-
#
|
125 |
-
|
126 |
-
Models added here will be automatically evaluated on the ๐ค cluster.
|
127 |
|
128 |
-
##
|
129 |
|
130 |
-
### 1
|
131 |
-
```
|
132 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
133 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
134 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
135 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
136 |
```
|
137 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
138 |
|
139 |
-
|
140 |
-
|
|
|
141 |
|
142 |
-
### 2
|
143 |
-
|
144 |
|
145 |
-
### 3
|
146 |
-
|
147 |
|
148 |
-
### 4
|
149 |
-
|
150 |
|
151 |
-
##
|
152 |
-
|
153 |
-
Make sure you have followed the above steps first.
|
154 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
155 |
"""
|
156 |
|
157 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
56 |
- Release the leaderboard to public
|
57 |
"""
|
58 |
|
59 |
+
TITLE = """<h1 align="center" id="space-title">๐ Open Ko-LLM Leaderboard</h1>"""
|
60 |
|
61 |
INTRODUCTION_TEXT = f"""
|
62 |
+
๐ Open Ko-LLM Leaderboard๋ ํ๊ตญ์ด ์ด๊ฑฐ๋ ์ธ์ด๋ชจ๋ธ์ ์ฑ๋ฅ์ ๊ฐ๊ด์ ์ผ๋ก ํ๊ฐํฉ๋๋ค.
|
63 |
|
64 |
+
"์ ์ถ" ํ์ด์ง์์ ๋ชจ๋ธ ์ ์ถ ์ ์๋์ผ๋ก ํ๊ฐ๋ฉ๋๋ค. ํ๊ฐ์ ์ฌ์ฉ๋๋ GPU๋ KT์ ์ง์์ผ๋ก ์ด์๋ฉ๋๋ค.
|
65 |
+
ํ๊ฐ์ ์ฌ์ฉ๋๋ ๋ฐ์ดํฐ๋ ์ ๋ฌธ ์ง์, ์ถ๋ก ๋ฅ๋ ฅ, ํ๊ฐ, ์ค๋ฆฌ, ์์์ ๋ค์ฏ๊ฐ์ง ์์๋ฅผ ํ๊ฐํ๊ธฐ ์ํ ๋ฐ์ดํฐ์
์ผ๋ก ๊ตฌ์ฑ๋์ด ์์ต๋๋ค.
|
66 |
+
๋ฒค์น๋งํฌ ๋ฐ์ดํฐ์
์ ๋ํ ๋ ์์ธํ ์ ๋ณด๋ "์ ๋ณด" ํ์ด์ง์์ ์ ๊ณต๋๊ณ ์์ต๋๋ค.
|
67 |
|
68 |
+
์
์คํ
์ด์ง์ NIA๊ฐ ๊ณต๋ ์ฃผ์ตํ๋ฉฐ ์
์คํ
์ด์ง๊ฐ ์ด์ํฉ๋๋ค.
|
|
|
|
|
|
|
|
|
69 |
"""
|
70 |
|
71 |
LLM_BENCHMARKS_TEXT = f"""
|
72 |
# Context
|
73 |
+
๋ฐ์ด๋ LLM ๋ชจ๋ธ๋ค์ด ์๋คํฌ์ด ๊ณต๊ฐ๋๊ณ ์์ง๋ง ์ด๋ ๋๋ถ๋ถ ์์ด ์ค์ฌ์, ์์ด ๋ฌธํ๊ถ์ ์ต์ํ ๋ชจ๋ธ์
๋๋ค. ์ ํฌ๋ ํ๊ตญ์ด ๋ฆฌ๋๋ณด๋ ๐ย Open Ko-LLM์ ์ด์ํ์ฌ ํ๊ตญ์ด์ ํ๊ตญ ๋ฌธํ์ ํน์ฑ์ ๋ฐ์ํ ๋ชจ๋ธ์ ํ๊ฐํ๊ณ ์ ํฉ๋๋ค. ์ด๋ฅผ ํตํด ํ๊ตญ์ด ์ฌ์ฉ์๋ค์ด ํธ๋ฆฌํ๊ฒ ๋ฆฌ๋๋ณด๋๋ฅผ ์ด์ฉํ๊ณ ์ฐธ์ฌํ์ฌ ํ๊ตญ์ ์ฐ๊ตฌ ์์ค ํฅ์์ ๊ธฐ์ฌํ ์ ์๊ธฐ๋ฅผ ๋ฐ๋๋๋ค.
|
74 |
|
75 |
## Icons
|
76 |
{ModelType.PT.to_str(" : ")} model
|
77 |
{ModelType.FT.to_str(" : ")} model
|
78 |
{ModelType.IFT.to_str(" : ")} model
|
79 |
{ModelType.RL.to_str(" : ")} model
|
80 |
+
๋ง์ฝ ์์ด์ฝ์ด ์๋ค๋ฉด ์์ง ๋ชจ๋ธ์ ๋ํ ์ ๋ณด๊ฐ ๋ถ์กฑํจ์ ๋ํ๋
๋๋ค.
|
81 |
+
๋ชจ๋ธ์ ๋ํ ์ ๋ณด๋ issue๋ฅผ ํตํด ์ ๋ฌํด์ฃผ์ธ์! ๐คฉ
|
82 |
|
83 |
+
๐ดโโ ๏ธ : ํด๋น ์์ด์ฝ์ ์ด ๋ชจ๋ธ์ด ์ปค๋ฎค๋ํฐ์ ์ํด ์ฃผ์ ๋์์ผ๋ก ์ ์ ๋์์ผ๋ฏ๋ก ์ด์ฉ ์์ ๋ฅผ ๋ฐ๋๋ค๋ ์๋ฏธ์
๋๋ค. ์์ด์ฝ์ ํด๋ฆญ ์ ํด๋น ๋ชจ๋ธ์ ๋ํ discussion์ผ๋ก ์ด๋ํฉ๋๋ค.
|
84 |
+
(๋์ ๋ฆฌ๋๋ณด๋ ์์๋ฅผ ์ํด ํ๊ฐ์
์ ํ์ต์ ์ด์ฉํ ๋ชจ๋ธ ๋ฑ์ด ์ฃผ์ ๋์์ผ๋ก ์ ์ ๋ฉ๋๋ค)
|
85 |
|
86 |
+
## How it works
|
87 |
|
88 |
+
๐ HuggingFace OpenLLM์์ ์ด์ํ๋ 4๊ฐ์ ํ์คํฌ(HellaSwag, MMLU, Arc, Truthful QA)์ ๋ฐ์ดํฐ๋ฅผ ํ๊ตญ์ด๋ก ๋ฒ์ญํ ๋ฐ์ดํฐ์
์ ๋น๋กฏํด ์ด 6๊ฐ์ง์ ๋ฐ์ดํฐ๋ก ๋ฒค์น๋งํฌ๋ฅผ ๊ตฌ์ฑํ์ต๋๋ค.
|
89 |
+
- Ko-HellaSwag (์
์คํ
์ด์ง ์ ๊ณต)
|
90 |
+
- Ko-MMLU (์
์คํ
์ด์ง ์ ๊ณต)
|
91 |
+
- Ko-Arc (์
์คํ
์ด์ง ์ ๊ณต)
|
92 |
+
- Ko-Truthful QA (์
์คํ
์ด์ง ์ ๊ณต)
|
93 |
+
- KoCommongen (NIA ํ๊ตญ์ง๋ฅ์ ๋ณด์ฌํ์งํฅ์ ์ ๊ณต)
|
94 |
+
- ํ
์คํธ ์ค๋ฆฌ๊ฒ์ฆ ๋ฐ์ดํฐ (NIA ํ๊ตญ์ง๋ฅ์ ๋ณด์ฌํ์งํฅ์ ์ ๊ณต)
|
95 |
+
LLM ์๋์ ๊ฑธ๋ง๋ ํ๊ฐ๋ฅผ ์ํด ์์, ์ ๋ฌธ ์ง์, ์ถ๋ก , ํ๊ฐ, ์ค๋ฆฌ์ ๋ค์ฏ๊ฐ์ง ์์๋ฅผ ํ๊ฐํ๊ธฐ์ ์ ํฉํ ๋ฐ์ดํฐ์
๋ค์ ๋ฒค์น๋งํฌ๋ก ์ ์ ํ์ต๋๋ค. ์ต์ข
์ ์๋ 6๊ฐ์ ํ๊ฐ ๋ฐ์ดํฐ์ ๋ํ ํ๊ท ์ ์๋ก ํ์ฐํฉ๋๋ค.
|
96 |
|
97 |
+
KT๋ก๋ถํฐ ํ๊ฐ์ ์ฌ์ฉ๋๋ GPU๋ฅผ ์ ๊ณต๋ฐ์์ต๋๋ค.
|
|
|
98 |
|
99 |
## Details and logs
|
100 |
You can find:
|
101 |
+
- ์ข ๋ ์์ธํ ์์น ์ ๋ณด๋: https://huggingface.co/datasets/open-llm-leaderboard/results
|
102 |
+
- ๋ชจ๋ธ์ ์
์ถ๋ ฅ์ ๋ํ ์์ธํ ์ ๋ณด๋: https://huggingface.co/datasets/open-llm-leaderboard/details
|
103 |
+
- ๋ชจ๋ธ์ ํ๊ฐ ํ์ ํ๊ฐ ์ํ๋: https://huggingface.co/datasets/open-llm-leaderboard/requests
|
104 |
|
105 |
## Reproducibility
|
106 |
+
ํ๊ฐ ๊ฒฐ๊ณผ๋ฅผ ์ฌํํ๊ธฐ ์ํด์๋ [์ด ๋ฒ์ ](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463)์ ๋ฐ์ดํฐ์
์ ์ด์ฉํ์ธ์. (๋ฐ์๋ ์ฝ๋ ๋ฐ ํ๊ฐ ํ๊ฒฝ์ด๋ผ์ ์ผ๋จ skip)
|
|
|
|
|
107 |
|
108 |
The total batch size we get for models which fit on one A100 node is 16 (8 GPUs * 2). If you don't use parallelism, adapt your batch size to fit.
|
109 |
*You can expect results to vary slightly for different batch sizes because of padding.*
|
|
|
122 |
"""
|
123 |
|
124 |
EVALUATION_QUEUE_TEXT = f"""
|
125 |
+
# ๐ย Open-Ko LLM ๋ฆฌ๋๋ณด๋์ ํ๊ฐ ํ์
๋๋ค.
|
126 |
+
์ด๊ณณ์ ์ถ๊ฐ๋ ๋ชจ๋ธ๋ค์ ๊ณง ์๋์ ์ผ๋ก KT์ GPU ์์์ ํ๊ฐ๋ ์์ ์
๋๋ค!
|
|
|
127 |
|
128 |
+
## <๋ชจ๋ธ ์ ์ถ ์ ํ์ธํ๋ฉด ์ข์ ๊ฒ๋ค>
|
129 |
|
130 |
+
### 1๏ธโฃ ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๊ฐ AutoClasses๋ก ๋ถ๋ฌ์ฌ ์ ์๋์?
|
131 |
+
```
|
132 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
133 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
134 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
135 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
136 |
```
|
|
|
137 |
|
138 |
+
๋ง์ฝ ์ด ๋จ๊ณ๊ฐ ์คํจํ๋ค๋ฉด ์๋ฌ ๋ฉ์ธ์ง๋ฅผ ๋ฐ๋ผ ๋ชจ๋ธ์ ๋๋ฒ๊น
ํ ํ์ ์ ์ถํด์ฃผ์ธ์.
|
139 |
+
โ ๏ธ ๋ชจ๋ธ์ด public ์ํ์ฌ์ผ ํฉ๋๋ค!
|
140 |
+
โ ๏ธ ๋ง์ฝ ๋ชจ๋ธ์ด use_remote_code=True์ฌ์ผ ํ๋ค๋ฉด ์ ์ ๊ธฐ๋ค๋ ค์ฃผ์ธ์. ํ์ฌ๋ก์๋ ์์ง ์ด ์ต์
์ ์ง์ํ์ง ์์ง๋ง ์๋ํ ์ ์๋๋ก ํ๊ณ ์์ต๋๋ค!
|
141 |
|
142 |
+
### 2๏ธโฃ ๋ชจ๋ธ์ weight๋ฅผ safetensors๋ก ๋ฐ๊ฟจ๋์?
|
143 |
+
safetensors๋ weight๋ฅผ ๋ณด๊ดํ๋ ์๋ก์ด ํฌ๋งท์ผ๋ก, ํจ์ฌ ์์ ํ๊ณ ๋น ๋ฅด๊ฒ ์ฌ์ฉํ ์ ์์ต๋๋ค. ๋ํ ๋ชจ๋ธ์ parameter ๊ฐ์๋ฅผ Extended Viewer์ ์ถ๊ฐํ ์ ์์ต๋๋ค
|
144 |
|
145 |
+
### 3๏ธโฃ ๋ชจ๋ธ์ด ์คํ ๋ผ์ด์ผ์ค๋ฅผ ๋ฐ๋ฅด๋์?
|
146 |
+
๐ Open-Ko LLM์ Open LLM์ ์ํ ๋ฆฌ๋๋ณด๋๋ก, ๋ง์ ์ฌ๋๋ค์ด ๋ค์ํ ๋ชจ๋ธ์ ์ฌ์ฉํ๊ธฐ๋ฅผ ๋ฐ๋๋๋ค
|
147 |
|
148 |
+
### 4๏ธโฃ ๋ชจ๋ธ ์นด๋๋ฅผ ์์ฑ๏ฟฝ๏ฟฝ์
จ๋์?
|
149 |
+
๋ฆฌ๋๋ณด๋์ ๋ชจ๋ธ์ ๋ํ ์ถ๊ฐ ์ ๋ณด๋ฅผ ์
๋ก๋ํ ๋ ์์ฑํ์ ๋ชจ๋ธ ์นด๋๊ฐ ์
๋ก๋๋ฉ๋๋ค
|
150 |
|
151 |
+
## ๋ชจ๋ธ์ด ์คํจํ ๊ฒฝ์ฐ:
|
152 |
+
๋ง์ฝ ์ ์ถํ ๋ชจ๋ธ์ ์ํ๊ฐ FAILED๊ฐ ๋๋ค๋ฉด ์ด๋ ๋ชจ๋ธ์ด ์คํ ์ค๋จ๋์์์ ์๋ฏธํฉ๋๋ค. ๋จผ์ ์์ ๋ค ๋จ๊ณ๋ฅผ ๋ชจ๋ ๋ฐ๋๋์ง ํ์ธํด๋ณด์ธ์. ๋ชจ๋ ๋จ๊ณ๋ฅผ ๋ฐ๋์์๋ ๋ถ๊ตฌํ๊ณ ์คํ ์ค๋จ๋์์ ๋๋ EleutherAIHarness ๋ฅผ ๋ก์ปฌ์์ ์คํํ ์ ์๋์ง ํ์ธํ๊ธฐ ์ํด ์์ ์ฝ๋๋ฅผ ์์ ์์ด ์คํํ์ธ์. (ํ์คํฌ ๋ณ ์์์ ์๋ฅผ ์ ํํ๊ธฐ ์ํด โlimit ํ๋ผ๋ฏธํฐ๋ฅผ ์ถ๊ฐํ ์ ์์ต๋๋ค.)
|
|
|
|
|
153 |
"""
|
154 |
|
155 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|