Spaces:
Sleeping
Sleeping
Commit
·
068a50e
0
Parent(s):
initial commit
Browse files- .gitattributes +35 -0
- README.md +78 -0
- app.py +151 -0
- data/README.md +12 -0
- data/code_eval_board.csv +9 -0
- requirements.txt +2 -0
- src/__pycache__/css_html.cpython-310.pyc +0 -0
- src/__pycache__/text_content.cpython-310.pyc +0 -0
- src/__pycache__/utils.cpython-310.pyc +0 -0
- src/css_html.py +79 -0
- src/text_content.py +95 -0
- src/utils.py +86 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Big Code Models Leaderboard
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
+
app_file: app.py
|
9 |
+
disable_embedding: true
|
10 |
+
pinned: false
|
11 |
+
tags:
|
12 |
+
- leaderboard
|
13 |
+
- eval:code
|
14 |
+
- test:public
|
15 |
+
- judge:auto
|
16 |
+
- submission:semiautomatic
|
17 |
+
models:
|
18 |
+
- WizardLM/WizardCoder-15B-V1.0
|
19 |
+
- bigcode/octocoder
|
20 |
+
- bigcode/octogeex
|
21 |
+
- stabilityai/stablecode-completion-alpha-3b
|
22 |
+
- bigcode/starcoder
|
23 |
+
- bigcode/starcoderbase
|
24 |
+
- bigcode/starcoderbase-7b
|
25 |
+
- bigcode/starcoderbase-3b
|
26 |
+
- bigcode/starcoderbase-1b
|
27 |
+
- bigcode/santacoder
|
28 |
+
- replit/replit-code-v1-3b
|
29 |
+
- THUDM/codegeex2-6b
|
30 |
+
- Salesforce/codegen25-7b-multi
|
31 |
+
- Salesforce/codegen25-7b-mono
|
32 |
+
- Salesforce/codegen-16B-multi
|
33 |
+
- Deci/DeciCoder-1b
|
34 |
+
- codellama/CodeLlama-7b-hf
|
35 |
+
- codellama/CodeLlama-7b-Python-hf
|
36 |
+
- codellama/CodeLlama-7b-Instruct-hf
|
37 |
+
- codellama/CodeLlama-13b-hf
|
38 |
+
- codellama/CodeLlama-13b-Python-hf
|
39 |
+
- codellama/CodeLlama-13b-Instruct-hf
|
40 |
+
- codellama/CodeLlama-34b-hf
|
41 |
+
- codellama/CodeLlama-34b-Python-hf
|
42 |
+
- codellama/CodeLlama-34b-Instruct-hf
|
43 |
+
- phind/Phind-CodeLlama-34B-v2
|
44 |
+
- phind/Phind-CodeLlama-34B-v1
|
45 |
+
- phind/Phind-CodeLlama-34B-Python-v1
|
46 |
+
- WizardLM/WizardCoder-Python-34B-V1.0
|
47 |
+
- WizardLM/WizardCoder-Python-13B-V1.0
|
48 |
+
- WizardLM/WizardCoder-3B-V1.0
|
49 |
+
- WizardLM/WizardCoder-1B-V1.0
|
50 |
+
- tiiuae/falcon-180B
|
51 |
+
- smallcloudai/Refact-1_6B-fim
|
52 |
+
- microsoft/phi-1
|
53 |
+
- WisdomShell/CodeShell-7B
|
54 |
+
- deepseek-ai/deepseek-coder-6.7b-base
|
55 |
+
- deepseek-ai/deepseek-coder-1.3b-base
|
56 |
+
- deepseek-ai/deepseek-coder-33b-base
|
57 |
+
- deepseek-ai/deepseek-coder-6.7b-instruct
|
58 |
+
- deepseek-ai/deepseek-coder-33b-instruct
|
59 |
+
- codefuse-ai/CodeFuse-DeepSeek-33B
|
60 |
+
- codellama/CodeLlama-70b-Instruct-hf
|
61 |
+
- codellama/CodeLlama-70b-hf
|
62 |
+
- codellama/CodeLlama-70b-Python-hf
|
63 |
+
- bigcode/starcoder2-15b
|
64 |
+
- bigcode/starcoder2-7b
|
65 |
+
- bigcode/starcoder2-3b
|
66 |
+
- stabilityai/stable-code-3b
|
67 |
+
- m-a-p/OpenCodeInterpreter-DS-33B
|
68 |
+
- m-a-p/OpenCodeInterpreter-DS-6.7B
|
69 |
+
- google/codegemma-7b
|
70 |
+
- google/codegemma-7b-it
|
71 |
+
- google/codegemma-2b
|
72 |
+
- Qwen/CodeQwen1.5-7B-Chat
|
73 |
+
- Qwen/CodeQwen1.5-7B
|
74 |
+
- NTQAI/Nxcode-CQ-7B-orpo
|
75 |
+
- Artigenz/Artigenz-Coder-DS-6.7B
|
76 |
+
- Qwen/Qwen2.5-Coder-32B
|
77 |
+
- Qwen/Qwen2.5-Coder-32B-Instruct
|
78 |
+
---
|
app.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from datetime import datetime, timezone
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
from src.css_html import custom_css
|
10 |
+
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
|
11 |
+
from src.utils import (
|
12 |
+
AutoEvalColumn,
|
13 |
+
fields,
|
14 |
+
is_model_on_hub,
|
15 |
+
make_clickable_names,
|
16 |
+
)
|
17 |
+
|
18 |
+
df = pd.read_csv("data/code_eval_board.csv")
|
19 |
+
|
20 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
21 |
+
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
22 |
+
COLS_LITE = [
|
23 |
+
c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
24 |
+
]
|
25 |
+
TYPES_LITE = [
|
26 |
+
c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
def select_columns(df, columns):
|
31 |
+
always_here_cols = [
|
32 |
+
AutoEvalColumn.model.name,
|
33 |
+
]
|
34 |
+
# We use COLS to maintain sorting
|
35 |
+
filtered_df = df[
|
36 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
37 |
+
]
|
38 |
+
return filtered_df
|
39 |
+
|
40 |
+
|
41 |
+
def filter_items(df, leaderboard_table, query):
|
42 |
+
if query == "all":
|
43 |
+
return df[leaderboard_table.columns]
|
44 |
+
else:
|
45 |
+
query = query[0]
|
46 |
+
filtered_df = df[df["T"].str.contains(query, na=False)]
|
47 |
+
return filtered_df[leaderboard_table.columns]
|
48 |
+
|
49 |
+
|
50 |
+
def search_table(df, leaderboard_table, query):
|
51 |
+
filtered_df = df[(df["Model"].str.contains(query, case=False))]
|
52 |
+
return filtered_df[leaderboard_table.columns]
|
53 |
+
|
54 |
+
|
55 |
+
df = make_clickable_names(df)
|
56 |
+
|
57 |
+
demo = gr.Blocks(css=custom_css)
|
58 |
+
with demo:
|
59 |
+
with gr.Row():
|
60 |
+
gr.Markdown(
|
61 |
+
"""<div style="text-align: center;"><h1> ESPnet-EZ Leaderboard for LibriSpeech-100h ASR1</span></h1></div>\
|
62 |
+
<br>\
|
63 |
+
<p>Users can use <code>reproduce</code> function to reproduce the numbers in ESPnet-EZ!</p>
|
64 |
+
""",
|
65 |
+
elem_classes="markdown-text",
|
66 |
+
)
|
67 |
+
|
68 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
69 |
+
with gr.TabItem("🔍 Evaluation table", id=0):
|
70 |
+
with gr.Accordion("➡️ See All Columns", open=False):
|
71 |
+
shown_columns = gr.CheckboxGroup(
|
72 |
+
choices=[
|
73 |
+
c
|
74 |
+
for c in COLS
|
75 |
+
if c
|
76 |
+
not in [
|
77 |
+
# AutoEvalColumn.dummy.name,
|
78 |
+
AutoEvalColumn.model.name,
|
79 |
+
]
|
80 |
+
],
|
81 |
+
value=[
|
82 |
+
c
|
83 |
+
for c in COLS_LITE
|
84 |
+
if c
|
85 |
+
not in [
|
86 |
+
# AutoEvalColumn.dummy.name,
|
87 |
+
AutoEvalColumn.model.name,
|
88 |
+
]
|
89 |
+
],
|
90 |
+
label="",
|
91 |
+
elem_id="column-select",
|
92 |
+
interactive=True,
|
93 |
+
)
|
94 |
+
# with gr.Column(min_width=780):
|
95 |
+
with gr.Row():
|
96 |
+
search_bar = gr.Textbox(
|
97 |
+
placeholder="🔍 Search for your model and press ENTER...",
|
98 |
+
show_label=False,
|
99 |
+
elem_id="search-bar",
|
100 |
+
)
|
101 |
+
leaderboard_df = gr.components.Dataframe(
|
102 |
+
value=df[
|
103 |
+
[
|
104 |
+
AutoEvalColumn.model.name,
|
105 |
+
]
|
106 |
+
+ shown_columns.value
|
107 |
+
],
|
108 |
+
headers=[
|
109 |
+
AutoEvalColumn.model.name,
|
110 |
+
]
|
111 |
+
+ shown_columns.value,
|
112 |
+
datatype=TYPES,
|
113 |
+
elem_id="leaderboard-table",
|
114 |
+
interactive=False,
|
115 |
+
)
|
116 |
+
|
117 |
+
hidden_leaderboard_df = gr.components.Dataframe(
|
118 |
+
value=df,
|
119 |
+
headers=COLS,
|
120 |
+
datatype=["str" for _ in range(len(COLS))],
|
121 |
+
visible=False,
|
122 |
+
)
|
123 |
+
search_bar.submit(
|
124 |
+
search_table,
|
125 |
+
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
126 |
+
leaderboard_df,
|
127 |
+
)
|
128 |
+
shown_columns.change(
|
129 |
+
select_columns,
|
130 |
+
[hidden_leaderboard_df, shown_columns],
|
131 |
+
leaderboard_df,
|
132 |
+
)
|
133 |
+
gr.Markdown(
|
134 |
+
"""
|
135 |
+
**Notes:**
|
136 |
+
- Win Rate represents how often a model outperforms other models in each language, averaged across all languages.
|
137 |
+
- The scores of instruction-tuned models might be significantly higher on humaneval-python than other languages. We use the instruction format of HumanEval. For other languages, we use base MultiPL-E prompts.
|
138 |
+
- For more details check the 📝 About section.
|
139 |
+
- Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
|
140 |
+
""",
|
141 |
+
elem_classes="markdown-text",
|
142 |
+
)
|
143 |
+
|
144 |
+
with gr.TabItem("📝 About", id=2):
|
145 |
+
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
146 |
+
|
147 |
+
with gr.TabItem("Submit results 🚀", id=3):
|
148 |
+
gr.Markdown(SUBMISSION_TEXT_3)
|
149 |
+
|
150 |
+
|
151 |
+
demo.launch()
|
data/README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Multilingual Code Evals
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.38.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
data/code_eval_board.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Links,Size (M),Training Config,Model Config,espnet version,pytorch version,WER (test-clean),WER (test-other),WER (dev-clean),WER (dev-other),CER (test-clean),CER (test-other),CER (dev-clean),CER (dev-other)
|
2 |
+
asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs,https://huggingface.co/espnet/shihlun_asr_whisper_medium_finetuned_librispeech100,769,asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs/train_config,asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs/model_config,202211,1.12.1,2.7,5.6,2.6,5.3,1,2.4,2.3,1
|
3 |
+
asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic,https://huggingface.co/pyf98/librispeech_100h_transformer,-,pyf98/librispeech_100h_transformer/train_config,pyf98/librispeech_100h_transformer/model_config,0.10.7a1,1.10.1,8.4,20.5,8.1,20.2,10.9,24.6,10.9,24.3
|
4 |
+
asr_conformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic,https://huggingface.co/pyf98/librispeech_100h_conformer,-,pyf98/librispeech_100h_conformer,pyf98/librispeech_100h_conformer,0.10.6a1,1.10.1,6.5,17.3,6.3,17.4,2.5,8.4,2.5,8.7
|
5 |
+
pyf98/librispeech_100_transducer_conformer,https://huggingface.co/pyf98/librispeech_100_transducer_conformer,30.53,pyf98/librispeech_100_transducer_conformer,pyf98/librispeech_100_transducer_conformer,202301,1.13.1,6.9,18.1,6.6,17.9,0.6,1.9,0.7,2
|
6 |
+
pyf98/librispeech_100_transducer_e_branchformer,https://huggingface.co/pyf98/librispeech_100_transducer_e_branchformer,30.01,pyf98/librispeech_100_transducer_e_branchformer,pyf98/librispeech_100_transducer_e_branchformer,202301,1.13.1,6.8,18,6.6,17.6,0.7,2,0.7,2
|
7 |
+
pyf98/librispeech_100_ctc_e_branchformer,https://huggingface.co/pyf98/librispeech_100_ctc_e_branchformer,26.43,pyf98/librispeech_100_ctc_e_branchformer,pyf98/librispeech_100_ctc_e_branchformer,202211,1.12.1,9.6,23.1,9.2,22.4,1.2,3,1.1,3
|
8 |
+
pyf98/librispeech_100_e_branchformer,https://huggingface.co/pyf98/librispeech_100_e_branchformer,38.47,pyf98/librispeech_100_e_branchformer,pyf98/librispeech_100_e_branchformer,202209,1.12.1,6.3,17,6.1,16.7,2.5,8.2,2.4,8.3
|
9 |
+
Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,https://huggingface.co/Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,-,Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,202304,2.1.2+cu118,6.2,17,5.9,16.6,2.4,8.2,2.3,8.4
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
transformers==4.32.1
|
2 |
+
huggingface-hub==0.16.4
|
src/__pycache__/css_html.cpython-310.pyc
ADDED
Binary file (1.44 kB). View file
|
|
src/__pycache__/text_content.cpython-310.pyc
ADDED
Binary file (7.97 kB). View file
|
|
src/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (3.68 kB). View file
|
|
src/css_html.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
|
2 |
+
custom_css = """
|
3 |
+
#changelog-text {
|
4 |
+
font-size: 16px !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
#changelog-text h2 {
|
8 |
+
font-size: 18px !important;
|
9 |
+
}
|
10 |
+
|
11 |
+
.markdown-text {
|
12 |
+
font-size: 16px !important;
|
13 |
+
}
|
14 |
+
|
15 |
+
#models-to-add-text {
|
16 |
+
font-size: 18px !important;
|
17 |
+
}
|
18 |
+
|
19 |
+
#citation-button span {
|
20 |
+
font-size: 16px !important;
|
21 |
+
}
|
22 |
+
|
23 |
+
#citation-button textarea {
|
24 |
+
font-size: 16px !important;
|
25 |
+
}
|
26 |
+
|
27 |
+
#citation-button > label > button {
|
28 |
+
margin: 6px;
|
29 |
+
transform: scale(1.3);
|
30 |
+
}
|
31 |
+
|
32 |
+
#leaderboard-table {
|
33 |
+
margin-top: 15px
|
34 |
+
}
|
35 |
+
|
36 |
+
#leaderboard-table-lite {
|
37 |
+
margin-top: 15px
|
38 |
+
}
|
39 |
+
|
40 |
+
#search-bar-table-box > div:first-child {
|
41 |
+
background: none;
|
42 |
+
border: none;
|
43 |
+
}
|
44 |
+
|
45 |
+
#search-bar {
|
46 |
+
padding: 0px;
|
47 |
+
}
|
48 |
+
|
49 |
+
/* Hides the final AutoEvalColumn */
|
50 |
+
#llm-benchmark-tab-table table td:last-child,
|
51 |
+
#llm-benchmark-tab-table table th:last-child {
|
52 |
+
display: none;
|
53 |
+
}
|
54 |
+
|
55 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
56 |
+
table td:first-child,
|
57 |
+
table th:first-child {
|
58 |
+
max-width: 400px;
|
59 |
+
overflow: auto;
|
60 |
+
white-space: nowrap;
|
61 |
+
}
|
62 |
+
|
63 |
+
.tab-buttons button {
|
64 |
+
font-size: 20px;
|
65 |
+
}
|
66 |
+
|
67 |
+
#scale-logo {
|
68 |
+
border-style: none !important;
|
69 |
+
box-shadow: none;
|
70 |
+
display: block;
|
71 |
+
margin-left: auto;
|
72 |
+
margin-right: auto;
|
73 |
+
max-width: 600px;
|
74 |
+
}
|
75 |
+
|
76 |
+
#scale-logo .download {
|
77 |
+
display: none;
|
78 |
+
}
|
79 |
+
"""
|
src/text_content.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ABOUT_TEXT = """# Context
|
2 |
+
The growing number of code models released by the community necessitates a comprehensive evaluation to reliably benchmark their capabilities. Similar to the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), we selected two common benchmarks for evaluating Code LLMs on multiple programming languages:
|
3 |
+
|
4 |
+
- **[HumanEval](https://huggingface.co/datasets/openai_humaneval)** - benchmark for measuring functional correctness for synthesizing programs from docstrings. It consists of 164 Python programming problems.
|
5 |
+
- **[MultiPL-E](https://huggingface.co/datasets/nuprl/MultiPL-E)** - Translation of HumanEval to 18 programming languages.
|
6 |
+
|
7 |
+
- **Throughput Measurement** - In addition to these benchmarks, we also measure model throughput on a batch size of 1 and 50 to compare their inference speed.
|
8 |
+
|
9 |
+
|
10 |
+
### Benchamrks & Prompts
|
11 |
+
- HumanEval-Python reports the pass@1 on HumanEval; the rest is from MultiPL-E benchmark.
|
12 |
+
- For all languages, we use the original benchamrk prompts for all models except HumanEval-Python, where we separate base from instruction models. We use the original code completion prompts for HumanEval for all base models, but for Instruction models, we use the Instruction version of HumanEval in [HumanEvalSynthesize](https://huggingface.co/datasets/bigcode/humanevalpack) delimited by the tokens/text recommended by the authors of each model (we also use a max generation length of 2048 instead of 512).
|
13 |
+
|
14 |
+
Figure below shows the example of OctoCoder vs Base HumanEval prompt, you can find the other prompts [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py).
|
15 |
+
|
16 |
+
<img src="https://huggingface.co/datasets/loubnabnl/repo-images/resolve/main/humaneval_instruct.png" alt="OctoCoder vs Base HumanEval prompt" width="800px">
|
17 |
+
- An exception to this is the Phind models. They seem to follow to base prompts better than the instruction versions. Therefore, following the authors' recommendation we use base HumanEval prompts without stripping them of the last newline.
|
18 |
+
- Also note that for WizardCoder-Python-34B-V1.0 & WizardCoder-Python-13B-V1.0 (CodeLLaMa based), we use the HumanEval-Python instruction prompt that the original authors used with their postprocessing (instead of HumanEvalSynthesize), code is available [here](https://github.com/bigcode-project/bigcode-evaluation-harness/pull/133)).
|
19 |
+
|
20 |
+
### Evaluation Parameters
|
21 |
+
- All models were evaluated with the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main) with top-p=0.95, temperature=0.2, max_length_generation 512, and n_samples=50.
|
22 |
+
|
23 |
+
### Throughput and Memory Usage
|
24 |
+
- Throughputs and peak memory usage are measured using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark/tree/main) which powers [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard). (0 throughput corresponds to OOM).
|
25 |
+
|
26 |
+
|
27 |
+
### Scoring and Rankings
|
28 |
+
- Average score is the average pass@1 over all languages. For Win Rate, we find model rank for each language and compute `num_models - (rank -1)`, then average this result over all languages.
|
29 |
+
|
30 |
+
### Miscellaneous
|
31 |
+
- #Languages column represents the number of programming languages included during the pretraining. UNK means the number of languages is unknown.
|
32 |
+
"""
|
33 |
+
|
34 |
+
SUBMISSION_TEXT = """
|
35 |
+
<h1 align="center">
|
36 |
+
How to submit models/results to the leaderboard?
|
37 |
+
</h1>
|
38 |
+
We welcome the community to submit evaluation results of new models. We also provide an experiental feature for submitting models that our team will evaluate on the 🤗 cluster.
|
39 |
+
|
40 |
+
## Submitting Models (experimental feature)
|
41 |
+
Inspired from the Open LLM Leaderboard, we welcome code models submission from the community that will be automatically evaluated. Please note that this is still an experimental feature.
|
42 |
+
Below are some guidlines to follow before submitting your model:
|
43 |
+
|
44 |
+
#### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
45 |
+
```python
|
46 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
47 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
48 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
49 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
50 |
+
```
|
51 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
52 |
+
Note: make sure your model is public!
|
53 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet.
|
54 |
+
#### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
55 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
56 |
+
#### 3) Make sure your model has an open license!
|
57 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
58 |
+
#### 4) Fill up your model card
|
59 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
|
60 |
+
"""
|
61 |
+
|
62 |
+
SUBMISSION_TEXT_2 = """
|
63 |
+
## Sumbitting Results
|
64 |
+
You also have the option for running evaluation yourself and submitting results. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
|
65 |
+
|
66 |
+
### 1 - Running Evaluation
|
67 |
+
|
68 |
+
We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
|
69 |
+
|
70 |
+
### 2- Submitting Results 🚀
|
71 |
+
|
72 |
+
To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
|
73 |
+
- Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
|
74 |
+
- Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
|
75 |
+
|
76 |
+
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
77 |
+
"""
|
78 |
+
SUBMISSION_TEXT_3 = """
|
79 |
+
<h1 align="center">
|
80 |
+
How to submit models/results to the leaderboard?
|
81 |
+
</h1>
|
82 |
+
We welcome the community to submit evaluation results of new models. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
|
83 |
+
|
84 |
+
### 1 - Running Evaluation
|
85 |
+
|
86 |
+
We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
|
87 |
+
|
88 |
+
### 2- Submitting Results 🚀
|
89 |
+
|
90 |
+
To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
|
91 |
+
- Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
|
92 |
+
- Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
|
93 |
+
|
94 |
+
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
95 |
+
"""
|
src/utils.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from transformers import AutoConfig
|
4 |
+
|
5 |
+
# These classes are for user facing column names, to avoid having to change them
|
6 |
+
# all around the code when a modif is needed
|
7 |
+
@dataclass
|
8 |
+
class ColumnContent:
|
9 |
+
name: str
|
10 |
+
type: str
|
11 |
+
displayed_by_default: bool
|
12 |
+
hidden: bool = False
|
13 |
+
|
14 |
+
|
15 |
+
def fields(raw_class):
|
16 |
+
return [
|
17 |
+
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
18 |
+
]
|
19 |
+
|
20 |
+
|
21 |
+
@dataclass(frozen=True)
|
22 |
+
class AutoEvalColumn: # Auto evals column
|
23 |
+
# you can use the following metrics:
|
24 |
+
# str, markdown, number
|
25 |
+
# ColumnContent(column name, type, flag if the value should be included in csv)
|
26 |
+
model = ColumnContent("Model", "markdown", True)
|
27 |
+
model_size = ColumnContent("Size (M)", "number", True)
|
28 |
+
train_config = ColumnContent("Training Config", "str", True)
|
29 |
+
model_config = ColumnContent("Model Config", "str", True)
|
30 |
+
espnet_version = ColumnContent("espnet version", "str", True)
|
31 |
+
pytorch_version = ColumnContent("pytorch version", "str", True)
|
32 |
+
wer_test_clean = ColumnContent("WER (test-clean)", "number", True)
|
33 |
+
wer_test_other = ColumnContent("WER (test-other)", "number", True)
|
34 |
+
wer_dev_clean = ColumnContent("WER (dev-clean)", "number", True)
|
35 |
+
wer_dev_other = ColumnContent("WER (dev-other)", "number", True)
|
36 |
+
cer_test_clean = ColumnContent("CER (test-clean)", "number", True)
|
37 |
+
cer_test_other = ColumnContent("CER (test-other)", "number", True)
|
38 |
+
cer_dev_clean = ColumnContent("CER (dev-clean)", "number", True)
|
39 |
+
cer_dev_other = ColumnContent("CER (dev-other)", "number", True)
|
40 |
+
|
41 |
+
|
42 |
+
def model_hyperlink(link, model_name):
|
43 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
44 |
+
|
45 |
+
|
46 |
+
def make_clickable_names(df):
|
47 |
+
df["Model"] = df.apply(
|
48 |
+
lambda row: model_hyperlink(row["Links"], row["Model"]), axis=1
|
49 |
+
)
|
50 |
+
return df
|
51 |
+
|
52 |
+
|
53 |
+
def styled_error(error):
|
54 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
55 |
+
|
56 |
+
|
57 |
+
def styled_warning(warn):
|
58 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
59 |
+
|
60 |
+
|
61 |
+
def styled_message(message):
|
62 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
63 |
+
|
64 |
+
|
65 |
+
def has_no_nan_values(df, columns):
|
66 |
+
return df[columns].notna().all(axis=1)
|
67 |
+
|
68 |
+
|
69 |
+
def has_nan_values(df, columns):
|
70 |
+
return df[columns].isna().any(axis=1)
|
71 |
+
|
72 |
+
|
73 |
+
def is_model_on_hub(model_name: str, revision: str) -> bool:
|
74 |
+
try:
|
75 |
+
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
|
76 |
+
return True, None
|
77 |
+
|
78 |
+
except ValueError:
|
79 |
+
return (
|
80 |
+
False,
|
81 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
82 |
+
)
|
83 |
+
|
84 |
+
except Exception as e:
|
85 |
+
print(f"Could not get the model config from the hub.: {e}")
|
86 |
+
return False, "was not found on hub!"
|