ms180 commited on
Commit
068a50e
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Big Code Models Leaderboard
3
+ emoji: 📈
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.36.1
8
+ app_file: app.py
9
+ disable_embedding: true
10
+ pinned: false
11
+ tags:
12
+ - leaderboard
13
+ - eval:code
14
+ - test:public
15
+ - judge:auto
16
+ - submission:semiautomatic
17
+ models:
18
+ - WizardLM/WizardCoder-15B-V1.0
19
+ - bigcode/octocoder
20
+ - bigcode/octogeex
21
+ - stabilityai/stablecode-completion-alpha-3b
22
+ - bigcode/starcoder
23
+ - bigcode/starcoderbase
24
+ - bigcode/starcoderbase-7b
25
+ - bigcode/starcoderbase-3b
26
+ - bigcode/starcoderbase-1b
27
+ - bigcode/santacoder
28
+ - replit/replit-code-v1-3b
29
+ - THUDM/codegeex2-6b
30
+ - Salesforce/codegen25-7b-multi
31
+ - Salesforce/codegen25-7b-mono
32
+ - Salesforce/codegen-16B-multi
33
+ - Deci/DeciCoder-1b
34
+ - codellama/CodeLlama-7b-hf
35
+ - codellama/CodeLlama-7b-Python-hf
36
+ - codellama/CodeLlama-7b-Instruct-hf
37
+ - codellama/CodeLlama-13b-hf
38
+ - codellama/CodeLlama-13b-Python-hf
39
+ - codellama/CodeLlama-13b-Instruct-hf
40
+ - codellama/CodeLlama-34b-hf
41
+ - codellama/CodeLlama-34b-Python-hf
42
+ - codellama/CodeLlama-34b-Instruct-hf
43
+ - phind/Phind-CodeLlama-34B-v2
44
+ - phind/Phind-CodeLlama-34B-v1
45
+ - phind/Phind-CodeLlama-34B-Python-v1
46
+ - WizardLM/WizardCoder-Python-34B-V1.0
47
+ - WizardLM/WizardCoder-Python-13B-V1.0
48
+ - WizardLM/WizardCoder-3B-V1.0
49
+ - WizardLM/WizardCoder-1B-V1.0
50
+ - tiiuae/falcon-180B
51
+ - smallcloudai/Refact-1_6B-fim
52
+ - microsoft/phi-1
53
+ - WisdomShell/CodeShell-7B
54
+ - deepseek-ai/deepseek-coder-6.7b-base
55
+ - deepseek-ai/deepseek-coder-1.3b-base
56
+ - deepseek-ai/deepseek-coder-33b-base
57
+ - deepseek-ai/deepseek-coder-6.7b-instruct
58
+ - deepseek-ai/deepseek-coder-33b-instruct
59
+ - codefuse-ai/CodeFuse-DeepSeek-33B
60
+ - codellama/CodeLlama-70b-Instruct-hf
61
+ - codellama/CodeLlama-70b-hf
62
+ - codellama/CodeLlama-70b-Python-hf
63
+ - bigcode/starcoder2-15b
64
+ - bigcode/starcoder2-7b
65
+ - bigcode/starcoder2-3b
66
+ - stabilityai/stable-code-3b
67
+ - m-a-p/OpenCodeInterpreter-DS-33B
68
+ - m-a-p/OpenCodeInterpreter-DS-6.7B
69
+ - google/codegemma-7b
70
+ - google/codegemma-7b-it
71
+ - google/codegemma-2b
72
+ - Qwen/CodeQwen1.5-7B-Chat
73
+ - Qwen/CodeQwen1.5-7B
74
+ - NTQAI/Nxcode-CQ-7B-orpo
75
+ - Artigenz/Artigenz-Coder-DS-6.7B
76
+ - Qwen/Qwen2.5-Coder-32B
77
+ - Qwen/Qwen2.5-Coder-32B-Instruct
78
+ ---
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
2
+ import json
3
+ import os
4
+ from datetime import datetime, timezone
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from src.css_html import custom_css
10
+ from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
11
+ from src.utils import (
12
+ AutoEvalColumn,
13
+ fields,
14
+ is_model_on_hub,
15
+ make_clickable_names,
16
+ )
17
+
18
+ df = pd.read_csv("data/code_eval_board.csv")
19
+
20
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
21
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
22
+ COLS_LITE = [
23
+ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
24
+ ]
25
+ TYPES_LITE = [
26
+ c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
27
+ ]
28
+
29
+
30
+ def select_columns(df, columns):
31
+ always_here_cols = [
32
+ AutoEvalColumn.model.name,
33
+ ]
34
+ # We use COLS to maintain sorting
35
+ filtered_df = df[
36
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
37
+ ]
38
+ return filtered_df
39
+
40
+
41
+ def filter_items(df, leaderboard_table, query):
42
+ if query == "all":
43
+ return df[leaderboard_table.columns]
44
+ else:
45
+ query = query[0]
46
+ filtered_df = df[df["T"].str.contains(query, na=False)]
47
+ return filtered_df[leaderboard_table.columns]
48
+
49
+
50
+ def search_table(df, leaderboard_table, query):
51
+ filtered_df = df[(df["Model"].str.contains(query, case=False))]
52
+ return filtered_df[leaderboard_table.columns]
53
+
54
+
55
+ df = make_clickable_names(df)
56
+
57
+ demo = gr.Blocks(css=custom_css)
58
+ with demo:
59
+ with gr.Row():
60
+ gr.Markdown(
61
+ """<div style="text-align: center;"><h1> ESPnet-EZ Leaderboard for LibriSpeech-100h ASR1</span></h1></div>\
62
+ <br>\
63
+ <p>Users can use <code>reproduce</code> function to reproduce the numbers in ESPnet-EZ!</p>
64
+ """,
65
+ elem_classes="markdown-text",
66
+ )
67
+
68
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
69
+ with gr.TabItem("🔍 Evaluation table", id=0):
70
+ with gr.Accordion("➡️ See All Columns", open=False):
71
+ shown_columns = gr.CheckboxGroup(
72
+ choices=[
73
+ c
74
+ for c in COLS
75
+ if c
76
+ not in [
77
+ # AutoEvalColumn.dummy.name,
78
+ AutoEvalColumn.model.name,
79
+ ]
80
+ ],
81
+ value=[
82
+ c
83
+ for c in COLS_LITE
84
+ if c
85
+ not in [
86
+ # AutoEvalColumn.dummy.name,
87
+ AutoEvalColumn.model.name,
88
+ ]
89
+ ],
90
+ label="",
91
+ elem_id="column-select",
92
+ interactive=True,
93
+ )
94
+ # with gr.Column(min_width=780):
95
+ with gr.Row():
96
+ search_bar = gr.Textbox(
97
+ placeholder="🔍 Search for your model and press ENTER...",
98
+ show_label=False,
99
+ elem_id="search-bar",
100
+ )
101
+ leaderboard_df = gr.components.Dataframe(
102
+ value=df[
103
+ [
104
+ AutoEvalColumn.model.name,
105
+ ]
106
+ + shown_columns.value
107
+ ],
108
+ headers=[
109
+ AutoEvalColumn.model.name,
110
+ ]
111
+ + shown_columns.value,
112
+ datatype=TYPES,
113
+ elem_id="leaderboard-table",
114
+ interactive=False,
115
+ )
116
+
117
+ hidden_leaderboard_df = gr.components.Dataframe(
118
+ value=df,
119
+ headers=COLS,
120
+ datatype=["str" for _ in range(len(COLS))],
121
+ visible=False,
122
+ )
123
+ search_bar.submit(
124
+ search_table,
125
+ [hidden_leaderboard_df, leaderboard_df, search_bar],
126
+ leaderboard_df,
127
+ )
128
+ shown_columns.change(
129
+ select_columns,
130
+ [hidden_leaderboard_df, shown_columns],
131
+ leaderboard_df,
132
+ )
133
+ gr.Markdown(
134
+ """
135
+ **Notes:**
136
+ - Win Rate represents how often a model outperforms other models in each language, averaged across all languages.
137
+ - The scores of instruction-tuned models might be significantly higher on humaneval-python than other languages. We use the instruction format of HumanEval. For other languages, we use base MultiPL-E prompts.
138
+ - For more details check the 📝 About section.
139
+ - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
140
+ """,
141
+ elem_classes="markdown-text",
142
+ )
143
+
144
+ with gr.TabItem("📝 About", id=2):
145
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
146
+
147
+ with gr.TabItem("Submit results 🚀", id=3):
148
+ gr.Markdown(SUBMISSION_TEXT_3)
149
+
150
+
151
+ demo.launch()
data/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Multilingual Code Evals
3
+ emoji: 📈
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.38.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
data/code_eval_board.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Model,Links,Size (M),Training Config,Model Config,espnet version,pytorch version,WER (test-clean),WER (test-other),WER (dev-clean),WER (dev-other),CER (test-clean),CER (test-other),CER (dev-clean),CER (dev-other)
2
+ asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs,https://huggingface.co/espnet/shihlun_asr_whisper_medium_finetuned_librispeech100,769,asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs/train_config,asr_whisper_medium_finetune_lr1e-5_adamw_wd1e-2_3epochs/model_config,202211,1.12.1,2.7,5.6,2.6,5.3,1,2.4,2.3,1
3
+ asr_transformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic,https://huggingface.co/pyf98/librispeech_100h_transformer,-,pyf98/librispeech_100h_transformer/train_config,pyf98/librispeech_100h_transformer/model_config,0.10.7a1,1.10.1,8.4,20.5,8.1,20.2,10.9,24.6,10.9,24.3
4
+ asr_conformer_win400_hop160_ctc0.3_lr2e-3_warmup15k_timemask5_amp_no-deterministic,https://huggingface.co/pyf98/librispeech_100h_conformer,-,pyf98/librispeech_100h_conformer,pyf98/librispeech_100h_conformer,0.10.6a1,1.10.1,6.5,17.3,6.3,17.4,2.5,8.4,2.5,8.7
5
+ pyf98/librispeech_100_transducer_conformer,https://huggingface.co/pyf98/librispeech_100_transducer_conformer,30.53,pyf98/librispeech_100_transducer_conformer,pyf98/librispeech_100_transducer_conformer,202301,1.13.1,6.9,18.1,6.6,17.9,0.6,1.9,0.7,2
6
+ pyf98/librispeech_100_transducer_e_branchformer,https://huggingface.co/pyf98/librispeech_100_transducer_e_branchformer,30.01,pyf98/librispeech_100_transducer_e_branchformer,pyf98/librispeech_100_transducer_e_branchformer,202301,1.13.1,6.8,18,6.6,17.6,0.7,2,0.7,2
7
+ pyf98/librispeech_100_ctc_e_branchformer,https://huggingface.co/pyf98/librispeech_100_ctc_e_branchformer,26.43,pyf98/librispeech_100_ctc_e_branchformer,pyf98/librispeech_100_ctc_e_branchformer,202211,1.12.1,9.6,23.1,9.2,22.4,1.2,3,1.1,3
8
+ pyf98/librispeech_100_e_branchformer,https://huggingface.co/pyf98/librispeech_100_e_branchformer,38.47,pyf98/librispeech_100_e_branchformer,pyf98/librispeech_100_e_branchformer,202209,1.12.1,6.3,17,6.1,16.7,2.5,8.2,2.4,8.3
9
+ Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,https://huggingface.co/Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,-,Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion,202304,2.1.2+cu118,6.2,17,5.9,16.6,2.4,8.2,2.3,8.4
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers==4.32.1
2
+ huggingface-hub==0.16.4
src/__pycache__/css_html.cpython-310.pyc ADDED
Binary file (1.44 kB). View file
 
src/__pycache__/text_content.cpython-310.pyc ADDED
Binary file (7.97 kB). View file
 
src/__pycache__/utils.cpython-310.pyc ADDED
Binary file (3.68 kB). View file
 
src/css_html.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
2
+ custom_css = """
3
+ #changelog-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #changelog-text h2 {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ .markdown-text {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #models-to-add-text {
16
+ font-size: 18px !important;
17
+ }
18
+
19
+ #citation-button span {
20
+ font-size: 16px !important;
21
+ }
22
+
23
+ #citation-button textarea {
24
+ font-size: 16px !important;
25
+ }
26
+
27
+ #citation-button > label > button {
28
+ margin: 6px;
29
+ transform: scale(1.3);
30
+ }
31
+
32
+ #leaderboard-table {
33
+ margin-top: 15px
34
+ }
35
+
36
+ #leaderboard-table-lite {
37
+ margin-top: 15px
38
+ }
39
+
40
+ #search-bar-table-box > div:first-child {
41
+ background: none;
42
+ border: none;
43
+ }
44
+
45
+ #search-bar {
46
+ padding: 0px;
47
+ }
48
+
49
+ /* Hides the final AutoEvalColumn */
50
+ #llm-benchmark-tab-table table td:last-child,
51
+ #llm-benchmark-tab-table table th:last-child {
52
+ display: none;
53
+ }
54
+
55
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
56
+ table td:first-child,
57
+ table th:first-child {
58
+ max-width: 400px;
59
+ overflow: auto;
60
+ white-space: nowrap;
61
+ }
62
+
63
+ .tab-buttons button {
64
+ font-size: 20px;
65
+ }
66
+
67
+ #scale-logo {
68
+ border-style: none !important;
69
+ box-shadow: none;
70
+ display: block;
71
+ margin-left: auto;
72
+ margin-right: auto;
73
+ max-width: 600px;
74
+ }
75
+
76
+ #scale-logo .download {
77
+ display: none;
78
+ }
79
+ """
src/text_content.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ABOUT_TEXT = """# Context
2
+ The growing number of code models released by the community necessitates a comprehensive evaluation to reliably benchmark their capabilities. Similar to the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), we selected two common benchmarks for evaluating Code LLMs on multiple programming languages:
3
+
4
+ - **[HumanEval](https://huggingface.co/datasets/openai_humaneval)** - benchmark for measuring functional correctness for synthesizing programs from docstrings. It consists of 164 Python programming problems.
5
+ - **[MultiPL-E](https://huggingface.co/datasets/nuprl/MultiPL-E)** - Translation of HumanEval to 18 programming languages.
6
+
7
+ - **Throughput Measurement** - In addition to these benchmarks, we also measure model throughput on a batch size of 1 and 50 to compare their inference speed.
8
+
9
+
10
+ ### Benchamrks & Prompts
11
+ - HumanEval-Python reports the pass@1 on HumanEval; the rest is from MultiPL-E benchmark.
12
+ - For all languages, we use the original benchamrk prompts for all models except HumanEval-Python, where we separate base from instruction models. We use the original code completion prompts for HumanEval for all base models, but for Instruction models, we use the Instruction version of HumanEval in [HumanEvalSynthesize](https://huggingface.co/datasets/bigcode/humanevalpack) delimited by the tokens/text recommended by the authors of each model (we also use a max generation length of 2048 instead of 512).
13
+
14
+ Figure below shows the example of OctoCoder vs Base HumanEval prompt, you can find the other prompts [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py).
15
+
16
+ <img src="https://huggingface.co/datasets/loubnabnl/repo-images/resolve/main/humaneval_instruct.png" alt="OctoCoder vs Base HumanEval prompt" width="800px">
17
+ - An exception to this is the Phind models. They seem to follow to base prompts better than the instruction versions. Therefore, following the authors' recommendation we use base HumanEval prompts without stripping them of the last newline.
18
+ - Also note that for WizardCoder-Python-34B-V1.0 & WizardCoder-Python-13B-V1.0 (CodeLLaMa based), we use the HumanEval-Python instruction prompt that the original authors used with their postprocessing (instead of HumanEvalSynthesize), code is available [here](https://github.com/bigcode-project/bigcode-evaluation-harness/pull/133)).
19
+
20
+ ### Evaluation Parameters
21
+ - All models were evaluated with the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main) with top-p=0.95, temperature=0.2, max_length_generation 512, and n_samples=50.
22
+
23
+ ### Throughput and Memory Usage
24
+ - Throughputs and peak memory usage are measured using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark/tree/main) which powers [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard). (0 throughput corresponds to OOM).
25
+
26
+
27
+ ### Scoring and Rankings
28
+ - Average score is the average pass@1 over all languages. For Win Rate, we find model rank for each language and compute `num_models - (rank -1)`, then average this result over all languages.
29
+
30
+ ### Miscellaneous
31
+ - #Languages column represents the number of programming languages included during the pretraining. UNK means the number of languages is unknown.
32
+ """
33
+
34
+ SUBMISSION_TEXT = """
35
+ <h1 align="center">
36
+ How to submit models/results to the leaderboard?
37
+ </h1>
38
+ We welcome the community to submit evaluation results of new models. We also provide an experiental feature for submitting models that our team will evaluate on the 🤗 cluster.
39
+
40
+ ## Submitting Models (experimental feature)
41
+ Inspired from the Open LLM Leaderboard, we welcome code models submission from the community that will be automatically evaluated. Please note that this is still an experimental feature.
42
+ Below are some guidlines to follow before submitting your model:
43
+
44
+ #### 1) Make sure you can load your model and tokenizer using AutoClasses:
45
+ ```python
46
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
47
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
48
+ model = AutoModel.from_pretrained("your model name", revision=revision)
49
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
50
+ ```
51
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
52
+ Note: make sure your model is public!
53
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet.
54
+ #### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
55
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
56
+ #### 3) Make sure your model has an open license!
57
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
58
+ #### 4) Fill up your model card
59
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
60
+ """
61
+
62
+ SUBMISSION_TEXT_2 = """
63
+ ## Sumbitting Results
64
+ You also have the option for running evaluation yourself and submitting results. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
65
+
66
+ ### 1 - Running Evaluation
67
+
68
+ We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
69
+
70
+ ### 2- Submitting Results 🚀
71
+
72
+ To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
73
+ - Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
74
+ - Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
75
+
76
+ The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
77
+ """
78
+ SUBMISSION_TEXT_3 = """
79
+ <h1 align="center">
80
+ How to submit models/results to the leaderboard?
81
+ </h1>
82
+ We welcome the community to submit evaluation results of new models. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
83
+
84
+ ### 1 - Running Evaluation
85
+
86
+ We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
87
+
88
+ ### 2- Submitting Results 🚀
89
+
90
+ To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
91
+ - Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
92
+ - Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
93
+
94
+ The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
95
+ """
src/utils.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py
2
+ from dataclasses import dataclass
3
+ from transformers import AutoConfig
4
+
5
+ # These classes are for user facing column names, to avoid having to change them
6
+ # all around the code when a modif is needed
7
+ @dataclass
8
+ class ColumnContent:
9
+ name: str
10
+ type: str
11
+ displayed_by_default: bool
12
+ hidden: bool = False
13
+
14
+
15
+ def fields(raw_class):
16
+ return [
17
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
18
+ ]
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class AutoEvalColumn: # Auto evals column
23
+ # you can use the following metrics:
24
+ # str, markdown, number
25
+ # ColumnContent(column name, type, flag if the value should be included in csv)
26
+ model = ColumnContent("Model", "markdown", True)
27
+ model_size = ColumnContent("Size (M)", "number", True)
28
+ train_config = ColumnContent("Training Config", "str", True)
29
+ model_config = ColumnContent("Model Config", "str", True)
30
+ espnet_version = ColumnContent("espnet version", "str", True)
31
+ pytorch_version = ColumnContent("pytorch version", "str", True)
32
+ wer_test_clean = ColumnContent("WER (test-clean)", "number", True)
33
+ wer_test_other = ColumnContent("WER (test-other)", "number", True)
34
+ wer_dev_clean = ColumnContent("WER (dev-clean)", "number", True)
35
+ wer_dev_other = ColumnContent("WER (dev-other)", "number", True)
36
+ cer_test_clean = ColumnContent("CER (test-clean)", "number", True)
37
+ cer_test_other = ColumnContent("CER (test-other)", "number", True)
38
+ cer_dev_clean = ColumnContent("CER (dev-clean)", "number", True)
39
+ cer_dev_other = ColumnContent("CER (dev-other)", "number", True)
40
+
41
+
42
+ def model_hyperlink(link, model_name):
43
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
44
+
45
+
46
+ def make_clickable_names(df):
47
+ df["Model"] = df.apply(
48
+ lambda row: model_hyperlink(row["Links"], row["Model"]), axis=1
49
+ )
50
+ return df
51
+
52
+
53
+ def styled_error(error):
54
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
55
+
56
+
57
+ def styled_warning(warn):
58
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
59
+
60
+
61
+ def styled_message(message):
62
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
63
+
64
+
65
+ def has_no_nan_values(df, columns):
66
+ return df[columns].notna().all(axis=1)
67
+
68
+
69
+ def has_nan_values(df, columns):
70
+ return df[columns].isna().any(axis=1)
71
+
72
+
73
+ def is_model_on_hub(model_name: str, revision: str) -> bool:
74
+ try:
75
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
76
+ return True, None
77
+
78
+ except ValueError:
79
+ return (
80
+ False,
81
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
82
+ )
83
+
84
+ except Exception as e:
85
+ print(f"Could not get the model config from the hub.: {e}")
86
+ return False, "was not found on hub!"