eduardo-alvarez
commited on
Commit
•
adcaec0
1
Parent(s):
d2f6680
added inference tested data
Browse files- app.py +7 -7
- info/about.py +23 -1
- info/submit.py +5 -2
- info/train_a_model.py +8 -1
- src/processing.py +1 -1
- status/leaderboard_status_041624.csv +15 -15
app.py
CHANGED
@@ -41,8 +41,6 @@ with demo:
|
|
41 |
follow the instructions and complete the form in the 🏎️ Submit tab. Models submitted to the leaderboard are evaluated
|
42 |
on the Intel Developer Cloud ☁️. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from
|
43 |
the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).""")
|
44 |
-
gr.Markdown("""Join 5000+ developers on the [Intel DevHub Discord](https://intel.ly/intelllmleaderboard_discord) to get support with your submission and
|
45 |
-
talk about everything from GenAI, HPC, to Quantum Computing.""")
|
46 |
gr.Markdown("""A special shout-out to the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
47 |
team for generously sharing their code and best
|
48 |
practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""")
|
@@ -166,7 +164,9 @@ with demo:
|
|
166 |
label="Model Types",
|
167 |
elem_id="model_types",
|
168 |
value=["pretrained","fine-tuned","chat-models","merges/moerges"])
|
169 |
-
|
|
|
|
|
170 |
color = '#2f82d4'
|
171 |
def make_clickable(row):
|
172 |
return f'<a href="https://huggingface.co/{row["Model"]}" target="_blank" style="color: {color}; text-decoration: underline;">{row["Model"]}</a>'
|
@@ -192,10 +192,10 @@ with demo:
|
|
192 |
["pretrained","fine-tuned","chat-models","merges/moerges"])
|
193 |
|
194 |
|
195 |
-
gradio_df_display = gr.Dataframe(value=initial_filtered_df, headers=["Model","Average","
|
196 |
-
"
|
197 |
-
"
|
198 |
-
datatype=["html","str","str","str","str","str","str","str","str","str","str","str","str"])
|
199 |
|
200 |
filter_hw.change(fn=update_df,
|
201 |
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
|
|
|
41 |
follow the instructions and complete the form in the 🏎️ Submit tab. Models submitted to the leaderboard are evaluated
|
42 |
on the Intel Developer Cloud ☁️. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from
|
43 |
the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).""")
|
|
|
|
|
44 |
gr.Markdown("""A special shout-out to the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
45 |
team for generously sharing their code and best
|
46 |
practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""")
|
|
|
164 |
label="Model Types",
|
165 |
elem_id="model_types",
|
166 |
value=["pretrained","fine-tuned","chat-models","merges/moerges"])
|
167 |
+
inbox_text = gr.CheckboxGroup(label = """Inference Tested Column Legend: 🟨 = Gaudi, 🟦 = Xeon, 🟥 = GPU Max, 🟠 = Core Ultra, 🟢 = Arc GPU (Please see "❓About" tab for more info)""")
|
168 |
+
|
169 |
+
# formatting model name and adding links
|
170 |
color = '#2f82d4'
|
171 |
def make_clickable(row):
|
172 |
return f'<a href="https://huggingface.co/{row["Model"]}" target="_blank" style="color: {color}; text-decoration: underline;">{row["Model"]}</a>'
|
|
|
192 |
["pretrained","fine-tuned","chat-models","merges/moerges"])
|
193 |
|
194 |
|
195 |
+
gradio_df_display = gr.Dataframe(value=initial_filtered_df, headers=["Inference Tested","Model","Average","ARC","HellaSwag","MMLU",
|
196 |
+
"TruthfulQA","Winogrande","Training Hardware","Model Type","Precision",
|
197 |
+
"Size","Infrastructure","Affiliation"],
|
198 |
+
datatype=["html","html","str","str","str","str","str","str","str","str","str","str","str","str"])
|
199 |
|
200 |
filter_hw.change(fn=update_df,
|
201 |
inputs=[filter_hw, filter_platform, filter_affiliation, filter_size, filter_precision, filter_type],
|
info/about.py
CHANGED
@@ -18,7 +18,9 @@ domain-specific benchmarks in the future. We utilize the <a href="https://github
|
|
18 |
Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of
|
19 |
different evaluation tasks.
|
20 |
|
21 |
-
|
|
|
|
|
22 |
|
23 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge (25-shot)</a> - a set of grade-school science questions.
|
24 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag (10-shot)</a> - a test of commonsense inference, which is easy for humans (~95%) but challenging for state-of-the-art models.
|
@@ -30,6 +32,24 @@ For all these evaluations, a higher score is better. We've chosen these benchmar
|
|
30 |
We run an adapted version of the benchmark code specifically designed to run the EleutherAI Harness benchmarks on Gaudi processors.
|
31 |
This adapted evaluation harness is built into the Hugging Face Optimum Habana Library. Review the documentation [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
## Support and Community
|
34 |
|
35 |
Join 5000+ developers on the [Intel DevHub Discord](https://discord.gg/yNYNxK2k) to get support with your submission
|
@@ -41,6 +61,8 @@ This is a fun on-leaderboard LLM chat functionality designed to provide a quick
|
|
41 |
As the leaderboard matures and users submit models, we will rotate the available models for chat. Who knows!? You might find
|
42 |
your model featured here soon! ⭐
|
43 |
|
|
|
|
|
44 |
### Chat Functionality Notice
|
45 |
- All the models in this demo run on 4th Generation Intel® Xeon® (Sapphire Rapids) utilizing AMX operations and quantized inference optimizations.
|
46 |
- Terms of use: By using the chat functionality, users are required to agree to the following terms: The service is a research preview intended for non-commercial
|
|
|
18 |
Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of
|
19 |
different evaluation tasks.
|
20 |
|
21 |
+
<hr>
|
22 |
+
|
23 |
+
## Our current benchmarks include:
|
24 |
|
25 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge (25-shot)</a> - a set of grade-school science questions.
|
26 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag (10-shot)</a> - a test of commonsense inference, which is easy for humans (~95%) but challenging for state-of-the-art models.
|
|
|
32 |
We run an adapted version of the benchmark code specifically designed to run the EleutherAI Harness benchmarks on Gaudi processors.
|
33 |
This adapted evaluation harness is built into the Hugging Face Optimum Habana Library. Review the documentation [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
|
34 |
|
35 |
+
<hr>
|
36 |
+
|
37 |
+
## Inference Tested Leaderboad Column:
|
38 |
+
|
39 |
+
This column classifies hardware platforms where a model has undergone testing, either directly or indirectly through testing of its base model. For instance, if Intel/neural-chat-7b-v3-3 is tested and works on
|
40 |
+
Arc GPUs, then models that are fine-tuned from this model without any architectural or algorithmic changes will be classified as "Inference Tested."
|
41 |
+
|
42 |
+
### Legend and Column Interpretation
|
43 |
+
Refer to the following emoji key to understand the "Inference Tested" column:
|
44 |
+
|
45 |
+
Inference Tested Column Legend:
|
46 |
+
🟨 = Gaudi, 🟦 = Xeon, 🟥 = GPU Max, 🟠 = Core Ultra, 🟢 = Arc GPU
|
47 |
+
|
48 |
+
For example, if a model has the 🟨🟦 emojis in the "Inference Tested" column, it indicates that the model has been tested and works on both Gaudi and Xeon hardware platforms, without implying any performance claims or full optimization for the indicated platforms.
|
49 |
+
|
50 |
+
|
51 |
+
<hr>
|
52 |
+
|
53 |
## Support and Community
|
54 |
|
55 |
Join 5000+ developers on the [Intel DevHub Discord](https://discord.gg/yNYNxK2k) to get support with your submission
|
|
|
61 |
As the leaderboard matures and users submit models, we will rotate the available models for chat. Who knows!? You might find
|
62 |
your model featured here soon! ⭐
|
63 |
|
64 |
+
<hr>
|
65 |
+
|
66 |
### Chat Functionality Notice
|
67 |
- All the models in this demo run on 4th Generation Intel® Xeon® (Sapphire Rapids) utilizing AMX operations and quantized inference optimizations.
|
68 |
- Terms of use: By using the chat functionality, users are required to agree to the following terms: The service is a research preview intended for non-commercial
|
info/submit.py
CHANGED
@@ -4,7 +4,9 @@ SUBMIT_TEXT = f"""
|
|
4 |
Models added here will be queued for evaluation on the Intel Developer Cloud ☁️. Depending on the queue, your model may take up to 10 days to show up on the leaderboard.
|
5 |
We will work to create greater transperancy as our leaderboard community grows.
|
6 |
|
7 |
-
|
|
|
|
|
8 |
|
9 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
10 |
```python
|
@@ -19,7 +21,7 @@ Note: Make sure your model is public!
|
|
19 |
|
20 |
Note: If your model needs `use_remote_code=True`, we do not support this option yet, but we are working on adding it.
|
21 |
|
22 |
-
### 2)
|
23 |
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`.
|
24 |
|
25 |
### 3) Make sure your model has an open license.
|
@@ -38,4 +40,5 @@ Not all models are converted properly from `float16` to `bfloat16`, and selectin
|
|
38 |
## In case of model failure
|
39 |
If your model fails evaluation 😔, we will contact you by opening a new discussion in your model respository. Let's work together to get your model the love it deserves ❤️!
|
40 |
|
|
|
41 |
"""
|
|
|
4 |
Models added here will be queued for evaluation on the Intel Developer Cloud ☁️. Depending on the queue, your model may take up to 10 days to show up on the leaderboard.
|
5 |
We will work to create greater transperancy as our leaderboard community grows.
|
6 |
|
7 |
+
<hr>
|
8 |
+
|
9 |
+
## Review these steps before submitting your model
|
10 |
|
11 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
12 |
```python
|
|
|
21 |
|
22 |
Note: If your model needs `use_remote_code=True`, we do not support this option yet, but we are working on adding it.
|
23 |
|
24 |
+
### 2) Consider converting your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
25 |
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`.
|
26 |
|
27 |
### 3) Make sure your model has an open license.
|
|
|
40 |
## In case of model failure
|
41 |
If your model fails evaluation 😔, we will contact you by opening a new discussion in your model respository. Let's work together to get your model the love it deserves ❤️!
|
42 |
|
43 |
+
<hr>
|
44 |
"""
|
info/train_a_model.py
CHANGED
@@ -5,16 +5,20 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
5 |
Intel offers a variety of platforms that can be used to train LLMs including data center and consumer grade CPUs, GPUs, and ASICs.
|
6 |
Below, you can find documentation on how to access free and paid resources to train a model on Intel hardware and submit it to the Hugging Face Model Hub.
|
7 |
|
|
|
|
|
8 |
## Intel Developer Cloud - Quick Start
|
9 |
The Intel Developer Cloud is one of the best places to access free and paid compute instances for model training. Intel offers Jupyter Notebook instances supported by
|
10 |
224 Core 4th Generation Xeon Bare Metal nodes with 4x GPU Max Series 1100. To access these resources please follow the instructions below:
|
11 |
1. Visit the [Intel Developer Cloud](https://bit.ly/inteldevelopercloud) and sign up for the "Standard - Free" tier to get started.
|
12 |
2. Navigate to the "Training" module under the "Software" section in the left panel.
|
13 |
-
3. Under the GenAI Essentials section, select the
|
14 |
4. Follow the instructions in the notebook to train your model using Intel® Data Center GPU Max 1100.
|
15 |
5. Upload your model to the Hugging Face Model Hub.
|
16 |
6. Go to the "Submit" tab on this Leaderboard and follow the instructions to submit your model.
|
17 |
|
|
|
|
|
18 |
## Training Code Samples
|
19 |
Below are some resources to get you started on training models on Intel platforms:
|
20 |
- Intel® Gaudi® Accelerators
|
@@ -23,8 +27,11 @@ Below are some resources to get you started on training models on Intel platform
|
|
23 |
- [Distributed Training of GPT2 LLMs on AWS](https://github.com/intel/intel-cloud-optimizations-aws/tree/main/distributed-training)
|
24 |
- [Fine-tuning Falcon 7B on Xeon Processors](https://medium.com/@eduand-alvarez/fine-tune-falcon-7-billion-on-xeon-cpus-with-hugging-face-and-oneapi-a25e10803a53)
|
25 |
- Intel® Data Center GPU Max Series
|
|
|
26 |
- [LLM Fine-tuning with QLoRA on Max Series GPUs](https://console.idcservice.net/training/detail/159c24e4-5598-3155-a790-2qv973tlm172)
|
27 |
|
|
|
|
|
28 |
## Submitting your Model to the Hugging Face Model Hub
|
29 |
Once your model is trained, it is a straighforward process to upload and open source it on the Hugging Face Model Hub. The commands from a Jupyter notebook are given below:
|
30 |
|
|
|
5 |
Intel offers a variety of platforms that can be used to train LLMs including data center and consumer grade CPUs, GPUs, and ASICs.
|
6 |
Below, you can find documentation on how to access free and paid resources to train a model on Intel hardware and submit it to the Hugging Face Model Hub.
|
7 |
|
8 |
+
<hr>
|
9 |
+
|
10 |
## Intel Developer Cloud - Quick Start
|
11 |
The Intel Developer Cloud is one of the best places to access free and paid compute instances for model training. Intel offers Jupyter Notebook instances supported by
|
12 |
224 Core 4th Generation Xeon Bare Metal nodes with 4x GPU Max Series 1100. To access these resources please follow the instructions below:
|
13 |
1. Visit the [Intel Developer Cloud](https://bit.ly/inteldevelopercloud) and sign up for the "Standard - Free" tier to get started.
|
14 |
2. Navigate to the "Training" module under the "Software" section in the left panel.
|
15 |
+
3. Under the GenAI Essentials section, select the [Gemma Model Fine-tuning using SFT and LoRA](https://console.idcservice.net/training/detail/99deeb99-b0c6-4d02-a1d5-a46d95344ff3) notebook and click "Launch".
|
16 |
4. Follow the instructions in the notebook to train your model using Intel® Data Center GPU Max 1100.
|
17 |
5. Upload your model to the Hugging Face Model Hub.
|
18 |
6. Go to the "Submit" tab on this Leaderboard and follow the instructions to submit your model.
|
19 |
|
20 |
+
<hr>
|
21 |
+
|
22 |
## Training Code Samples
|
23 |
Below are some resources to get you started on training models on Intel platforms:
|
24 |
- Intel® Gaudi® Accelerators
|
|
|
27 |
- [Distributed Training of GPT2 LLMs on AWS](https://github.com/intel/intel-cloud-optimizations-aws/tree/main/distributed-training)
|
28 |
- [Fine-tuning Falcon 7B on Xeon Processors](https://medium.com/@eduand-alvarez/fine-tune-falcon-7-billion-on-xeon-cpus-with-hugging-face-and-oneapi-a25e10803a53)
|
29 |
- Intel® Data Center GPU Max Series
|
30 |
+
- [Gemma Model Fine-tuning using SFT and LoRA](https://console.idcservice.net/training/detail/99deeb99-b0c6-4d02-a1d5-a46d95344ff3)
|
31 |
- [LLM Fine-tuning with QLoRA on Max Series GPUs](https://console.idcservice.net/training/detail/159c24e4-5598-3155-a790-2qv973tlm172)
|
32 |
|
33 |
+
<hr>
|
34 |
+
|
35 |
## Submitting your Model to the Hugging Face Model Hub
|
36 |
Once your model is trained, it is a straighforward process to upload and open source it on the Hugging Face Model Hub. The commands from a Jupyter notebook are given below:
|
37 |
|
src/processing.py
CHANGED
@@ -4,7 +4,7 @@ def filter_benchmarks_table(df, hw_selected, platform_selected,
|
|
4 |
type_selected, affiliation_selected):
|
5 |
|
6 |
filtered_df = df[
|
7 |
-
df['Hardware'].isin(hw_selected) &
|
8 |
df['Infrastructure'].isin(platform_selected) &
|
9 |
df['Size'].isin(size_selected) &
|
10 |
df['Precision'].isin(precision_selected) &
|
|
|
4 |
type_selected, affiliation_selected):
|
5 |
|
6 |
filtered_df = df[
|
7 |
+
df['Training Hardware'].isin(hw_selected) &
|
8 |
df['Infrastructure'].isin(platform_selected) &
|
9 |
df['Size'].isin(size_selected) &
|
10 |
df['Precision'].isin(precision_selected) &
|
status/leaderboard_status_041624.csv
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
-
Model,Average,Hardware,Model Type,Precision,Size,Infrastructure,
|
2 |
-
Intel/neural-chat-7b-v3-3,71.574,Gaudi,
|
3 |
-
Intel/neural-chat-7b-v3-2,70.858,Gaudi,
|
4 |
-
Intel/neural-chat-7b-v3-1,70.002,Gaudi,
|
5 |
-
Intel/neural-chat-7b-v3,69.906,Gaudi,
|
6 |
-
Intel/neural-chat-7b-v3-1,69.89,Gaudi,
|
7 |
-
Intel/neural-chat-7b-v3-1,69.972,Gaudi,
|
8 |
-
Intel/neural-chat-7b-v3-1,68.256,Gaudi,
|
9 |
-
FunDialogues/dollygem-2b-LoRA,49.368,GPU Max,
|
10 |
-
ThejasElandassery/dallema,49.134,GPU Max,
|
11 |
-
utkarshsingh99/confused-gemma,49.306,GPU Max,
|
12 |
-
myasaswin/Gemma2B-LORAfied,49.566,GPU Max,
|
13 |
-
Aprajita0/Gemma-2b-Lora,49.526,GPU Max,
|
14 |
-
gopalakrishnan-d/gemma-2b-dolly-ds-lora,49.378,GPU Max,
|
15 |
-
SSK0908/gemma-2b-dolly-qa,49.452,GPU Max,
|
|
|
1 |
+
Inference Tested,Model,Average,Training Hardware,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,Model Type,Precision,Size,Infrastructure,Affiliation
|
2 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-3,71.574,Gaudi,66.89,85.26,63.07,63.01,79.64,fine-tuned,fp16,7,Intel Developer Cloud,Intel Engineering
|
3 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-2,70.858,Gaudi,67.49,83.92,63.55,59.68,79.65,fine-tuned,fp16,7,Intel Developer Cloud,Intel Engineering
|
4 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-1,70.002,Gaudi,66.21,83.64,62.37,59.65,78.14,fine-tuned,fp16,7,Intel Developer Cloud,Intel Engineering
|
5 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3,69.906,Gaudi,67.15,83.29,62.26,58.77,78.06,fine-tuned,fp16,7,Intel Developer Cloud,Intel Engineering
|
6 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-1,69.89,Gaudi,65.7,83.54,62.12,59.48,78.61,fine-tuned,int8,7,Intel Developer Cloud,Intel Engineering
|
7 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-1,69.972,Gaudi,66.3,83.6,62.44,59.54,77.98,fine-tuned,bf16,7,Intel Developer Cloud,Intel Engineering
|
8 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Intel/neural-chat-7b-v3-1,68.256,Gaudi,64.25,82.49,60.79,56.4,77.35,fine-tuned,int4,7,Intel Developer Cloud,Intel Engineering
|
9 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,FunDialogues/dollygem-2b-LoRA,49.368,GPU Max,41.55,71.56,35.39,32.59,65.75,fine-tuned,bf16,2,Intel Developer Cloud,No Affiliation
|
10 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,ThejasElandassery/dallema,49.134,GPU Max,41.64,72.01,33.67,33.08,65.27,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|
11 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,utkarshsingh99/confused-gemma,49.306,GPU Max,42.06,71.86,34.14,33.2,65.27,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|
12 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,myasaswin/Gemma2B-LORAfied,49.566,GPU Max,42.56,71.56,35.47,32.57,65.67,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|
13 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,Aprajita0/Gemma-2b-Lora,49.526,GPU Max,42.24,71.88,34.14,33.23,66.14,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|
14 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,gopalakrishnan-d/gemma-2b-dolly-ds-lora,49.378,GPU Max,42.24,71.88,34.11,33.15,65.51,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|
15 |
+
<p>🟨 🟦 🟥 🟠 🟢</p>,SSK0908/gemma-2b-dolly-qa,49.452,GPU Max,42.15,71.92,33.98,33.15,66.06,fine-tuned,bf16,2,Intel Developer Cloud,Student Ambassador
|