ClΓ©mentine commited on
Commit
4ccfada
Β·
1 Parent(s): e4ab31c

fix display

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +1 -1
  3. src/display/about.py +10 -9
  4. src/leaderboard/read_evals.py +3 -0
README.md CHANGED
@@ -17,7 +17,7 @@ space_ci:
17
  - H4_TOKEN
18
  tags:
19
  - leaderboard
20
- short_description: Ranking open LLMs and chat models on their capabilities
21
  ---
22
 
23
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
17
  - H4_TOKEN
18
  tags:
19
  - leaderboard
20
+ short_description: Track, rank and evaluate open LLMs and chatbots
21
  ---
22
 
23
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -340,7 +340,7 @@ with demo:
340
  with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
341
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
342
 
343
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
344
  with gr.Column():
345
  with gr.Row():
346
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
340
  with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
341
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
342
 
343
+ with gr.TabItem("πŸš€ Submit ", elem_id="llm-benchmark-tab-table", id=5):
344
  with gr.Column():
345
  with gr.Row():
346
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/display/about.py CHANGED
@@ -1,16 +1,8 @@
1
  from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- πŸ“ The πŸ€— Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
7
-
8
- πŸ€— Submit a model for automated evaluation on the πŸ€— GPU cluster on the "Submit" page!
9
- The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
10
-
11
- Other cool leaderboards:
12
- - [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
13
- - [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
14
  """
15
 
16
  icons = f"""
@@ -24,6 +16,9 @@ LLM_BENCHMARKS_TEXT = f"""
24
  ## ABOUT
25
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
26
 
 
 
 
27
  ### Tasks
28
  πŸ“ˆ We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
29
 
@@ -88,6 +83,12 @@ To get more information about quantization, see:
88
  ### Useful links
89
  - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
90
  - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 
 
 
 
 
 
91
  """
92
 
93
  FAQ_TEXT = """
 
1
  from src.display.utils import ModelType
2
 
3
+ TITLE = """<h1 style="text-align:left;float:left; id="space-title">πŸ€— Open LLM Leaderboard</h1> Track, rank and evaluate open LLMs and chatbots"""
4
 
5
  INTRODUCTION_TEXT = """
 
 
 
 
 
 
 
 
6
  """
7
 
8
  icons = f"""
 
16
  ## ABOUT
17
  With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
18
 
19
+ πŸ€— Submit a model for automated evaluation on the πŸ€— GPU cluster on the "Submit" page!
20
+ The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
21
+
22
  ### Tasks
23
  πŸ“ˆ We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
24
 
 
83
  ### Useful links
84
  - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
85
  - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
86
+
87
+ ### Other cool leaderboards:
88
+ - [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
89
+ - [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
90
+
91
+
92
  """
93
 
94
  FAQ_TEXT = """
src/leaderboard/read_evals.py CHANGED
@@ -204,6 +204,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
204
  eval_result.update_with_request_file(requests_path)
205
  if eval_result.full_model in dynamic_data:
206
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
 
 
 
207
 
208
  # Store results of same eval together
209
  eval_name = eval_result.eval_name
 
204
  eval_result.update_with_request_file(requests_path)
205
  if eval_result.full_model in dynamic_data:
206
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
207
+ # Hardcoding because of gating problem
208
+ if "meta-llama" in eval_result.full_model:
209
+ eval_result.still_on_hub = True
210
 
211
  # Store results of same eval together
212
  eval_name = eval_result.eval_name