Spaces:
Runtime error
Runtime error
update layout
Browse files- .gitignore +2 -0
- app.py +4 -0
- data/models.yaml +49 -8
- src/about.py +6 -4
- src/populate.py +7 -2
.gitignore
CHANGED
@@ -14,3 +14,5 @@ logs/
|
|
14 |
envs/
|
15 |
|
16 |
tmp.py
|
|
|
|
|
|
14 |
envs/
|
15 |
|
16 |
tmp.py
|
17 |
+
print.py
|
18 |
+
leaderboard.tex
|
app.py
CHANGED
@@ -45,6 +45,10 @@ with demo:
|
|
45 |
)
|
46 |
|
47 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
|
|
|
|
|
|
48 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
49 |
|
50 |
with gr.Row():
|
|
|
45 |
)
|
46 |
|
47 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
48 |
+
# with gr.Column(scale=2):
|
49 |
+
# gr.Markdown("""
|
50 |
+
# ![](src/logo.png)
|
51 |
+
# """)
|
52 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
53 |
|
54 |
with gr.Row():
|
data/models.yaml
CHANGED
@@ -1,82 +1,123 @@
|
|
1 |
- name: 'tulu-2-dpo-70b'
|
2 |
fdir: 'tulu-2-dpo-70b'
|
|
|
3 |
- name: 'tulu-2-70b'
|
4 |
fdir: 'tulu-2-70b'
|
|
|
5 |
- name: 'llama-2-70b'
|
6 |
fdir: 'llama-2-70b'
|
|
|
7 |
- name: 'tulu-2-dpo-13b'
|
8 |
fdir: 'tulu-2-dpo-13b'
|
|
|
9 |
- name: 'tulu-2-13b'
|
10 |
fdir: 'tulu-2-13b'
|
|
|
11 |
- name: 'llama-2-13b'
|
12 |
fdir: 'llama-2-13b'
|
|
|
13 |
- name: 'tulu-2-dpo-7b'
|
14 |
fdir: 'tulu-2-dpo-7b'
|
|
|
15 |
- name: 'tulu-2-7b'
|
16 |
fdir: 'tulu-2-7b'
|
|
|
17 |
- name: 'llama-2-7b'
|
18 |
fdir: 'llama-2-7b'
|
|
|
19 |
- name: 'gemini-1.0-pro'
|
20 |
fdir: 'gemini-1.0-pro'
|
|
|
21 |
- name: 'gemini-1.5-pro'
|
22 |
fdir: 'gemini-1.5-pro'
|
|
|
23 |
- name: 'gemini-1.5-flash'
|
24 |
fdir: 'gemini-1.5-flash'
|
|
|
25 |
- name: 'llama-3-8b'
|
26 |
fdir: 'llama-3-8b'
|
|
|
|
|
|
|
|
|
27 |
- name: 'gpt-3.5-turbo-0125'
|
28 |
fdir: 'gpt-3.5-turbo-0125'
|
|
|
29 |
- name: 'gpt-4-0314'
|
30 |
fdir: 'gpt-4-0314'
|
|
|
31 |
- name: 'gpt-4-0613'
|
32 |
fdir: 'gpt-4-0613'
|
|
|
33 |
- name: 'gpt-4-1106-preview'
|
34 |
fdir: 'gpt-4-1106-preview'
|
|
|
35 |
- name: 'gpt-4-0125-preview'
|
36 |
fdir: 'gpt-4-0125-preview'
|
|
|
37 |
- name: 'gpt-4-turbo-2024-04-09'
|
38 |
fdir: 'gpt-4-turbo-2024-04-09'
|
|
|
39 |
- name: 'gpt-4o'
|
40 |
fdir: 'gpt-4o'
|
|
|
|
|
|
|
|
|
41 |
- name: 'claude-3-opus'
|
42 |
fdir: 'claude-3-opus-20240229'
|
|
|
43 |
- name: 'claude-3-haiku'
|
44 |
fdir: 'claude-3-haiku-20240307'
|
|
|
45 |
- name: 'claude-3-sonnet'
|
46 |
fdir: 'claude-3-sonnet-20240229'
|
|
|
47 |
- name: 'claude-2.1'
|
48 |
fdir: 'claude-2.1'
|
|
|
49 |
- name: 'claude-instant-1.2'
|
50 |
fdir: 'claude-instant-1.2'
|
|
|
51 |
- name: 'command-r-plus'
|
52 |
fdir: 'command-r-plus'
|
53 |
-
|
54 |
-
|
|
|
|
|
55 |
- name: 'mistral-7b-v0.2'
|
56 |
fdir: 'mistral-7b-v0.2'
|
|
|
57 |
- name: 'mistral-7b-v0.1'
|
58 |
fdir: 'mistral-7b-v0.1'
|
|
|
59 |
- name: 'mixtral-8x7b'
|
60 |
fdir: 'mixtral-8x7b'
|
|
|
|
|
|
|
|
|
61 |
- name: 'yi-1.5-34b'
|
62 |
fdir: 'yi-1.5-34b'
|
|
|
63 |
- name: 'yi-1.5-9b'
|
64 |
fdir: 'yi-1.5-9b'
|
|
|
65 |
- name: 'qwen-1.5-72b'
|
66 |
fdir: 'qwen-1.5-72b'
|
|
|
67 |
- name: 'qwen-1.5-32b'
|
68 |
fdir: 'qwen-1.5-32b'
|
|
|
69 |
- name: 'qwen-2-72b'
|
70 |
fdir: 'qwen-2-72b'
|
|
|
71 |
- name: 'gemma-7b'
|
72 |
fdir: 'gemma-7b'
|
|
|
73 |
- name: 'gemma-2b'
|
74 |
fdir: 'gemma-2b'
|
75 |
-
|
76 |
-
fdir: 'mistral-7b-v0.3'
|
77 |
- name: 'glm-4-9b'
|
78 |
fdir: 'glm-4-9b'
|
79 |
-
|
80 |
-
fdir: 'mistral-large'
|
81 |
-
- name: 'claude-3.5-sonnet'
|
82 |
-
fdir: 'claude3.5-sonnet'
|
|
|
1 |
- name: 'tulu-2-dpo-70b'
|
2 |
fdir: 'tulu-2-dpo-70b'
|
3 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-70b'
|
4 |
- name: 'tulu-2-70b'
|
5 |
fdir: 'tulu-2-70b'
|
6 |
+
url: 'https://huggingface.co/allenai/tulu-2-70b'
|
7 |
- name: 'llama-2-70b'
|
8 |
fdir: 'llama-2-70b'
|
9 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-70b-chat-hf'
|
10 |
- name: 'tulu-2-dpo-13b'
|
11 |
fdir: 'tulu-2-dpo-13b'
|
12 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-13b'
|
13 |
- name: 'tulu-2-13b'
|
14 |
fdir: 'tulu-2-13b'
|
15 |
+
url: 'https://huggingface.co/allenai/tulu-2-13b'
|
16 |
- name: 'llama-2-13b'
|
17 |
fdir: 'llama-2-13b'
|
18 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-13b-chat-hf'
|
19 |
- name: 'tulu-2-dpo-7b'
|
20 |
fdir: 'tulu-2-dpo-7b'
|
21 |
+
url: 'https://huggingface.co/allenai/tulu-2-dpo-7b'
|
22 |
- name: 'tulu-2-7b'
|
23 |
fdir: 'tulu-2-7b'
|
24 |
+
url: 'https://huggingface.co/allenai/tulu-2-7b'
|
25 |
- name: 'llama-2-7b'
|
26 |
fdir: 'llama-2-7b'
|
27 |
+
url: 'https://huggingface.co/meta-llama/Llama-2-7b-chat-hf'
|
28 |
- name: 'gemini-1.0-pro'
|
29 |
fdir: 'gemini-1.0-pro'
|
30 |
+
url: 'https://deepmind.google/technologies/gemini/pro/'
|
31 |
- name: 'gemini-1.5-pro'
|
32 |
fdir: 'gemini-1.5-pro'
|
33 |
+
url: 'https://deepmind.google/technologies/gemini/pro/'
|
34 |
- name: 'gemini-1.5-flash'
|
35 |
fdir: 'gemini-1.5-flash'
|
36 |
+
url: 'https://deepmind.google/technologies/gemini/flash/'
|
37 |
- name: 'llama-3-8b'
|
38 |
fdir: 'llama-3-8b'
|
39 |
+
url: 'https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct'
|
40 |
+
- name: 'llama-3-70b'
|
41 |
+
fdir: 'llama-3-70b'
|
42 |
+
url: 'https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct'
|
43 |
- name: 'gpt-3.5-turbo-0125'
|
44 |
fdir: 'gpt-3.5-turbo-0125'
|
45 |
+
url: 'https://platform.openai.com/docs/models/gpt-3-5-turbo'
|
46 |
- name: 'gpt-4-0314'
|
47 |
fdir: 'gpt-4-0314'
|
48 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
49 |
- name: 'gpt-4-0613'
|
50 |
fdir: 'gpt-4-0613'
|
51 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
52 |
- name: 'gpt-4-1106-preview'
|
53 |
fdir: 'gpt-4-1106-preview'
|
54 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
55 |
- name: 'gpt-4-0125-preview'
|
56 |
fdir: 'gpt-4-0125-preview'
|
57 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
58 |
- name: 'gpt-4-turbo-2024-04-09'
|
59 |
fdir: 'gpt-4-turbo-2024-04-09'
|
60 |
+
url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
|
61 |
- name: 'gpt-4o'
|
62 |
fdir: 'gpt-4o'
|
63 |
+
url: 'https://platform.openai.com/docs/models/gpt-4o'
|
64 |
+
- name: 'claude-3.5-sonnet'
|
65 |
+
fdir: 'claude3.5-sonnet'
|
66 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
67 |
- name: 'claude-3-opus'
|
68 |
fdir: 'claude-3-opus-20240229'
|
69 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
70 |
- name: 'claude-3-haiku'
|
71 |
fdir: 'claude-3-haiku-20240307'
|
72 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
73 |
- name: 'claude-3-sonnet'
|
74 |
fdir: 'claude-3-sonnet-20240229'
|
75 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
76 |
- name: 'claude-2.1'
|
77 |
fdir: 'claude-2.1'
|
78 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
79 |
- name: 'claude-instant-1.2'
|
80 |
fdir: 'claude-instant-1.2'
|
81 |
+
url: 'https://docs.anthropic.com/en/docs/about-claude/models'
|
82 |
- name: 'command-r-plus'
|
83 |
fdir: 'command-r-plus'
|
84 |
+
url: 'https://huggingface.co/CohereForAI/c4ai-command-r-plus'
|
85 |
+
- name: 'mistral-7b-v0.3'
|
86 |
+
fdir: 'mistral-7b-v0.3'
|
87 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3'
|
88 |
- name: 'mistral-7b-v0.2'
|
89 |
fdir: 'mistral-7b-v0.2'
|
90 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2'
|
91 |
- name: 'mistral-7b-v0.1'
|
92 |
fdir: 'mistral-7b-v0.1'
|
93 |
+
url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'
|
94 |
- name: 'mixtral-8x7b'
|
95 |
fdir: 'mixtral-8x7b'
|
96 |
+
url: 'https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1'
|
97 |
+
- name: 'mistral-large'
|
98 |
+
fdir: 'mistral-large'
|
99 |
+
url: 'https://mistral.ai/news/mistral-large/'
|
100 |
- name: 'yi-1.5-34b'
|
101 |
fdir: 'yi-1.5-34b'
|
102 |
+
url: 'https://huggingface.co/01-ai/Yi-1.5-34B-Chat'
|
103 |
- name: 'yi-1.5-9b'
|
104 |
fdir: 'yi-1.5-9b'
|
105 |
+
url: 'https://huggingface.co/01-ai/Yi-1.5-9B-Chat'
|
106 |
- name: 'qwen-1.5-72b'
|
107 |
fdir: 'qwen-1.5-72b'
|
108 |
+
url: 'https://huggingface.co/Qwen/Qwen1.5-72B-Chat'
|
109 |
- name: 'qwen-1.5-32b'
|
110 |
fdir: 'qwen-1.5-32b'
|
111 |
+
url: 'https://huggingface.co/Qwen/Qwen1.5-32B-Chat'
|
112 |
- name: 'qwen-2-72b'
|
113 |
fdir: 'qwen-2-72b'
|
114 |
+
url: 'https://huggingface.co/Qwen/Qwen2-72B-Instruct'
|
115 |
- name: 'gemma-7b'
|
116 |
fdir: 'gemma-7b'
|
117 |
+
url: 'https://huggingface.co/google/gemma-7b-it'
|
118 |
- name: 'gemma-2b'
|
119 |
fdir: 'gemma-2b'
|
120 |
+
url: 'https://huggingface.co/google/gemma-2b-it'
|
|
|
121 |
- name: 'glm-4-9b'
|
122 |
fdir: 'glm-4-9b'
|
123 |
+
url: 'https://huggingface.co/THUDM/glm-4-9b-chat'
|
|
|
|
|
|
src/about.py
CHANGED
@@ -20,6 +20,8 @@ INTRODUCTION_TEXT = """
|
|
20 |
LLM_BENCHMARKS_TEXT = f"""
|
21 |
## How it works
|
22 |
|
|
|
|
|
23 |
### Task
|
24 |
The LLMs are evaluated as judges in a pairwise comparison task.
|
25 |
Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
|
@@ -47,9 +49,9 @@ The [prompt](https://github.com/princeton-nlp/LLMBar/blob/main/LLMEvaluator/eval
|
|
47 |
"""
|
48 |
|
49 |
CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
|
50 |
-
CITATION_BUTTON_TEXT = r"""@
|
51 |
-
title={Benchmarking
|
52 |
author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
|
53 |
-
|
54 |
-
|
55 |
}"""
|
|
|
20 |
LLM_BENCHMARKS_TEXT = f"""
|
21 |
## How it works
|
22 |
|
23 |
+
![](src/logo.png)
|
24 |
+
|
25 |
### Task
|
26 |
The LLMs are evaluated as judges in a pairwise comparison task.
|
27 |
Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
|
|
|
49 |
"""
|
50 |
|
51 |
CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
|
52 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{liu2024benchmarking,
|
53 |
+
title={Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction Controllable Summarization},
|
54 |
author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
|
55 |
+
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
|
56 |
+
year = "2024",
|
57 |
}"""
|
src/populate.py
CHANGED
@@ -8,7 +8,7 @@ import numpy as np
|
|
8 |
from datasets import load_dataset
|
9 |
from .envs import TOKEN
|
10 |
|
11 |
-
TYPES = ["
|
12 |
|
13 |
|
14 |
def read_json(file_path: str) -> list[dict]:
|
@@ -95,7 +95,12 @@ def load_leaderboard() -> pd.DataFrame:
|
|
95 |
acc, agr, models_acc, models_agr = pairwise_meta_eval(
|
96 |
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
|
97 |
)
|
98 |
-
predictions["Model"].append(model["name"])
|
|
|
|
|
|
|
|
|
|
|
99 |
predictions["Accuracy"].append(acc)
|
100 |
predictions["Agreement"].append(agr)
|
101 |
predictions["Self-Accuracy"].append(models_acc)
|
|
|
8 |
from datasets import load_dataset
|
9 |
from .envs import TOKEN
|
10 |
|
11 |
+
TYPES = ["number", "html", "number", "number", "number", "number"]
|
12 |
|
13 |
|
14 |
def read_json(file_path: str) -> list[dict]:
|
|
|
95 |
acc, agr, models_acc, models_agr = pairwise_meta_eval(
|
96 |
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
|
97 |
)
|
98 |
+
# predictions["Model"].append(model["name"])
|
99 |
+
# predictions["Model"].append(f"[{model['name']}]({model['url']})")
|
100 |
+
link = model['url']
|
101 |
+
model_name = model['name']
|
102 |
+
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
103 |
+
predictions["Model"].append(output)
|
104 |
predictions["Accuracy"].append(acc)
|
105 |
predictions["Agreement"].append(agr)
|
106 |
predictions["Self-Accuracy"].append(models_acc)
|