henryL7 commited on
Commit
9bd8edd
β€’
1 Parent(s): 18902c1

update layout

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +4 -0
  3. data/models.yaml +49 -8
  4. src/about.py +6 -4
  5. src/populate.py +7 -2
.gitignore CHANGED
@@ -14,3 +14,5 @@ logs/
14
  envs/
15
 
16
  tmp.py
 
 
 
14
  envs/
15
 
16
  tmp.py
17
+ print.py
18
+ leaderboard.tex
app.py CHANGED
@@ -45,6 +45,10 @@ with demo:
45
  )
46
 
47
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
48
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
49
 
50
  with gr.Row():
 
45
  )
46
 
47
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
48
+ # with gr.Column(scale=2):
49
+ # gr.Markdown("""
50
+ # ![](src/logo.png)
51
+ # """)
52
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
53
 
54
  with gr.Row():
data/models.yaml CHANGED
@@ -1,82 +1,123 @@
1
  - name: 'tulu-2-dpo-70b'
2
  fdir: 'tulu-2-dpo-70b'
 
3
  - name: 'tulu-2-70b'
4
  fdir: 'tulu-2-70b'
 
5
  - name: 'llama-2-70b'
6
  fdir: 'llama-2-70b'
 
7
  - name: 'tulu-2-dpo-13b'
8
  fdir: 'tulu-2-dpo-13b'
 
9
  - name: 'tulu-2-13b'
10
  fdir: 'tulu-2-13b'
 
11
  - name: 'llama-2-13b'
12
  fdir: 'llama-2-13b'
 
13
  - name: 'tulu-2-dpo-7b'
14
  fdir: 'tulu-2-dpo-7b'
 
15
  - name: 'tulu-2-7b'
16
  fdir: 'tulu-2-7b'
 
17
  - name: 'llama-2-7b'
18
  fdir: 'llama-2-7b'
 
19
  - name: 'gemini-1.0-pro'
20
  fdir: 'gemini-1.0-pro'
 
21
  - name: 'gemini-1.5-pro'
22
  fdir: 'gemini-1.5-pro'
 
23
  - name: 'gemini-1.5-flash'
24
  fdir: 'gemini-1.5-flash'
 
25
  - name: 'llama-3-8b'
26
  fdir: 'llama-3-8b'
 
 
 
 
27
  - name: 'gpt-3.5-turbo-0125'
28
  fdir: 'gpt-3.5-turbo-0125'
 
29
  - name: 'gpt-4-0314'
30
  fdir: 'gpt-4-0314'
 
31
  - name: 'gpt-4-0613'
32
  fdir: 'gpt-4-0613'
 
33
  - name: 'gpt-4-1106-preview'
34
  fdir: 'gpt-4-1106-preview'
 
35
  - name: 'gpt-4-0125-preview'
36
  fdir: 'gpt-4-0125-preview'
 
37
  - name: 'gpt-4-turbo-2024-04-09'
38
  fdir: 'gpt-4-turbo-2024-04-09'
 
39
  - name: 'gpt-4o'
40
  fdir: 'gpt-4o'
 
 
 
 
41
  - name: 'claude-3-opus'
42
  fdir: 'claude-3-opus-20240229'
 
43
  - name: 'claude-3-haiku'
44
  fdir: 'claude-3-haiku-20240307'
 
45
  - name: 'claude-3-sonnet'
46
  fdir: 'claude-3-sonnet-20240229'
 
47
  - name: 'claude-2.1'
48
  fdir: 'claude-2.1'
 
49
  - name: 'claude-instant-1.2'
50
  fdir: 'claude-instant-1.2'
 
51
  - name: 'command-r-plus'
52
  fdir: 'command-r-plus'
53
- - name: 'llama-3-70b'
54
- fdir: 'llama-3-70b'
 
 
55
  - name: 'mistral-7b-v0.2'
56
  fdir: 'mistral-7b-v0.2'
 
57
  - name: 'mistral-7b-v0.1'
58
  fdir: 'mistral-7b-v0.1'
 
59
  - name: 'mixtral-8x7b'
60
  fdir: 'mixtral-8x7b'
 
 
 
 
61
  - name: 'yi-1.5-34b'
62
  fdir: 'yi-1.5-34b'
 
63
  - name: 'yi-1.5-9b'
64
  fdir: 'yi-1.5-9b'
 
65
  - name: 'qwen-1.5-72b'
66
  fdir: 'qwen-1.5-72b'
 
67
  - name: 'qwen-1.5-32b'
68
  fdir: 'qwen-1.5-32b'
 
69
  - name: 'qwen-2-72b'
70
  fdir: 'qwen-2-72b'
 
71
  - name: 'gemma-7b'
72
  fdir: 'gemma-7b'
 
73
  - name: 'gemma-2b'
74
  fdir: 'gemma-2b'
75
- - name: 'mistral-7b-v0.3'
76
- fdir: 'mistral-7b-v0.3'
77
  - name: 'glm-4-9b'
78
  fdir: 'glm-4-9b'
79
- - name: 'mistral-large'
80
- fdir: 'mistral-large'
81
- - name: 'claude-3.5-sonnet'
82
- fdir: 'claude3.5-sonnet'
 
1
  - name: 'tulu-2-dpo-70b'
2
  fdir: 'tulu-2-dpo-70b'
3
+ url: 'https://huggingface.co/allenai/tulu-2-dpo-70b'
4
  - name: 'tulu-2-70b'
5
  fdir: 'tulu-2-70b'
6
+ url: 'https://huggingface.co/allenai/tulu-2-70b'
7
  - name: 'llama-2-70b'
8
  fdir: 'llama-2-70b'
9
+ url: 'https://huggingface.co/meta-llama/Llama-2-70b-chat-hf'
10
  - name: 'tulu-2-dpo-13b'
11
  fdir: 'tulu-2-dpo-13b'
12
+ url: 'https://huggingface.co/allenai/tulu-2-dpo-13b'
13
  - name: 'tulu-2-13b'
14
  fdir: 'tulu-2-13b'
15
+ url: 'https://huggingface.co/allenai/tulu-2-13b'
16
  - name: 'llama-2-13b'
17
  fdir: 'llama-2-13b'
18
+ url: 'https://huggingface.co/meta-llama/Llama-2-13b-chat-hf'
19
  - name: 'tulu-2-dpo-7b'
20
  fdir: 'tulu-2-dpo-7b'
21
+ url: 'https://huggingface.co/allenai/tulu-2-dpo-7b'
22
  - name: 'tulu-2-7b'
23
  fdir: 'tulu-2-7b'
24
+ url: 'https://huggingface.co/allenai/tulu-2-7b'
25
  - name: 'llama-2-7b'
26
  fdir: 'llama-2-7b'
27
+ url: 'https://huggingface.co/meta-llama/Llama-2-7b-chat-hf'
28
  - name: 'gemini-1.0-pro'
29
  fdir: 'gemini-1.0-pro'
30
+ url: 'https://deepmind.google/technologies/gemini/pro/'
31
  - name: 'gemini-1.5-pro'
32
  fdir: 'gemini-1.5-pro'
33
+ url: 'https://deepmind.google/technologies/gemini/pro/'
34
  - name: 'gemini-1.5-flash'
35
  fdir: 'gemini-1.5-flash'
36
+ url: 'https://deepmind.google/technologies/gemini/flash/'
37
  - name: 'llama-3-8b'
38
  fdir: 'llama-3-8b'
39
+ url: 'https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct'
40
+ - name: 'llama-3-70b'
41
+ fdir: 'llama-3-70b'
42
+ url: 'https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct'
43
  - name: 'gpt-3.5-turbo-0125'
44
  fdir: 'gpt-3.5-turbo-0125'
45
+ url: 'https://platform.openai.com/docs/models/gpt-3-5-turbo'
46
  - name: 'gpt-4-0314'
47
  fdir: 'gpt-4-0314'
48
+ url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
49
  - name: 'gpt-4-0613'
50
  fdir: 'gpt-4-0613'
51
+ url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
52
  - name: 'gpt-4-1106-preview'
53
  fdir: 'gpt-4-1106-preview'
54
+ url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
55
  - name: 'gpt-4-0125-preview'
56
  fdir: 'gpt-4-0125-preview'
57
+ url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
58
  - name: 'gpt-4-turbo-2024-04-09'
59
  fdir: 'gpt-4-turbo-2024-04-09'
60
+ url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
61
  - name: 'gpt-4o'
62
  fdir: 'gpt-4o'
63
+ url: 'https://platform.openai.com/docs/models/gpt-4o'
64
+ - name: 'claude-3.5-sonnet'
65
+ fdir: 'claude3.5-sonnet'
66
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
67
  - name: 'claude-3-opus'
68
  fdir: 'claude-3-opus-20240229'
69
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
70
  - name: 'claude-3-haiku'
71
  fdir: 'claude-3-haiku-20240307'
72
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
73
  - name: 'claude-3-sonnet'
74
  fdir: 'claude-3-sonnet-20240229'
75
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
76
  - name: 'claude-2.1'
77
  fdir: 'claude-2.1'
78
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
79
  - name: 'claude-instant-1.2'
80
  fdir: 'claude-instant-1.2'
81
+ url: 'https://docs.anthropic.com/en/docs/about-claude/models'
82
  - name: 'command-r-plus'
83
  fdir: 'command-r-plus'
84
+ url: 'https://huggingface.co/CohereForAI/c4ai-command-r-plus'
85
+ - name: 'mistral-7b-v0.3'
86
+ fdir: 'mistral-7b-v0.3'
87
+ url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3'
88
  - name: 'mistral-7b-v0.2'
89
  fdir: 'mistral-7b-v0.2'
90
+ url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2'
91
  - name: 'mistral-7b-v0.1'
92
  fdir: 'mistral-7b-v0.1'
93
+ url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'
94
  - name: 'mixtral-8x7b'
95
  fdir: 'mixtral-8x7b'
96
+ url: 'https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1'
97
+ - name: 'mistral-large'
98
+ fdir: 'mistral-large'
99
+ url: 'https://mistral.ai/news/mistral-large/'
100
  - name: 'yi-1.5-34b'
101
  fdir: 'yi-1.5-34b'
102
+ url: 'https://huggingface.co/01-ai/Yi-1.5-34B-Chat'
103
  - name: 'yi-1.5-9b'
104
  fdir: 'yi-1.5-9b'
105
+ url: 'https://huggingface.co/01-ai/Yi-1.5-9B-Chat'
106
  - name: 'qwen-1.5-72b'
107
  fdir: 'qwen-1.5-72b'
108
+ url: 'https://huggingface.co/Qwen/Qwen1.5-72B-Chat'
109
  - name: 'qwen-1.5-32b'
110
  fdir: 'qwen-1.5-32b'
111
+ url: 'https://huggingface.co/Qwen/Qwen1.5-32B-Chat'
112
  - name: 'qwen-2-72b'
113
  fdir: 'qwen-2-72b'
114
+ url: 'https://huggingface.co/Qwen/Qwen2-72B-Instruct'
115
  - name: 'gemma-7b'
116
  fdir: 'gemma-7b'
117
+ url: 'https://huggingface.co/google/gemma-7b-it'
118
  - name: 'gemma-2b'
119
  fdir: 'gemma-2b'
120
+ url: 'https://huggingface.co/google/gemma-2b-it'
 
121
  - name: 'glm-4-9b'
122
  fdir: 'glm-4-9b'
123
+ url: 'https://huggingface.co/THUDM/glm-4-9b-chat'
 
 
 
src/about.py CHANGED
@@ -20,6 +20,8 @@ INTRODUCTION_TEXT = """
20
  LLM_BENCHMARKS_TEXT = f"""
21
  ## How it works
22
 
 
 
23
  ### Task
24
  The LLMs are evaluated as judges in a pairwise comparison task.
25
  Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
@@ -47,9 +49,9 @@ The [prompt](https://github.com/princeton-nlp/LLMBar/blob/main/LLMEvaluator/eval
47
  """
48
 
49
  CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
50
- CITATION_BUTTON_TEXT = r"""@article{liu2023benchmarking,
51
- title={Benchmarking generation and evaluation capabilities of large language models for instruction controllable summarization},
52
  author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
53
- journal={arXiv preprint arXiv:2311.09184},
54
- year={2023}
55
  }"""
 
20
  LLM_BENCHMARKS_TEXT = f"""
21
  ## How it works
22
 
23
+ ![](src/logo.png)
24
+
25
  ### Task
26
  The LLMs are evaluated as judges in a pairwise comparison task.
27
  Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
 
49
  """
50
 
51
  CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
52
+ CITATION_BUTTON_TEXT = r"""@inproceedings{liu2024benchmarking,
53
+ title={Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction Controllable Summarization},
54
  author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
55
+ booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
56
+ year = "2024",
57
  }"""
src/populate.py CHANGED
@@ -8,7 +8,7 @@ import numpy as np
8
  from datasets import load_dataset
9
  from .envs import TOKEN
10
 
11
- TYPES = ["str", "number", "number", "number", "number", "number"]
12
 
13
 
14
  def read_json(file_path: str) -> list[dict]:
@@ -95,7 +95,12 @@ def load_leaderboard() -> pd.DataFrame:
95
  acc, agr, models_acc, models_agr = pairwise_meta_eval(
96
  human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
97
  )
98
- predictions["Model"].append(model["name"])
 
 
 
 
 
99
  predictions["Accuracy"].append(acc)
100
  predictions["Agreement"].append(agr)
101
  predictions["Self-Accuracy"].append(models_acc)
 
8
  from datasets import load_dataset
9
  from .envs import TOKEN
10
 
11
+ TYPES = ["number", "html", "number", "number", "number", "number"]
12
 
13
 
14
  def read_json(file_path: str) -> list[dict]:
 
95
  acc, agr, models_acc, models_agr = pairwise_meta_eval(
96
  human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
97
  )
98
+ # predictions["Model"].append(model["name"])
99
+ # predictions["Model"].append(f"[{model['name']}]({model['url']})")
100
+ link = model['url']
101
+ model_name = model['name']
102
+ output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
103
+ predictions["Model"].append(output)
104
  predictions["Accuracy"].append(acc)
105
  predictions["Agreement"].append(agr)
106
  predictions["Self-Accuracy"].append(models_acc)