TracyMc commited on
Commit
03b1dbc
·
1 Parent(s): c94992d

Add application file

Browse files
Files changed (3) hide show
  1. .gitignore +13 -0
  2. app.py +241 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import pandas as pd
4
+ from collections import defaultdict
5
+ import copy as cp
6
+ from urllib.request import urlopen
7
+ import re
8
+
9
+ # Constants
10
+ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
11
+ title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
12
+ author={OpenCompass Contributors},
13
+ howpublished = {\url{https://github.com/open-compass/opencompass}},
14
+ year={2023}
15
+ },
16
+ }"""
17
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
18
+ OPENCOMPASS_README = (
19
+ 'https://raw.githubusercontent.com/open-compass/opencompass/main/README.md'
20
+ )
21
+ GITHUB_REPO = 'https://github.com/open-compass/opencompass'
22
+ GITHUB_RAW = 'https://raw.githubusercontent.com/open-compass/opencompass'
23
+ GITHUB_BLOB = 'https://github.com/open-compass/opencompass/blob'
24
+
25
+ # URL for the JSON data
26
+ DATA_URL = "http://opencompass.oss-cn-shanghai.aliyuncs.com/assets/research-rank/research-data.24-12.20241205.json"
27
+
28
+ # Markdown content
29
+ MAIN_LEADERBOARD_TITLE = "# CompassAcademic Leaderboard"
30
+ MAIN_LEADERBOARD_DESCRIPTION = """## Main Evaluation Results
31
+ The CompassAcademic currently focuses on the comprehensive reasoning abilities of LLMs.
32
+ - The datasets selected so far include General Knowledge Reasoning (MMLU-Pro/GPQA-Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Completion (LiveCodeBench, HumanEval), and Instruction Following (IFEval).
33
+ - Currently, the evaluation primarily targets chat models, with updates featuring the latest community models at irregular intervals.
34
+ - Prompts and reproduction scripts can be found in [**OpenCompass**: A Toolkit for Evaluation of LLMs](https://github.com/open-compass/opencompass)🏆.
35
+ """
36
+
37
+
38
+ def fix_image_urls(content):
39
+ """Fix image URLs in markdown content."""
40
+ # Handle the specific logo.svg path
41
+ content = content.replace(
42
+ 'docs/en/_static/image/logo.svg',
43
+ 'https://raw.githubusercontent.com/open-compass/opencompass/main/docs/en/_static/image/logo.svg',
44
+ )
45
+
46
+ # Replace other relative image paths with absolute GitHub URLs
47
+ content = re.sub(
48
+ r'!\[[^\]]*\]\((?!http)([^)]+)\)',
49
+ lambda m: f'![{m.group(0)}](https://raw.githubusercontent.com/open-compass/opencompass/main/{m.group(1)})',
50
+ content,
51
+ )
52
+
53
+ return content
54
+
55
+
56
+ MODEL_SIZE = ['<10B', '10B-70B', '>70B', 'Unknown']
57
+ MODEL_TYPE = ['API', 'OpenSource']
58
+
59
+
60
+ def load_data():
61
+ response = urlopen(DATA_URL)
62
+ data = json.loads(response.read().decode('utf-8'))
63
+ return data
64
+
65
+
66
+ def build_main_table(data):
67
+ df = pd.DataFrame(data['globalData']['OverallTable'])
68
+
69
+ # Add OpenSource column based on models data
70
+ models_data = data['models']
71
+ df['OpenSource'] = df['model'].apply(
72
+ lambda x: 'Yes' if models_data[x]['release'] == 'OpenSource' else 'No'
73
+ )
74
+
75
+ columns = {
76
+ 'model': 'Model',
77
+ 'org': 'Organization',
78
+ 'num': 'Parameters',
79
+ 'OpenSource': 'OpenSource',
80
+ 'Average': 'Average Score',
81
+ 'BBH': 'BBH',
82
+ 'Math-500': 'Math-500',
83
+ 'AIME': 'AIME',
84
+ 'MMLU-Pro': 'MMLU-Pro',
85
+ 'LiveCodeBench': 'LiveCodeBench',
86
+ 'HumanEval': 'HumanEval',
87
+ 'GQPA-Diamond': 'GQPA-Diamond',
88
+ 'IFEval': 'IFEval',
89
+ }
90
+ df = df[list(columns.keys())].rename(columns=columns)
91
+ return df
92
+
93
+
94
+ def filter_table(df, size_ranges, model_types):
95
+ filtered_df = df.copy()
96
+
97
+ # Filter by size
98
+ if size_ranges:
99
+
100
+ def get_size_in_B(param):
101
+ if param == 'N/A':
102
+ return None
103
+ try:
104
+ return float(param.replace('B', ''))
105
+ except:
106
+ return None
107
+
108
+ filtered_df['size_in_B'] = filtered_df['Parameters'].apply(
109
+ get_size_in_B
110
+ )
111
+
112
+ mask = pd.Series(False, index=filtered_df.index)
113
+ for size_range in size_ranges:
114
+ if size_range == '<10B':
115
+ mask |= (filtered_df['size_in_B'] < 10) & (
116
+ filtered_df['size_in_B'].notna()
117
+ )
118
+ elif size_range == '10B-70B':
119
+ mask |= (filtered_df['size_in_B'] >= 10) & (
120
+ filtered_df['size_in_B'] < 70
121
+ )
122
+ elif size_range == '>70B':
123
+ mask |= filtered_df['size_in_B'] >= 70
124
+ elif size_range == 'Unknown':
125
+ mask |= filtered_df['size_in_B'].isna()
126
+
127
+ filtered_df = filtered_df[mask]
128
+ filtered_df.drop('size_in_B', axis=1, inplace=True)
129
+
130
+ # Filter by model type
131
+ if model_types:
132
+ type_mask = pd.Series(False, index=filtered_df.index)
133
+ for model_type in model_types:
134
+ if model_type == 'API':
135
+ type_mask |= filtered_df['OpenSource'] == 'No'
136
+ elif model_type == 'OpenSource':
137
+ type_mask |= filtered_df['OpenSource'] == 'Yes'
138
+ filtered_df = filtered_df[type_mask]
139
+
140
+ # 直接返回过滤后的 DataFrame
141
+ return filtered_df
142
+
143
+
144
+ def calculate_column_widths(df):
145
+ """Dynamically calculate column widths based on content length."""
146
+ column_widths = []
147
+
148
+ for column in df.columns:
149
+ # Get max length of column name and values
150
+ header_length = len(str(column))
151
+ max_content_length = df[column].astype(str).map(len).max()
152
+
153
+ # Use the larger of header or content length
154
+ # Multiply by average character width (approximately 8 pixels)
155
+ # Add padding (20 pixels)
156
+ # Increase the multiplier for header length to ensure it fits
157
+ width = max(header_length * 10, max_content_length * 8) + 20
158
+
159
+ # Set minimum width (200 pixels)
160
+ width = max(160, width)
161
+
162
+ # Set maximum width (400 pixels) to prevent extremely wide columns
163
+ width = min(400, width)
164
+
165
+ column_widths.append(width)
166
+
167
+ return column_widths
168
+
169
+
170
+ def create_interface():
171
+ data = load_data()
172
+ df = build_main_table(data)
173
+
174
+ with gr.Blocks() as demo:
175
+ gr.Markdown(MAIN_LEADERBOARD_TITLE)
176
+
177
+ with gr.Tabs() as tabs:
178
+ with gr.TabItem("🏅 Main Leaderboard", elem_id='main'):
179
+ gr.Markdown(MAIN_LEADERBOARD_DESCRIPTION)
180
+
181
+ with gr.Row():
182
+ with gr.Column():
183
+ size_filter = gr.CheckboxGroup(
184
+ choices=MODEL_SIZE,
185
+ value=MODEL_SIZE,
186
+ label='Model Size',
187
+ interactive=True,
188
+ )
189
+ with gr.Column():
190
+ type_filter = gr.CheckboxGroup(
191
+ choices=MODEL_TYPE,
192
+ value=MODEL_TYPE,
193
+ label='Model Type',
194
+ interactive=True,
195
+ )
196
+
197
+ with gr.Column():
198
+ table = gr.DataFrame(
199
+ value=df.sort_values("Average Score", ascending=False),
200
+ interactive=False,
201
+ wrap=False, # 禁用自动换行
202
+ column_widths=calculate_column_widths(df),
203
+ )
204
+
205
+ def update_table(size_ranges, model_types):
206
+ filtered_df = filter_table(df, size_ranges, model_types)
207
+ return filtered_df.sort_values(
208
+ "Average Score", ascending=False
209
+ )
210
+
211
+ size_filter.change(
212
+ fn=update_table,
213
+ inputs=[size_filter, type_filter],
214
+ outputs=table,
215
+ )
216
+
217
+ type_filter.change(
218
+ fn=update_table,
219
+ inputs=[size_filter, type_filter],
220
+ outputs=table,
221
+ )
222
+
223
+ with gr.TabItem("🔍 About", elem_id='about'):
224
+ readme_content = urlopen(OPENCOMPASS_README).read().decode()
225
+ fixed_content = fix_image_urls(readme_content)
226
+ gr.Markdown(fixed_content)
227
+
228
+ with gr.Row():
229
+ with gr.Accordion("Citation", open=False):
230
+ citation_button = gr.Textbox(
231
+ value=CITATION_BUTTON_TEXT,
232
+ label=CITATION_BUTTON_LABEL,
233
+ elem_id='citation-button',
234
+ )
235
+
236
+ return demo
237
+
238
+
239
+ if __name__ == '__main__':
240
+ demo = create_interface()
241
+ demo.launch(server_name='0.0.0.0')
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.15.0
2
+ numpy>=1.23.4
3
+ pandas>=1.5.3