nxphi47 commited on
Commit
5919bed
·
1 Parent(s): 33587db

Create performance_plot.py

Browse files
Files changed (1) hide show
  1. performance_plot.py +253 -0
performance_plot.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import gradio as gr
4
+ import json
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ from datasets import load_dataset
9
+ from plotly.subplots import make_subplots
10
+
11
+
12
+ CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"]
13
+ LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
14
+
15
+ FORCE_DOWNLOAD = bool(int(os.environ.get("FORCE_DOWNLOAD", "0")))
16
+ HF_TOKEN = str(os.environ.get("HF_TOKEN", ""))
17
+ DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
18
+ PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "gpt4_single_json.csv"))
19
+
20
+
21
+ rename_map = {
22
+ "seallm13b10L6k_a_5a1R1_seaall_sft4x_1_5a1_r2_0_dpo_8_40000s": "SeaLLM-13b",
23
+ # "seallm13b10L4k_a_sft4xdpo_5a": "SeaLLM-13b-10L",
24
+ "polylm": "PolyLM-13b",
25
+ "qwen": "Qwen-14b",
26
+ "gpt-3.5-turbo": "GPT-3.5-turbo",
27
+ "gpt-4-1106-preview": "GPT-4-turbo",
28
+ }
29
+ CATEGORIES = [ "task-solving", "math-reasoning", "general-instruction", "natural-question", "safety", ]
30
+
31
+ CATEGORIES_NAMES = {
32
+ "task-solving": 'Task-solving',
33
+ "math-reasoning": 'Math',
34
+ "general-instruction": 'General-instruction',
35
+ "natural-question": 'NaturalQA',
36
+ "safety": 'Safety',
37
+ }
38
+
39
+
40
+ # LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
41
+ LANGS = ['en', 'vi', 'id', 'ms', 'tl', 'th', 'km', 'lo', 'my']
42
+ LANG_NAMES = {
43
+ 'en': 'eng',
44
+ 'vi': 'vie',
45
+ 'th': 'tha',
46
+ 'id': 'ind',
47
+ 'km': 'khm',
48
+ 'lo': 'lao',
49
+ 'ms': 'msa',
50
+ 'my': 'mya',
51
+ 'tl': 'tgl',
52
+
53
+ }
54
+
55
+
56
+
57
+ MODEL_DFRAME = None
58
+
59
+ def get_model_df():
60
+ # global MODEL_DFRAME
61
+ # if isinstance(MODEL_DFRAME, pd.DataFrame):
62
+ # print(f'Load cache data frame')
63
+ # return MODEL_DFRAME
64
+ from huggingface_hub import hf_hub_download
65
+ assert DATA_SET_REPO_PATH != ''
66
+ assert HF_TOKEN != ''
67
+ repo_id = DATA_SET_REPO_PATH
68
+ filename = PERFORMANCE_FILENAME
69
+
70
+ # data_path = f"{DATA_SET_REPO_PATH}/{PERFORMANCE_FILENAME}"
71
+ file_path = hf_hub_download(
72
+ repo_id=repo_id,
73
+ filename=filename,
74
+ force_download=FORCE_DOWNLOAD,
75
+ local_dir='./hf_cache',
76
+ repo_type="dataset",
77
+ token=HF_TOKEN
78
+ )
79
+ print(f'Downloaded file at {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}')
80
+ df = pd.read_csv(file_path)
81
+ return df
82
+
83
+
84
+ def aggregate_df(df, model_dict, category_name, categories):
85
+ scores_all = []
86
+ all_models = df["model"].unique()
87
+ for model in all_models:
88
+ for i, cat in enumerate(categories):
89
+ # filter category/model, and score format error (<1% case)
90
+ res = df[(df[category_name]==cat) & (df["model"]==model) & (df["score"] >= 0)]
91
+ score = res["score"].mean()
92
+ cat_name = cat
93
+ scores_all.append({"model": model, category_name: cat_name, "score": score})
94
+
95
+ target_models = list(model_dict.keys())
96
+ scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]
97
+ scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)
98
+
99
+ df_score = pd.DataFrame(scores_target)
100
+ df_score = df_score[df_score["model"].isin(target_models)]
101
+
102
+ rename_map = model_dict
103
+
104
+ for k, v in rename_map.items():
105
+ df_score.replace(k, v, inplace=True)
106
+ return df_score
107
+
108
+
109
+ def polar_subplot(fig, dframe, model_names, category_label, category_names, row, col, showlegend=True):
110
+
111
+ # cat category
112
+ colors = px.colors.qualitative.Plotly
113
+ for i, (model, model_name) in enumerate(model_names):
114
+ cat_list = dframe[dframe['model'] == model_name][category_label].tolist()
115
+ score_list = dframe[dframe['model'] == model_name]['score'].tolist()
116
+ cat_list += [cat_list[0]]
117
+ cat_list = [category_names[x] for x in cat_list]
118
+ score_list += [score_list[0]]
119
+ polar = go.Scatterpolar(
120
+ name = model_name,
121
+ r = score_list,
122
+ theta = cat_list,
123
+ legendgroup=f'{i}',
124
+ marker=dict(color=colors[i]),
125
+ hovertemplate="""Score: %{r:.2f}""",
126
+ showlegend=showlegend,
127
+ )
128
+ fig.add_trace(polar, row, col)
129
+
130
+
131
+ def plot_agg_fn():
132
+ df = get_model_df()
133
+
134
+ all_models = df["model"].unique()
135
+ model_names = list(rename_map.items())
136
+ colors = px.colors.qualitative.Plotly
137
+
138
+ cat_df = aggregate_df(df, rename_map, "category", CATEGORIES, )
139
+ lang_df = aggregate_df(df, rename_map, "lang", LANGS, )
140
+
141
+ fig = make_subplots(
142
+ rows=1, cols=2,
143
+ specs=[[{'type': 'polar'}]*2],
144
+ subplot_titles=("By Category", "By Language"),
145
+ )
146
+ fig.layout.annotations[0].y = 1.05
147
+ fig.layout.annotations[1].y = 1.05
148
+
149
+ # cat category
150
+ for i, (model, model_name) in enumerate(model_names):
151
+ cat_list = cat_df[cat_df['model'] == model_name]['category'].tolist()
152
+ score_list = cat_df[cat_df['model'] == model_name]['score'].tolist()
153
+ cat_list += [cat_list[0]]
154
+ cat_list = [CATEGORIES_NAMES[x] for x in cat_list]
155
+ score_list += [score_list[0]]
156
+ polar = go.Scatterpolar(
157
+ name = model_name,
158
+ r = score_list,
159
+ theta = cat_list,
160
+ legendgroup=f'{i}',
161
+ marker=dict(color=colors[i]),
162
+ hovertemplate="""Score: %{r:.2f}""",
163
+ )
164
+ fig.add_trace(polar, 1, 1)
165
+
166
+ # cat langs
167
+ for i, (model, model_name) in enumerate(model_names):
168
+ cat_list = lang_df[lang_df['model'] == model_name]['lang'].tolist()
169
+ score_list = lang_df[lang_df['model'] == model_name]['score'].tolist()
170
+ cat_list += [cat_list[0]]
171
+ score_list += [score_list[0]]
172
+ cat_list = [LANG_NAMES[x] for x in cat_list]
173
+ polar = go.Scatterpolar(
174
+ name = model_name,
175
+ r = score_list,
176
+ theta = cat_list,
177
+ legendgroup=f'{i}',
178
+ marker=dict(color=colors[i]),
179
+ hovertemplate="""Score: %{r:.2f}""",
180
+ showlegend=False,
181
+ )
182
+ fig.add_trace(polar, 1, 2)
183
+
184
+
185
+ polar_config = dict(
186
+ angularaxis = dict(
187
+ rotation=90, # start position of angular axis
188
+ ),
189
+ radialaxis = dict(
190
+ range=[0, 10],
191
+ ),
192
+ )
193
+
194
+ fig.update_layout(
195
+ polar = polar_config,
196
+ polar2 = polar_config,
197
+ title='Sea-Bench (rated by GPT-4)',
198
+ )
199
+ return fig
200
+
201
+
202
+ def plot_by_lang_fn():
203
+ df = get_model_df()
204
+ model_names = list(rename_map.items())
205
+
206
+ fig = make_subplots(
207
+ rows=3, cols=3,
208
+ specs=[[{'type': 'polar'}]*3] * 3,
209
+ subplot_titles=list(LANG_NAMES.values()),
210
+ # vertical_spacing=1
211
+ )
212
+ # print(fig.layout.annotations)
213
+ for ano in fig.layout.annotations:
214
+ ano.y = ano.y + 0.02
215
+ has_safety = ['vi', 'id', 'th']
216
+
217
+ for lang_id, lang in enumerate(LANGS):
218
+ cat_names = CATEGORIES if lang in has_safety else [x for x in CATEGORIES if x != 'safety']
219
+ cat_lang_df = aggregate_df(df[df['lang'] == lang], rename_map, "category", cat_names, )
220
+ row = lang_id // 3 + 1
221
+ col = lang_id % 3 + 1
222
+ polar_subplot(fig, cat_lang_df, model_names, 'category', CATEGORIES_NAMES, row, col, showlegend=lang_id == 0)
223
+
224
+ polar_config = dict(
225
+ angularaxis = dict(
226
+ rotation=90, # start position of angular axis
227
+ ),
228
+ radialaxis = dict(
229
+ range=[0, 10],
230
+ ),
231
+ )
232
+ layer_kwargs = {f"polar{i}": polar_config for i in range(1, 10)}
233
+ fig.update_layout(
234
+ title='Sea-Bench - By language (rated by GPT-4)',
235
+ height=1000,
236
+ # width=1200,
237
+ **layer_kwargs
238
+ )
239
+ return fig
240
+
241
+
242
+
243
+ def attach_plot_to_demo(demo):
244
+ with gr.Accordion("Psst... wanna see some performance benchmarks?", open=False):
245
+ gr_plot_agg = gr.Plot(label="Aggregated")
246
+ gr_plot_bylang = gr.Plot(label='By language')
247
+
248
+ # def callback():
249
+ demo.load(plot_agg_fn, [], gr_plot_agg)
250
+ demo.load(plot_by_lang_fn, [], gr_plot_bylang)
251
+ # return callback
252
+
253
+