pminervini commited on
Commit
c0db8b3
·
1 Parent(s): 03b3c51
cli/analysis-cli.py CHANGED
@@ -3,6 +3,7 @@
3
  import os
4
  import sys
5
  import json
 
6
 
7
  import numpy as np
8
 
@@ -27,113 +28,292 @@ def find_json_files(json_path):
27
  return res
28
 
29
 
30
- my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
31
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
34
- request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
35
 
36
- model_name_to_model_map = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- for path in request_path_lst:
39
- with open(path, 'r') as f:
40
- data = json.load(f)
41
- model_name_to_model_map[data["model"]] = data
42
 
43
- model_dataset_metric_to_result_map = {}
44
- data_map = {}
45
 
46
- for path in result_path_lst:
47
- with open(path, 'r') as f:
48
- data = json.load(f)
49
- model_name = data["config"]["model_name"]
50
- for dataset_name, results_dict in data["results"].items():
51
- for metric_name, value in results_dict.items():
52
 
53
- # print(model_name, dataset_name, metric_name, value)
 
 
 
 
 
54
 
55
- if ',' in metric_name and '_stderr' not in metric_name \
56
- and 'f1' not in metric_name \
57
- and model_name_to_model_map[model_name]["likes"] > 256:
58
 
59
- to_add = True
 
 
60
 
61
- if 'selfcheck' in dataset_name:
62
- if 'max' not in metric_name:
63
- to_add = False
64
 
65
- if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
66
- to_add = False
67
- # pass
 
 
 
 
 
 
 
68
 
69
- # breakpoint()
70
 
71
- if 'bertscore' in metric_name:
72
- if 'precision' not in metric_name:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  to_add = False
74
 
75
- if 'correctness,' in metric_name or 'em,' in metric_name:
76
- to_add = False
 
 
77
 
78
- if 'rouge' in metric_name:
79
- if 'rougeL' not in metric_name:
80
  to_add = False
81
 
82
- if 'ifeval' in dataset_name:
83
- if 'prompt_level_strict_acc' not in metric_name:
84
  to_add = False
85
 
86
- if 'squad' in dataset_name:
87
- to_add = False
88
 
89
- if 'fever' in dataset_name:
90
- to_add = False
 
 
 
 
91
 
92
- if 'rouge' in metric_name:
93
- value /= 100.0
 
 
94
 
95
- if to_add:
96
- sanitised_metric_name = metric_name.split(',')[0]
97
- model_dataset_metric_to_result_map[(model_name, dataset_name, sanitised_metric_name)] = value
98
 
99
- # if (model_name, dataset_name) not in data_map:
100
- # data_map[(model_name, dataset_name)] = {}
101
- # data_map[(model_name, dataset_name)][metric_name] = value
 
102
 
103
- if model_name not in data_map:
104
- data_map[model_name] = {}
105
- data_map[model_name][(dataset_name, sanitised_metric_name)] = value
106
 
107
- print('model_name', model_name, 'dataset_name', dataset_name, 'metric_name', metric_name, 'value', value)
 
 
108
 
109
- model_name_lst = [m for m in data_map.keys()]
110
- for m in model_name_lst:
111
- if len(data_map[m]) < 8:
112
- del data_map[m]
113
 
114
- df = pd.DataFrame.from_dict(data_map, orient='index')
115
- o_df = df.copy(deep=True)
116
 
117
- print(df)
 
 
118
 
119
- # Check for NaN or infinite values and replace them
120
- df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
121
- df.fillna(0, inplace=True) # Replace NaN with 0 (or use another imputation strategy)
122
 
123
- from sklearn.preprocessing import MinMaxScaler
 
 
124
 
125
- # scaler = MinMaxScaler()
126
- # df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
127
 
128
- sns.set_context("notebook", font_scale=1.0)
129
 
130
- # fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
131
- fig = sns.clustermap(df, method='ward', metric='euclidean', cmap='coolwarm', figsize=(16, 12), annot=True, mask=o_df.isnull())
132
 
133
- # Adjust the size of the cells (less wide)
134
- plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
135
- plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
136
 
137
- # Save the clustermap to file
138
- fig.savefig('plots/clustermap.pdf')
139
- fig.savefig('plots/clustermap.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  import sys
5
  import json
6
+ import pickle
7
 
8
  import numpy as np
9
 
 
28
  return res
29
 
30
 
31
+ def sanitise_metric(name: str) -> str:
32
+ res = name
33
+ res = res.replace("prompt_level_strict_acc", "Prompt-Level Accuracy")
34
+ res = res.replace("acc", "Accuracy")
35
+ res = res.replace("exact_match", "EM")
36
+ res = res.replace("avg-selfcheckgpt", "AVG")
37
+ res = res.replace("max-selfcheckgpt", "MAX")
38
+ res = res.replace("rouge", "ROUGE-")
39
+ res = res.replace("bertscore_precision", "BERT-P")
40
+ res = res.replace("exact", "EM")
41
+ res = res.replace("HasAns_EM", "HasAns")
42
+ res = res.replace("NoAns_EM", "NoAns")
43
+ return res
44
 
 
 
45
 
46
+ def sanitise_dataset(name: str) -> str:
47
+ res = name
48
+ res = res.replace("tqa8", "TriviaQA")
49
+ res = res.replace("nq8", "NQ")
50
+ res = res.replace("truthfulqa", "TruthfulQA")
51
+ res = res.replace("ifeval", "IFEval")
52
+ res = res.replace("selfcheckgpt", "SelfCheckGPT")
53
+ res = res.replace("truefalse_cieacf", "True-False")
54
+ res = res.replace("mc", "MC")
55
+ res = res.replace("race", "RACE")
56
+ res = res.replace("squad", "SQuAD")
57
+ res = res.replace("memo-trap", "MemoTrap")
58
+ res = res.replace("cnndm", "CNN/DM")
59
+ res = res.replace("xsum", "XSum")
60
+ res = res.replace("qa", "QA")
61
+ res = res.replace("summarization", "Summarization")
62
+ res = res.replace("dialogue", "Dialog")
63
+ res = res.replace("halueval", "HaluEval")
64
+ res = res.replace("_", " ")
65
+ return res
66
 
 
 
 
 
67
 
68
+ cache_file = 'data_map_cache.pkl'
 
69
 
 
 
 
 
 
 
70
 
71
+ def load_data_map_from_cache(cache_file):
72
+ if os.path.exists(cache_file):
73
+ with open(cache_file, 'rb') as f:
74
+ return pickle.load(f)
75
+ else:
76
+ return None
77
 
 
 
 
78
 
79
+ def save_data_map_to_cache(data_map, cache_file):
80
+ with open(cache_file, 'wb') as f:
81
+ pickle.dump(data_map, f)
82
 
 
 
 
83
 
84
+ # Try to load the data_map from the cache file
85
+ data_map = load_data_map_from_cache(cache_file)
86
+
87
+
88
+ if data_map is None:
89
+ my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
90
+ my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
91
+
92
+ result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
93
+ request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
94
 
95
+ model_name_to_model_map = {}
96
 
97
+ for path in request_path_lst:
98
+ with open(path, 'r') as f:
99
+ data = json.load(f)
100
+ model_name_to_model_map[data["model"]] = data
101
+
102
+ model_dataset_metric_to_result_map = {}
103
+
104
+ # data_map[model_name][(dataset_name, sanitised_metric_name)] = value
105
+ data_map = {}
106
+
107
+ for path in result_path_lst:
108
+ with open(path, 'r') as f:
109
+ data = json.load(f)
110
+ model_name = data["config"]["model_name"]
111
+ for dataset_name, results_dict in data["results"].items():
112
+ for metric_name, value in results_dict.items():
113
+
114
+ # print(model_name, dataset_name, metric_name, value)
115
+
116
+ if ',' in metric_name and '_stderr' not in metric_name \
117
+ and 'f1' not in metric_name \
118
+ and model_name_to_model_map[model_name]["likes"] > 128:
119
+
120
+ to_add = True
121
+
122
+ if 'memo-trap_v2' in dataset_name:
123
  to_add = False
124
 
125
+ if 'selfcheck' in dataset_name:
126
+ # if 'max' in metric_name:
127
+ # to_add = False
128
+ pass
129
 
130
+ if 'faithdial' in dataset_name:
 
131
  to_add = False
132
 
133
+ if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
 
134
  to_add = False
135
 
136
+ if 'truthfulqa_gen' in dataset_name:
137
+ to_add = False
138
 
139
+ if 'bertscore' in metric_name:
140
+ if 'precision' not in metric_name:
141
+ to_add = False
142
+
143
+ if 'correctness,' in metric_name or 'em,' in metric_name:
144
+ to_add = False
145
 
146
+ if 'rouge' in metric_name:
147
+ pass
148
+ # if 'rougeL' not in metric_name:
149
+ # to_add = False
150
 
151
+ if 'ifeval' in dataset_name:
152
+ if 'prompt_level_strict_acc' not in metric_name:
153
+ to_add = False
154
 
155
+ if 'squad' in dataset_name:
156
+ # to_add = False
157
+ if 'best_exact' in metric_name:
158
+ to_add = False
159
 
160
+ if 'fever' in dataset_name:
161
+ to_add = False
 
162
 
163
+ if 'xsum' in dataset_name:
164
+ # to_add = False
165
+ pass
166
 
167
+ if 'rouge' in metric_name:
168
+ value /= 100.0
 
 
169
 
170
+ if 'squad' in dataset_name:
171
+ value /= 100.0
172
 
173
+ if to_add:
174
+ sanitised_metric_name = sanitise_metric(metric_name.split(',')[0])
175
+ sanitised_dataset_name = sanitise_dataset(dataset_name)
176
 
177
+ model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
 
 
178
 
179
+ if model_name not in data_map:
180
+ data_map[model_name] = {}
181
+ data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
182
 
183
+ print('model_name', model_name, 'dataset_name', sanitised_dataset_name, 'metric_name', sanitised_metric_name, 'value', value)
 
184
 
185
+ save_data_map_to_cache(data_map, cache_file)
186
 
187
+ model_name_lst = [m for m in data_map.keys()]
 
188
 
189
+ for model_name in model_name_lst:
190
+ if len(data_map[model_name]) < 14:
191
+ del data_map[model_name]
192
 
193
+ plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
194
+
195
+ for plot_type in plot_type_lst:
196
+
197
+ data_map_v2 = {}
198
+ for model_name in data_map.keys():
199
+ for dataset_metric in data_map[model_name].keys():
200
+ if dataset_metric not in data_map_v2:
201
+ data_map_v2[dataset_metric] = {}
202
+
203
+ if plot_type in {'all'}:
204
+ to_add = True
205
+ if 'ROUGE' in dataset_metric[1] and 'ROUGE-L' not in dataset_metric[1]:
206
+ to_add = False
207
+ if 'SQuAD' in dataset_metric[0] and 'EM' not in dataset_metric[1]:
208
+ to_add = False
209
+ if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
210
+ to_add = False
211
+ if to_add is True:
212
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
213
+ elif plot_type in {'summ'}:
214
+ if 'CNN' in dataset_metric[0] or 'XSum' in dataset_metric[0]:
215
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
216
+ elif plot_type in {'qa'}:
217
+ if 'TriviaQA' in dataset_metric[0] or 'NQ' in dataset_metric[0] or 'TruthfulQA' in dataset_metric[0]:
218
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
219
+ elif plot_type in {'instr'}:
220
+ if 'MemoTrap' in dataset_metric[0] or 'IFEval' in dataset_metric[0]:
221
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
222
+ elif plot_type in {'detect'}:
223
+ if 'HaluEval' in dataset_metric[0] or 'SelfCheck' in dataset_metric[0]:
224
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
225
+ elif plot_type in {'rc'}:
226
+ if 'RACE' in dataset_metric[0] or 'SQuAD' in dataset_metric[0]:
227
+ data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
228
+ else:
229
+ assert False, f"Unknown plot type: {plot_type}"
230
+
231
+ # df = pd.DataFrame.from_dict(data_map, orient='index') # Invert the y-axis (rows)
232
+ df = pd.DataFrame.from_dict(data_map_v2, orient='index') # Invert the y-axis (rows)
233
+ df.index = [', '.join(map(str, idx)) for idx in df.index]
234
+
235
+ o_df = df.copy(deep=True)
236
+
237
+ # breakpoint()
238
+
239
+ print(df)
240
+
241
+ # Check for NaN or infinite values and replace them
242
+ df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
243
+ df.fillna(0, inplace=True) # Replace NaN with 0 (or use another imputation strategy)
244
+
245
+ from sklearn.preprocessing import MinMaxScaler
246
+
247
+ # scaler = MinMaxScaler()
248
+ # df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
249
+
250
+ # Calculate dimensions based on the DataFrame size
251
+ cell_height = 1.0 # Height of each cell in inches
252
+ cell_width = 1.0 # Width of each cell in inches
253
+
254
+ n_rows = len(df.index) # Datasets and Metrics
255
+ n_cols = len(df.columns) # Models
256
+
257
+ # Calculate figure size dynamically
258
+ fig_width = cell_width * n_cols + 0
259
+ fig_height = cell_height * n_rows + 0
260
+
261
+ col_cluster = True
262
+ row_cluster = True
263
+
264
+ sns.set_context("notebook", font_scale=1.3)
265
+
266
+ dendrogram_ratio = (.1, .1)
267
+
268
+ if plot_type in {'detect'}:
269
+ fig_width = cell_width * n_cols - 2
270
+ fig_height = cell_height * n_rows + 5.2
271
+ dendrogram_ratio = (.1, .2)
272
+
273
+ if plot_type in {'instr'}:
274
+ fig_width = cell_width * n_cols - 2
275
+ fig_height = cell_height * n_rows + 5.2
276
+ dendrogram_ratio = (.1, .4)
277
+
278
+ if plot_type in {'qa'}:
279
+ fig_width = cell_width * n_cols - 2
280
+ fig_height = cell_height * n_rows + 4
281
+ dendrogram_ratio = (.1, .2)
282
+
283
+ if plot_type in {'summ'}:
284
+ fig_width = cell_width * n_cols - 2
285
+ fig_height = cell_height * n_rows + 2.0
286
+ dendrogram_ratio = (.1, .1)
287
+ row_cluster = False
288
+
289
+ if plot_type in {'rc'}:
290
+ fig_width = cell_width * n_cols - 2
291
+ fig_height = cell_height * n_rows + 5.2
292
+ dendrogram_ratio = (.1, .4)
293
+
294
+ print('figsize', (fig_width, fig_height))
295
+
296
+ print(f'Generating clustermap for {plot_type}')
297
+
298
+ # fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
299
+ fig = sns.clustermap(df,
300
+ method='ward',
301
+ metric='euclidean',
302
+ cmap='coolwarm',
303
+ figsize=(fig_width, fig_height), # figsize=(24, 16),
304
+ annot=True,
305
+ mask=o_df.isnull(),
306
+ dendrogram_ratio=dendrogram_ratio,
307
+ fmt='.2f',
308
+ col_cluster=col_cluster,
309
+ row_cluster=row_cluster)
310
+
311
+ # Adjust the size of the cells (less wide)
312
+ plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
313
+ plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
314
+
315
+ # Save the clustermap to file
316
+ fig.savefig(f'plots/clustermap_{plot_type}.pdf')
317
+ fig.savefig(f'plots/clustermap_{plot_type}.png')
318
+
319
+ o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
plots/clustermap_all.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","SQuADv2, EM","SQuADv2, HasAns","SQuADv2, NoAns","TriviaQA, EM","HaluEval Dialog, Accuracy","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy","RACE, Accuracy","NQ, EM","TruthfulQA MC2, Accuracy","HaluEval Summarization, Accuracy","True-False, Accuracy","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.5250917993,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.1329908195,0.1431820096,0.1281900109,null,0.3019455908,null,0.2228585867,0.0799292512,0.3982986608,0.0476711867,0.1752716247,0.2339762486,0.1510991325,0.1514360313,0.3049776805,0.2034026783,0.2296807883,0.3536595637,0.1234734271,0.3176956119,0.139560347,0.3088520172,0.3112103091,0.2263117999,0.2642129201,0.1662595806,null,0.257811842,0.0,0.1199359892,0.1446138297,0.1327381454,0.1591004801,null],[0.0811403509,0.0718623482,0.0558367072,null,0.604757085,null,0.4461875843,0.1589068826,0.7967273954,0.0755735493,0.3500337382,0.467780027,0.3021255061,0.0733805668,0.6108299595,0.407219973,0.4586707152,0.7083333333,0.0632591093,0.6363022942,0.0841767881,0.6185897436,0.6233130904,0.4531039136,0.5291835358,0.000168691,null,0.5163630229,0.0,0.0657894737,0.0566801619,0.0796221323,0.0775978408,null],[0.1846930193,0.2142977292,0.2003364172,null,0.0,null,0.0001682086,0.0011774601,0.0010092515,0.0198486123,0.0010092515,0.0008410429,0.0005046257,0.2292682927,0.0,0.0001682086,0.0013456686,0.0,0.1835155593,0.0,0.1947855341,0.0,0.0,0.0001682086,0.0,0.3318755257,null,0.0,0.0,0.1739276703,0.2322960471,0.1857022708,0.2403700589,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.6389322336,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.5928,0.6085,0.6471,null,null,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null],[0.0371901778,0.0347891843,0.0428959698,null,null,null,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,null,0.1854881243,0.2370092437,null,0.0432590646,0.2647281333,0.042540476,null,null,0.1822281713,0.1657837663,0.0141983599,null,null,null,0.0340250019,0.0445139517,0.0359765055,0.0413856309,0.0011478244],[0.0401970892,0.0394161696,0.0428647574,null,null,null,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,null,0.5666702185,0.4707080535,null,0.0407451505,0.3412336695,0.0400569668,null,null,0.4089209967,0.3192473228,0.1421243598,null,null,null,0.0379350583,0.0428665575,0.0387781905,0.0411868449,0.1811100146],[0.3949103208,0.3837605404,0.4030390077,null,null,null,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,null,0.6324491786,0.709515113,null,0.4014707047,0.7352085092,0.3991319337,null,null,0.6545591434,0.5731293539,0.4255551427,null,null,null,0.3853941111,0.4050690645,0.3917698265,0.401477077,0.0021477235],[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.5405982906,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009,null],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3844731978,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547,null],[0.4574162679,0.4220095694,0.4325358852,0.4392344498,0.4593301435,0.4229665072,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4076555024,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689,0.419138756],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0313019391,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.6681645582,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null],[0.4645,0.4193,0.4436,0.5464,0.4465,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5574],[0.8534100247,0.7733771569,0.8087099425,0.8895645029,0.8905505341,0.8348397699,0.5314708299,0.5041906327,0.8917009039,0.4940016434,0.5059983566,0.5413311422,0.507641742,0.8698438784,0.8793755136,0.5860312243,0.5393590797,0.883483977,0.8059161873,0.6271158587,0.8195562859,0.8854560394,0.8323746919,0.6364831553,0.5814297453,0.4926869351,0.8376335251,0.8670501233,0.5064913722,0.8023007395,0.7268693509,0.8514379622,0.8315529992,0.8933442892],[0.0132261781,0.0098347434,0.012722468,0.0114292948,null,0.008927303,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.0157445278,0.2242330745,0.2130128961,0.0142701427,0.0133793406,0.2408143362,0.012671213,0.0117323751,0.0111061484,0.1882467392,0.1691898288,0.0164863838,0.0116271987,0.0131859896,0.0000716868,0.0103604417,0.0145397085,null,0.0135674045,0.0006113776],[0.1686613542,0.2305715843,0.1762471835,0.1795055867,null,0.1750336135,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.2233354576,0.8669051787,0.9215527018,0.1965546069,0.1575508605,0.94858605,0.1617593629,0.1759309876,0.1893680342,0.8267075767,0.901661868,0.7995222351,0.166116994,0.1795224981,0.0805047156,0.1580946885,0.1517425501,null,0.155634531,0.9509124595],[0.3668251897,0.3243336913,0.3548213557,0.3652777882,null,0.3544185533,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.3898203604,0.6159694987,0.6594958169,0.3785245521,0.3663761784,0.7087181979,0.3645764744,0.3628082484,0.3673437638,0.5926767984,0.5112991874,0.4542439283,0.3637139385,0.3722801591,0.3306992617,0.3566615503,0.367965746,null,0.3640197324,0.0013591523],[0.5454,0.4519,0.5206,0.6605,0.23,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,null],[0.0900088111,0.0378151261,0.1105999164,0.0288242793,0.0767584532,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,null,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,null]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","teknium\/OpenHermes-2.5-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","SQuADv2, EM","TriviaQA, EM","HaluEval Dialog, Accuracy","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy","RACE, Accuracy","NQ, EM","TruthfulQA MC2, Accuracy","HaluEval Summarization, Accuracy","True-False, Accuracy","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P","HaluEval QA, Accuracy","SelfCheckGPT, MAX","CNN\/DM v2, ROUGE-L","CNN\/DM v2, factKB","CNN\/DM v2, BERT-P","XSum v2, ROUGE-L","XSum v2, factKB","XSum v2, BERT-P"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3598531212,0.5250917993,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.1329908195,0.1431820096,0.1281900109,null,0.0681377916,0.3019455908,null,0.2228585867,0.0799292512,0.3982986608,0.0476711867,0.1752716247,0.2339762486,0.1510991325,0.1514360313,0.3049776805,0.2034026783,0.2296807883,0.3536595637,0.1234734271,0.3176956119,0.139560347,0.3088520172,0.3112103091,0.2263117999,0.2642129201,0.1662595806,null,0.257811842,0.0,0.1199359892,0.1446138297,0.1327381454,0.1591004801,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,null,0.6389322336,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.5928,0.6085,0.6471,null,0.7826,null,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null],[0.0371901778,0.0347891843,0.0428959698,null,null,null,null,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,null,0.1854881243,0.2370092437,null,0.0432590646,0.2647281333,0.042540476,null,null,0.1822281713,0.1657837663,0.0141983599,null,null,null,0.0340250019,0.0445139517,0.0359765055,0.0413856309,0.0011478244],[0.0401970892,0.0394161696,0.0428647574,null,null,null,null,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,null,0.5666702185,0.4707080535,null,0.0407451505,0.3412336695,0.0400569668,null,null,0.4089209967,0.3192473228,0.1421243598,null,null,null,0.0379350583,0.0428665575,0.0387781905,0.0411868449,0.1811100146],[0.3949103208,0.3837605404,0.4030390077,null,null,null,null,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,null,0.6324491786,0.709515113,null,0.4014707047,0.7352085092,0.3991319337,null,null,0.6545591434,0.5731293539,0.4255551427,null,null,null,0.3853941111,0.4050690645,0.3917698265,0.401477077,0.0021477235],[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.6388888889,0.5405982906,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009,null],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,null,0.3844731978,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547,null],[0.4574162679,0.4220095694,0.4325358852,0.4392344498,0.44784689,0.4593301435,0.4229665072,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4076555024,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689,0.419138756],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0349030471,0.0313019391,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.530496424,0.6681645582,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null],[0.4645,0.4193,0.4436,0.5464,0.5909,0.4465,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5574],[0.8534100247,0.7733771569,0.8087099425,0.8895645029,0.8879211175,0.8905505341,0.8348397699,0.5314708299,0.5041906327,0.8917009039,0.4940016434,0.5059983566,0.5413311422,0.507641742,0.8698438784,0.8793755136,0.5860312243,0.5393590797,0.883483977,0.8059161873,0.6271158587,0.8195562859,0.8854560394,0.8323746919,0.6364831553,0.5814297453,0.4926869351,0.8376335251,0.8670501233,0.5064913722,0.8023007395,0.7268693509,0.8514379622,0.8315529992,0.8933442892],[0.0132261781,0.0098347434,0.012722468,0.0114292948,null,null,0.008927303,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.0157445278,0.2242330745,0.2130128961,0.0142701427,0.0133793406,0.2408143362,0.012671213,0.0117323751,0.0111061484,0.1882467392,0.1691898288,0.0164863838,0.0116271987,0.0131859896,0.0000716868,0.0103604417,0.0145397085,null,0.0135674045,0.0006113776],[0.1686613542,0.2305715843,0.1762471835,0.1795055867,null,null,0.1750336135,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.2233354576,0.8669051787,0.9215527018,0.1965546069,0.1575508605,0.94858605,0.1617593629,0.1759309876,0.1893680342,0.8267075767,0.901661868,0.7995222351,0.166116994,0.1795224981,0.0805047156,0.1580946885,0.1517425501,null,0.155634531,0.9509124595],[0.3668251897,0.3243336913,0.3548213557,0.3652777882,null,null,0.3544185533,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.3898203604,0.6159694987,0.6594958169,0.3785245521,0.3663761784,0.7087181979,0.3645764744,0.3628082484,0.3673437638,0.5926767984,0.5112991874,0.4542439283,0.3637139385,0.3722801591,0.3306992617,0.3566615503,0.367965746,null,0.3640197324,0.0013591523],[0.5454,0.4519,0.5206,0.6605,0.5135,0.23,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,null],[0.2447584892,0.281512605,0.2558918005,0.1120163592,0.1324451679,0.2752668233,0.1512605042,0.3277310924,0.2040760759,0.2941176471,0.2339250687,0.2394957983,0.2058823529,0.2829557667,0.2295219446,0.987394958,0.1199278912,0.3158274114,0.1444317716,0.4453781513,0.2352941176,0.2793721332,0.1848739496,0.2734539158,0.076986247,0.1974789916,0.1554621849,null,0.2455076835,0.8670519793,0.151334934,0.2598217688,0.0588235294,null,null],[null,null,null,null,null,null,null,null,null,null,null,0.1559188347,null,0.1900936275,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,0.9662616708,null,0.9704224166,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,0.352680524,null,0.4360266906,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.1185889375,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.6746885531,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.3937310434,null,null,null,null,null,null,null,null,null,null]]}
plots/clustermap_all.pdf CHANGED
Binary files a/plots/clustermap_all.pdf and b/plots/clustermap_all.pdf differ
 
plots/clustermap_all.png CHANGED

Git LFS Details

  • SHA256: ce9b9ac68fe169a26bc9dd5f447f08104ceb9eace667eb7c1ae42b9047535184
  • Pointer size: 132 Bytes
  • Size of remote file: 1.89 MB

Git LFS Details

  • SHA256: 6468423f1098f6c9f62401b910bee22893e238f66b86e1b399041c85e3ba4de1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.94 MB
plots/clustermap_detect.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["HaluEval Dialog, Accuracy","HaluEval Summarization, Accuracy","HaluEval QA, Accuracy","SelfCheckGPT, AVG"],"data":[[0.5928,0.6085,0.6471,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null,null,null],[0.4645,0.4193,0.4436,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5464,0.4465,0.5574],[0.5454,0.4519,0.5206,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,0.6605,0.23,null],[0.0900088111,0.0378151261,0.1105999164,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,null,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,0.0288242793,0.0767584532,null]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2.5-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["HaluEval Dialog, Accuracy","HaluEval Summarization, Accuracy","HaluEval QA, Accuracy","SelfCheckGPT, AVG","SelfCheckGPT, MAX"],"data":[[0.5928,0.6085,0.6471,0.7826,0.6649,0.4625,0.4998,0.6674,0.472,0.4772,0.4984,0.4836,0.7173,0.7699,0.4694,0.3997,0.7963,0.5478,0.4979,0.6043,0.7917,0.7634,0.3878,0.4203,0.0001,0.558,0.7326,0.0712,0.6425,0.4997,0.6548,0.7393,null,null,null],[0.4645,0.4193,0.4436,0.5909,0.4504,0.4652,0.4653,0.5459,0.4651,0.4668,0.4658,0.4457,0.476,0.5147,0.4701,0.4536,0.448,0.4904,0.5224,0.4696,0.5268,0.5238,0.4402,0.448,0.0,0.4588,null,0.0774,0.4906,0.4279,0.4772,null,0.5464,0.4465,0.5574],[0.5454,0.4519,0.5206,0.5135,0.4917,0.5806,0.4995,0.5969,0.4653,0.4376,0.5139,0.4625,0.3992,0.4502,0.4446,0.3526,0.4208,0.4555,0.5093,0.3233,0.6235,0.5185,0.2968,0.4672,0.0549,0.5699,0.5715,0.0708,0.5231,0.4566,0.5728,0.6879,0.6605,0.23,null],[0.0900088111,0.0378151261,0.1105999164,0.0401887953,0.0042016807,0.012605042,0.063491636,0.0504201681,0.0865653082,0.0168067227,0.0042016807,0.0885709807,0.0782453898,0.987394958,0.0336183781,0.0797089047,0.0358374659,0.3529411765,0.012605042,0.1020439002,0.0210084034,0.0758733581,0.0291536913,0.0210084034,0.0084033613,null,0.1206660487,0.6692579906,0.0413645344,0.1043527865,0.0,null,0.0288242793,0.0767584532,null],[0.2447584892,0.281512605,0.2558918005,0.1324451679,0.1512605042,0.3277310924,0.2040760759,0.2941176471,0.2339250687,0.2394957983,0.2058823529,0.2829557667,0.2295219446,0.987394958,0.1199278912,0.3158274114,0.1444317716,0.4453781513,0.2352941176,0.2793721332,0.1848739496,0.2734539158,0.076986247,0.1974789916,0.1554621849,null,0.2455076835,0.8670519793,0.151334934,0.2598217688,0.0588235294,null,0.1120163592,0.2752668233,null]]}
plots/clustermap_detect.pdf CHANGED
Binary files a/plots/clustermap_detect.pdf and b/plots/clustermap_detect.pdf differ
 
plots/clustermap_detect.png CHANGED

Git LFS Details

  • SHA256: d6eb557c180c449b1a9c7e32786a2be0d03c57e97d70ce357276512ddf119481
  • Pointer size: 131 Bytes
  • Size of remote file: 810 kB

Git LFS Details

  • SHA256: 9b12edeb2eae60cc87acd7b3e14d47458ba5740f73422b76ee5f52da0bf86d5e
  • Pointer size: 131 Bytes
  • Size of remote file: 915 kB
plots/clustermap_instr.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy"],"data":[[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.5405982906,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,0.3844731978,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","teknium\/OpenHermes-2.5-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf"],"index":["MemoTrap, Accuracy","IFEval, Prompt-Level Accuracy"],"data":[[0.641025641,0.7179487179,0.5886752137,0.6079059829,0.6388888889,0.5405982906,0.7061965812,0.6826923077,0.860042735,0.6314102564,0.8344017094,0.7756410256,0.7574786325,0.7980769231,0.6463675214,0.6346153846,0.594017094,0.7532051282,0.5897435897,0.561965812,0.6645299145,0.5758547009,0.5352564103,0.5854700855,0.858974359,0.7126068376,0.8643162393,0.5641025641,0.5523504274,0.5737179487,0.733974359,0.6143162393,0.6079059829,0.5758547009],[0.2735674677,0.2606284658,0.1626617375,0.2255083179,null,0.3844731978,0.3049907579,0.0868761553,0.0850277264,0.2310536044,0.1293900185,0.1885397412,0.179297597,0.1423290203,0.2402957486,0.033271719,0.0628465804,null,0.2735674677,0.1497227357,0.146025878,0.1700554529,0.0609981516,0.0924214418,0.1534195933,0.1164510166,0.1940850277,null,0.1423290203,0.1423290203,0.2865064695,0.1866913124,0.314232902,0.1829944547]]}
plots/clustermap_instr.pdf CHANGED
Binary files a/plots/clustermap_instr.pdf and b/plots/clustermap_instr.pdf differ
 
plots/clustermap_instr.png CHANGED

Git LFS Details

  • SHA256: 560f2281712a08e541122860291fc98dd6b548cb031dd09afaf52c4618b68022
  • Pointer size: 131 Bytes
  • Size of remote file: 662 kB

Git LFS Details

  • SHA256: 16d7f109668a7cc2034a7d3bc1e247d3983bc25ff29d166831b63d19e7c203e0
  • Pointer size: 131 Bytes
  • Size of remote file: 678 kB
plots/clustermap_qa.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","NQ, EM","TruthfulQA MC2, Accuracy"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.5250917993,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,0.6389322336,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0313019391,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.6681645582,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2-Mistral-7B","teknium\/OpenHermes-2.5-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","mistralai\/Mistral-7B-Instruct-v0.1","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["TruthfulQA MC1, Accuracy","TriviaQA, EM","NQ, EM","TruthfulQA MC2, Accuracy"],"data":[[0.2656058752,0.2900856793,0.358629131,0.3390452876,0.3598531212,0.5250917993,0.3916768666,0.2239902081,0.2447980416,0.3157894737,0.2582619339,0.2386780906,0.2019583843,0.2313341493,0.364749082,0.3525091799,0.2802937576,0.205630355,0.3904528764,0.2558139535,0.2264381885,0.3096695226,0.4063647491,0.3867809058,0.2876376989,0.2239902081,0.2325581395,0.2741738066,0.3476132191,0.247246022,0.3023255814,0.2521419829,0.2802937576,0.2594859241,null],[0.0947949175,0.0817543469,0.0961324119,0.6805060187,null,0.6389322336,0.5224030317,0.2633749443,0.0570664289,0.6594962104,0.0155483727,0.2011257245,0.3931676326,0.1415514935,0.0974141774,0.6583816317,0.3750557289,0.3940592956,0.6674654481,0.0815871601,0.4172982613,0.0927329469,0.6591618368,0.6443379403,0.3915514935,0.5866584931,0.1061078912,0.7003455194,0.0984730272,0.0,0.0883303611,0.0921756576,0.0978600089,0.0948506465,0.4198060633],[0.0263157895,0.0263157895,0.0232686981,0.0335180055,0.0349030471,0.0313019391,0.0249307479,0.0329639889,0.0102493075,0.0343490305,0.0049861496,0.0542936288,0.0916897507,0.0368421053,0.028531856,0.0293628809,0.0265927978,0.1307479224,0.0293628809,0.0238227147,0.1091412742,0.0271468144,0.0304709141,0.0315789474,0.135734072,0.2207756233,0.0232686981,null,0.0254847645,0.0,0.0252077562,0.0268698061,0.0268698061,0.0274238227,0.1991689751],[0.4167499124,0.4410061226,0.5164091712,0.509255685,0.530496424,0.6681645582,0.5592109222,0.3889466583,0.4243149954,0.4731016618,0.4557936883,0.3986263303,0.3595710074,0.3961377938,0.515367679,0.5225657507,0.423037498,0.352115369,0.558968896,0.3841009802,0.3706818154,0.4572778615,0.5602234073,0.5511952533,0.4407667557,0.3426523695,0.3962402525,0.4225232911,0.5026754146,0.4947679694,0.4531160226,0.389651281,0.4394613382,0.3689257684,null]]}
plots/clustermap_qa.pdf CHANGED
Binary files a/plots/clustermap_qa.pdf and b/plots/clustermap_qa.pdf differ
 
plots/clustermap_qa.png CHANGED

Git LFS Details

  • SHA256: c022da90152d7c4768666c41a867824c1632311d8f1b7825bf47d350eb3facbc
  • Pointer size: 131 Bytes
  • Size of remote file: 789 kB

Git LFS Details

  • SHA256: c7be3692472a9f4dc52d4f73eb59e6b92212ecd54bde3c0d6e29aafc030ed384
  • Pointer size: 131 Bytes
  • Size of remote file: 806 kB
plots/clustermap_rc.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","mistralai\/Mistral-7B-Instruct-v0.2","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","NousResearch\/Yarn-Mistral-7b-128k","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["SQUADv2, EM","SQUADv2, HasAns","SQUADv2, NoAns","RACE, Accuracy"],"data":[[0.1329908195,0.1431820096,0.1281900109,0.3019455908,0.2228585867,0.0799292512,0.3982986608,0.0476711867,0.1752716247,0.2339762486,0.1510991325,0.1514360313,0.3049776805,0.2034026783,0.2296807883,0.3536595637,0.1234734271,0.3176956119,0.139560347,0.3088520172,0.3112103091,0.2263117999,0.2642129201,0.1662595806,0.257811842,0.0,0.1199359892,0.1446138297,0.1327381454,0.1591004801,null,null,null,null],[0.0811403509,0.0718623482,0.0558367072,0.604757085,0.4461875843,0.1589068826,0.7967273954,0.0755735493,0.3500337382,0.467780027,0.3021255061,0.0733805668,0.6108299595,0.407219973,0.4586707152,0.7083333333,0.0632591093,0.6363022942,0.0841767881,0.6185897436,0.6233130904,0.4531039136,0.5291835358,0.000168691,0.5163630229,0.0,0.0657894737,0.0566801619,0.0796221323,0.0775978408,null,null,null,null],[0.1846930193,0.2142977292,0.2003364172,0.0,0.0001682086,0.0011774601,0.0010092515,0.0198486123,0.0010092515,0.0008410429,0.0005046257,0.2292682927,0.0,0.0001682086,0.0013456686,0.0,0.1835155593,0.0,0.1947855341,0.0,0.0,0.0001682086,0.0,0.3318755257,0.0,0.0,0.1739276703,0.2322960471,0.1857022708,0.2403700589,null,null,null,null],[0.4574162679,0.4220095694,0.4325358852,0.4593301435,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689,0.4392344498,0.4229665072,0.4076555024,0.419138756]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","teknium\/OpenHermes-2.5-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.2","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","Open-Orca\/Mistral-7B-OpenOrca","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","ehartford\/dolphin-2.1-mistral-7b","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","NousResearch\/Yarn-Mistral-7b-128k","upstage\/SOLAR-10.7B-Instruct-v1.0"],"index":["SQuADv2, EM","SQuADv2, HasAns","SQuADv2, NoAns","RACE, Accuracy"],"data":[[0.1329908195,0.1431820096,0.1281900109,0.0681377916,0.3019455908,0.2228585867,0.0799292512,0.3982986608,0.0476711867,0.1752716247,0.2339762486,0.1510991325,0.1514360313,0.3049776805,0.2034026783,0.2296807883,0.3536595637,0.1234734271,0.3176956119,0.139560347,0.3088520172,0.3112103091,0.2263117999,0.2642129201,0.1662595806,0.257811842,0.0,0.1199359892,0.1446138297,0.1327381454,0.1591004801,null,null,null,null],[0.0811403509,0.0718623482,0.0558367072,0.1364709852,0.604757085,0.4461875843,0.1589068826,0.7967273954,0.0755735493,0.3500337382,0.467780027,0.3021255061,0.0733805668,0.6108299595,0.407219973,0.4586707152,0.7083333333,0.0632591093,0.6363022942,0.0841767881,0.6185897436,0.6233130904,0.4531039136,0.5291835358,0.000168691,0.5163630229,0.0,0.0657894737,0.0566801619,0.0796221323,0.0775978408,null,null,null,null],[0.1846930193,0.2142977292,0.2003364172,0.0,0.0,0.0001682086,0.0011774601,0.0010092515,0.0198486123,0.0010092515,0.0008410429,0.0005046257,0.2292682927,0.0,0.0001682086,0.0013456686,0.0,0.1835155593,0.0,0.1947855341,0.0,0.0,0.0001682086,0.0,0.3318755257,0.0,0.0,0.1739276703,0.2322960471,0.1857022708,0.2403700589,null,null,null,null],[0.4574162679,0.4220095694,0.4325358852,0.44784689,0.4593301435,0.3655502392,0.3023923445,0.4688995215,0.2755980861,0.35215311,0.376076555,0.3406698565,0.433492823,0.4717703349,0.3674641148,0.3827751196,0.4555023923,0.3770334928,0.3645933014,0.4,0.4641148325,0.4583732057,0.3722488038,0.3741626794,0.2937799043,0.4593301435,0.2220095694,0.4373205742,0.395215311,0.4612440191,0.404784689,0.4392344498,0.4229665072,0.4076555024,0.419138756]]}
plots/clustermap_rc.pdf CHANGED
Binary files a/plots/clustermap_rc.pdf and b/plots/clustermap_rc.pdf differ
 
plots/clustermap_rc.png CHANGED

Git LFS Details

  • SHA256: e1fad784b2ab132e6e2b6fb558c65a698acda0eeb51d8ef47bd1099bfcd02300
  • Pointer size: 131 Bytes
  • Size of remote file: 777 kB

Git LFS Details

  • SHA256: f72ae88335e4fcb6b8d1614572a6eea6c53068d0d11e10e9c36fb23eb5e4d64f
  • Pointer size: 131 Bytes
  • Size of remote file: 796 kB
plots/clustermap_summ.json CHANGED
@@ -1 +1 @@
1
- {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","upstage\/SOLAR-10.7B-Instruct-v1.0","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","Open-Orca\/Mistral-7B-OpenOrca","ehartford\/dolphin-2.1-mistral-7b","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert"],"index":["XSum, ROUGE-1","XSum, ROUGE-2","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","CNN\/DM, ROUGE-1","CNN\/DM, ROUGE-2","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P"],"data":[[0.0371901778,0.0347891843,0.0428959698,0.2203326049,0.1749600982,0.0504142569,0.1607960902,0.1560967861,0.2724328188,0.1948865046,0.0384377825,0.2457075618,0.298490147,0.0432590646,0.3344535486,0.042540476,0.2352382212,0.2094982527,0.0153408429,0.001309379,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0,0.0,0.0,0.0475386414,0.0262246323,0.0,0.0214328855,0.0377156872,0.0872967218,0.0385742496,0.0,0.0653846178,0.105823501,0.0,0.1240294423,0.0,0.0614764833,0.0613314645,0.0010001753,0.0007904875,0.0,0.0,0.0,0.0,null,null,null,null,null,null,null,null,null],[0.0371901778,0.0347891843,0.0428959698,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,0.1854881243,0.2370092437,0.0432590646,0.2647281333,0.042540476,0.1822281713,0.1657837663,0.0141983599,0.0011478244,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0401970892,0.0394161696,0.0428647574,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,0.5666702185,0.4707080535,0.0407451505,0.3412336695,0.0400569668,0.4089209967,0.3192473228,0.1421243598,0.1811100146,0.0379350583,0.0428665575,0.0387781905,0.0411868449,null,null,null,null,null,null,null,null,null],[0.3949103208,0.3837605404,0.4030390077,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,0.6324491786,0.709515113,0.4014707047,0.7352085092,0.3991319337,0.6545591434,0.5731293539,0.4255551427,0.0021477235,0.3853941111,0.4050690645,0.3917698265,0.401477077,null,null,null,null,null,null,null,null,null],[0.0132261781,0.0098347434,0.012722468,0.2307383556,0.1357875781,0.0169222534,0.1400837608,0.2215130966,0.2555628211,0.2302678427,0.0135334171,0.2558424993,0.2379621599,0.0133793406,0.2670262038,0.012671213,0.2131071975,0.1893372238,0.0165830818,0.0006702693,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.0,0.0,0.000001892,0.089367922,0.0423122873,0.000001892,0.0429218938,0.0925038287,0.1067787126,0.0938882235,0.0,0.0985488725,0.0919108951,0.0,0.1076334391,0.0,0.0774515089,0.0724591632,0.000524905,0.0003540662,0.0,0.0,null,0.0,0.0,0.0,0.000001892,0.000001892,0.0,0.0,0.0,0.0,0.0],[0.0132261781,0.0098347434,0.012722468,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.2242330745,0.2130128961,0.0133793406,0.2408143362,0.012671213,0.1882467392,0.1691898288,0.0164863838,0.0006113776,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.1686613542,0.2305715843,0.1762471835,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.8669051787,0.9215527018,0.1575508605,0.94858605,0.1617593629,0.8267075767,0.901661868,0.7995222351,0.9509124595,0.1580946885,0.1517425501,null,0.155634531,0.1795055867,0.1750336135,0.2233354576,0.1965546069,0.1759309876,0.1893680342,0.166116994,0.1795224981,0.0805047156],[0.3668251897,0.3243336913,0.3548213557,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.6159694987,0.6594958169,0.3663761784,0.7087181979,0.3645764744,0.5926767984,0.5112991874,0.4542439283,0.0013591523,0.3566615503,0.367965746,null,0.3640197324,0.3652777882,0.3544185533,0.3898203604,0.3785245521,0.3628082484,0.3673437638,0.3637139385,0.3722801591,0.3306992617]]}
 
1
+ {"columns":["TheBloke\/Llama-2-13B-chat-GPTQ","TheBloke\/Llama-2-7B-Chat-GPTQ","TheBloke\/Wizard-Vicuna-13B-Uncensored-GPTQ","bigscience\/bloom-7b1","bigscience\/bloom-560m","berkeley-nest\/Starling-LM-7B-alpha","EleutherAI\/gpt-neo-125m","EleutherAI\/gpt-neo-2.7B","EleutherAI\/gpt-j-6b","EleutherAI\/gpt-neo-1.3B","Gryphe\/MythoMax-L2-13b","pankajmathur\/orca_mini_3b","KoboldAI\/OPT-13B-Erebus","togethercomputer\/LLaMA-2-7B-32K","togethercomputer\/GPT-JT-6B-v1","togethercomputer\/Llama-2-7B-32K-Instruct","tiiuae\/falcon-7b-instruct","tiiuae\/falcon-7b","ai-forever\/mGPT","upstage\/SOLAR-10.7B-Instruct-v1.0","meta-llama\/Llama-2-7b-chat-hf","meta-llama\/Llama-2-7b-hf","meta-llama\/Llama-2-13b-chat-hf","meta-llama\/Llama-2-13b-hf","teknium\/OpenHermes-2-Mistral-7B","mistralai\/Mistral-7B-Instruct-v0.1","Open-Orca\/Mistral-7B-OpenOrca","ehartford\/dolphin-2.1-mistral-7b","HuggingFaceH4\/zephyr-7b-alpha","HuggingFaceH4\/zephyr-7b-beta","NousResearch\/Yarn-Mistral-7b-128k","NousResearch\/Nous-Hermes-Llama2-13b","DiscoResearch\/mixtral-7b-8expert"],"index":["XSum, ROUGE-1","XSum, ROUGE-2","XSum, ROUGE-L","XSum, factKB","XSum, BERT-P","CNN\/DM, ROUGE-1","CNN\/DM, ROUGE-2","CNN\/DM, ROUGE-L","CNN\/DM, factKB","CNN\/DM, BERT-P","CNN\/DM v2, ROUGE-1","CNN\/DM v2, ROUGE-2","CNN\/DM v2, ROUGE-L","CNN\/DM v2, factKB","CNN\/DM v2, BERT-P","XSum v2, ROUGE-1","XSum v2, ROUGE-2","XSum v2, ROUGE-L","XSum v2, factKB","XSum v2, BERT-P"],"data":[[0.0371901778,0.0347891843,0.0428959698,0.2203326049,0.1749600982,0.0504142569,0.1607960902,0.1560967861,0.2724328188,0.1948865046,0.0384377825,0.2457075618,0.298490147,0.0432590646,0.3344535486,0.042540476,0.2352382212,0.2094982527,0.0153408429,0.001309379,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0,0.0,0.0,0.0475386414,0.0262246323,0.0,0.0214328855,0.0377156872,0.0872967218,0.0385742496,0.0,0.0653846178,0.105823501,0.0,0.1240294423,0.0,0.0614764833,0.0613314645,0.0010001753,0.0007904875,0.0,0.0,0.0,0.0,null,null,null,null,null,null,null,null,null],[0.0371901778,0.0347891843,0.0428959698,0.1668276326,0.1356427068,0.0504142569,0.1262087692,0.1205109613,0.2136831021,0.1487592997,0.0384377825,0.1854881243,0.2370092437,0.0432590646,0.2647281333,0.042540476,0.1822281713,0.1657837663,0.0141983599,0.0011478244,0.0340250019,0.0445139517,0.0359765055,0.0413856309,null,null,null,null,null,null,null,null,null],[0.0401970892,0.0394161696,0.0428647574,0.5674834215,0.2298042738,0.0397678158,0.2459031195,0.3441669812,0.4791825971,0.4144174002,0.0384018147,0.5666702185,0.4707080535,0.0407451505,0.3412336695,0.0400569668,0.4089209967,0.3192473228,0.1421243598,0.1811100146,0.0379350583,0.0428665575,0.0387781905,0.0411868449,null,null,null,null,null,null,null,null,null],[0.3949103208,0.3837605404,0.4030390077,0.6528773697,0.6049370656,0.4270770697,0.5845326816,0.4448599865,0.6811507025,0.6106414046,0.3989033275,0.6324491786,0.709515113,0.4014707047,0.7352085092,0.3991319337,0.6545591434,0.5731293539,0.4255551427,0.0021477235,0.3853941111,0.4050690645,0.3917698265,0.401477077,null,null,null,null,null,null,null,null,null],[0.0132261781,0.0098347434,0.012722468,0.2307383556,0.1357875781,0.0169222534,0.1400837608,0.2215130966,0.2555628211,0.2302678427,0.0135334171,0.2558424993,0.2379621599,0.0133793406,0.2670262038,0.012671213,0.2131071975,0.1893372238,0.0165830818,0.0006702693,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.0,0.0,0.000001892,0.089367922,0.0423122873,0.000001892,0.0429218938,0.0925038287,0.1067787126,0.0938882235,0.0,0.0985488725,0.0919108951,0.0,0.1076334391,0.0,0.0774515089,0.0724591632,0.000524905,0.0003540662,0.0,0.0,null,0.0,0.0,0.0,0.000001892,0.000001892,0.0,0.0,0.0,0.0,0.0],[0.0132261781,0.0098347434,0.012722468,0.2075387019,0.1227088127,0.0169222534,0.1243996336,0.1958355522,0.2238202254,0.2016505067,0.0135334171,0.2242330745,0.2130128961,0.0133793406,0.2408143362,0.012671213,0.1882467392,0.1691898288,0.0164863838,0.0006113776,0.0103604417,0.0145397085,null,0.0135674045,0.0114292948,0.008927303,0.0157445278,0.0142701427,0.0117323751,0.0111061484,0.0116271987,0.0131859896,0.0000716868],[0.1686613542,0.2305715843,0.1762471835,0.9249732766,0.7988642532,0.2070823135,0.7133009049,0.9058066799,0.9396594147,0.8739379747,0.1648057051,0.8669051787,0.9215527018,0.1575508605,0.94858605,0.1617593629,0.8267075767,0.901661868,0.7995222351,0.9509124595,0.1580946885,0.1517425501,null,0.155634531,0.1795055867,0.1750336135,0.2233354576,0.1965546069,0.1759309876,0.1893680342,0.166116994,0.1795224981,0.0805047156],[0.3668251897,0.3243336913,0.3548213557,0.6009727172,0.4646713339,0.3964162779,0.4789093344,0.5761275867,0.617234758,0.5888279105,0.3663381487,0.6159694987,0.6594958169,0.3663761784,0.7087181979,0.3645764744,0.5926767984,0.5112991874,0.4542439283,0.0013591523,0.3566615503,0.367965746,null,0.3640197324,0.3652777882,0.3544185533,0.3898203604,0.3785245521,0.3628082484,0.3673437638,0.3637139385,0.3722801591,0.3306992617],[null,null,null,null,null,null,null,0.1819405822,null,0.22221396,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,0.0842738314,null,0.1031631811,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,0.1559188347,null,0.1900936275,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,0.9662616708,null,0.9704224166,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,0.352680524,null,0.4360266906,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.1691722959,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.0370226737,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.1185889375,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.6746885531,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,0.3937310434,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null]]}
plots/clustermap_summ.pdf CHANGED
Binary files a/plots/clustermap_summ.pdf and b/plots/clustermap_summ.pdf differ
 
plots/clustermap_summ.png CHANGED

Git LFS Details

  • SHA256: 0f80554cccdbc8765019771f37c64fd72b4e83856e785790460c8123397e5fca
  • Pointer size: 131 Bytes
  • Size of remote file: 997 kB

Git LFS Details

  • SHA256: ab89444c49ceb70502af3d2a63e4ff0e95d79be48f2659ca8e2e7010ce2784e5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.19 MB