Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
user name
commited on
Commit
·
5e86dd4
1
Parent(s):
bfbf195
add factuality, faithfulness scores
Browse files- src/display/utils.py +3 -0
- src/populate.py +10 -1
src/display/utils.py
CHANGED
@@ -74,6 +74,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
74 |
#Scores
|
75 |
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
76 |
|
|
|
|
|
|
|
77 |
for task in Tasks:
|
78 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
79 |
|
|
|
74 |
#Scores
|
75 |
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
76 |
|
77 |
+
auto_eval_column_dict.append(["Faithfulness", ColumnContent, ColumnContent("Faithfulness", "number", True)])
|
78 |
+
auto_eval_column_dict.append(["Factuality", ColumnContent, ColumnContent("Factuality", "number", True)])
|
79 |
+
|
80 |
for task in Tasks:
|
81 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
82 |
|
src/populate.py
CHANGED
@@ -60,11 +60,20 @@ def get_leaderboard_df(results_path: str,
|
|
60 |
# if AutoEvalColumn.average.name in df:
|
61 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
62 |
|
63 |
-
|
|
|
|
|
|
|
64 |
|
65 |
# filter out if any of the benchmarks have not been produced
|
66 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
return raw_data, df
|
69 |
|
70 |
|
|
|
60 |
# if AutoEvalColumn.average.name in df:
|
61 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
62 |
|
63 |
+
cols_mod = copy.deepcopy(cols)
|
64 |
+
cols_mod.remove('Faithfulness')
|
65 |
+
cols_mod.remove('Factuality')
|
66 |
+
df = df[cols_mod]#.round(decimals=2)
|
67 |
|
68 |
# filter out if any of the benchmarks have not been produced
|
69 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
70 |
|
71 |
+
Factuality_score = df[factuality_tasks].mean(axis=1)
|
72 |
+
Faithfulness_score = df[faithfulness_tasks].mean(axis=1)
|
73 |
+
df.insert(2, 'Factuality', Factuality_score)
|
74 |
+
df.insert(2, 'Faithfulness', Faithfulness_score)
|
75 |
+
df = df.round(decimals=2)
|
76 |
+
|
77 |
return raw_data, df
|
78 |
|
79 |
|