Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
import json | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from datasets import load_dataset | |
from plotly.subplots import make_subplots | |
CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"] | |
LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl'] | |
FORCE_DOWNLOAD = bool(int(os.environ.get("FORCE_DOWNLOAD", "0"))) | |
HF_TOKEN = str(os.environ.get("HF_TOKEN", "")) | |
DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", "")) | |
PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "gpt4_single_json.csv")) | |
rename_map = { | |
"seallm13b10L6k_a_5a1R1_seaall_sft4x_1_5a1_r2_0_dpo_8_40000s": "SeaLLM-13b", | |
"polylm": "PolyLM-13b", | |
"qwen": "Qwen-14b", | |
"gpt-3.5-turbo": "GPT-3.5-turbo", | |
} | |
CATEGORIES = [ "task-solving", "math-reasoning", "general-instruction", "natural-question", "safety", ] | |
CATEGORIES_NAMES = { | |
"task-solving": 'Task-solving', | |
"math-reasoning": 'Math', | |
"general-instruction": 'General-instruction', | |
"natural-question": 'NaturalQA', | |
"safety": 'Safety', | |
} | |
# LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl'] | |
LANGS = ['en', 'vi', 'id', 'ms', 'tl', 'th', 'km', 'lo', 'my'] | |
LANG_NAMES = { | |
'en': 'eng', | |
'vi': 'vie', | |
'th': 'tha', | |
'id': 'ind', | |
'km': 'khm', | |
'lo': 'lao', | |
'ms': 'msa', | |
'my': 'mya', | |
'tl': 'tgl', | |
} | |
MODEL_DFRAME = None | |
def get_model_df(): | |
# global MODEL_DFRAME | |
# if isinstance(MODEL_DFRAME, pd.DataFrame): | |
# print(f'Load cache data frame') | |
# return MODEL_DFRAME | |
from huggingface_hub import hf_hub_download | |
assert DATA_SET_REPO_PATH != '' | |
assert HF_TOKEN != '' | |
repo_id = DATA_SET_REPO_PATH | |
filename = PERFORMANCE_FILENAME | |
# data_path = f"{DATA_SET_REPO_PATH}/{PERFORMANCE_FILENAME}" | |
file_path = hf_hub_download( | |
repo_id=repo_id, | |
filename=filename, | |
force_download=FORCE_DOWNLOAD, | |
local_dir='./hf_cache', | |
repo_type="dataset", | |
token=HF_TOKEN | |
) | |
print(f'Downloaded file at {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}') | |
df = pd.read_csv(file_path) | |
return df | |
def aggregate_df(df, model_dict, category_name, categories): | |
scores_all = [] | |
all_models = df["model"].unique() | |
for model in all_models: | |
for i, cat in enumerate(categories): | |
# filter category/model, and score format error (<1% case) | |
res = df[(df[category_name]==cat) & (df["model"]==model) & (df["score"] >= 0)] | |
score = res["score"].mean() | |
cat_name = cat | |
scores_all.append({"model": model, category_name: cat_name, "score": score}) | |
target_models = list(model_dict.keys()) | |
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models] | |
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True) | |
df_score = pd.DataFrame(scores_target) | |
df_score = df_score[df_score["model"].isin(target_models)] | |
rename_map = model_dict | |
for k, v in rename_map.items(): | |
df_score.replace(k, v, inplace=True) | |
return df_score | |
def polar_subplot(fig, dframe, model_names, category_label, category_names, row, col, showlegend=True): | |
# cat category | |
colors = px.colors.qualitative.Plotly | |
for i, (model, model_name) in enumerate(model_names): | |
cat_list = dframe[dframe['model'] == model_name][category_label].tolist() | |
score_list = dframe[dframe['model'] == model_name]['score'].tolist() | |
cat_list += [cat_list[0]] | |
cat_list = [category_names[x] for x in cat_list] | |
score_list += [score_list[0]] | |
polar = go.Scatterpolar( | |
name = model_name, | |
r = score_list, | |
theta = cat_list, | |
legendgroup=f'{i}', | |
marker=dict(color=colors[i]), | |
hovertemplate="""Score: %{r:.2f}""", | |
showlegend=showlegend, | |
) | |
fig.add_trace(polar, row, col) | |
def plot_agg_fn(): | |
df = get_model_df() | |
all_models = df["model"].unique() | |
model_names = list(rename_map.items()) | |
colors = px.colors.qualitative.Plotly | |
cat_df = aggregate_df(df, rename_map, "category", CATEGORIES, ) | |
lang_df = aggregate_df(df, rename_map, "lang", LANGS, ) | |
fig = make_subplots( | |
rows=1, cols=2, | |
specs=[[{'type': 'polar'}]*2], | |
subplot_titles=("By Category", "By Language"), | |
) | |
fig.layout.annotations[0].y = 1.05 | |
fig.layout.annotations[1].y = 1.05 | |
# cat category | |
for i, (model, model_name) in enumerate(model_names): | |
cat_list = cat_df[cat_df['model'] == model_name]['category'].tolist() | |
score_list = cat_df[cat_df['model'] == model_name]['score'].tolist() | |
cat_list += [cat_list[0]] | |
cat_list = [CATEGORIES_NAMES[x] for x in cat_list] | |
score_list += [score_list[0]] | |
polar = go.Scatterpolar( | |
name = model_name, | |
r = score_list, | |
theta = cat_list, | |
legendgroup=f'{i}', | |
marker=dict(color=colors[i]), | |
hovertemplate="""Score: %{r:.2f}""", | |
) | |
fig.add_trace(polar, 1, 1) | |
# cat langs | |
for i, (model, model_name) in enumerate(model_names): | |
cat_list = lang_df[lang_df['model'] == model_name]['lang'].tolist() | |
score_list = lang_df[lang_df['model'] == model_name]['score'].tolist() | |
cat_list += [cat_list[0]] | |
score_list += [score_list[0]] | |
cat_list = [LANG_NAMES[x] for x in cat_list] | |
polar = go.Scatterpolar( | |
name = model_name, | |
r = score_list, | |
theta = cat_list, | |
legendgroup=f'{i}', | |
marker=dict(color=colors[i]), | |
hovertemplate="""Score: %{r:.2f}""", | |
showlegend=False, | |
) | |
fig.add_trace(polar, 1, 2) | |
polar_config = dict( | |
angularaxis = dict( | |
rotation=90, # start position of angular axis | |
), | |
radialaxis = dict( | |
range=[0, 10], | |
), | |
) | |
fig.update_layout( | |
polar = polar_config, | |
polar2 = polar_config, | |
title='Sea-Bench (rated by GPT-4)', | |
) | |
return fig | |
def plot_by_lang_fn(): | |
df = get_model_df() | |
model_names = list(rename_map.items()) | |
fig = make_subplots( | |
rows=3, cols=3, | |
specs=[[{'type': 'polar'}]*3] * 3, | |
subplot_titles=list(LANG_NAMES.values()), | |
# vertical_spacing=1 | |
) | |
# print(fig.layout.annotations) | |
for ano in fig.layout.annotations: | |
ano.y = ano.y + 0.02 | |
has_safety = ['vi', 'id', 'th'] | |
for lang_id, lang in enumerate(LANGS): | |
cat_names = CATEGORIES if lang in has_safety else [x for x in CATEGORIES if x != 'safety'] | |
cat_lang_df = aggregate_df(df[df['lang'] == lang], rename_map, "category", cat_names, ) | |
row = lang_id // 3 + 1 | |
col = lang_id % 3 + 1 | |
polar_subplot(fig, cat_lang_df, model_names, 'category', CATEGORIES_NAMES, row, col, showlegend=lang_id == 0) | |
polar_config = dict( | |
angularaxis = dict( | |
rotation=90, # start position of angular axis | |
), | |
radialaxis = dict( | |
range=[0, 10], | |
), | |
) | |
layer_kwargs = {f"polar{i}": polar_config for i in range(1, 10)} | |
fig.update_layout( | |
title='Sea-Bench - By language (rated by GPT-4)', | |
height=1000, | |
# width=1200, | |
**layer_kwargs | |
) | |
return fig | |
def both_plot(): | |
return plot_agg_fn(), plot_by_lang_fn() | |
def attach_plot_to_demo(demo: gr.Blocks): | |
with gr.Accordion("Psst... wanna see some performance benchmarks?", open=False) as accord: | |
# gr_plot_agg = gr.Plot(plot_agg_fn, label="Aggregated") | |
# gr_plot_bylang = gr.Plot(plot_by_lang_fn, label='By language') | |
show_result = gr.Button("Load benchmark results") | |
gr_plot_agg = gr.Plot(label="Aggregated") | |
gr_plot_bylang = gr.Plot(label='By language') | |
# def callback(): | |
# demo.load(plot_agg_fn, [], gr_plot_agg) | |
# demo.load(plot_by_lang_fn, [], gr_plot_bylang) | |
show_result.click(both_plot, [], [gr_plot_agg, gr_plot_bylang]) | |