Spaces:
Sleeping
Sleeping
Nathan Habib
commited on
Commit
·
bb3c194
1
Parent(s):
56f8b5d
commit
Browse files- app.py +91 -0
- requirements.txt +1 -0
- utils.py +114 -0
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from utils import construct_dataframe, MODELS, get_scores
|
5 |
+
|
6 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
7 |
+
DATAFRAME: pd.DataFrame = construct_dataframe()
|
8 |
+
MAX_LINES = 500
|
9 |
+
MIN_LINES = 10
|
10 |
+
|
11 |
+
|
12 |
+
def get_from_question_id_turn_2(model, question_id: int):
|
13 |
+
new = DATAFRAME.loc[question_id]
|
14 |
+
new = new[new["turn"] == 1]
|
15 |
+
new = new[new["model"] == model]
|
16 |
+
|
17 |
+
prompt_lighteval = new["prompt"].values[0]
|
18 |
+
response_lighteval = new["response"].values[0]
|
19 |
+
judgement_prompt_lighteval = new["judgement_prompt"].values[0]
|
20 |
+
judgement_lighteval = new["judgment"].values[0]
|
21 |
+
score_lighteval = new["score"].values[0]
|
22 |
+
|
23 |
+
return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
|
24 |
+
|
25 |
+
|
26 |
+
def get_from_question_id_turn_1(model, question_id: int):
|
27 |
+
new = DATAFRAME.loc[question_id]
|
28 |
+
new = new[new["turn"] == 0]
|
29 |
+
new = new[new["model"] == model]
|
30 |
+
|
31 |
+
prompt_lighteval = new["prompt"].values[0]
|
32 |
+
response_lighteval = new["response"].values[0]
|
33 |
+
judgement_prompt_lighteval = new["judgement_prompt"].values[0]
|
34 |
+
judgement_lighteval = new["judgment"].values[0]
|
35 |
+
score_lighteval = new["score"].values[0]
|
36 |
+
|
37 |
+
return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
|
38 |
+
|
39 |
+
|
40 |
+
with gr.Blocks() as demo:
|
41 |
+
with gr.Row():
|
42 |
+
model = gr.Dropdown([model.split("__")[1] for model in MODELS], label="Model")
|
43 |
+
index = gr.Dropdown(set(DATAFRAME.index.values.tolist()), label="Index", value=DATAFRAME.index.values.tolist()[0])
|
44 |
+
|
45 |
+
with gr.Row():
|
46 |
+
gr.DataFrame(get_scores(DATAFRAME).reset_index(), interactive=False, )
|
47 |
+
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
gr.Markdown("## Turn 1")
|
51 |
+
score_lighteval = gr.Number(label="Score", interactive=False)
|
52 |
+
prompt_lighteval = gr.Textbox(
|
53 |
+
label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
|
54 |
+
)
|
55 |
+
response_lighteval = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
56 |
+
judgement_prompt_lighteval = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
57 |
+
judgement_lighteval = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
58 |
+
with gr.Column():
|
59 |
+
gr.Markdown("## Turn 2")
|
60 |
+
score_lighteval_2 = gr.Number(label="Score", interactive=False)
|
61 |
+
prompt_lighteval_2 = gr.Textbox(
|
62 |
+
label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
|
63 |
+
)
|
64 |
+
response_lighteval_2 = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
65 |
+
judgement_prompt_lighteval_2 = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
66 |
+
judgement_lighteval_2 = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
67 |
+
|
68 |
+
|
69 |
+
index.change(
|
70 |
+
fn=get_from_question_id_turn_1,
|
71 |
+
inputs=[model, index],
|
72 |
+
outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval],
|
73 |
+
)
|
74 |
+
|
75 |
+
index.change(
|
76 |
+
fn=get_from_question_id_turn_2,
|
77 |
+
inputs=[model, index],
|
78 |
+
outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
|
79 |
+
)
|
80 |
+
model.change(
|
81 |
+
fn=get_from_question_id_turn_2,
|
82 |
+
inputs=[model, index],
|
83 |
+
outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
|
84 |
+
)
|
85 |
+
model.change(
|
86 |
+
fn=get_from_question_id_turn_1,
|
87 |
+
inputs=[model, index],
|
88 |
+
outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval,],
|
89 |
+
)
|
90 |
+
|
91 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
plotly
|
utils.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from datasets import load_dataset
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from pprint import pprint
|
6 |
+
pd.options.plotting.backend = "plotly"
|
7 |
+
|
8 |
+
MODELS = [
|
9 |
+
"mistralai__Mistral-7B-Instruct-v0.2",
|
10 |
+
# "HuggingFaceH4__zephyr-7b-beta",
|
11 |
+
# "meta-llama__Llama-2-7b-chat-hf",
|
12 |
+
# "01-ai__Yi-34B-Chat",
|
13 |
+
]
|
14 |
+
|
15 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
16 |
+
|
17 |
+
score_turn = {
|
18 |
+
1: "multi_turn",
|
19 |
+
0: "single_turn",
|
20 |
+
}
|
21 |
+
|
22 |
+
def get_dataframe_lighteval() -> pd.DataFrame:
|
23 |
+
samples = []
|
24 |
+
scores = []
|
25 |
+
for model in MODELS:
|
26 |
+
details_lighteval = load_dataset(
|
27 |
+
f"SaylorTwift/details_{model}_private",
|
28 |
+
"extended_mt_bench_0",
|
29 |
+
split="latest",
|
30 |
+
token=HF_TOKEN,
|
31 |
+
)
|
32 |
+
|
33 |
+
for d in details_lighteval:
|
34 |
+
judement_prompt = d["judement_prompt"]
|
35 |
+
judgement = d["judgement"]
|
36 |
+
predictions = d["predictions"][0]
|
37 |
+
prompts = d["full_prompt"]
|
38 |
+
|
39 |
+
turns = []
|
40 |
+
for turn in range(len(predictions)):
|
41 |
+
if turn == 1:
|
42 |
+
prompt = prompts[turn].format(model_response=predictions[turn - 1])
|
43 |
+
else:
|
44 |
+
prompt = prompts[turn]
|
45 |
+
|
46 |
+
turns.append([])
|
47 |
+
turns[turn].append(prompt)
|
48 |
+
turns[turn].append(predictions[turn])
|
49 |
+
turns[turn].append(judement_prompt[turn])
|
50 |
+
turns[turn].append(judgement[turn])
|
51 |
+
|
52 |
+
for i, turn in enumerate(turns):
|
53 |
+
samples.append(
|
54 |
+
{
|
55 |
+
"model": model,
|
56 |
+
"turn": i,
|
57 |
+
"prompt": turn[0],
|
58 |
+
"response": turn[1],
|
59 |
+
"judgement_prompt": turn[2],
|
60 |
+
"judgment": turn[3],
|
61 |
+
"score": d["metrics"][score_turn[i]],
|
62 |
+
"question_id": d["specifics"]["id"],
|
63 |
+
}
|
64 |
+
)
|
65 |
+
|
66 |
+
dataframe_all_samples = pd.DataFrame(samples)
|
67 |
+
|
68 |
+
return dataframe_all_samples
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
def construct_dataframe() -> pd.DataFrame:
|
74 |
+
"""
|
75 |
+
Construct a dataframe from the data in the data folder
|
76 |
+
"""
|
77 |
+
lighteval = get_dataframe_lighteval()
|
78 |
+
lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1])
|
79 |
+
lighteval = lighteval.set_index(["question_id", "turn", "model"])
|
80 |
+
all_samples = lighteval.reset_index()
|
81 |
+
all_samples = all_samples.set_index("question_id")
|
82 |
+
|
83 |
+
return all_samples.dropna()
|
84 |
+
|
85 |
+
|
86 |
+
def create_plot(model: str, dataframe: pd.DataFrame):
|
87 |
+
new = dataframe[dataframe["model"] == model].dropna()
|
88 |
+
new = new[new["turn"] == 1]
|
89 |
+
new["score_lighteval"] = new["score_lighteval"].astype(int)
|
90 |
+
new["score_mt_bench"] = new["score_mt_bench"].astype(int)
|
91 |
+
new = new[['score_lighteval', 'score_mt_bench']]
|
92 |
+
new.index = new.index.astype(str)
|
93 |
+
|
94 |
+
fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group")
|
95 |
+
|
96 |
+
return fig
|
97 |
+
|
98 |
+
|
99 |
+
def get_scores(dataframe):
|
100 |
+
dataframe = dataframe.dropna()
|
101 |
+
dataframe["score"] = dataframe["score"].astype(int)
|
102 |
+
new = dataframe[['score', "turn", "model"]]
|
103 |
+
new = new.groupby(["model", "turn"]).mean()
|
104 |
+
new = new.groupby(["model"]).mean()
|
105 |
+
return new
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
df = construct_dataframe()
|
109 |
+
from pprint import pprint
|
110 |
+
pprint(df)
|
111 |
+
#print(df.iloc[130])
|
112 |
+
# model = "zephyr-7b-beta"
|
113 |
+
# fig = create_plot(model, df)
|
114 |
+
# fig.show()
|