InstruSumEval / src /populate.py
henryL7's picture
update layout
9bd8edd
import json
import pandas as pd
import yaml
from sklearn.metrics import cohen_kappa_score
import numpy as np
from datasets import load_dataset
from .envs import TOKEN
TYPES = ["number", "html", "number", "number", "number", "number"]
def read_json(file_path: str) -> list[dict]:
"""
Read a JSON/JSONL file and return its contents as a list of dictionaries.
Parameters:
file_path (str): The path to the JSON file.
Returns:
list[dict]: The contents of the JSON file as a list of dictionaries.
"""
try:
with open(file_path) as f:
data = [json.loads(x) for x in f]
return data
except json.decoder.JSONDecodeError:
with open(file_path) as f:
data = json.load(f)
return data
def pairwise_compare(
evaluator1_responses: list[dict],
evaluator2_responses: list[dict],
) -> tuple[float, float]:
"""
Compare pairwise evaluators.
Args:
evaluator1_responses: The responses from the first evaluator.
evaluator2_responses: The responses from the second evaluator.
Returns:
None
"""
assert len(evaluator1_responses) == len(evaluator2_responses)
evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses])
evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses])
acc = (evaluator1_winners == evaluator2_winners).mean().item()
agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners)
return acc, agreement
def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]:
"""
Evaluate a pairwise evaluator.
Args:
human_responses: The responses from the human evaluator.
model_dir: The directory containing the model responses.
model_dir_swap: The directory containing the model responses with swapped inputs.
Returns:
dict[float]: The accuracy and agreement.
"""
model_responses = read_json(model_dir)
model_responses_swap = read_json(model_dir_swap)
acc, agr = pairwise_compare(human_responses, model_responses)
swap_acc, swap_agr = pairwise_compare(
human_responses,
model_responses_swap,
)
acc = (acc + swap_acc) / 2
agr = (agr + swap_agr) / 2
models_acc, models_agr = pairwise_compare(
model_responses,
model_responses_swap,
)
return acc, agr, models_acc, models_agr
def load_leaderboard() -> pd.DataFrame:
"""Loads the leaderboard from the file system"""
with open("./data/models.yaml") as fp:
models = yaml.safe_load(fp)
human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"]
human_responses = [x for x in human_responses]
predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]}
for model in models:
fdir = model["fdir"]
acc, agr, models_acc, models_agr = pairwise_meta_eval(
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
)
# predictions["Model"].append(model["name"])
# predictions["Model"].append(f"[{model['name']}]({model['url']})")
link = model['url']
model_name = model['name']
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
predictions["Model"].append(output)
predictions["Accuracy"].append(acc)
predictions["Agreement"].append(agr)
predictions["Self-Accuracy"].append(models_acc)
predictions["Self-Agreement"].append(models_agr)
df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3)
df.reset_index(drop=True, inplace=True)
df[' '] = pd.Series(range(1, len(df) + 1))
columns = [' '] + [col for col in df.columns if col != ' ']
df = df[columns]
return df