|
import streamlit as st |
|
import pandas as pd |
|
from PIL import Image |
|
import base64 |
|
from io import BytesIO |
|
import random |
|
import plotly.graph_objects as go |
|
|
|
|
|
MAJOR_A_WIN = "A>>B" |
|
MINOR_A_WIN = "A>B" |
|
MINOR_B_WIN = "B>A" |
|
MAJOR_B_WIN = "B>>A" |
|
TIE = "A=B" |
|
|
|
GA_TRACKING_CODE = """ |
|
<script async src="https://www.googletagmanager.com/gtag/js?id=G-EVZ0R7014L"></script> |
|
<script> |
|
window.dataLayer = window.dataLayer || []; |
|
function gtag(){dataLayer.push(arguments);} |
|
gtag('js', new Date()); |
|
|
|
gtag('config', 'G-EVZ0R7014L'); |
|
</script> |
|
""" |
|
|
|
|
|
def is_consistent(rating, reverse_rating): |
|
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in { |
|
MAJOR_B_WIN, |
|
MINOR_B_WIN, |
|
}: |
|
return True |
|
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in { |
|
MAJOR_A_WIN, |
|
MINOR_A_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in { |
|
MAJOR_B_WIN, |
|
MINOR_B_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in { |
|
MAJOR_A_WIN, |
|
MINOR_A_WIN, |
|
}: |
|
return True |
|
if reverse_rating in {TIE} and rating in {TIE}: |
|
return True |
|
if reverse_rating in {TIE} and rating not in {TIE}: |
|
return False |
|
if rating in {TIE} and reverse_rating not in {TIE}: |
|
return False |
|
return False |
|
|
|
|
|
|
|
def pil_to_base64(img): |
|
buffered = BytesIO() |
|
img.save(buffered, format="PNG") |
|
img_str = base64.b64encode(buffered.getvalue()).decode() |
|
return img_str |
|
|
|
|
|
def main(): |
|
|
|
df_test_set = pd.read_json("data/test_set.jsonl", lines=True) |
|
df_responses = pd.read_json("data/responses.jsonl", lines=True) |
|
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True) |
|
df_leaderboard = ( |
|
pd.read_csv("data/leaderboard_6_11.csv") |
|
.sort_values("Rank") |
|
.reset_index(drop=True) |
|
) |
|
df_leaderboard = df_leaderboard.rename( |
|
columns={"EI Score": "Council Arena EI Score (95% CI)"} |
|
) |
|
|
|
|
|
df_test_set["scenario_option"] = ( |
|
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"] |
|
) |
|
scenario_options = df_test_set["scenario_option"].tolist() |
|
|
|
|
|
model_options = df_responses["llm_responder"].unique().tolist() |
|
|
|
|
|
judge_options = df_response_judging["llm_judge"].unique().tolist() |
|
|
|
st.set_page_config( |
|
page_title="Language Model Council", page_icon="🏛️", layout="wide" |
|
) |
|
|
|
|
|
center_css = """ |
|
<style> |
|
h1, h2, h3, h6{ |
|
text-align: center; |
|
} |
|
</style> |
|
""" |
|
|
|
|
|
st.markdown(GA_TRACKING_CODE, unsafe_allow_html=True) |
|
|
|
st.markdown(center_css, unsafe_allow_html=True) |
|
|
|
|
|
st.title("Language Model Council") |
|
st.markdown( |
|
"### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:" |
|
) |
|
st.markdown( |
|
"###### [Justin Zhao](https://www.justinxzhao.com/)¹, [Flor Miriam Plaza-del-Arco](https://fmplaza.github.io/)², [Amanda Cercas Curry](https://amandacurry.github.io/)²" |
|
) |
|
st.markdown("###### ¹ Predibase, ² Bocconi University") |
|
|
|
|
|
_, col1, col2, col3, col4, _ = st.columns([0.3, 0.1, 0.1, 0.1, 0.1, 0.3]) |
|
|
|
with col1: |
|
st.link_button( |
|
"Data", |
|
"https://huggingface.co/datasets/llm-council/emotional_application", |
|
use_container_width=True, |
|
type="primary", |
|
) |
|
|
|
with col2: |
|
st.link_button( |
|
"Paper", |
|
"https://arxiv.org/abs/2406.08598", |
|
use_container_width=True, |
|
type="primary", |
|
) |
|
|
|
with col3: |
|
st.link_button( |
|
"Github", |
|
"https://github.com/llm-council/llm-council", |
|
use_container_width=True, |
|
type="primary", |
|
) |
|
|
|
with col4: |
|
st.link_button( |
|
"Website", |
|
"https://llm-council.com/", |
|
use_container_width=True, |
|
type="primary", |
|
) |
|
|
|
|
|
with open("img/hero.svg", "r") as file: |
|
svg_content = file.read() |
|
|
|
left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2]) |
|
with cent_co: |
|
st.image(svg_content, use_column_width=True) |
|
|
|
with cent_co.expander("Abstract"): |
|
st.markdown( |
|
"""The rapid advancement of Large Language Models (LLMs) necessitates robust |
|
and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based |
|
on how well their responses align with human preferences. However, many tasks |
|
such as those related to emotional intelligence, creative writing, or persuasiveness, |
|
are highly subjective and often lack majoritarian human agreement. Judges may |
|
have irreconcilable disagreements about what constitutes a better response. To |
|
address the challenge of ranking LLMs on highly subjective tasks, we propose |
|
a novel benchmarking framework, the Language Model Council (LMC). The |
|
LMC operates through a democratic process to: 1) formulate a test set through |
|
equal participation, 2) administer the test among council members, and 3) evaluate |
|
responses as a collective jury. We deploy a council of 20 newest LLMs on an |
|
open-ended emotional intelligence task: responding to interpersonal dilemmas. |
|
Our results show that the LMC produces rankings that are more separable, robust, |
|
and less biased than those from any individual LLM judge, and is more consistent |
|
with a human-established leaderboard compared to other benchmarks.""" |
|
) |
|
st.markdown( |
|
"This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**." |
|
) |
|
|
|
|
|
tabs = st.tabs( |
|
[ |
|
"Leaderboard Results", |
|
"Browse Data", |
|
"Analysis", |
|
"About Us", |
|
] |
|
) |
|
|
|
|
|
with tabs[0]: |
|
_, mid_column, _ = st.columns([0.2, 0.6, 0.2]) |
|
mid_column.markdown("#### Leaderboard Graph") |
|
|
|
df = df_leaderboard.copy() |
|
df["Score"] = df["Council Arena EI Score (95% CI)"].apply( |
|
lambda x: float(x.split(" ")[0]) |
|
) |
|
df["Lower"] = df["Council Arena EI Score (95% CI)"].apply( |
|
lambda x: float(x.split(" ")[1][1:-1]) |
|
) |
|
df["Upper"] = df["Council Arena EI Score (95% CI)"].apply( |
|
lambda x: float(x.split(" ")[2][:-1]) |
|
) |
|
|
|
|
|
df = df.sort_values(by="Score", ascending=False) |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
num_bars = len(df) |
|
colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)] |
|
|
|
fig.add_trace( |
|
go.Bar( |
|
x=df["Score"], |
|
y=df["LLM"], |
|
orientation="h", |
|
error_x=dict( |
|
type="data", |
|
array=df["Upper"], |
|
arrayminus=-1 * df["Lower"], |
|
thickness=0.5, |
|
width=3, |
|
color="black", |
|
), |
|
marker=dict(color=colors, opacity=0.8), |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True), |
|
yaxis_title="LLM", |
|
yaxis=dict(autorange="reversed"), |
|
template="presentation", |
|
width=1000, |
|
height=700, |
|
) |
|
|
|
|
|
mid_column.plotly_chart(fig) |
|
|
|
mid_column.divider() |
|
|
|
mid_column.markdown("#### Leaderboard Table") |
|
|
|
|
|
mid_column.dataframe(df_leaderboard, hide_index=True) |
|
|
|
|
|
def colored_text_box(text, background_color, text_color="black"): |
|
html_code = f""" |
|
<div style=" |
|
background-color: {background_color}; |
|
color: {text_color}; |
|
padding: 10px; |
|
border-radius: 10px; |
|
"> |
|
{text} |
|
</div> |
|
""" |
|
return html_code |
|
|
|
|
|
if "selected_scenario" not in st.session_state: |
|
st.session_state.selected_scenario = None |
|
|
|
if "selected_model" not in st.session_state: |
|
st.session_state.selected_model = None |
|
|
|
if "selected_judge" not in st.session_state: |
|
st.session_state.selected_judge = None |
|
|
|
|
|
def update_scenario(): |
|
st.session_state.selected_scenario = st.session_state.scenario_selector |
|
|
|
def update_model(): |
|
st.session_state.selected_model = st.session_state.model_selector |
|
|
|
def update_judge(): |
|
st.session_state.selected_judge = st.session_state.judge_selector |
|
|
|
def randomize_selection(): |
|
st.session_state.selected_scenario = random.choice(scenario_options) |
|
st.session_state.selected_model = random.choice(model_options) |
|
st.session_state.selected_judge = random.choice(judge_options) |
|
|
|
with tabs[1]: |
|
|
|
_, mid_column, _ = st.columns([0.4, 0.2, 0.4]) |
|
mid_column.button( |
|
":game_die: Randomize!", |
|
on_click=randomize_selection, |
|
type="primary", |
|
use_container_width=True, |
|
) |
|
|
|
st.markdown("#### 1. Select a scenario.") |
|
|
|
st.session_state.selected_scenario = st.selectbox( |
|
"Select Scenario", |
|
scenario_options, |
|
label_visibility="hidden", |
|
key="scenario_selector", |
|
on_change=update_scenario, |
|
index=( |
|
scenario_options.index(st.session_state.selected_scenario) |
|
if st.session_state.selected_scenario |
|
else 0 |
|
), |
|
) |
|
|
|
|
|
if st.session_state.selected_scenario: |
|
selected_emobench_id = int( |
|
st.session_state.selected_scenario.split(": ")[0] |
|
) |
|
scenario_details = df_test_set[ |
|
df_test_set["emobench_id"] == selected_emobench_id |
|
].iloc[0] |
|
|
|
|
|
st.markdown( |
|
colored_text_box( |
|
scenario_details["detailed_dilemma"], |
|
"#01204E", |
|
"white", |
|
), |
|
unsafe_allow_html=True, |
|
) |
|
with st.expander("Additional Information"): |
|
st.write( |
|
{ |
|
"LLM Author": scenario_details["llm_author"], |
|
"Problem": scenario_details["problem"], |
|
"Relationship": scenario_details["relationship"], |
|
"Scenario": scenario_details["scenario"], |
|
} |
|
) |
|
|
|
st.divider() |
|
|
|
st.markdown("#### 2. View responses.") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
fixed_model = "qwen1.5-32B-Chat" |
|
st.selectbox( |
|
"Select Model", |
|
[fixed_model], |
|
key="fixed_model", |
|
label_visibility="hidden", |
|
) |
|
|
|
|
|
if st.session_state.selected_scenario: |
|
response_details_fixed = df_responses[ |
|
(df_responses["emobench_id"] == selected_emobench_id) |
|
& (df_responses["llm_responder"] == fixed_model) |
|
].iloc[0] |
|
|
|
|
|
st.markdown( |
|
colored_text_box( |
|
response_details_fixed["response_string"], |
|
"#028391", |
|
"white", |
|
), |
|
unsafe_allow_html=True, |
|
) |
|
|
|
with col2: |
|
st.session_state.selected_model = st.selectbox( |
|
"Select Model", |
|
model_options, |
|
key="model_selector", |
|
on_change=update_model, |
|
index=( |
|
model_options.index(st.session_state.selected_model) |
|
if st.session_state.selected_model |
|
else 0 |
|
), |
|
) |
|
|
|
|
|
if st.session_state.selected_model and st.session_state.selected_scenario: |
|
response_details_dynamic = df_responses[ |
|
(df_responses["emobench_id"] == selected_emobench_id) |
|
& (df_responses["llm_responder"] == st.session_state.selected_model) |
|
].iloc[0] |
|
|
|
|
|
st.markdown( |
|
colored_text_box( |
|
response_details_dynamic["response_string"], |
|
"#028391", |
|
"white", |
|
), |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.divider() |
|
|
|
st.markdown("#### 3. Response judging.") |
|
st.markdown("##### All council members") |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**") |
|
pairwise_counts_left = df_response_judging[ |
|
(df_response_judging["first_completion_by"] == fixed_model) |
|
& ( |
|
df_response_judging["second_completion_by"] |
|
== st.session_state.selected_model |
|
) |
|
]["pairwise_choice"].value_counts() |
|
st.bar_chart(pairwise_counts_left) |
|
|
|
with col2: |
|
st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**") |
|
pairwise_counts_right = df_response_judging[ |
|
( |
|
df_response_judging["first_completion_by"] |
|
== st.session_state.selected_model |
|
) |
|
& (df_response_judging["second_completion_by"] == fixed_model) |
|
]["pairwise_choice"].value_counts() |
|
st.bar_chart(pairwise_counts_right) |
|
|
|
|
|
st.markdown("##### Individual LLM judges") |
|
st.session_state.selected_judge = st.selectbox( |
|
"Select Judge", |
|
judge_options, |
|
label_visibility="hidden", |
|
key="judge_selector", |
|
on_change=update_judge, |
|
index=( |
|
judge_options.index(st.session_state.selected_judge) |
|
if st.session_state.selected_judge |
|
else 0 |
|
), |
|
) |
|
|
|
|
|
if st.session_state.selected_judge and st.session_state.selected_scenario: |
|
col1, col2 = st.columns(2) |
|
|
|
judging_details_left = df_response_judging[ |
|
(df_response_judging["llm_judge"] == st.session_state.selected_judge) |
|
& (df_response_judging["first_completion_by"] == fixed_model) |
|
& ( |
|
df_response_judging["second_completion_by"] |
|
== st.session_state.selected_model |
|
) |
|
].iloc[0] |
|
|
|
judging_details_right = df_response_judging[ |
|
(df_response_judging["llm_judge"] == st.session_state.selected_judge) |
|
& ( |
|
df_response_judging["first_completion_by"] |
|
== st.session_state.selected_model |
|
) |
|
& (df_response_judging["second_completion_by"] == fixed_model) |
|
].iloc[0] |
|
|
|
|
|
if is_consistent( |
|
judging_details_left["pairwise_choice"], |
|
judging_details_right["pairwise_choice"], |
|
): |
|
st.success( |
|
f"{st.session_state.selected_judge} as a judge was consistent on this example with positions flipped.", |
|
icon="✅", |
|
) |
|
else: |
|
st.warning( |
|
f"{st.session_state.selected_judge} as a judge was inconsistent on this example with positions flipped.", |
|
icon="⚠️", |
|
) |
|
|
|
|
|
with col1: |
|
if not judging_details_left.empty: |
|
st.write( |
|
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}" |
|
) |
|
st.markdown( |
|
colored_text_box( |
|
judging_details_left["judging_response_string"], |
|
"#FEAE6F", |
|
"black", |
|
), |
|
unsafe_allow_html=True, |
|
) |
|
else: |
|
st.write("No judging details found for the selected combination.") |
|
|
|
with col2: |
|
if not judging_details_right.empty: |
|
st.write( |
|
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}" |
|
) |
|
st.markdown( |
|
colored_text_box( |
|
judging_details_right["judging_response_string"], |
|
"#FEAE6F", |
|
"black", |
|
), |
|
unsafe_allow_html=True, |
|
) |
|
else: |
|
st.write("No judging details found for the selected combination.") |
|
|
|
with tabs[2]: |
|
st.markdown("### Battles (Respondent vs. Respondent)") |
|
st.markdown("###### Expected win rates based on Terry-Bradley coefficients") |
|
image = Image.open("img/llm_vs_llm_win_rates.png") |
|
img_base64 = pil_to_base64(image) |
|
centered_image_html = f""" |
|
<div style="text-align: center;"> |
|
<img src="data:image/png;base64,{img_base64}" width="1000"/> |
|
</div> |
|
""" |
|
st.markdown(centered_image_html, unsafe_allow_html=True) |
|
|
|
st.divider() |
|
|
|
st.markdown("### Affinities (Judge vs. Respondent)") |
|
|
|
st.markdown("###### Raw affinities") |
|
image = Image.open("img/raw.png") |
|
img_base64 = pil_to_base64(image) |
|
centered_image_html = f""" |
|
<div style="text-align: center;"> |
|
<img src="data:image/png;base64,{img_base64}" width="1000"/> |
|
</div> |
|
""" |
|
st.markdown(centered_image_html, unsafe_allow_html=True) |
|
|
|
|
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
|
|
st.markdown("###### Council-Normalized") |
|
image = Image.open("img/council_normalized.png") |
|
img_base64 = pil_to_base64(image) |
|
centered_image_html = f""" |
|
<div style="text-align: center;"> |
|
<img src="data:image/png;base64,{img_base64}" width="1000"/> |
|
</div> |
|
""" |
|
st.markdown(centered_image_html, unsafe_allow_html=True) |
|
|
|
st.divider() |
|
|
|
st.markdown("### Agreement (Judge vs. Judge)") |
|
|
|
st.markdown("###### Sidewise Cohen's Kappa:") |
|
image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png") |
|
img_base64 = pil_to_base64(image) |
|
centered_image_html = f""" |
|
<div style="text-align: center;"> |
|
<img src="data:image/png;base64,{img_base64}" width="1000"/> |
|
</div> |
|
""" |
|
st.markdown(centered_image_html, unsafe_allow_html=True) |
|
|
|
st.write("Check out the paper for more detailed analysis!") |
|
|
|
with tabs[-1]: |
|
st.markdown( |
|
"""**Motivation**: |
|
|
|
Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best? |
|
|
|
**Main collaborators**: |
|
- [Justin Zhao](https://x.com/justinxzhao) |
|
- [Flor Plaza](https://x.com/florplaza22) |
|
- [Sam Paech](https://x.com/sam_paech) |
|
- [Federico Bianchi](https://x.com/federicobianchy) |
|
- [Sahand Sabour](https://x.com/SahandSabour) |
|
- [Amanda Cercas Curry](https://x.com/CurriedAmanda) |
|
""" |
|
) |
|
|
|
|
|
with st.expander("Citation"): |
|
st.write( |
|
"Please cite the following paper if you find our leaderboard, dataset, or framework helpful." |
|
) |
|
st.code( |
|
"""@misc{zhao2024council, |
|
Title = {Language Model Council: Benchmarking Foundation Models on Highly Subjective Tasks by Consensus}, |
|
Author = {Justin Zhao and Flor Miriam Plaza-del-Arco and Amanda Cercas Curry}, |
|
Year = {2024} |
|
Eprint = {arXiv:2406.08598}, |
|
}""" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|