|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
ATTN_DATA = [ |
|
|
|
"Model π€", |
|
"Experiment π§ͺ", |
|
"Params (B)", |
|
"Architecture ποΈ", |
|
"Open LLM Score (%)", |
|
|
|
"Backend π", |
|
"Quantization ποΈ", |
|
"Precision π₯", |
|
"Attention ποΈ", |
|
"Kernel βοΈ", |
|
|
|
"Prefill (s)", |
|
"Decode (tokens/s)", |
|
|
|
"Prefill Speedup (%)", |
|
"Decode Speedup (%)", |
|
] |
|
|
|
|
|
def get_attn_df(open_llm_perf_df): |
|
copy_df = open_llm_perf_df.copy() |
|
copy_df["Quantization & Kernel"] = ( |
|
copy_df["Quantization ποΈ"] + " & " + copy_df["Kernel βοΈ"] |
|
) |
|
|
|
eager_df = copy_df[(copy_df["Attention ποΈ"] == "Eager")] |
|
sdpa_df = copy_df[(copy_df["Attention ποΈ"] == "SDPA")] |
|
fa2_df = copy_df[(copy_df["Attention ποΈ"] == "FAv2")] |
|
|
|
sdpa_df = pd.merge( |
|
eager_df, |
|
sdpa_df, |
|
on=["Model π€", "Quantization & Kernel"], |
|
suffixes=["", " other"], |
|
) |
|
fa2_df = pd.merge( |
|
eager_df, |
|
fa2_df, |
|
on=["Model π€", "Quantization & Kernel"], |
|
suffixes=["", " other"], |
|
) |
|
|
|
attn_df = pd.concat([sdpa_df, fa2_df]) |
|
|
|
|
|
attn_df["Prefill Speedup (%)"] = ( |
|
(attn_df["Prefill (s)"] / attn_df["Prefill (s) other"]) * 100 |
|
).round(2) - 100 |
|
attn_df["Decode Speedup (%)"] = ( |
|
(attn_df["Decode (tokens/s) other"] / attn_df["Decode (tokens/s)"]) * 100 |
|
).round(2) - 100 |
|
|
|
return attn_df |
|
|
|
|
|
def get_attn_prefill_fig(open_llm_perf_df): |
|
attn_df = get_attn_df(open_llm_perf_df) |
|
|
|
prefill_fig = px.box( |
|
attn_df, |
|
x="Architecture ποΈ", |
|
y="Prefill Speedup (%)", |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
custom_data=ATTN_DATA, |
|
color="Attention ποΈ other", |
|
points="all", |
|
) |
|
|
|
prefill_fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[ |
|
f"<b>{column}:</b> %{{customdata[{i}]}}" |
|
for i, column in enumerate(ATTN_DATA) |
|
] |
|
) |
|
) |
|
|
|
prefill_fig.update_layout( |
|
title={ |
|
"text": "Prefill Speedup per Architecture, Compared To Eager Attention", |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
"y": 0.95, |
|
"x": 0.5, |
|
}, |
|
yaxis_title="Prefill Speedup (%)", |
|
xaxis_title="LLM Architecture", |
|
legend_title="Attention", |
|
width=1200, |
|
height=600, |
|
) |
|
|
|
return prefill_fig |
|
|
|
|
|
def get_attn_decode_fig(open_llm_perf_df): |
|
attn_df = get_attn_df(open_llm_perf_df) |
|
print(len(attn_df)) |
|
|
|
decode_fig = px.box( |
|
attn_df, |
|
x="Architecture ποΈ", |
|
y="Decode Speedup (%)", |
|
color_discrete_sequence=px.colors.qualitative.Light24, |
|
custom_data=ATTN_DATA, |
|
color="Attention ποΈ other", |
|
points="all", |
|
) |
|
|
|
decode_fig.update_traces( |
|
hovertemplate="<br>".join( |
|
[ |
|
f"<b>{column}:</b> %{{customdata[{i}]}}" |
|
for i, column in enumerate(ATTN_DATA) |
|
] |
|
) |
|
) |
|
|
|
decode_fig.update_layout( |
|
title={ |
|
"text": "Decode Speedup per Architecture, Compared To Eager Attention", |
|
"xanchor": "center", |
|
"yanchor": "top", |
|
"y": 0.95, |
|
"x": 0.5, |
|
}, |
|
yaxis_title="Decode Speedup (%)", |
|
xaxis_title="LLM Architecture", |
|
legend_title="Attention", |
|
width=1200, |
|
height=600, |
|
) |
|
|
|
return decode_fig |
|
|
|
|
|
def create_attn_plots(open_llm_perf_df): |
|
|
|
gr.HTML("π Hover over the points π for additional information.", elem_id="text") |
|
|
|
prefill_fig = get_attn_prefill_fig(open_llm_perf_df) |
|
decode_fig = get_attn_decode_fig(open_llm_perf_df) |
|
|
|
|
|
prefill_plot = gr.components.Plot( |
|
value=prefill_fig, elem_id="plot", show_label=False |
|
) |
|
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False) |
|
|
|
return prefill_plot, decode_plot |
|
|