Spaces:

lmarena-ai
/

preference-proxy-evaluations

Running

Evan Frick

make human pref come first

b2821de 5 months ago

4.26 kB

	import streamlit as st
	import pandas as pd
	import json
	from os.path import split as path_split, splitext as path_splitext

	st.set_page_config(
	page_title="PPE Metrics Explorer",
	layout="wide", # This makes the app use the entire screen width
	initial_sidebar_state="expanded",
	)

	# Set the title of the app
	st.title("PPE Metrics Explorer")

	@st.cache_data
	def load_data(file_path):
	"""
	Load json data from a file.
	"""
	with open(file_path, 'r') as file:
	data = json.load(file)
	return data

	def contains_list(column):
	return column.apply(lambda x: isinstance(x, list)).any()

	def main():
	# Load the JSON data
	data = load_data('results.json')

	# Extract the list of benchmarks
	benchmarks = list(sorted(data.keys(), key=lambda s: "A" + s if s == "human_preference_v1" else s))

	# Dropdown for selecting benchmark
	selected_benchmark = st.selectbox("Select a Benchmark", benchmarks)

	# Extract data for the selected benchmark
	benchmark_data = data[selected_benchmark]

	# Prepare a list to store records
	records = []

	# Iterate over each model in the selected benchmark
	for model, metrics in benchmark_data.items():

	model = path_split(path_splitext(model)[0])[-1]
	# Flatten the metrics dictionary if there are nested metrics
	# For example, in "human_preference_v1", there are subcategories like "overall", "hard_prompt", etc.
	# We'll aggregate these or allow the user to select subcategories as needed
	if isinstance(metrics, dict):
	# Check if metrics contain nested dictionaries
	nested_keys = list(metrics.keys())
	# If there are nested keys, we can allow the user to select a subcategory
	# For simplicity, let's assume we want to display all nested metrics concatenated
	flattened_metrics = {}
	for subkey, submetrics in metrics.items():
	if isinstance(submetrics, dict):
	for metric_name, value in submetrics.items():
	# Create a compound key
	key = f"{subkey} - {metric_name}"
	flattened_metrics[key] = value
	else:
	flattened_metrics[subkey] = submetrics
	records.append({
	"Model": model,
	**flattened_metrics
	})
	else:
	# If metrics are not nested, just add them directly
	records.append({
	"Model": model,
	"Value": metrics
	})

	# Create a DataFrame
	df = pd.DataFrame(records)

	# Drop columns that contain lists
	df = df.loc[:, ~df.apply(contains_list)]

	if "human" not in selected_benchmark:
	df = df[sorted(df.columns, key=str.lower)]

	# Set 'Model' as the index
	df.set_index("Model", inplace=True)


	# Create two columns: one for spacing and one for the search bar
	col1, col2, col3 = st.columns([1, 3, 1]) # Adjust the ratios as needed
	with col1:
	# Column Search Functionality
	# st.markdown("#### Filter Columns")
	column_search = st.text_input("", placeholder="Search metrics...", key="search")
	# column_search = st.text_input("Search for metrics (column names):", "")

	if column_search:
	# Filter columns that contain the search term (case-insensitive)
	filtered_columns = [col for col in df.columns if column_search.lower() in col.lower()]
	if filtered_columns:
	df_display = df[filtered_columns]
	else:
	st.warning("No columns match your search.")
	df_display = pd.DataFrame() # Empty DataFrame
	else:
	# If no search term, display all columns
	df_display = df

	# Display the DataFrame
	st.dataframe(df_display.sort_values(df_display.columns[0], ascending=False) if len(df_display) else df_display, use_container_width=True)

	# Optional: Allow user to download the data as CSV
	csv = df_display.to_csv()
	st.download_button(
	label="Download data as CSV",
	data=csv,
	file_name=f"{selected_benchmark}_metrics.csv",
	mime='text/csv',
	)

	if __name__ == "__main__":
	main()