Spaces:

AtlaAI
/

selene

Running

App Files Files Community

selene / app.py

kaikaidai

Update app.py

64fe960 verified 6 days ago

raw

history blame contribute delete

15.6 kB

	import json
	import re
	from datetime import datetime
	import hashlib
	import gradio as gr

	from gen_api_answer import (
	atla_parse_model_response,
	get_atla_response
	)

	from prompts import (
	ATLA_PROMPT,
	ATLA_PROMPT_WITH_REFERENCE
	)

	from random_sample_generation import (
	get_random_human_ai_pair,
	get_random_human_ai_ground_truth_pair,
	generate_ai_response
	)

	from utils import Vote

	from prompts import (
	DEFAULT_EVAL_PROMPT,
	DEFAULT_EVAL_PROMPT_EDITABLE,
	FIXED_EVAL_SUFFIX,
	DEFAULT_EVAL_CRITERIA
	)

	from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS


	# Load the model_data from JSONL
	def load_model_data():
	model_data = {}
	try:
	with open("data/models.jsonl", "r") as f:
	for line in f:
	model = json.loads(line)
	model_data[model["name"]] = {
	"organization": model["organization"],
	"license": model["license"],
	"api_model": model["api_model"],
	}
	except FileNotFoundError:
	print("Warning: models.jsonl not found")
	return {}
	return model_data


	model_data = load_model_data()

	def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
	prompt_value = prompt.value if hasattr(prompt, 'value') else prompt

	vote = Vote(
	timestamp=datetime.now().isoformat(),
	prompt=prompt_value,
	response_a=response_a,
	response_b=response_b,
	model_a=model_a,
	model_b=model_b,
	winner=winner,
	judge_id=judge_id,
	)
	add_vote(vote, db)


	def parse_variables(prompt):
	# Extract variables enclosed in double curly braces
	variables = re.findall(r"{{(.*?)}}", prompt)
	# Remove duplicates while preserving order
	seen = set()
	variables = [
	x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
	]
	return variables


	def get_final_prompt(eval_prompt, variable_values):
	# Replace variables in the eval prompt with their values
	for var, val in variable_values.items():
	eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
	return eval_prompt



	def get_ip(request: gr.Request) -> str:
	"""Get and hash the IP address from the request."""
	if "cf-connecting-ip" in request.headers:
	ip = request.headers["cf-connecting-ip"]
	elif "x-forwarded-for" in request.headers:
	ip = request.headers["x-forwarded-for"]
	if "," in ip:
	ip = ip.split(",")[0]
	else:
	ip = request.client.host

	# Hash the IP address for privacy
	return hashlib.sha256(ip.encode()).hexdigest()[:16]


	def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
	"""Generate appropriate message based on vote and model rankings.
	Returns (title, message) tuple."""
	# Get current rankings
	voting_data = get_current_votes()
	leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
	rankings = get_model_rankings(leaderboard)
	pos_a = rankings.get(model_a, 0)
	pos_b = rankings.get(model_b, 0)

	if choice == "Tie":
	return "It's a tie!", "Keep voting responsibly 🤗"

	# Check if vote aligns with leaderboard
	if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
	return "The favourite wins!", "Keep voting responsibly 🤗"
	else:
	return "The underdog wins!", "Keep voting responsibly 🤗"


	def populate_random_example(request: gr.Request, compatible_mode: bool):
	"""Generate a random human-AI conversation example and reset judge outputs."""
	if compatible_mode:
	human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
	else:
	human_msg, ai_msg = get_random_human_ai_pair()
	ground_truth_msg = ""

	return [
	gr.update(value=human_msg),
	gr.update(value=ai_msg),
	gr.update(value="🎲", variant="secondary"),
	gr.update(value=""), # Clear score
	gr.update(value=""), # Clear critique
	gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
	]


	with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
	gr.Markdown(MAIN_TITLE)
	gr.Markdown(HOW_IT_WORKS)

	# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
	eval_prompt = gr.Textbox(
	value=DEFAULT_EVAL_PROMPT,
	visible=False
	)

	with gr.Tabs():
	with gr.TabItem("Playground"):
	with gr.Row():
	# Left side - Input section
	with gr.Column(scale=1):
	with gr.Group():
	human_input = gr.TextArea(
	label="👩 User Input",
	lines=5,
	placeholder="Enter the human message here..."
	)
	with gr.Row():
	generate_btn = gr.Button(
	"Generate AI Response",
	size="sm",
	interactive=False
	)

	ai_response = gr.TextArea(
	label="🤖 AI Response",
	lines=10,
	placeholder="Enter the AI response here..."
	)

	# Ground truth response (initially hidden)
	ground_truth = gr.TextArea(
	label="🎯 Ground truth response",
	lines=10,
	placeholder="Enter the ground truth response here...",
	visible=False
	)

	with gr.Row():
	random_btn = gr.Button("🎲", scale=2)
	send_btn = gr.Button(
	value="Run evaluation",
	variant="primary",
	size="lg",
	scale=8
	)

	# Right side - Model outputs
	with gr.Column(scale=1):
	gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation")
	with gr.Group():
	with gr.Row():
	score = gr.Textbox(label="Score", lines=1, interactive=False)
	critique = gr.TextArea(label="Critique", lines=12, interactive=False)

	gr.Markdown("<br>")


	# Replace the "Edit Judge Prompt" Accordion section with:
	with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
	gr.Markdown("<br>")
	use_reference_toggle = gr.Checkbox(
	label="Use a reference response",
	value=False
	)

	# Hide the default prompt editor
	with gr.Column(visible=False) as default_prompt_editor:
	eval_prompt_editable = gr.TextArea(
	value=DEFAULT_EVAL_PROMPT_EDITABLE,
	label="Evaluation Criteria",
	lines=12
	)

	with gr.Row(visible=False) as edit_buttons_row:
	cancel_prompt_btn = gr.Button("Cancel")
	save_prompt_btn = gr.Button("Save", variant="primary")
	gr.Markdown("The sample being evaluated is always appended as:")
	gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")

	# Show the compatible mode editor
	with gr.Column(visible=True) as compatible_prompt_editor:
	eval_criteria_text = gr.TextArea(
	label="Evaluation Criteria",
	lines=12,
	value=DEFAULT_EVAL_CRITERIA,
	placeholder="Enter the complete evaluation criteria and scoring rubric..."
	)

	# Define state variables for model tracking
	model_a_state = gr.State()
	model_b_state = gr.State()
	final_prompt_state = gr.State()
	eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
	is_editing = gr.State(False) # Track editing state
	compatible_mode_state = gr.State(False) # Track compatible mode state

	# Update model names after responses are generated
	def update_model_names(model_a, model_b):
	return gr.update(value=f"Model: {model_a}"), gr.update(
	value=f"Model: {model_b}"
	)

	# Store the last submitted prompt and variables for comparison
	last_submission = gr.State({})

	# Add handlers for save/cancel buttons
	def save_prompt(new_prompt, previous_prompt):
	return [
	gr.update(value=new_prompt), # Update the prompt
	new_prompt, # Update the previous prompt state
	gr.update(visible=False) # Hide the buttons
	]

	def cancel_prompt(previous_prompt):
	return [
	gr.update(value=previous_prompt), # Revert to previous prompt
	previous_prompt, # Keep the previous prompt state
	gr.update(visible=False) # Hide the buttons
	]

	def show_edit_buttons(current_value, previous_value):
	# Show buttons only if the current value differs from the previous value
	return gr.update(visible=current_value != previous_value)

	# Add handlers for save/cancel buttons and prompt changes
	save_prompt_btn.click(
	fn=save_prompt,
	inputs=[eval_prompt_editable, eval_prompt_previous],
	outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
	)

	cancel_prompt_btn.click(
	fn=cancel_prompt,
	inputs=[eval_prompt_previous],
	outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
	)

	eval_prompt_editable.change(
	fn=show_edit_buttons,
	inputs=[eval_prompt_editable, eval_prompt_previous],
	outputs=edit_buttons_row
	)

	# Function to toggle visibility based on compatible mode
	def toggle_use_reference(checked):
	if checked:
	human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
	return {
	ground_truth: gr.update(visible=True, value=ground_truth_msg),
	human_input: gr.update(value=human_msg),
	ai_response: gr.update(value=ai_msg),
	score: gr.update(value=""),
	critique: gr.update(value=""),
	random_btn: gr.update(value="🎲", variant="secondary"),
	}
	else:
	return {
	ground_truth: gr.update(visible=False)
	}

	# Update the change handler to include all necessary outputs
	use_reference_toggle.change(
	fn=toggle_use_reference,
	inputs=[use_reference_toggle],
	outputs=[
	ground_truth,
	human_input,
	ai_response,
	score,
	critique,
	random_btn,
	]
	)

	# Add a new state variable to track first game
	first_game_state = gr.State(True) # Initialize as True

	# Update the submit function to parse the evaluation criteria
	def submit_and_store(
	use_reference,
	eval_criteria_text,
	human_input,
	ai_response,
	ground_truth_input,
	):
	# Build prompt data dictionary
	prompt_data = {
	'human_input': human_input,
	'ai_response': ai_response,
	'ground_truth_input': ground_truth_input if use_reference else '',
	'eval_criteria': eval_criteria_text,
	}

	# Get base prompt based on whether reference is used
	base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT

	# Format the prompt
	final_prompt = base_prompt.format(
	human_input=prompt_data['human_input'],
	ai_response=prompt_data['ai_response'],
	ground_truth_input=prompt_data['ground_truth_input'],
	eval_criteria=prompt_data['eval_criteria']
	)

	# Get response from Atla
	response = get_atla_response(
	model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
	prompt=final_prompt,
	max_tokens=500,
	temperature=0.01
	)

	# Parse the response
	score, critique = atla_parse_model_response(response)

	return [
	score,
	critique,
	gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
	gr.update(value="🎲"),
	]

	# Update the click handler to use False for is_first_game after first submission
	def create_submit_handler():
	first_game = True

	def handler(*args):
	nonlocal first_game
	result = submit_and_store(*args)
	first_game = False # Set to False after first submission
	return result

	return handler

	# Update the send_btn click handler
	send_btn.click(
	fn=submit_and_store,
	inputs=[
	use_reference_toggle,
	eval_criteria_text,
	human_input,
	ai_response,
	ground_truth,
	],
	outputs=[
	score,
	critique,
	send_btn,
	random_btn,
	],
	)

	# Add random button handler
	random_btn.click(
	fn=populate_random_example,
	inputs=[use_reference_toggle],
	outputs=[
	human_input,
	ai_response,
	random_btn,
	score,
	critique,
	ground_truth,
	]
	)

	# Add input change handlers
	def handle_input_change():
	"""Reset UI state when inputs are changed"""
	return [
	gr.update(value="Run evaluation", variant="primary"), # send_btn
	gr.update(value="🎲", variant="secondary"), # random_btn
	]

	# Update the change handlers for inputs
	human_input.change(
	fn=handle_input_change,
	inputs=[],
	outputs=[send_btn, random_btn]
	)

	ai_response.change(
	fn=handle_input_change,
	inputs=[],
	outputs=[send_btn, random_btn]
	)

	generate_btn.click(
	fn=lambda msg: (
	generate_ai_response(msg)[0], # Only take the response text
	gr.update(
	value="Generate AI Response", # Keep the label
	interactive=False # Disable the button
	)
	),
	inputs=[human_input],
	outputs=[ai_response, generate_btn]
	)

	human_input.change(
	fn=lambda x: gr.update(interactive=bool(x.strip())),
	inputs=[human_input],
	outputs=[generate_btn]
	)

	# Update the demo.load to include the random example population
	demo.load(
	fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
	inputs=[],
	outputs=[
	human_input,
	ai_response,
	random_btn,
	score,
	critique,
	ground_truth,
	]
	)

	if __name__ == "__main__":
	demo.launch()