selene / app.py
kaikaidai's picture
Update app.py
64fe960 verified
import json
import re
from datetime import datetime
import hashlib
import gradio as gr
from gen_api_answer import (
atla_parse_model_response,
get_atla_response
)
from prompts import (
ATLA_PROMPT,
ATLA_PROMPT_WITH_REFERENCE
)
from random_sample_generation import (
get_random_human_ai_pair,
get_random_human_ai_ground_truth_pair,
generate_ai_response
)
from utils import Vote
from prompts import (
DEFAULT_EVAL_PROMPT,
DEFAULT_EVAL_PROMPT_EDITABLE,
FIXED_EVAL_SUFFIX,
DEFAULT_EVAL_CRITERIA
)
from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
# Load the model_data from JSONL
def load_model_data():
model_data = {}
try:
with open("data/models.jsonl", "r") as f:
for line in f:
model = json.loads(line)
model_data[model["name"]] = {
"organization": model["organization"],
"license": model["license"],
"api_model": model["api_model"],
}
except FileNotFoundError:
print("Warning: models.jsonl not found")
return {}
return model_data
model_data = load_model_data()
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
vote = Vote(
timestamp=datetime.now().isoformat(),
prompt=prompt_value,
response_a=response_a,
response_b=response_b,
model_a=model_a,
model_b=model_b,
winner=winner,
judge_id=judge_id,
)
add_vote(vote, db)
def parse_variables(prompt):
# Extract variables enclosed in double curly braces
variables = re.findall(r"{{(.*?)}}", prompt)
# Remove duplicates while preserving order
seen = set()
variables = [
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
]
return variables
def get_final_prompt(eval_prompt, variable_values):
# Replace variables in the eval prompt with their values
for var, val in variable_values.items():
eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
return eval_prompt
def get_ip(request: gr.Request) -> str:
"""Get and hash the IP address from the request."""
if "cf-connecting-ip" in request.headers:
ip = request.headers["cf-connecting-ip"]
elif "x-forwarded-for" in request.headers:
ip = request.headers["x-forwarded-for"]
if "," in ip:
ip = ip.split(",")[0]
else:
ip = request.client.host
# Hash the IP address for privacy
return hashlib.sha256(ip.encode()).hexdigest()[:16]
def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
"""Generate appropriate message based on vote and model rankings.
Returns (title, message) tuple."""
# Get current rankings
voting_data = get_current_votes()
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
rankings = get_model_rankings(leaderboard)
pos_a = rankings.get(model_a, 0)
pos_b = rankings.get(model_b, 0)
if choice == "Tie":
return "It's a tie!", "Keep voting responsibly πŸ€—"
# Check if vote aligns with leaderboard
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
return "The favourite wins!", "Keep voting responsibly πŸ€—"
else:
return "The underdog wins!", "Keep voting responsibly πŸ€—"
def populate_random_example(request: gr.Request, compatible_mode: bool):
"""Generate a random human-AI conversation example and reset judge outputs."""
if compatible_mode:
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
else:
human_msg, ai_msg = get_random_human_ai_pair()
ground_truth_msg = ""
return [
gr.update(value=human_msg),
gr.update(value=ai_msg),
gr.update(value="🎲", variant="secondary"),
gr.update(value=""), # Clear score
gr.update(value=""), # Clear critique
gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
]
with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
gr.Markdown(MAIN_TITLE)
gr.Markdown(HOW_IT_WORKS)
# Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
eval_prompt = gr.Textbox(
value=DEFAULT_EVAL_PROMPT,
visible=False
)
with gr.Tabs():
with gr.TabItem("Playground"):
with gr.Row():
# Left side - Input section
with gr.Column(scale=1):
with gr.Group():
human_input = gr.TextArea(
label="πŸ‘© User Input",
lines=5,
placeholder="Enter the human message here..."
)
with gr.Row():
generate_btn = gr.Button(
"Generate AI Response",
size="sm",
interactive=False
)
ai_response = gr.TextArea(
label="πŸ€– AI Response",
lines=10,
placeholder="Enter the AI response here..."
)
# Ground truth response (initially hidden)
ground_truth = gr.TextArea(
label="🎯 Ground truth response",
lines=10,
placeholder="Enter the ground truth response here...",
visible=False
)
with gr.Row():
random_btn = gr.Button("🎲", scale=2)
send_btn = gr.Button(
value="Run evaluation",
variant="primary",
size="lg",
scale=8
)
# Right side - Model outputs
with gr.Column(scale=1):
gr.Markdown("### πŸ‘©β€βš–οΈ Selene-Mini Evaluation")
with gr.Group():
with gr.Row():
score = gr.Textbox(label="Score", lines=1, interactive=False)
critique = gr.TextArea(label="Critique", lines=12, interactive=False)
gr.Markdown("<br>")
# Replace the "Edit Judge Prompt" Accordion section with:
with gr.Accordion("πŸ“ Edit Judge Prompt", open=False) as prompt_accordion:
gr.Markdown("<br>")
use_reference_toggle = gr.Checkbox(
label="Use a reference response",
value=False
)
# Hide the default prompt editor
with gr.Column(visible=False) as default_prompt_editor:
eval_prompt_editable = gr.TextArea(
value=DEFAULT_EVAL_PROMPT_EDITABLE,
label="Evaluation Criteria",
lines=12
)
with gr.Row(visible=False) as edit_buttons_row:
cancel_prompt_btn = gr.Button("Cancel")
save_prompt_btn = gr.Button("Save", variant="primary")
gr.Markdown("*The sample being evaluated is always appended as:*")
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
# Show the compatible mode editor
with gr.Column(visible=True) as compatible_prompt_editor:
eval_criteria_text = gr.TextArea(
label="Evaluation Criteria",
lines=12,
value=DEFAULT_EVAL_CRITERIA,
placeholder="Enter the complete evaluation criteria and scoring rubric..."
)
# Define state variables for model tracking
model_a_state = gr.State()
model_b_state = gr.State()
final_prompt_state = gr.State()
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
is_editing = gr.State(False) # Track editing state
compatible_mode_state = gr.State(False) # Track compatible mode state
# Update model names after responses are generated
def update_model_names(model_a, model_b):
return gr.update(value=f"*Model: {model_a}*"), gr.update(
value=f"*Model: {model_b}*"
)
# Store the last submitted prompt and variables for comparison
last_submission = gr.State({})
# Add handlers for save/cancel buttons
def save_prompt(new_prompt, previous_prompt):
return [
gr.update(value=new_prompt), # Update the prompt
new_prompt, # Update the previous prompt state
gr.update(visible=False) # Hide the buttons
]
def cancel_prompt(previous_prompt):
return [
gr.update(value=previous_prompt), # Revert to previous prompt
previous_prompt, # Keep the previous prompt state
gr.update(visible=False) # Hide the buttons
]
def show_edit_buttons(current_value, previous_value):
# Show buttons only if the current value differs from the previous value
return gr.update(visible=current_value != previous_value)
# Add handlers for save/cancel buttons and prompt changes
save_prompt_btn.click(
fn=save_prompt,
inputs=[eval_prompt_editable, eval_prompt_previous],
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
)
cancel_prompt_btn.click(
fn=cancel_prompt,
inputs=[eval_prompt_previous],
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
)
eval_prompt_editable.change(
fn=show_edit_buttons,
inputs=[eval_prompt_editable, eval_prompt_previous],
outputs=edit_buttons_row
)
# Function to toggle visibility based on compatible mode
def toggle_use_reference(checked):
if checked:
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
return {
ground_truth: gr.update(visible=True, value=ground_truth_msg),
human_input: gr.update(value=human_msg),
ai_response: gr.update(value=ai_msg),
score: gr.update(value=""),
critique: gr.update(value=""),
random_btn: gr.update(value="🎲", variant="secondary"),
}
else:
return {
ground_truth: gr.update(visible=False)
}
# Update the change handler to include all necessary outputs
use_reference_toggle.change(
fn=toggle_use_reference,
inputs=[use_reference_toggle],
outputs=[
ground_truth,
human_input,
ai_response,
score,
critique,
random_btn,
]
)
# Add a new state variable to track first game
first_game_state = gr.State(True) # Initialize as True
# Update the submit function to parse the evaluation criteria
def submit_and_store(
use_reference,
eval_criteria_text,
human_input,
ai_response,
ground_truth_input,
):
# Build prompt data dictionary
prompt_data = {
'human_input': human_input,
'ai_response': ai_response,
'ground_truth_input': ground_truth_input if use_reference else '',
'eval_criteria': eval_criteria_text,
}
# Get base prompt based on whether reference is used
base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
# Format the prompt
final_prompt = base_prompt.format(
human_input=prompt_data['human_input'],
ai_response=prompt_data['ai_response'],
ground_truth_input=prompt_data['ground_truth_input'],
eval_criteria=prompt_data['eval_criteria']
)
# Get response from Atla
response = get_atla_response(
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
prompt=final_prompt,
max_tokens=500,
temperature=0.01
)
# Parse the response
score, critique = atla_parse_model_response(response)
return [
score,
critique,
gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
gr.update(value="🎲"),
]
# Update the click handler to use False for is_first_game after first submission
def create_submit_handler():
first_game = True
def handler(*args):
nonlocal first_game
result = submit_and_store(*args)
first_game = False # Set to False after first submission
return result
return handler
# Update the send_btn click handler
send_btn.click(
fn=submit_and_store,
inputs=[
use_reference_toggle,
eval_criteria_text,
human_input,
ai_response,
ground_truth,
],
outputs=[
score,
critique,
send_btn,
random_btn,
],
)
# Add random button handler
random_btn.click(
fn=populate_random_example,
inputs=[use_reference_toggle],
outputs=[
human_input,
ai_response,
random_btn,
score,
critique,
ground_truth,
]
)
# Add input change handlers
def handle_input_change():
"""Reset UI state when inputs are changed"""
return [
gr.update(value="Run evaluation", variant="primary"), # send_btn
gr.update(value="🎲", variant="secondary"), # random_btn
]
# Update the change handlers for inputs
human_input.change(
fn=handle_input_change,
inputs=[],
outputs=[send_btn, random_btn]
)
ai_response.change(
fn=handle_input_change,
inputs=[],
outputs=[send_btn, random_btn]
)
generate_btn.click(
fn=lambda msg: (
generate_ai_response(msg)[0], # Only take the response text
gr.update(
value="Generate AI Response", # Keep the label
interactive=False # Disable the button
)
),
inputs=[human_input],
outputs=[ai_response, generate_btn]
)
human_input.change(
fn=lambda x: gr.update(interactive=bool(x.strip())),
inputs=[human_input],
outputs=[generate_btn]
)
# Update the demo.load to include the random example population
demo.load(
fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
inputs=[],
outputs=[
human_input,
ai_response,
random_btn,
score,
critique,
ground_truth,
]
)
if __name__ == "__main__":
demo.launch()