|
import json |
|
import re |
|
from datetime import datetime |
|
import hashlib |
|
import gradio as gr |
|
|
|
from gen_api_answer import ( |
|
atla_parse_model_response, |
|
get_atla_response |
|
) |
|
|
|
from prompts import ( |
|
ATLA_PROMPT, |
|
ATLA_PROMPT_WITH_REFERENCE |
|
) |
|
|
|
from random_sample_generation import ( |
|
get_random_human_ai_pair, |
|
get_random_human_ai_ground_truth_pair, |
|
generate_ai_response |
|
) |
|
|
|
from utils import Vote |
|
|
|
from prompts import ( |
|
DEFAULT_EVAL_PROMPT, |
|
DEFAULT_EVAL_PROMPT_EDITABLE, |
|
FIXED_EVAL_SUFFIX, |
|
DEFAULT_EVAL_CRITERIA |
|
) |
|
|
|
from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS |
|
|
|
|
|
|
|
def load_model_data(): |
|
model_data = {} |
|
try: |
|
with open("data/models.jsonl", "r") as f: |
|
for line in f: |
|
model = json.loads(line) |
|
model_data[model["name"]] = { |
|
"organization": model["organization"], |
|
"license": model["license"], |
|
"api_model": model["api_model"], |
|
} |
|
except FileNotFoundError: |
|
print("Warning: models.jsonl not found") |
|
return {} |
|
return model_data |
|
|
|
|
|
model_data = load_model_data() |
|
|
|
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id): |
|
prompt_value = prompt.value if hasattr(prompt, 'value') else prompt |
|
|
|
vote = Vote( |
|
timestamp=datetime.now().isoformat(), |
|
prompt=prompt_value, |
|
response_a=response_a, |
|
response_b=response_b, |
|
model_a=model_a, |
|
model_b=model_b, |
|
winner=winner, |
|
judge_id=judge_id, |
|
) |
|
add_vote(vote, db) |
|
|
|
|
|
def parse_variables(prompt): |
|
|
|
variables = re.findall(r"{{(.*?)}}", prompt) |
|
|
|
seen = set() |
|
variables = [ |
|
x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) |
|
] |
|
return variables |
|
|
|
|
|
def get_final_prompt(eval_prompt, variable_values): |
|
|
|
for var, val in variable_values.items(): |
|
eval_prompt = eval_prompt.replace("{{" + var + "}}", val) |
|
return eval_prompt |
|
|
|
|
|
|
|
def get_ip(request: gr.Request) -> str: |
|
"""Get and hash the IP address from the request.""" |
|
if "cf-connecting-ip" in request.headers: |
|
ip = request.headers["cf-connecting-ip"] |
|
elif "x-forwarded-for" in request.headers: |
|
ip = request.headers["x-forwarded-for"] |
|
if "," in ip: |
|
ip = ip.split(",")[0] |
|
else: |
|
ip = request.client.host |
|
|
|
|
|
return hashlib.sha256(ip.encode()).hexdigest()[:16] |
|
|
|
|
|
def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]: |
|
"""Generate appropriate message based on vote and model rankings. |
|
Returns (title, message) tuple.""" |
|
|
|
voting_data = get_current_votes() |
|
leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True) |
|
rankings = get_model_rankings(leaderboard) |
|
pos_a = rankings.get(model_a, 0) |
|
pos_b = rankings.get(model_b, 0) |
|
|
|
if choice == "Tie": |
|
return "It's a tie!", "Keep voting responsibly π€" |
|
|
|
|
|
if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a): |
|
return "The favourite wins!", "Keep voting responsibly π€" |
|
else: |
|
return "The underdog wins!", "Keep voting responsibly π€" |
|
|
|
|
|
def populate_random_example(request: gr.Request, compatible_mode: bool): |
|
"""Generate a random human-AI conversation example and reset judge outputs.""" |
|
if compatible_mode: |
|
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
|
else: |
|
human_msg, ai_msg = get_random_human_ai_pair() |
|
ground_truth_msg = "" |
|
|
|
return [ |
|
gr.update(value=human_msg), |
|
gr.update(value=ai_msg), |
|
gr.update(value="π²", variant="secondary"), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update(value=ground_truth_msg, visible=compatible_mode), |
|
] |
|
|
|
|
|
with gr.Blocks(theme="default", css=CSS_STYLES) as demo: |
|
gr.Markdown(MAIN_TITLE) |
|
gr.Markdown(HOW_IT_WORKS) |
|
|
|
|
|
eval_prompt = gr.Textbox( |
|
value=DEFAULT_EVAL_PROMPT, |
|
visible=False |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Playground"): |
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(): |
|
human_input = gr.TextArea( |
|
label="π© User Input", |
|
lines=5, |
|
placeholder="Enter the human message here..." |
|
) |
|
with gr.Row(): |
|
generate_btn = gr.Button( |
|
"Generate AI Response", |
|
size="sm", |
|
interactive=False |
|
) |
|
|
|
ai_response = gr.TextArea( |
|
label="π€ AI Response", |
|
lines=10, |
|
placeholder="Enter the AI response here..." |
|
) |
|
|
|
|
|
ground_truth = gr.TextArea( |
|
label="π― Ground truth response", |
|
lines=10, |
|
placeholder="Enter the ground truth response here...", |
|
visible=False |
|
) |
|
|
|
with gr.Row(): |
|
random_btn = gr.Button("π²", scale=2) |
|
send_btn = gr.Button( |
|
value="Run evaluation", |
|
variant="primary", |
|
size="lg", |
|
scale=8 |
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
gr.Markdown("### π©ββοΈ Selene-Mini Evaluation") |
|
with gr.Group(): |
|
with gr.Row(): |
|
score = gr.Textbox(label="Score", lines=1, interactive=False) |
|
critique = gr.TextArea(label="Critique", lines=12, interactive=False) |
|
|
|
gr.Markdown("<br>") |
|
|
|
|
|
|
|
with gr.Accordion("π Edit Judge Prompt", open=False) as prompt_accordion: |
|
gr.Markdown("<br>") |
|
use_reference_toggle = gr.Checkbox( |
|
label="Use a reference response", |
|
value=False |
|
) |
|
|
|
|
|
with gr.Column(visible=False) as default_prompt_editor: |
|
eval_prompt_editable = gr.TextArea( |
|
value=DEFAULT_EVAL_PROMPT_EDITABLE, |
|
label="Evaluation Criteria", |
|
lines=12 |
|
) |
|
|
|
with gr.Row(visible=False) as edit_buttons_row: |
|
cancel_prompt_btn = gr.Button("Cancel") |
|
save_prompt_btn = gr.Button("Save", variant="primary") |
|
gr.Markdown("*The sample being evaluated is always appended as:*") |
|
gr.Markdown(f"```{FIXED_EVAL_SUFFIX}") |
|
|
|
|
|
with gr.Column(visible=True) as compatible_prompt_editor: |
|
eval_criteria_text = gr.TextArea( |
|
label="Evaluation Criteria", |
|
lines=12, |
|
value=DEFAULT_EVAL_CRITERIA, |
|
placeholder="Enter the complete evaluation criteria and scoring rubric..." |
|
) |
|
|
|
|
|
model_a_state = gr.State() |
|
model_b_state = gr.State() |
|
final_prompt_state = gr.State() |
|
eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) |
|
is_editing = gr.State(False) |
|
compatible_mode_state = gr.State(False) |
|
|
|
|
|
def update_model_names(model_a, model_b): |
|
return gr.update(value=f"*Model: {model_a}*"), gr.update( |
|
value=f"*Model: {model_b}*" |
|
) |
|
|
|
|
|
last_submission = gr.State({}) |
|
|
|
|
|
def save_prompt(new_prompt, previous_prompt): |
|
return [ |
|
gr.update(value=new_prompt), |
|
new_prompt, |
|
gr.update(visible=False) |
|
] |
|
|
|
def cancel_prompt(previous_prompt): |
|
return [ |
|
gr.update(value=previous_prompt), |
|
previous_prompt, |
|
gr.update(visible=False) |
|
] |
|
|
|
def show_edit_buttons(current_value, previous_value): |
|
|
|
return gr.update(visible=current_value != previous_value) |
|
|
|
|
|
save_prompt_btn.click( |
|
fn=save_prompt, |
|
inputs=[eval_prompt_editable, eval_prompt_previous], |
|
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] |
|
) |
|
|
|
cancel_prompt_btn.click( |
|
fn=cancel_prompt, |
|
inputs=[eval_prompt_previous], |
|
outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] |
|
) |
|
|
|
eval_prompt_editable.change( |
|
fn=show_edit_buttons, |
|
inputs=[eval_prompt_editable, eval_prompt_previous], |
|
outputs=edit_buttons_row |
|
) |
|
|
|
|
|
def toggle_use_reference(checked): |
|
if checked: |
|
human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
|
return { |
|
ground_truth: gr.update(visible=True, value=ground_truth_msg), |
|
human_input: gr.update(value=human_msg), |
|
ai_response: gr.update(value=ai_msg), |
|
score: gr.update(value=""), |
|
critique: gr.update(value=""), |
|
random_btn: gr.update(value="π²", variant="secondary"), |
|
} |
|
else: |
|
return { |
|
ground_truth: gr.update(visible=False) |
|
} |
|
|
|
|
|
use_reference_toggle.change( |
|
fn=toggle_use_reference, |
|
inputs=[use_reference_toggle], |
|
outputs=[ |
|
ground_truth, |
|
human_input, |
|
ai_response, |
|
score, |
|
critique, |
|
random_btn, |
|
] |
|
) |
|
|
|
|
|
first_game_state = gr.State(True) |
|
|
|
|
|
def submit_and_store( |
|
use_reference, |
|
eval_criteria_text, |
|
human_input, |
|
ai_response, |
|
ground_truth_input, |
|
): |
|
|
|
prompt_data = { |
|
'human_input': human_input, |
|
'ai_response': ai_response, |
|
'ground_truth_input': ground_truth_input if use_reference else '', |
|
'eval_criteria': eval_criteria_text, |
|
} |
|
|
|
|
|
base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT |
|
|
|
|
|
final_prompt = base_prompt.format( |
|
human_input=prompt_data['human_input'], |
|
ai_response=prompt_data['ai_response'], |
|
ground_truth_input=prompt_data['ground_truth_input'], |
|
eval_criteria=prompt_data['eval_criteria'] |
|
) |
|
|
|
|
|
response = get_atla_response( |
|
model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B", |
|
prompt=final_prompt, |
|
max_tokens=500, |
|
temperature=0.01 |
|
) |
|
|
|
|
|
score, critique = atla_parse_model_response(response) |
|
|
|
return [ |
|
score, |
|
critique, |
|
gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), |
|
gr.update(value="π²"), |
|
] |
|
|
|
|
|
def create_submit_handler(): |
|
first_game = True |
|
|
|
def handler(*args): |
|
nonlocal first_game |
|
result = submit_and_store(*args) |
|
first_game = False |
|
return result |
|
|
|
return handler |
|
|
|
|
|
send_btn.click( |
|
fn=submit_and_store, |
|
inputs=[ |
|
use_reference_toggle, |
|
eval_criteria_text, |
|
human_input, |
|
ai_response, |
|
ground_truth, |
|
], |
|
outputs=[ |
|
score, |
|
critique, |
|
send_btn, |
|
random_btn, |
|
], |
|
) |
|
|
|
|
|
random_btn.click( |
|
fn=populate_random_example, |
|
inputs=[use_reference_toggle], |
|
outputs=[ |
|
human_input, |
|
ai_response, |
|
random_btn, |
|
score, |
|
critique, |
|
ground_truth, |
|
] |
|
) |
|
|
|
|
|
def handle_input_change(): |
|
"""Reset UI state when inputs are changed""" |
|
return [ |
|
gr.update(value="Run evaluation", variant="primary"), |
|
gr.update(value="π²", variant="secondary"), |
|
] |
|
|
|
|
|
human_input.change( |
|
fn=handle_input_change, |
|
inputs=[], |
|
outputs=[send_btn, random_btn] |
|
) |
|
|
|
ai_response.change( |
|
fn=handle_input_change, |
|
inputs=[], |
|
outputs=[send_btn, random_btn] |
|
) |
|
|
|
generate_btn.click( |
|
fn=lambda msg: ( |
|
generate_ai_response(msg)[0], |
|
gr.update( |
|
value="Generate AI Response", |
|
interactive=False |
|
) |
|
), |
|
inputs=[human_input], |
|
outputs=[ai_response, generate_btn] |
|
) |
|
|
|
human_input.change( |
|
fn=lambda x: gr.update(interactive=bool(x.strip())), |
|
inputs=[human_input], |
|
outputs=[generate_btn] |
|
) |
|
|
|
|
|
demo.load( |
|
fn=lambda: populate_random_example(None, False), |
|
inputs=[], |
|
outputs=[ |
|
human_input, |
|
ai_response, |
|
random_btn, |
|
score, |
|
critique, |
|
ground_truth, |
|
] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |