import json import re from datetime import datetime import hashlib import gradio as gr from gen_api_answer import ( atla_parse_model_response, get_atla_response ) from prompts import ( ATLA_PROMPT, ATLA_PROMPT_WITH_REFERENCE ) from random_sample_generation import ( get_random_human_ai_pair, get_random_human_ai_ground_truth_pair, generate_ai_response ) from utils import Vote from prompts import ( DEFAULT_EVAL_PROMPT, DEFAULT_EVAL_PROMPT_EDITABLE, FIXED_EVAL_SUFFIX, DEFAULT_EVAL_CRITERIA ) from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS # Load the model_data from JSONL def load_model_data(): model_data = {} try: with open("data/models.jsonl", "r") as f: for line in f: model = json.loads(line) model_data[model["name"]] = { "organization": model["organization"], "license": model["license"], "api_model": model["api_model"], } except FileNotFoundError: print("Warning: models.jsonl not found") return {} return model_data model_data = load_model_data() def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id): prompt_value = prompt.value if hasattr(prompt, 'value') else prompt vote = Vote( timestamp=datetime.now().isoformat(), prompt=prompt_value, response_a=response_a, response_b=response_b, model_a=model_a, model_b=model_b, winner=winner, judge_id=judge_id, ) add_vote(vote, db) def parse_variables(prompt): # Extract variables enclosed in double curly braces variables = re.findall(r"{{(.*?)}}", prompt) # Remove duplicates while preserving order seen = set() variables = [ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) ] return variables def get_final_prompt(eval_prompt, variable_values): # Replace variables in the eval prompt with their values for var, val in variable_values.items(): eval_prompt = eval_prompt.replace("{{" + var + "}}", val) return eval_prompt def get_ip(request: gr.Request) -> str: """Get and hash the IP address from the request.""" if "cf-connecting-ip" in request.headers: ip = request.headers["cf-connecting-ip"] elif "x-forwarded-for" in request.headers: ip = request.headers["x-forwarded-for"] if "," in ip: ip = ip.split(",")[0] else: ip = request.client.host # Hash the IP address for privacy return hashlib.sha256(ip.encode()).hexdigest()[:16] def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]: """Generate appropriate message based on vote and model rankings. Returns (title, message) tuple.""" # Get current rankings voting_data = get_current_votes() leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True) rankings = get_model_rankings(leaderboard) pos_a = rankings.get(model_a, 0) pos_b = rankings.get(model_b, 0) if choice == "Tie": return "It's a tie!", "Keep voting responsibly 🤗" # Check if vote aligns with leaderboard if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a): return "The favourite wins!", "Keep voting responsibly 🤗" else: return "The underdog wins!", "Keep voting responsibly 🤗" def populate_random_example(request: gr.Request, compatible_mode: bool): """Generate a random human-AI conversation example and reset judge outputs.""" if compatible_mode: human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() else: human_msg, ai_msg = get_random_human_ai_pair() ground_truth_msg = "" return [ gr.update(value=human_msg), gr.update(value=ai_msg), gr.update(value="🎲", variant="secondary"), gr.update(value=""), # Clear score gr.update(value=""), # Clear critique gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility ] with gr.Blocks(theme="default", css=CSS_STYLES) as demo: gr.Markdown(MAIN_TITLE) gr.Markdown(HOW_IT_WORKS) # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT eval_prompt = gr.Textbox( value=DEFAULT_EVAL_PROMPT, visible=False ) with gr.Tabs(): with gr.TabItem("Playground"): with gr.Row(): # Left side - Input section with gr.Column(scale=1): with gr.Group(): human_input = gr.TextArea( label="👩 User Input", lines=5, placeholder="Enter the human message here..." ) with gr.Row(): generate_btn = gr.Button( "Generate AI Response", size="sm", interactive=False ) ai_response = gr.TextArea( label="🤖 AI Response", lines=10, placeholder="Enter the AI response here..." ) # Ground truth response (initially hidden) ground_truth = gr.TextArea( label="🎯 Ground truth response", lines=10, placeholder="Enter the ground truth response here...", visible=False ) with gr.Row(): random_btn = gr.Button("🎲", scale=2) send_btn = gr.Button( value="Run evaluation", variant="primary", size="lg", scale=8 ) # Right side - Model outputs with gr.Column(scale=1): gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation") with gr.Group(): with gr.Row(): score = gr.Textbox(label="Score", lines=1, interactive=False) critique = gr.TextArea(label="Critique", lines=12, interactive=False) gr.Markdown("
") # Replace the "Edit Judge Prompt" Accordion section with: with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion: gr.Markdown("
") use_reference_toggle = gr.Checkbox( label="Use a reference response", value=False ) # Hide the default prompt editor with gr.Column(visible=False) as default_prompt_editor: eval_prompt_editable = gr.TextArea( value=DEFAULT_EVAL_PROMPT_EDITABLE, label="Evaluation Criteria", lines=12 ) with gr.Row(visible=False) as edit_buttons_row: cancel_prompt_btn = gr.Button("Cancel") save_prompt_btn = gr.Button("Save", variant="primary") gr.Markdown("*The sample being evaluated is always appended as:*") gr.Markdown(f"```{FIXED_EVAL_SUFFIX}") # Show the compatible mode editor with gr.Column(visible=True) as compatible_prompt_editor: eval_criteria_text = gr.TextArea( label="Evaluation Criteria", lines=12, value=DEFAULT_EVAL_CRITERIA, placeholder="Enter the complete evaluation criteria and scoring rubric..." ) # Define state variables for model tracking model_a_state = gr.State() model_b_state = gr.State() final_prompt_state = gr.State() eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value is_editing = gr.State(False) # Track editing state compatible_mode_state = gr.State(False) # Track compatible mode state # Update model names after responses are generated def update_model_names(model_a, model_b): return gr.update(value=f"*Model: {model_a}*"), gr.update( value=f"*Model: {model_b}*" ) # Store the last submitted prompt and variables for comparison last_submission = gr.State({}) # Add handlers for save/cancel buttons def save_prompt(new_prompt, previous_prompt): return [ gr.update(value=new_prompt), # Update the prompt new_prompt, # Update the previous prompt state gr.update(visible=False) # Hide the buttons ] def cancel_prompt(previous_prompt): return [ gr.update(value=previous_prompt), # Revert to previous prompt previous_prompt, # Keep the previous prompt state gr.update(visible=False) # Hide the buttons ] def show_edit_buttons(current_value, previous_value): # Show buttons only if the current value differs from the previous value return gr.update(visible=current_value != previous_value) # Add handlers for save/cancel buttons and prompt changes save_prompt_btn.click( fn=save_prompt, inputs=[eval_prompt_editable, eval_prompt_previous], outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] ) cancel_prompt_btn.click( fn=cancel_prompt, inputs=[eval_prompt_previous], outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] ) eval_prompt_editable.change( fn=show_edit_buttons, inputs=[eval_prompt_editable, eval_prompt_previous], outputs=edit_buttons_row ) # Function to toggle visibility based on compatible mode def toggle_use_reference(checked): if checked: human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() return { ground_truth: gr.update(visible=True, value=ground_truth_msg), human_input: gr.update(value=human_msg), ai_response: gr.update(value=ai_msg), score: gr.update(value=""), critique: gr.update(value=""), random_btn: gr.update(value="🎲", variant="secondary"), } else: return { ground_truth: gr.update(visible=False) } # Update the change handler to include all necessary outputs use_reference_toggle.change( fn=toggle_use_reference, inputs=[use_reference_toggle], outputs=[ ground_truth, human_input, ai_response, score, critique, random_btn, ] ) # Add a new state variable to track first game first_game_state = gr.State(True) # Initialize as True # Update the submit function to parse the evaluation criteria def submit_and_store( use_reference, eval_criteria_text, human_input, ai_response, ground_truth_input, ): # Build prompt data dictionary prompt_data = { 'human_input': human_input, 'ai_response': ai_response, 'ground_truth_input': ground_truth_input if use_reference else '', 'eval_criteria': eval_criteria_text, } # Get base prompt based on whether reference is used base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT # Format the prompt final_prompt = base_prompt.format( human_input=prompt_data['human_input'], ai_response=prompt_data['ai_response'], ground_truth_input=prompt_data['ground_truth_input'], eval_criteria=prompt_data['eval_criteria'] ) # Get response from Atla response = get_atla_response( model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B", prompt=final_prompt, max_tokens=500, temperature=0.01 ) # Parse the response score, critique = atla_parse_model_response(response) return [ score, critique, gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), gr.update(value="🎲"), ] # Update the click handler to use False for is_first_game after first submission def create_submit_handler(): first_game = True def handler(*args): nonlocal first_game result = submit_and_store(*args) first_game = False # Set to False after first submission return result return handler # Update the send_btn click handler send_btn.click( fn=submit_and_store, inputs=[ use_reference_toggle, eval_criteria_text, human_input, ai_response, ground_truth, ], outputs=[ score, critique, send_btn, random_btn, ], ) # Add random button handler random_btn.click( fn=populate_random_example, inputs=[use_reference_toggle], outputs=[ human_input, ai_response, random_btn, score, critique, ground_truth, ] ) # Add input change handlers def handle_input_change(): """Reset UI state when inputs are changed""" return [ gr.update(value="Run evaluation", variant="primary"), # send_btn gr.update(value="🎲", variant="secondary"), # random_btn ] # Update the change handlers for inputs human_input.change( fn=handle_input_change, inputs=[], outputs=[send_btn, random_btn] ) ai_response.change( fn=handle_input_change, inputs=[], outputs=[send_btn, random_btn] ) generate_btn.click( fn=lambda msg: ( generate_ai_response(msg)[0], # Only take the response text gr.update( value="Generate AI Response", # Keep the label interactive=False # Disable the button ) ), inputs=[human_input], outputs=[ai_response, generate_btn] ) human_input.change( fn=lambda x: gr.update(interactive=bool(x.strip())), inputs=[human_input], outputs=[generate_btn] ) # Update the demo.load to include the random example population demo.load( fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode inputs=[], outputs=[ human_input, ai_response, random_btn, score, critique, ground_truth, ] ) if __name__ == "__main__": demo.launch()