Spaces:
Running
Running
New models appear more often
Browse files
app.py
CHANGED
@@ -686,7 +686,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
686 |
score3_description,
|
687 |
score4_description,
|
688 |
score5_description,
|
689 |
-
is_first_game,
|
690 |
):
|
691 |
# Build prompt data dictionary
|
692 |
prompt_data = {
|
@@ -705,21 +705,20 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
705 |
active_models = [name for name, info in model_data.items()
|
706 |
if info.get("active", True)]
|
707 |
|
708 |
-
|
709 |
# Define new models list
|
710 |
-
new_models = ["Atla-8B-preview", "Flow-Judge-
|
711 |
|
712 |
if is_first_game:
|
713 |
-
# For the first game, ensure
|
714 |
-
|
715 |
-
other_models = [m for m in active_models if m !=
|
716 |
other_model = random.choice(other_models)
|
717 |
|
718 |
# Randomly assign new model to either position A or B
|
719 |
if random.random() < 0.5:
|
720 |
-
model_a, model_b =
|
721 |
else:
|
722 |
-
model_a, model_b = other_model,
|
723 |
else:
|
724 |
# For subsequent games, new models appears 40% of the time
|
725 |
if random.random() < 0.4:
|
@@ -758,12 +757,14 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
758 |
is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
|
759 |
is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
|
760 |
is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
|
761 |
-
is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
|
|
|
|
|
762 |
|
763 |
if is_prometheus_a:
|
764 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
765 |
score_a_val = f"{score_a_val} / 5"
|
766 |
-
elif is_atla_a:
|
767 |
score_a_val, critique_a_val = atla_parse_model_response(response_a)
|
768 |
score_a_val = f"{score_a_val} / 5"
|
769 |
elif is_flow_judge_a:
|
@@ -776,7 +777,7 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
776 |
if is_prometheus_b:
|
777 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
778 |
score_b_val = f"{score_b_val} / 5"
|
779 |
-
elif is_atla_b:
|
780 |
score_b_val, critique_b_val = atla_parse_model_response(response_b)
|
781 |
score_b_val = f"{score_b_val} / 5"
|
782 |
elif is_flow_judge_b:
|
|
|
686 |
score3_description,
|
687 |
score4_description,
|
688 |
score5_description,
|
689 |
+
is_first_game,
|
690 |
):
|
691 |
# Build prompt data dictionary
|
692 |
prompt_data = {
|
|
|
705 |
active_models = [name for name, info in model_data.items()
|
706 |
if info.get("active", True)]
|
707 |
|
|
|
708 |
# Define new models list
|
709 |
+
new_models = ["Atla-8B-preview", "Flow-Judge-0.1", "SFR-LLaMA-3.1-70B-Judge"] # add "Flow-Judge-1.0" once ready
|
710 |
|
711 |
if is_first_game:
|
712 |
+
# For the first game, ensure Salesforce model is one of the models to catch up on votes
|
713 |
+
salesforce_model = "SFR-LLaMA-3.1-70B-Judge"
|
714 |
+
other_models = [m for m in active_models if m != salesforce_model]
|
715 |
other_model = random.choice(other_models)
|
716 |
|
717 |
# Randomly assign new model to either position A or B
|
718 |
if random.random() < 0.5:
|
719 |
+
model_a, model_b = salesforce_model, other_model
|
720 |
else:
|
721 |
+
model_a, model_b = other_model, salesforce_model
|
722 |
else:
|
723 |
# For subsequent games, new models appears 40% of the time
|
724 |
if random.random() < 0.4:
|
|
|
757 |
is_atla_a = (model_data.get(model_a)['organization'] == 'Atla')
|
758 |
is_atla_b = (model_data.get(model_b)['organization'] == 'Atla')
|
759 |
is_flow_judge_a = (model_data.get(model_a)['organization'] == 'Flow AI')
|
760 |
+
is_flow_judge_b = (model_data.get(model_b)['organization'] == 'Flow AI')
|
761 |
+
is_salesforce_a = (model_data.get(model_a)['organization'] == 'Salesforce')
|
762 |
+
is_salesforce_b = (model_data.get(model_b)['organization'] == 'Salesforce')
|
763 |
|
764 |
if is_prometheus_a:
|
765 |
score_a_val, critique_a_val = prometheus_parse_model_response(response_a)
|
766 |
score_a_val = f"{score_a_val} / 5"
|
767 |
+
elif is_atla_a or is_salesforce_a: # Same parser for Atla and Salesforce
|
768 |
score_a_val, critique_a_val = atla_parse_model_response(response_a)
|
769 |
score_a_val = f"{score_a_val} / 5"
|
770 |
elif is_flow_judge_a:
|
|
|
777 |
if is_prometheus_b:
|
778 |
score_b_val, critique_b_val = prometheus_parse_model_response(response_b)
|
779 |
score_b_val = f"{score_b_val} / 5"
|
780 |
+
elif is_atla_b or is_salesforce_b: # Same parser for Atla and Salesforce
|
781 |
score_b_val, critique_b_val = atla_parse_model_response(response_b)
|
782 |
score_b_val = f"{score_b_val} / 5"
|
783 |
elif is_flow_judge_b:
|