ready for demo
Browse files- .gradio/certificate.pem +31 -0
- TestQuesitons.txt +0 -3
- app.py +452 -84
- static/testquestions.txt +10 -0
- testquestions.txt +10 -0
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
TestQuesitons.txt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
How many states are in america?
|
2 |
-
|
3 |
-
How much wood could a woodchuck chuck if a woodchuck could chuck wood?
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -7,26 +7,37 @@ import json
|
|
7 |
from dotenv import load_dotenv
|
8 |
import threading
|
9 |
from queue import Queue, Empty
|
|
|
10 |
|
11 |
# Load environment variables
|
12 |
load_dotenv()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
MAX_QUESTIONS = 10 # Maximum number of questions to support
|
15 |
|
16 |
######
|
17 |
-
#
|
18 |
#
|
19 |
MODELS = [
|
20 |
-
|
21 |
-
"anthropic/claude-3-
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"openai/gpt-
|
|
|
|
|
|
|
|
|
|
|
27 |
]
|
28 |
-
#
|
29 |
-
######
|
30 |
|
31 |
# Get configuration from environment variables
|
32 |
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
|
@@ -44,7 +55,7 @@ def get_response(question, model):
|
|
44 |
}
|
45 |
|
46 |
data = {
|
47 |
-
"model": model,
|
48 |
"messages": [
|
49 |
{"role": "user", "content": question}
|
50 |
],
|
@@ -92,36 +103,158 @@ def read_questions(file_obj):
|
|
92 |
return questions
|
93 |
|
94 |
with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
95 |
-
gr.Markdown("# Vibesmark Test Suite
|
96 |
|
97 |
# Store current state
|
98 |
-
state = gr.State({
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
with gr.Row():
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
with gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
question_display = gr.Markdown("### Upload a file to begin")
|
108 |
with gr.Row():
|
109 |
with gr.Column():
|
110 |
-
|
111 |
-
model1_display = gr.Markdown("")
|
112 |
-
response1_display = gr.Textbox(label="Response 1", interactive=False, lines=4)
|
113 |
with gr.Column():
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
def process_file(file, state):
|
121 |
if file is None:
|
122 |
raise gr.Error("Please upload a file first.")
|
123 |
questions = read_questions(file)
|
124 |
-
new_state = {
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
# Return outputs in order matching the outputs list in the event handler
|
127 |
return [
|
@@ -129,14 +262,22 @@ with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
|
129 |
f"Question 1 / {len(questions)}", # question_counter
|
130 |
gr.update(interactive=False), # prev_btn
|
131 |
gr.update(interactive=len(questions) > 1), # next_btn
|
132 |
-
gr.update(value=""), # model1_display
|
133 |
gr.update(value=""), # response1_display
|
134 |
-
gr.update(value=""), # model2_display
|
135 |
gr.update(value=""), # response2_display
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
]
|
138 |
|
139 |
-
def navigate_question(direction, state):
|
|
|
|
|
|
|
|
|
140 |
questions = state["questions"]
|
141 |
current_index = state["current_index"]
|
142 |
|
@@ -144,21 +285,94 @@ with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
|
144 |
current_index += 1
|
145 |
elif direction == "prev" and current_index > 0:
|
146 |
current_index -= 1
|
|
|
|
|
147 |
|
148 |
new_state = state.copy()
|
149 |
new_state["current_index"] = current_index
|
150 |
|
151 |
-
#
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
f"### Question {current_index + 1}:\n{questions[current_index]}", # question_display
|
154 |
f"Question {current_index + 1} / {len(questions)}", # question_counter
|
155 |
gr.update(interactive=current_index > 0), # prev_btn
|
156 |
gr.update(interactive=current_index < len(questions) - 1), # next_btn
|
157 |
-
|
158 |
-
|
159 |
-
gr.update(
|
160 |
-
gr.update(
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
]
|
163 |
|
164 |
def get_responses_in_parallel(question, model1, model2):
|
@@ -213,37 +427,81 @@ with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
|
213 |
t1.join()
|
214 |
t2.join()
|
215 |
|
216 |
-
def
|
217 |
-
"""
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
237 |
]
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
gr.update(value=f"**{model_1}**"),
|
243 |
-
gr.update(value=partial1),
|
244 |
-
gr.update(value=f"**{model_2}**"),
|
245 |
-
gr.update(value=partial2)
|
246 |
-
]
|
247 |
|
248 |
# Connect events
|
249 |
file_input.change(
|
@@ -254,55 +512,135 @@ with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
|
254 |
question_counter,
|
255 |
prev_btn,
|
256 |
next_btn,
|
257 |
-
model1_display,
|
258 |
response1_display,
|
259 |
-
model2_display,
|
260 |
response2_display,
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
]
|
263 |
)
|
264 |
|
265 |
prev_btn.click(
|
266 |
-
fn=
|
267 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
268 |
outputs=[
|
269 |
question_display,
|
270 |
question_counter,
|
271 |
prev_btn,
|
272 |
next_btn,
|
273 |
-
model1_display,
|
274 |
response1_display,
|
275 |
-
model2_display,
|
276 |
response2_display,
|
277 |
-
|
|
|
|
|
|
|
|
|
278 |
]
|
279 |
)
|
280 |
|
281 |
next_btn.click(
|
282 |
-
fn=
|
283 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
284 |
outputs=[
|
285 |
question_display,
|
286 |
question_counter,
|
287 |
prev_btn,
|
288 |
next_btn,
|
289 |
-
model1_display,
|
290 |
response1_display,
|
291 |
-
model2_display,
|
292 |
response2_display,
|
293 |
-
|
|
|
|
|
|
|
|
|
294 |
]
|
295 |
)
|
296 |
|
297 |
-
|
298 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
inputs=[state],
|
300 |
outputs=[
|
301 |
-
|
|
|
|
|
|
|
|
|
302 |
response1_display,
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
]
|
|
|
|
|
|
|
306 |
)
|
307 |
|
308 |
# Add footer with subtle styling
|
@@ -313,4 +651,34 @@ demo.queue()
|
|
313 |
|
314 |
# Launch with the appropriate host setting for deployment
|
315 |
if __name__ == "__main__":
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
import threading
|
9 |
from queue import Queue, Empty
|
10 |
+
import shutil
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
15 |
+
# Create static directory if it doesn't exist
|
16 |
+
os.makedirs('static', exist_ok=True)
|
17 |
+
|
18 |
+
# Copy testquestions.txt to static directory if it exists
|
19 |
+
if os.path.exists('testquestions.txt'):
|
20 |
+
shutil.copy2('testquestions.txt', 'static/testquestions.txt')
|
21 |
+
|
22 |
MAX_QUESTIONS = 10 # Maximum number of questions to support
|
23 |
|
24 |
######
|
25 |
+
# Models configuration
|
26 |
#
|
27 |
MODELS = [
|
28 |
+
# Standard Language Models
|
29 |
+
{"display_name": "Claude 3 Opus", "model_id": "anthropic/claude-3-opus-20240229"},
|
30 |
+
{"display_name": "Claude 3.5 Sonnet", "model_id": "anthropic/claude-3.5-sonnet"},
|
31 |
+
{"display_name": "Gemini Pro", "model_id": "google/gemini-pro"},
|
32 |
+
{"display_name": "Mistral Medium", "model_id": "mistralai/mistral-medium"},
|
33 |
+
{"display_name": "Claude 2.1", "model_id": "anthropic/claude-2.1"},
|
34 |
+
{"display_name": "GPT-4 Turbo", "model_id": "openai/gpt-4-turbo-preview"},
|
35 |
+
{"display_name": "GPT-3.5 Turbo", "model_id": "openai/gpt-3.5-turbo"},
|
36 |
+
# Reasoning-specialized Models
|
37 |
+
{"display_name": "Reasoner: O1-Mini", "model_id": "openai/o1-mini"},
|
38 |
+
{"display_name": "Reasoner: O1 Preview", "model_id": "openai/o1-preview"},
|
39 |
+
{"display_name": "Reasoner: DeepSeek R1", "model_id": "deepseek/deepseek-r1"}
|
40 |
]
|
|
|
|
|
41 |
|
42 |
# Get configuration from environment variables
|
43 |
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
|
|
|
55 |
}
|
56 |
|
57 |
data = {
|
58 |
+
"model": model, # model is now the direct model_id
|
59 |
"messages": [
|
60 |
{"role": "user", "content": question}
|
61 |
],
|
|
|
103 |
return questions
|
104 |
|
105 |
with gr.Blocks(title="Vibesmark Test Suite") as demo:
|
106 |
+
gr.Markdown("# Vibesmark Test Suite")
|
107 |
|
108 |
# Store current state
|
109 |
+
state = gr.State({
|
110 |
+
"questions": [],
|
111 |
+
"current_index": 0,
|
112 |
+
"preferences": {}, # Store preferences for each question
|
113 |
+
"current_model_order": {}, # Track which model is shown on which side
|
114 |
+
"test_started": False # Track if test has started
|
115 |
+
})
|
116 |
|
117 |
+
# Move model selection to the top
|
118 |
with gr.Row():
|
119 |
+
with gr.Column():
|
120 |
+
model1_selector = gr.Dropdown(
|
121 |
+
choices={model["model_id"]: model["display_name"] for model in MODELS},
|
122 |
+
label="Select First Model",
|
123 |
+
value="anthropic/claude-3.5-sonnet",
|
124 |
+
type="value",
|
125 |
+
allow_custom_value=False
|
126 |
+
)
|
127 |
+
with gr.Column():
|
128 |
+
model2_selector = gr.Dropdown(
|
129 |
+
choices={model["model_id"]: model["display_name"] for model in MODELS},
|
130 |
+
label="Select Second Model",
|
131 |
+
value="google/gemini-pro",
|
132 |
+
type="value",
|
133 |
+
allow_custom_value=False
|
134 |
+
)
|
135 |
|
136 |
+
with gr.Row():
|
137 |
+
with gr.Column():
|
138 |
+
gr.Markdown("Upload a `.txt` file with **one question per line**.")
|
139 |
+
file_input = gr.File(label="Upload your questions (.txt)")
|
140 |
+
with gr.Column():
|
141 |
+
gr.Markdown("Download example questions:")
|
142 |
+
gr.HTML('<a href="testquestions.txt" download>Download testquestions.txt</a>')
|
143 |
+
|
144 |
+
with gr.Row():
|
145 |
+
start_btn = gr.Button("Start Test", variant="primary")
|
146 |
+
finish_btn = gr.Button("Finish & Show Results", variant="secondary", interactive=False)
|
147 |
+
results_display = gr.Markdown("Click 'Finish & Show Results' when you're done to see the summary", visible=True)
|
148 |
+
|
149 |
+
# Add confirmation dialog
|
150 |
+
with gr.Row(visible=False) as confirm_dialog:
|
151 |
+
gr.Markdown("Are you sure you want to finish the test? This will reset all progress.")
|
152 |
+
with gr.Row():
|
153 |
+
confirm_btn = gr.Button("Yes, Finish Test", variant="primary")
|
154 |
+
cancel_btn = gr.Button("Cancel", variant="secondary")
|
155 |
+
|
156 |
+
with gr.Group(visible=False) as question_group:
|
157 |
question_display = gr.Markdown("### Upload a file to begin")
|
158 |
with gr.Row():
|
159 |
with gr.Column():
|
160 |
+
response1_display = gr.Textbox(label="Response A", interactive=False, lines=8)
|
|
|
|
|
161 |
with gr.Column():
|
162 |
+
response2_display = gr.Textbox(label="Response B", interactive=False, lines=8)
|
163 |
+
|
164 |
+
# Add preference selection buttons
|
165 |
+
with gr.Row():
|
166 |
+
prefer_a_btn = gr.Button("Prefer Response A", interactive=False, variant="secondary")
|
167 |
+
preference_display = gr.Markdown("Make your selection", container=True)
|
168 |
+
prefer_b_btn = gr.Button("Prefer Response B", interactive=False, variant="secondary")
|
169 |
+
|
170 |
+
# Add vertical spacing
|
171 |
+
gr.Row(height=30)
|
172 |
+
|
173 |
+
# Move navigation to bottom of question group
|
174 |
+
with gr.Row():
|
175 |
+
prev_btn = gr.Button("← Previous", interactive=False)
|
176 |
+
question_counter = gr.Markdown("Question 0 / 0")
|
177 |
+
next_btn = gr.Button("Next →", interactive=False)
|
178 |
+
|
179 |
+
def start_test(state, model_1, model_2):
|
180 |
+
"""Start the test and lock model selection"""
|
181 |
+
if not state["questions"]:
|
182 |
+
raise gr.Error("Please upload a file first.")
|
183 |
+
|
184 |
+
if model_1 == model_2:
|
185 |
+
raise gr.Error("Please select different models for comparison.")
|
186 |
+
|
187 |
+
new_state = state.copy()
|
188 |
+
new_state["test_started"] = True
|
189 |
+
current_index = state["current_index"]
|
190 |
+
current_question = state["questions"][current_index]
|
191 |
+
|
192 |
+
# Get existing preference if any
|
193 |
+
current_pref = state["preferences"].get(current_index, None)
|
194 |
+
pref_display = "Make your selection"
|
195 |
+
if current_pref is not None:
|
196 |
+
pref_display = f"You preferred Response {current_pref}"
|
197 |
+
|
198 |
+
# First yield the initial state updates
|
199 |
+
yield [
|
200 |
+
new_state,
|
201 |
+
gr.update(interactive=False), # model1_selector
|
202 |
+
gr.update(interactive=False), # model2_selector
|
203 |
+
gr.update(interactive=False), # start_btn
|
204 |
+
gr.update(interactive=True), # finish_btn
|
205 |
+
"", # response1_display
|
206 |
+
"", # response2_display
|
207 |
+
gr.update(interactive=True), # prefer_a_btn - Enable immediately
|
208 |
+
gr.update(interactive=True), # prefer_b_btn - Enable immediately
|
209 |
+
pref_display, # preference_display
|
210 |
+
gr.update(visible=True) # question_group
|
211 |
+
]
|
212 |
+
|
213 |
+
# Randomly decide which model goes on which side
|
214 |
+
if random.choice([True, False]):
|
215 |
+
model_a, model_b = model_1, model_2
|
216 |
+
else:
|
217 |
+
model_a, model_b = model_2, model_1
|
218 |
+
|
219 |
+
# Store the model order in state
|
220 |
+
new_state["current_model_order"][current_index] = {
|
221 |
+
"A": model_a,
|
222 |
+
"B": model_b
|
223 |
+
}
|
224 |
+
|
225 |
+
# Stream both model responses in parallel
|
226 |
+
for partial1, partial2 in get_responses_in_parallel(current_question, model_a, model_b):
|
227 |
+
# Check current preference again in case it changed during streaming
|
228 |
+
current_pref = new_state["preferences"].get(current_index, None)
|
229 |
+
pref_display = "Make your selection"
|
230 |
+
if current_pref is not None:
|
231 |
+
pref_display = f"You preferred Response {current_pref}"
|
232 |
+
|
233 |
+
yield [
|
234 |
+
new_state,
|
235 |
+
gr.update(interactive=False), # model1_selector
|
236 |
+
gr.update(interactive=False), # model2_selector
|
237 |
+
gr.update(interactive=False), # start_btn
|
238 |
+
gr.update(interactive=True), # finish_btn
|
239 |
+
partial1, # response1_display
|
240 |
+
partial2, # response2_display
|
241 |
+
gr.update(interactive=True), # prefer_a_btn - Keep enabled during streaming
|
242 |
+
gr.update(interactive=True), # prefer_b_btn - Keep enabled during streaming
|
243 |
+
pref_display, # preference_display - Maintain current preference
|
244 |
+
gr.update(visible=True) # question_group
|
245 |
+
]
|
246 |
|
247 |
def process_file(file, state):
|
248 |
if file is None:
|
249 |
raise gr.Error("Please upload a file first.")
|
250 |
questions = read_questions(file)
|
251 |
+
new_state = {
|
252 |
+
"questions": questions,
|
253 |
+
"current_index": 0,
|
254 |
+
"preferences": {},
|
255 |
+
"current_model_order": {},
|
256 |
+
"test_started": False
|
257 |
+
}
|
258 |
|
259 |
# Return outputs in order matching the outputs list in the event handler
|
260 |
return [
|
|
|
262 |
f"Question 1 / {len(questions)}", # question_counter
|
263 |
gr.update(interactive=False), # prev_btn
|
264 |
gr.update(interactive=len(questions) > 1), # next_btn
|
|
|
265 |
gr.update(value=""), # response1_display
|
|
|
266 |
gr.update(value=""), # response2_display
|
267 |
+
gr.update(interactive=False), # prefer_a_btn
|
268 |
+
gr.update(interactive=False), # prefer_b_btn
|
269 |
+
"Make your selection", # preference_display
|
270 |
+
new_state, # state
|
271 |
+
gr.update(interactive=True), # start_btn
|
272 |
+
gr.update(interactive=False), # finish_btn
|
273 |
+
gr.update(visible=False) # question_group
|
274 |
]
|
275 |
|
276 |
+
def navigate_question(direction, state, model_1, model_2):
|
277 |
+
"""Navigate to next/prev question and start fetching responses"""
|
278 |
+
if not state["test_started"]:
|
279 |
+
raise gr.Error("Please start the test first")
|
280 |
+
|
281 |
questions = state["questions"]
|
282 |
current_index = state["current_index"]
|
283 |
|
|
|
285 |
current_index += 1
|
286 |
elif direction == "prev" and current_index > 0:
|
287 |
current_index -= 1
|
288 |
+
else:
|
289 |
+
raise gr.Error("No more questions in that direction")
|
290 |
|
291 |
new_state = state.copy()
|
292 |
new_state["current_index"] = current_index
|
293 |
|
294 |
+
# Get existing preference for this question if any
|
295 |
+
current_pref = state["preferences"].get(current_index, None)
|
296 |
+
pref_display = "Make your selection"
|
297 |
+
if current_pref is not None:
|
298 |
+
pref_display = f"You preferred Response {current_pref}"
|
299 |
+
|
300 |
+
# First yield to update the question display and clear responses
|
301 |
+
yield [
|
302 |
f"### Question {current_index + 1}:\n{questions[current_index]}", # question_display
|
303 |
f"Question {current_index + 1} / {len(questions)}", # question_counter
|
304 |
gr.update(interactive=current_index > 0), # prev_btn
|
305 |
gr.update(interactive=current_index < len(questions) - 1), # next_btn
|
306 |
+
"", # response1_display
|
307 |
+
"", # response2_display
|
308 |
+
gr.update(interactive=True), # prefer_a_btn - Enable immediately
|
309 |
+
gr.update(interactive=True), # prefer_b_btn - Enable immediately
|
310 |
+
pref_display, # preference_display
|
311 |
+
new_state, # state
|
312 |
+
gr.update(visible=True) # question_group
|
313 |
+
]
|
314 |
+
|
315 |
+
# Now start fetching responses
|
316 |
+
current_question = questions[current_index]
|
317 |
+
|
318 |
+
# Randomly decide which model goes on which side
|
319 |
+
if random.choice([True, False]):
|
320 |
+
model_a, model_b = model_1, model_2
|
321 |
+
else:
|
322 |
+
model_a, model_b = model_2, model_1
|
323 |
+
|
324 |
+
# Store the model order in state
|
325 |
+
new_state["current_model_order"][current_index] = {
|
326 |
+
"A": model_a,
|
327 |
+
"B": model_b
|
328 |
+
}
|
329 |
+
|
330 |
+
# Stream both model responses in parallel
|
331 |
+
for partial1, partial2 in get_responses_in_parallel(current_question, model_a, model_b):
|
332 |
+
# Check current preference again in case it changed during streaming
|
333 |
+
current_pref = new_state["preferences"].get(current_index, None)
|
334 |
+
pref_display = "Make your selection"
|
335 |
+
if current_pref is not None:
|
336 |
+
pref_display = f"You preferred Response {current_pref}"
|
337 |
+
|
338 |
+
yield [
|
339 |
+
f"### Question {current_index + 1}:\n{questions[current_index]}", # question_display
|
340 |
+
f"Question {current_index + 1} / {len(questions)}", # question_counter
|
341 |
+
gr.update(interactive=current_index > 0), # prev_btn
|
342 |
+
gr.update(interactive=current_index < len(questions) - 1), # next_btn
|
343 |
+
partial1, # response1_display
|
344 |
+
partial2, # response2_display
|
345 |
+
gr.update(interactive=True), # prefer_a_btn - Keep enabled during streaming
|
346 |
+
gr.update(interactive=True), # prefer_b_btn - Keep enabled during streaming
|
347 |
+
pref_display, # preference_display - Maintain current preference
|
348 |
+
new_state, # state
|
349 |
+
gr.update(visible=True) # question_group
|
350 |
+
]
|
351 |
+
|
352 |
+
def record_preference(choice, state):
|
353 |
+
"""Record user's preference for the current question"""
|
354 |
+
current_index = state["current_index"]
|
355 |
+
new_state = state.copy()
|
356 |
+
new_state["preferences"][current_index] = choice
|
357 |
+
|
358 |
+
# Get the actual models for this choice
|
359 |
+
model_order = state["current_model_order"].get(current_index, {})
|
360 |
+
model_a = model_order.get("A", "Unknown")
|
361 |
+
model_b = model_order.get("B", "Unknown")
|
362 |
+
|
363 |
+
# Create a more detailed preference message
|
364 |
+
if choice == "A":
|
365 |
+
preferred_model = model_a
|
366 |
+
other_model = model_b
|
367 |
+
else:
|
368 |
+
preferred_model = model_b
|
369 |
+
other_model = model_a
|
370 |
+
|
371 |
+
message = f"You preferred {preferred_model} over {other_model}"
|
372 |
+
|
373 |
+
return [
|
374 |
+
new_state,
|
375 |
+
message
|
376 |
]
|
377 |
|
378 |
def get_responses_in_parallel(question, model1, model2):
|
|
|
427 |
t1.join()
|
428 |
t2.join()
|
429 |
|
430 |
+
def reset_interface():
|
431 |
+
"""Reset all interface elements to their initial state"""
|
432 |
+
return [
|
433 |
+
gr.update(interactive=True), # model1_selector
|
434 |
+
gr.update(interactive=True), # model2_selector
|
435 |
+
gr.update(interactive=True), # start_btn
|
436 |
+
gr.update(interactive=False), # finish_btn
|
437 |
+
gr.update(value=""), # response1_display
|
438 |
+
gr.update(value=""), # response2_display
|
439 |
+
gr.update(interactive=False), # prefer_a_btn
|
440 |
+
gr.update(interactive=False), # prefer_b_btn
|
441 |
+
"Make your selection", # preference_display
|
442 |
+
gr.update(value="### Upload a file to begin"), # question_display
|
443 |
+
gr.update(value="Question 0 / 0"), # question_counter
|
444 |
+
gr.update(interactive=False), # prev_btn
|
445 |
+
gr.update(interactive=False), # next_btn
|
446 |
+
{ # Fresh state
|
447 |
+
"questions": [],
|
448 |
+
"current_index": 0,
|
449 |
+
"preferences": {},
|
450 |
+
"current_model_order": {},
|
451 |
+
"test_started": False
|
452 |
+
},
|
453 |
+
gr.update(visible=False) # question_group
|
454 |
+
]
|
455 |
|
456 |
+
def generate_results_summary(state):
|
457 |
+
"""Generate a summary of which model was preferred for which questions"""
|
458 |
+
if not state["preferences"]:
|
459 |
+
return ["No preferences recorded yet."] + reset_interface()
|
460 |
+
|
461 |
+
# Create a mapping of model to preferred question numbers
|
462 |
+
model_preferences = {}
|
463 |
+
|
464 |
+
for q_idx, choice in state["preferences"].items():
|
465 |
+
# Get the model order for this question
|
466 |
+
model_order = state["current_model_order"].get(q_idx, {})
|
467 |
+
if not model_order:
|
468 |
+
continue
|
469 |
+
|
470 |
+
# Determine which model was preferred
|
471 |
+
preferred_model = model_order["A"] if choice == "A" else model_order["B"]
|
472 |
+
|
473 |
+
# Get display name for the model
|
474 |
+
display_name = next((m["display_name"] for m in MODELS if m["model_id"] == preferred_model), preferred_model)
|
475 |
+
|
476 |
+
if display_name not in model_preferences:
|
477 |
+
model_preferences[display_name] = []
|
478 |
+
model_preferences[display_name].append(str(q_idx + 1)) # +1 for 1-based indexing
|
479 |
+
|
480 |
+
# Format the results
|
481 |
+
summary_parts = []
|
482 |
+
for model, questions in model_preferences.items():
|
483 |
+
summary_parts.append(f"**{model}** won questions {', '.join(questions)}")
|
484 |
+
|
485 |
+
summary = "### Results Summary\n" + "\n\n".join(summary_parts)
|
486 |
+
|
487 |
+
# Return summary and reset interface
|
488 |
+
return [summary] + reset_interface() + [gr.update(visible=False)] # Hide question_group
|
489 |
|
490 |
+
def show_confirm_dialog(state):
|
491 |
+
"""Show confirmation dialog if test has started"""
|
492 |
+
if not state["test_started"] or not state["questions"]:
|
493 |
+
return [
|
494 |
+
gr.update(visible=False), # confirm_dialog
|
495 |
+
["No test in progress to finish."] + reset_interface() + [gr.update(visible=False)] # results and reset
|
496 |
+
]
|
497 |
+
return [
|
498 |
+
gr.update(visible=True), # confirm_dialog
|
499 |
+
None # No results update
|
500 |
]
|
501 |
|
502 |
+
def hide_confirm_dialog():
|
503 |
+
"""Hide the confirmation dialog"""
|
504 |
+
return gr.update(visible=False)
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
# Connect events
|
507 |
file_input.change(
|
|
|
512 |
question_counter,
|
513 |
prev_btn,
|
514 |
next_btn,
|
|
|
515 |
response1_display,
|
|
|
516 |
response2_display,
|
517 |
+
prefer_a_btn,
|
518 |
+
prefer_b_btn,
|
519 |
+
preference_display,
|
520 |
+
state,
|
521 |
+
start_btn,
|
522 |
+
finish_btn,
|
523 |
+
question_group
|
524 |
]
|
525 |
)
|
526 |
|
527 |
prev_btn.click(
|
528 |
+
fn=navigate_question,
|
529 |
+
inputs=[
|
530 |
+
gr.State("prev"),
|
531 |
+
state,
|
532 |
+
model1_selector,
|
533 |
+
model2_selector
|
534 |
+
],
|
535 |
outputs=[
|
536 |
question_display,
|
537 |
question_counter,
|
538 |
prev_btn,
|
539 |
next_btn,
|
|
|
540 |
response1_display,
|
|
|
541 |
response2_display,
|
542 |
+
prefer_a_btn,
|
543 |
+
prefer_b_btn,
|
544 |
+
preference_display,
|
545 |
+
state,
|
546 |
+
question_group
|
547 |
]
|
548 |
)
|
549 |
|
550 |
next_btn.click(
|
551 |
+
fn=navigate_question,
|
552 |
+
inputs=[
|
553 |
+
gr.State("next"),
|
554 |
+
state,
|
555 |
+
model1_selector,
|
556 |
+
model2_selector
|
557 |
+
],
|
558 |
outputs=[
|
559 |
question_display,
|
560 |
question_counter,
|
561 |
prev_btn,
|
562 |
next_btn,
|
|
|
563 |
response1_display,
|
|
|
564 |
response2_display,
|
565 |
+
prefer_a_btn,
|
566 |
+
prefer_b_btn,
|
567 |
+
preference_display,
|
568 |
+
state,
|
569 |
+
question_group
|
570 |
]
|
571 |
)
|
572 |
|
573 |
+
start_btn.click(
|
574 |
+
fn=start_test,
|
575 |
+
inputs=[state, model1_selector, model2_selector],
|
576 |
+
outputs=[
|
577 |
+
state,
|
578 |
+
model1_selector,
|
579 |
+
model2_selector,
|
580 |
+
start_btn,
|
581 |
+
finish_btn,
|
582 |
+
response1_display,
|
583 |
+
response2_display,
|
584 |
+
prefer_a_btn,
|
585 |
+
prefer_b_btn,
|
586 |
+
preference_display,
|
587 |
+
question_group
|
588 |
+
]
|
589 |
+
)
|
590 |
+
|
591 |
+
# Connect preference buttons
|
592 |
+
prefer_a_btn.click(
|
593 |
+
fn=lambda state: record_preference("A", state),
|
594 |
+
inputs=[state],
|
595 |
+
outputs=[state, preference_display]
|
596 |
+
)
|
597 |
+
|
598 |
+
prefer_b_btn.click(
|
599 |
+
fn=lambda state: record_preference("B", state),
|
600 |
+
inputs=[state],
|
601 |
+
outputs=[state, preference_display]
|
602 |
+
)
|
603 |
+
|
604 |
+
# Connect results button to show confirmation first
|
605 |
+
finish_btn.click(
|
606 |
+
fn=show_confirm_dialog,
|
607 |
+
inputs=[state],
|
608 |
+
outputs=[
|
609 |
+
confirm_dialog,
|
610 |
+
results_display
|
611 |
+
]
|
612 |
+
)
|
613 |
+
|
614 |
+
# Connect cancel button
|
615 |
+
cancel_btn.click(
|
616 |
+
fn=hide_confirm_dialog,
|
617 |
+
outputs=[confirm_dialog]
|
618 |
+
)
|
619 |
+
|
620 |
+
# Connect confirm button to actual finish action
|
621 |
+
confirm_btn.click(
|
622 |
+
fn=generate_results_summary,
|
623 |
inputs=[state],
|
624 |
outputs=[
|
625 |
+
results_display,
|
626 |
+
model1_selector,
|
627 |
+
model2_selector,
|
628 |
+
start_btn,
|
629 |
+
finish_btn,
|
630 |
response1_display,
|
631 |
+
response2_display,
|
632 |
+
prefer_a_btn,
|
633 |
+
prefer_b_btn,
|
634 |
+
preference_display,
|
635 |
+
question_display,
|
636 |
+
question_counter,
|
637 |
+
prev_btn,
|
638 |
+
next_btn,
|
639 |
+
state
|
640 |
]
|
641 |
+
).then(
|
642 |
+
fn=hide_confirm_dialog,
|
643 |
+
outputs=[confirm_dialog]
|
644 |
)
|
645 |
|
646 |
# Add footer with subtle styling
|
|
|
651 |
|
652 |
# Launch with the appropriate host setting for deployment
|
653 |
if __name__ == "__main__":
|
654 |
+
print("\nStarting Vibesmark Test Suite...")
|
655 |
+
print("You can access the app at: http://localhost:7860")
|
656 |
+
|
657 |
+
# Create a FastAPI app to serve the example file
|
658 |
+
from fastapi import FastAPI
|
659 |
+
from fastapi.responses import FileResponse
|
660 |
+
from fastapi.middleware.cors import CORSMiddleware
|
661 |
+
|
662 |
+
app = FastAPI()
|
663 |
+
|
664 |
+
# Add CORS middleware
|
665 |
+
app.add_middleware(
|
666 |
+
CORSMiddleware,
|
667 |
+
allow_origins=["*"],
|
668 |
+
allow_credentials=True,
|
669 |
+
allow_methods=["*"],
|
670 |
+
allow_headers=["*"],
|
671 |
+
)
|
672 |
+
|
673 |
+
@app.get("/testquestions.txt")
|
674 |
+
async def get_example_file():
|
675 |
+
return FileResponse("testquestions.txt")
|
676 |
+
|
677 |
+
# Mount FastAPI app to Gradio
|
678 |
+
demo.app.mount("/", app)
|
679 |
+
|
680 |
+
demo.launch(
|
681 |
+
server_name="0.0.0.0", # Allows external connections
|
682 |
+
server_port=7860,
|
683 |
+
share=False
|
684 |
+
)
|
static/testquestions.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
What is the capital of France?
|
2 |
+
Explain quantum entanglement in simple terms.
|
3 |
+
How does photosynthesis work?
|
4 |
+
What causes the seasons on Earth?
|
5 |
+
Write a haiku about artificial intelligence.
|
6 |
+
What are the key differences between classical and quantum computers?
|
7 |
+
Explain the concept of recursion in programming.
|
8 |
+
What is the significance of the number pi?
|
9 |
+
How do vaccines work to protect against diseases?
|
10 |
+
What causes the Northern Lights phenomenon?
|
testquestions.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
What is the capital of France?
|
2 |
+
Explain quantum entanglement in simple terms.
|
3 |
+
How does photosynthesis work?
|
4 |
+
What causes the seasons on Earth?
|
5 |
+
Write a haiku about artificial intelligence.
|
6 |
+
What are the key differences between classical and quantum computers?
|
7 |
+
Explain the concept of recursion in programming.
|
8 |
+
What is the significance of the number pi?
|
9 |
+
How do vaccines work to protect against diseases?
|
10 |
+
What causes the Northern Lights phenomenon?
|