kaizuberbuehler commited on
Commit
afb8d0c
·
1 Parent(s): 0a86c6a

Add new benchmarks; Several improvements

Browse files
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import json
2
- from datetime import datetime, date
3
 
4
  import gradio as gr
5
  import plotly.graph_objects as go
 
 
6
 
7
 
8
  def create_big_five_capex_plot() -> go.Figure:
@@ -11,8 +13,8 @@ def create_big_five_capex_plot() -> go.Figure:
11
  data = [json.loads(line) for line in file if line.strip()]
12
 
13
  quarters: list[str] = [entry["Quarter"] for entry in data]
14
- companies = ['Microsoft', 'Google', 'Meta', 'Apple', 'Amazon']
15
- colors = ['#80bb00', '#ee161f', '#0065e3', '#000000', '#ff6200']
16
 
17
  x_positions = list(range(len(quarters)))
18
 
@@ -29,7 +31,7 @@ def create_big_five_capex_plot() -> go.Figure:
29
  fig = go.Figure(data=traces)
30
  fig.update_layout(
31
  barmode="stack",
32
- title="Capital Expenditures of the Big Five Tech Companies in Millions of USD per Quarter",
33
  xaxis_title="Quarter",
34
  yaxis_title="Capital Expenditures (Millions USD)",
35
  xaxis=dict(
@@ -37,7 +39,7 @@ def create_big_five_capex_plot() -> go.Figure:
37
  tickvals=x_positions,
38
  ticktext=quarters
39
  ),
40
- height=600
41
  )
42
 
43
  # Calculate the x position for the vertical dotted line.
@@ -86,12 +88,14 @@ def create_big_five_capex_plot() -> go.Figure:
86
 
87
  def create_simple_plot(data_path: str,
88
  name: str,
 
89
  start_date: datetime, end_date: datetime,
90
- min_value: int = 0, max_value: int = 100) -> go.Figure:
91
- simple_bench_leaderboard = []
 
92
  with open(data_path, 'r') as file:
93
  for line in file:
94
- simple_bench_leaderboard.append(json.loads(line))
95
 
96
  models = []
97
  with open("models.jsonl", 'r') as file:
@@ -99,7 +103,7 @@ def create_simple_plot(data_path: str,
99
  models.append(json.loads(line))
100
 
101
  data = []
102
- for entry in simple_bench_leaderboard:
103
  model_name = entry['model']
104
  score = entry['score']
105
  model_info = next((m for m in models if m['Name'] == model_name), None)
@@ -142,8 +146,8 @@ def create_simple_plot(data_path: str,
142
  ))
143
 
144
  fig.update_layout(
145
- title=f'{name} Over Time',
146
- xaxis_title='Release Date',
147
  yaxis_title=name,
148
  hovermode='x unified',
149
  xaxis=dict(
@@ -156,67 +160,237 @@ def create_simple_plot(data_path: str,
156
  height=800
157
  )
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return fig
160
 
161
 
162
  with gr.Blocks() as demo:
163
  with gr.Tab("System Performance Over Time"):
164
- with gr.Tab("ARC-AGI-Pub") as arc_agi_tab:
165
- arc_agi_plot: gr.Plot = gr.Plot()
166
- with gr.Tab("Simple Bench") as simple_bench_tab:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  simple_bench_plot: gr.Plot = gr.Plot()
168
- with gr.Tab("PlanBench") as planbench_tab:
 
 
 
169
  planbench_plot: gr.Plot = gr.Plot()
170
  planbench_markdown: gr.Markdown = gr.Markdown(
171
  value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
172
  )
173
- with gr.Tab("Codeforces") as codeforces_tab:
174
- with gr.Tab("General-Purpose Systems"):
175
- codeforces_plot: gr.Plot = gr.Plot()
176
- with gr.Tab("BigCodeBench", interactive=False):
177
- bigcodebench_plot: gr.Plot = gr.Plot()
178
- with gr.Tab("GAIA", interactive=False):
179
  gaia_plot: gr.Plot = gr.Plot()
180
- with gr.Tab("GPQA", interactive=False):
 
 
 
 
 
 
 
 
 
 
 
181
  gpqa_plot: gr.Plot = gr.Plot()
182
- with gr.Tab("HumanEval", interactive=False):
183
- humaneval_plot: gr.Plot = gr.Plot()
184
- with gr.Tab("Chatbot Arena", interactive=False):
185
- chatbot_arena_plot: gr.Plot = gr.Plot()
186
- with gr.Tab("MATH", interactive=False):
187
- math_plot: gr.Plot = gr.Plot()
188
- with gr.Tab("OpenCompass", interactive=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  opencompass_plot: gr.Plot = gr.Plot()
190
- with gr.Tab("SWE-bench", interactive=False):
 
 
 
191
  swe_bench_plot: gr.Plot = gr.Plot()
192
- with gr.Tab("WebArena", interactive=False):
 
 
 
193
  webarena_plot: gr.Plot = gr.Plot()
194
- with gr.Tab("ZeroEval", interactive=False):
195
- zeroeval_plot: gr.Plot = gr.Plot()
 
196
  with gr.Tab("Finance") as finance_tab:
197
- with gr.Tab("Big Five Capex") as big_five_capex_tab:
198
  big_five_capex_plot: gr.Plot = gr.Plot()
199
- with gr.Tab("NVIDIA Revenue", interactive=False) as nvidia_revenue:
200
  nvidia_revenue_plot: gr.Plot = gr.Plot()
201
  big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
202
- finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
 
 
 
 
 
 
 
203
  arc_agi_tab.select(fn=create_simple_plot,
204
- inputs=[gr.State("arc_agi_leaderboard.jsonl"), gr.State("ARC-AGI-Pub (Public Eval) Score"),
205
- gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20))],
206
- outputs=arc_agi_plot)
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  simple_bench_tab.select(fn=create_simple_plot,
208
- inputs=[gr.State("simple_bench_leaderboard.jsonl"), gr.State("Simple Bench Score"),
209
- gr.State(date(2023, 6, 13)), gr.State(date(2024, 8, 14))],
 
 
 
 
210
  outputs=simple_bench_plot)
211
  codeforces_tab.select(fn=create_simple_plot,
212
- inputs=[gr.State("codeforces_leaderboard.jsonl"), gr.State("Codeforces (Elo Rating)"),
213
- gr.State(date(2024, 5, 13)), gr.State(date(2024, 12, 20)),
214
- gr.State(800), gr.State(3000)],
 
 
 
215
  outputs=codeforces_plot)
216
  planbench_tab.select(fn=create_simple_plot,
217
- inputs=[gr.State("planbench_leaderboard.jsonl"), gr.State("PlanBench (Mystery Blocksworld, 0-shot) Score"),
218
- gr.State(date(2023, 3, 14)), gr.State(date(2024, 9, 23))],
 
 
219
  outputs=planbench_plot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
 
222
  if __name__ == "__main__":
 
1
  import json
2
+ from datetime import datetime, date, timedelta
3
 
4
  import gradio as gr
5
  import plotly.graph_objects as go
6
+ from scipy.optimize import curve_fit
7
+ import numpy as np
8
 
9
 
10
  def create_big_five_capex_plot() -> go.Figure:
 
13
  data = [json.loads(line) for line in file if line.strip()]
14
 
15
  quarters: list[str] = [entry["Quarter"] for entry in data]
16
+ companies = ['Microsoft', 'Google', 'Meta', 'Amazon']
17
+ colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200']
18
 
19
  x_positions = list(range(len(quarters)))
20
 
 
31
  fig = go.Figure(data=traces)
32
  fig.update_layout(
33
  barmode="stack",
34
+ title="Capital Expenditures of Amazon, Meta, Google and Microsoft in Millions of USD per Quarter",
35
  xaxis_title="Quarter",
36
  yaxis_title="Capital Expenditures (Millions USD)",
37
  xaxis=dict(
 
39
  tickvals=x_positions,
40
  ticktext=quarters
41
  ),
42
+ height=800
43
  )
44
 
45
  # Calculate the x position for the vertical dotted line.
 
88
 
89
  def create_simple_plot(data_path: str,
90
  name: str,
91
+ subtitle: str,
92
  start_date: datetime, end_date: datetime,
93
+ min_value: int = 0, max_value: int = 100,
94
+ labeled_horizontal_lines: dict[str, float] = None) -> go.Figure:
95
+ leaderboard = []
96
  with open(data_path, 'r') as file:
97
  for line in file:
98
+ leaderboard.append(json.loads(line))
99
 
100
  models = []
101
  with open("models.jsonl", 'r') as file:
 
103
  models.append(json.loads(line))
104
 
105
  data = []
106
+ for entry in leaderboard:
107
  model_name = entry['model']
108
  score = entry['score']
109
  model_info = next((m for m in models if m['Name'] == model_name), None)
 
146
  ))
147
 
148
  fig.update_layout(
149
+ title=f'{name} Over Time<br><sup>{subtitle}</sup>',
150
+ xaxis_title='Publication or Release Date',
151
  yaxis_title=name,
152
  hovermode='x unified',
153
  xaxis=dict(
 
160
  height=800
161
  )
162
 
163
+ if labeled_horizontal_lines:
164
+ for label, y_value in labeled_horizontal_lines.items():
165
+ fig.add_hline(
166
+ y=y_value,
167
+ line_dash="dot",
168
+ line_color="black",
169
+ annotation_text=label,
170
+ annotation_position="right",
171
+ annotation=dict(
172
+ font_size=12,
173
+ font_color="black",
174
+ xanchor="left",
175
+ yanchor="middle",
176
+ xshift=10
177
+ )
178
+ )
179
+
180
  return fig
181
 
182
 
183
  with gr.Blocks() as demo:
184
  with gr.Tab("System Performance Over Time"):
185
+ with gr.Tab("Legend"):
186
+ legend_markdown: gr.Markdown = gr.Markdown(
187
+ value="""
188
+ ## Benchmarks and Top Scores
189
+
190
+ | Benchmark | Top Score |
191
+ |-----------|-----------|
192
+ | BigCodeBench | 🟠 36% |
193
+ | Simple Bench | 🟠 42% |
194
+ | PlanBench | 🟠 53% |
195
+ | GAIA | 🟡 65% |
196
+ | ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
197
+ | GPQA | 🟡 76% |
198
+ | ZebraLogic | 🟡 81% |
199
+ | ARC-AGI-Pub (Public Eval) | 🟡 83% |
200
+ | ZeroEval | 🟡 86% |
201
+ | MATH-L5 | 🟡 89% |
202
+ | MMLU-Redux | 🟢 93% |
203
+ | CRUX | 🟢 96% |
204
+
205
+ ## Colors
206
+
207
+ | Color | Score Range |
208
+ |-------|------------|
209
+ | 🔴 Red | Below 30% |
210
+ | 🟠 Orange | 30% to 60% |
211
+ | 🟡 Yellow | 60% to 90% |
212
+ | 🟢 Green | Above 90% |"""
213
+ )
214
+ with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
215
+ bigcodebench_plot: gr.Plot = gr.Plot()
216
+ bigcodebench_markdown: gr.Markdown = gr.Markdown(
217
+ value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)"""
218
+ )
219
+ with gr.Tab("🟠 Simple Bench") as simple_bench_tab:
220
  simple_bench_plot: gr.Plot = gr.Plot()
221
+ simple_bench_markdown: gr.Markdown = gr.Markdown(
222
+ value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
223
+ )
224
+ with gr.Tab("🟠 PlanBench") as planbench_tab:
225
  planbench_plot: gr.Plot = gr.Plot()
226
  planbench_markdown: gr.Markdown = gr.Markdown(
227
  value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
228
  )
229
+ with gr.Tab("🟡 GAIA") as gaia_tab:
 
 
 
 
 
230
  gaia_plot: gr.Plot = gr.Plot()
231
+ gaia_markdown: gr.Markdown = gr.Markdown(
232
+ value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
233
+ )
234
+ with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
235
+ with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
236
+ arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
237
+ with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab:
238
+ arc_agi_public_eval_plot: gr.Plot = gr.Plot()
239
+ arc_agi_markdown: gr.Markdown = gr.Markdown(
240
+ value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
241
+ )
242
+ with gr.Tab("🟡 GPQA") as gpqa_tab:
243
  gpqa_plot: gr.Plot = gr.Plot()
244
+ gpqa_markdown: gr.Markdown = gr.Markdown(
245
+ value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
246
+ )
247
+ with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
248
+ zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
249
+ zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
250
+ value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
251
+ )
252
+ with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
253
+ zeroeval_average_plot: gr.Plot = gr.Plot()
254
+ zeroeval_average_markdown: gr.Markdown = gr.Markdown(
255
+ value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
256
+ )
257
+ with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab:
258
+ zeroeval_math_l5_plot: gr.Plot = gr.Plot()
259
+ zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
260
+ value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
261
+ )
262
+ with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
263
+ zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
264
+ zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
265
+ value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
266
+ )
267
+ with gr.Tab("🟢 CRUX") as zeroeval_crux_tab:
268
+ zeroeval_crux_plot: gr.Plot = gr.Plot()
269
+ zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
270
+ value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
271
+ )
272
+ with gr.Tab("Codeforces") as codeforces_tab:
273
+ codeforces_plot: gr.Plot = gr.Plot()
274
+ with gr.Tab("OpenCompass", visible=False):
275
  opencompass_plot: gr.Plot = gr.Plot()
276
+ opencompass_markdown: gr.Markdown = gr.Markdown(
277
+ value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)"""
278
+ )
279
+ with gr.Tab("SWE-bench", visible=False):
280
  swe_bench_plot: gr.Plot = gr.Plot()
281
+ swe_bench_markdown: gr.Markdown = gr.Markdown(
282
+ value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
283
+ )
284
+ with gr.Tab("WebArena", visible=False):
285
  webarena_plot: gr.Plot = gr.Plot()
286
+ webarena_markdown: gr.Markdown = gr.Markdown(
287
+ value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
288
+ )
289
  with gr.Tab("Finance") as finance_tab:
290
+ with gr.Tab("Big Tech Capex") as big_five_capex_tab:
291
  big_five_capex_plot: gr.Plot = gr.Plot()
292
+ with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue:
293
  nvidia_revenue_plot: gr.Plot = gr.Plot()
294
  big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
295
+ arc_agi_public_eval_tab.select(fn=create_simple_plot,
296
+ inputs=[gr.State("arc_agi_leaderboard.jsonl"),
297
+ gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
298
+ gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
299
+ gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
300
+ gr.State(0), gr.State(100),
301
+ gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
302
+ outputs=arc_agi_public_eval_plot)
303
  arc_agi_tab.select(fn=create_simple_plot,
304
+ inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
305
+ gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
306
+ gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
307
+ gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
308
+ gr.State(0), gr.State(100),
309
+ gr.State({"MTurkers": 77})],
310
+ outputs=arc_agi_semi_private_eval_plot)
311
+ arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
312
+ inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
313
+ gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
314
+ gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
315
+ gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
316
+ gr.State(0), gr.State(100),
317
+ gr.State({"MTurkers": 77})],
318
+ outputs=arc_agi_semi_private_eval_plot)
319
+ finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
320
  simple_bench_tab.select(fn=create_simple_plot,
321
+ inputs=[gr.State("simple_bench_leaderboard.jsonl"),
322
+ gr.State("Simple Bench Score"),
323
+ gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
324
+ gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)),
325
+ gr.State(0), gr.State(100),
326
+ gr.State({"Humans": 83.7})],
327
  outputs=simple_bench_plot)
328
  codeforces_tab.select(fn=create_simple_plot,
329
+ inputs=[gr.State("codeforces_leaderboard.jsonl"),
330
+ gr.State("Codeforces Rating"),
331
+ gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
332
+ gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
333
+ gr.State(0), gr.State(4000),
334
+ gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
335
  outputs=codeforces_plot)
336
  planbench_tab.select(fn=create_simple_plot,
337
+ inputs=[gr.State("planbench_leaderboard.jsonl"),
338
+ gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
339
+ gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
340
+ gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
341
  outputs=planbench_plot)
342
+ bigcodebench_tab.select(fn=create_simple_plot,
343
+ inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
344
+ gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
345
+ gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
346
+ gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
347
+ outputs=bigcodebench_plot)
348
+ gaia_tab.select(fn=create_simple_plot,
349
+ inputs=[gr.State("gaia_leaderboard.jsonl"),
350
+ gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
351
+ gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
352
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
353
+ gr.State(0), gr.State(100),
354
+ gr.State({"Humans": 92})],
355
+ outputs=gaia_plot)
356
+ gpqa_tab.select(fn=create_simple_plot,
357
+ inputs=[gr.State("gpqa_leaderboard.jsonl"),
358
+ gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
359
+ gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
360
+ gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
361
+ gr.State(25), gr.State(100),
362
+ gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
363
+ outputs=gpqa_plot)
364
+ zeroeval_average_tab.select(fn=create_simple_plot,
365
+ inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
366
+ gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
367
+ gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
368
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
369
+ outputs=zeroeval_average_plot)
370
+ zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
371
+ inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
372
+ gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
373
+ gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
374
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
375
+ outputs=zeroeval_mmlu_redux_plot)
376
+ zeroeval_zebralogic_tab.select(fn=create_simple_plot,
377
+ inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
378
+ gr.State("ZeroEval ZebraLogic Score"),
379
+ gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
380
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
381
+ outputs=zeroeval_zebralogic_plot)
382
+ zeroeval_crux_tab.select(fn=create_simple_plot,
383
+ inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
384
+ gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
385
+ gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
386
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
387
+ outputs=zeroeval_crux_plot)
388
+ zeroeval_math_l5_tab.select(fn=create_simple_plot,
389
+ inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
390
+ gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
391
+ gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
392
+ gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
393
+ outputs=zeroeval_math_l5_plot)
394
 
395
 
396
  if __name__ == "__main__":
arc_agi_semi_private_eval_leaderboard.jsonl ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"model": "o3", "score": 75.7}
2
+ {"model": "o1-2024-12-17", "score": 32}
3
+ {"model": "o1-preview-2024-09-12", "score": 18}
4
+ {"model": "claude-3-5-sonnet-20240620", "score": 14}
5
+ {"model": "gpt-4o-2024-05-13", "score": 5}
6
+ {"model": "gemini-1.5-pro-001", "score": 4.5}
big_five_capex.jsonl CHANGED
@@ -1,40 +1,40 @@
1
- {"Quarter": "2015 Q1", "Microsoft": 1391, "Google": 2927, "Meta": 502, "Apple": 2369, "Amazon": 871}
2
- {"Quarter": "2015 Q2", "Microsoft": 1781, "Google": 2515, "Meta": 549, "Apple": 2043, "Amazon": 1213}
3
- {"Quarter": "2015 Q3", "Microsoft": 1356, "Google": 2406, "Meta": 780, "Apple": 3618, "Amazon": 1195}
4
- {"Quarter": "2015 Q4", "Microsoft": 2024, "Google": 2102, "Meta": 692, "Apple": 3612, "Amazon": 1309}
5
- {"Quarter": "2016 Q1", "Microsoft": 2308, "Google": 2444, "Meta": 1132, "Apple": 2336, "Amazon": 1179}
6
- {"Quarter": "2016 Q2", "Microsoft": 2655, "Google": 2136, "Meta": 995, "Apple": 2809, "Amazon": 1711}
7
- {"Quarter": "2016 Q3", "Microsoft": 2163, "Google": 2554, "Meta": 1095, "Apple": 3977, "Amazon": 1841}
8
- {"Quarter": "2016 Q4", "Microsoft": 1988, "Google": 3078, "Meta": 1269, "Apple": 3334, "Amazon": 3073}
9
- {"Quarter": "2017 Q1", "Microsoft": 1695, "Google": 2508, "Meta": 1271, "Apple": 2975, "Amazon": 2148}
10
- {"Quarter": "2017 Q2", "Microsoft": 2283, "Google": 2831, "Meta": 1444, "Apple": 2277, "Amazon": 3113}
11
- {"Quarter": "2017 Q3", "Microsoft": 2132, "Google": 3538, "Meta": 1755, "Apple": 3865, "Amazon": 3074}
12
- {"Quarter": "2017 Q4", "Microsoft": 2586, "Google": 4307, "Meta": 2263, "Apple": 2810, "Amazon": 3619}
13
- {"Quarter": "2018 Q1", "Microsoft": 2934, "Google": 7299, "Meta": 2812, "Apple": 4195, "Amazon": 3098}
14
- {"Quarter": "2018 Q2", "Microsoft": 3980, "Google": 5477, "Meta": 3460, "Apple": 3267, "Amazon": 3243}
15
- {"Quarter": "2018 Q3", "Microsoft": 3602, "Google": 5282, "Meta": 3342, "Apple": 3041, "Amazon": 3352}
16
- {"Quarter": "2018 Q4", "Microsoft": 3707, "Google": 7081, "Meta": 4301, "Apple": 3355, "Amazon": 3734}
17
- {"Quarter": "2019 Q1", "Microsoft": 2565, "Google": 4638, "Meta": 3837, "Apple": 2363, "Amazon": 3290}
18
- {"Quarter": "2019 Q2", "Microsoft": 4051, "Google": 6126, "Meta": 3633, "Apple": 2000, "Amazon": 3562}
19
- {"Quarter": "2019 Q3", "Microsoft": 3385, "Google": 6732, "Meta": 3532, "Apple": 2777, "Amazon": 4697}
20
- {"Quarter": "2019 Q4", "Microsoft": 3545, "Google": 6052, "Meta": 4100, "Apple": 2107, "Amazon": 5312}
21
- {"Quarter": "2020 Q1", "Microsoft": 3767, "Google": 6005, "Meta": 3558, "Apple": 1853, "Amazon": 6795}
22
- {"Quarter": "2020 Q2", "Microsoft": 4744, "Google": 5391, "Meta": 3255, "Apple": 1565, "Amazon": 7459}
23
- {"Quarter": "2020 Q3", "Microsoft": 4907, "Google": 5406, "Meta": 3689, "Apple": 1784, "Amazon": 11063}
24
- {"Quarter": "2020 Q4", "Microsoft": 4174, "Google": 5479, "Meta": 4613, "Apple": 3500, "Amazon": 14823}
25
- {"Quarter": "2021 Q1", "Microsoft": 5089, "Google": 5942, "Meta": 4303, "Apple": 2269, "Amazon": 12082}
26
- {"Quarter": "2021 Q2", "Microsoft": 6452, "Google": 5496, "Meta": 4641, "Apple": 2093, "Amazon": 14288}
27
- {"Quarter": "2021 Q3", "Microsoft": 5810, "Google": 6819, "Meta": 4346, "Apple": 3223, "Amazon": 15748}
28
- {"Quarter": "2021 Q4", "Microsoft": 5865, "Google": 6383, "Meta": 5400, "Apple": 2803, "Amazon": 18935}
29
- {"Quarter": "2022 Q1", "Microsoft": 5340, "Google": 9786, "Meta": 5441, "Apple": 2514, "Amazon": 14951}
30
- {"Quarter": "2022 Q2", "Microsoft": 6871, "Google": 6828, "Meta": 7572, "Apple": 2102, "Amazon": 15724}
31
- {"Quarter": "2022 Q3", "Microsoft": 6283, "Google": 7276, "Meta": 9375, "Apple": 3289, "Amazon": 16378}
32
- {"Quarter": "2022 Q4", "Microsoft": 6274, "Google": 7595, "Meta": 9043, "Apple": 3787, "Amazon": 16592}
33
- {"Quarter": "2023 Q1", "Microsoft": 6607, "Google": 6289, "Meta": 6823, "Apple": 2916, "Amazon": 14207}
34
- {"Quarter": "2023 Q2", "Microsoft": 8943, "Google": 6888, "Meta": 6134, "Apple": 2093, "Amazon": 11455}
35
- {"Quarter": "2023 Q3", "Microsoft": 9917, "Google": 8055, "Meta": 6543, "Apple": 2163, "Amazon": 12479}
36
- {"Quarter": "2023 Q4", "Microsoft": 9735, "Google": 11019, "Meta": 7665, "Apple": 2392, "Amazon": 14588}
37
- {"Quarter": "2024 Q1", "Microsoft": 10952, "Google": 12012, "Meta": 6400, "Apple": 1996, "Amazon": 14925}
38
- {"Quarter": "2024 Q2", "Microsoft": 13873, "Google": 13186, "Meta": 8173, "Apple": 2151, "Amazon": 17620}
39
- {"Quarter": "2024 Q3", "Microsoft": 14923, "Google": 13016, "Meta": 8258, "Apple": 0, "Amazon": 22620}
40
- {"Quarter": "2024 Q4", "Microsoft": 15804, "Google": 14276, "Meta": 14425, "Apple": 0, "Amazon": 27834}
 
1
+ {"Quarter": "2015 Q1", "Microsoft": 1391, "Google": 2927, "Meta": 502, "Amazon": 871}
2
+ {"Quarter": "2015 Q2", "Microsoft": 1781, "Google": 2515, "Meta": 549, "Amazon": 1213}
3
+ {"Quarter": "2015 Q3", "Microsoft": 1356, "Google": 2406, "Meta": 780, "Amazon": 1195}
4
+ {"Quarter": "2015 Q4", "Microsoft": 2024, "Google": 2102, "Meta": 692, "Amazon": 1309}
5
+ {"Quarter": "2016 Q1", "Microsoft": 2308, "Google": 2444, "Meta": 1132, "Amazon": 1179}
6
+ {"Quarter": "2016 Q2", "Microsoft": 2655, "Google": 2136, "Meta": 995, "Amazon": 1711}
7
+ {"Quarter": "2016 Q3", "Microsoft": 2163, "Google": 2554, "Meta": 1095, "Amazon": 1841}
8
+ {"Quarter": "2016 Q4", "Microsoft": 1988, "Google": 3078, "Meta": 1269, "Amazon": 3073}
9
+ {"Quarter": "2017 Q1", "Microsoft": 1695, "Google": 2508, "Meta": 1271, "Amazon": 2148}
10
+ {"Quarter": "2017 Q2", "Microsoft": 2283, "Google": 2831, "Meta": 1444, "Amazon": 3113}
11
+ {"Quarter": "2017 Q3", "Microsoft": 2132, "Google": 3538, "Meta": 1755, "Amazon": 3074}
12
+ {"Quarter": "2017 Q4", "Microsoft": 2586, "Google": 4307, "Meta": 2263, "Amazon": 3619}
13
+ {"Quarter": "2018 Q1", "Microsoft": 2934, "Google": 7299, "Meta": 2812, "Amazon": 3098}
14
+ {"Quarter": "2018 Q2", "Microsoft": 3980, "Google": 5477, "Meta": 3460, "Amazon": 3243}
15
+ {"Quarter": "2018 Q3", "Microsoft": 3602, "Google": 5282, "Meta": 3342, "Amazon": 3352}
16
+ {"Quarter": "2018 Q4", "Microsoft": 3707, "Google": 7081, "Meta": 4301, "Amazon": 3734}
17
+ {"Quarter": "2019 Q1", "Microsoft": 2565, "Google": 4638, "Meta": 3837, "Amazon": 3290}
18
+ {"Quarter": "2019 Q2", "Microsoft": 4051, "Google": 6126, "Meta": 3633, "Amazon": 3562}
19
+ {"Quarter": "2019 Q3", "Microsoft": 3385, "Google": 6732, "Meta": 3532, "Amazon": 4697}
20
+ {"Quarter": "2019 Q4", "Microsoft": 3545, "Google": 6052, "Meta": 4100, "Amazon": 5312}
21
+ {"Quarter": "2020 Q1", "Microsoft": 3767, "Google": 6005, "Meta": 3558, "Amazon": 6795}
22
+ {"Quarter": "2020 Q2", "Microsoft": 4744, "Google": 5391, "Meta": 3255, "Amazon": 7459}
23
+ {"Quarter": "2020 Q3", "Microsoft": 4907, "Google": 5406, "Meta": 3689, "Amazon": 11063}
24
+ {"Quarter": "2020 Q4", "Microsoft": 4174, "Google": 5479, "Meta": 4613, "Amazon": 14823}
25
+ {"Quarter": "2021 Q1", "Microsoft": 5089, "Google": 5942, "Meta": 4303, "Amazon": 12082}
26
+ {"Quarter": "2021 Q2", "Microsoft": 6452, "Google": 5496, "Meta": 4641, "Amazon": 14288}
27
+ {"Quarter": "2021 Q3", "Microsoft": 5810, "Google": 6819, "Meta": 4346, "Amazon": 15748}
28
+ {"Quarter": "2021 Q4", "Microsoft": 5865, "Google": 6383, "Meta": 5400, "Amazon": 18935}
29
+ {"Quarter": "2022 Q1", "Microsoft": 5340, "Google": 9786, "Meta": 5441, "Amazon": 14951}
30
+ {"Quarter": "2022 Q2", "Microsoft": 6871, "Google": 6828, "Meta": 7572, "Amazon": 15724}
31
+ {"Quarter": "2022 Q3", "Microsoft": 6283, "Google": 7276, "Meta": 9375, "Amazon": 16378}
32
+ {"Quarter": "2022 Q4", "Microsoft": 6274, "Google": 7595, "Meta": 9043, "Amazon": 16592}
33
+ {"Quarter": "2023 Q1", "Microsoft": 6607, "Google": 6289, "Meta": 6823, "Amazon": 14207}
34
+ {"Quarter": "2023 Q2", "Microsoft": 8943, "Google": 6888, "Meta": 6134, "Amazon": 11455}
35
+ {"Quarter": "2023 Q3", "Microsoft": 9917, "Google": 8055, "Meta": 6543, "Amazon": 12479}
36
+ {"Quarter": "2023 Q4", "Microsoft": 9735, "Google": 11019, "Meta": 7665, "Amazon": 14588}
37
+ {"Quarter": "2024 Q1", "Microsoft": 10952, "Google": 12012, "Meta": 6400, "Amazon": 14925}
38
+ {"Quarter": "2024 Q2", "Microsoft": 13873, "Google": 13186, "Meta": 8173, "Amazon": 17620}
39
+ {"Quarter": "2024 Q3", "Microsoft": 14923, "Google": 13016, "Meta": 8258, "Amazon": 22620}
40
+ {"Quarter": "2024 Q4", "Microsoft": 15804, "Google": 14276, "Meta": 14425, "Amazon": 27834}
bigcodebench_hard_average_leaderboard.jsonl ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-2024-12-17", "score": 35.5}
2
+ {"model": "gemini-exp-1206", "score": 34.1}
3
+ {"model": "gpt-4-turbo-2024-04-09", "score": 32.1}
4
+ {"model": "athene-v2-chat", "score": 32.1}
5
+ {"model": "athene-v2-agent", "score": 31.4}
6
+ {"model": "gpt-4o-2024-11-20", "score": 31.1}
7
+ {"model": "gpt-4o-2024-08-06", "score": 30.8}
8
+ {"model": "qwen2.5-coder-32b-instruct", "score": 30.8}
9
+ {"model": "claude-3.5-sonnet-20241022", "score": 30.4}
10
+ {"model": "claude-3.5-haiku-20241022", "score": 30.1}
11
+ {"model": "claude-3.5-sonnet-20240620", "score": 29.4}
12
+ {"model": "deepseek-coder-v2-instruct (2024-07-24)", "score": 29.4}
13
+ {"model": "gemini-1.5-pro-exp-0827", "score": 29.4}
14
+ {"model": "gemini-exp-1114", "score": 29.4}
15
+ {"model": "o1-preview-2024-09-12 (temperature=1)", "score": 28.8}
16
+ {"model": "deepseek-v2-chat (2024-06-28)", "score": 28.7}
17
+ {"model": "llama-3.3-70b-instruct", "score": 28.4}
18
+ {"model": "gemini-2.0-flash-exp", "score": 28.1}
19
+ {"model": "gemini-1.5-pro-exp-0801", "score": 27.4}
20
+ {"model": "o1-mini-2024-09-12 (temperature=1)", "score": 27.4}
21
+ {"model": "gemini-exp-1121", "score": 27.4}
22
+ {"model": "gemini-2.0-flash-thinking-exp-1219", "score": 27.4}
23
+ {"model": "gpt-4o-2024-05-13", "score": 27.1}
24
+ {"model": "deepseek-coder-v2-instruct", "score": 27}
25
+ {"model": "gemini-1.5-pro-002", "score": 26.6}
26
+ {"model": "grok-beta", "score": 26.6}
27
+ {"model": "llama-3.1-405b-instruct", "score": 26.4}
28
+ {"model": "deepseek-v2.5-1210", "score": 26.4}
29
+ {"model": "deepseek-v2.5", "score": 26.1}
30
+ {"model": "claude-3-opus-20240229", "score": 26}
31
+ {"model": "mistral-large-instruct-2407", "score": 26}
32
+ {"model": "gemini-1.5-pro-api-0514", "score": 25.4}
33
+ {"model": "llama-3.1-70b-instruct", "score": 25.4}
34
+ {"model": "qwen2.5-72b-instruct", "score": 25.4}
35
+ {"model": "gpt-4o-mini-2024-07-18", "score": 25.3}
36
+ {"model": "llama-3-70b-instruct", "score": 24.6}
37
+ {"model": "qwen2.5-32b-instruct", "score": 24.6}
38
+ {"model": "llama-3.1-nemotron-70b-instruct", "score": 24.6}
39
+ {"model": "dracarys-llama-3.1-70b-instruct", "score": 24.3}
40
+ {"model": "gemini-1.5-flash-api-0514", "score": 23.6}
41
+ {"model": "llama-3-70b-synthia-v3.5", "score": 23.6}
42
+ {"model": "claude-3-sonnet-20240229", "score": 23.4}
43
+ {"model": "dracarys-72b-instruct", "score": 22.6}
44
+ {"model": "hermes-2-theta-llama-3-70b", "score": 22.3}
45
+ {"model": "phi-3.1-mini-128k-instruct", "score": 22}
46
+ {"model": "hermes-2-pro-llama-3-70b", "score": 21.6}
47
+ {"model": "gemini-1.5-flash-exp-0827", "score": 21.6}
48
+ {"model": "qwen2.5-14b-instruct", "score": 20.9}
49
+ {"model": "qwen2-72b-chat", "score": 20.6}
50
+ {"model": "codestral-22b-v0.1", "score": 20.6}
51
+ {"model": "qwen2.5-coder-7b-instruct", "score": 20.3}
52
+ {"model": "gemma-2-27b-instruct", "score": 20}
53
+ {"model": "gpt-3.5-turbo-0125", "score": 19.9}
54
+ {"model": "mixtral-8x22b-instruct", "score": 19.9}
55
+ {"model": "athene-70b", "score": 19.9}
56
+ {"model": "deepseek-coder-33b-instruct", "score": 19.3}
57
+ {"model": "whiterabbitneo-33b-v1.5", "score": 19.3}
58
+ {"model": "reflectioncoder-ds-33b", "score": 18.9}
59
+ {"model": "deepseek-v2-chat", "score": 18.6}
60
+ {"model": "opencoder-8b-instruct", "score": 18.5}
61
+ {"model": "claude-3-haiku-20240307", "score": 18.3}
62
+ {"model": "gpt-4-0613", "score": 17.6}
codeforces_leaderboard.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"model": "o3", "score": 2727}
2
  {"model": "o3-mini", "score": 2073}
3
  {"model": "o1", "score": 1673}
4
  {"model": "o1-mini", "score": 1650}
 
1
+ {"model": "o3", "score": 2400}
2
  {"model": "o3-mini", "score": 2073}
3
  {"model": "o1", "score": 1673}
4
  {"model": "o1-mini", "score": 1650}
gaia_leaderboard.jsonl ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "score": 65.12}
2
+ {"model": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "score": 49.33}
3
+ {"model": "barcelona v0.1 (claude sonnet 3.5)", "score": 46.18}
4
+ {"model": "omne v0.1 (o1-preview, gpt-4o)", "score": 40.53}
5
+ {"model": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "score": 39.53}
6
+ {"model": "Multi Agent", "score": 38.87}
7
+ {"model": "DynaSaur (gpt-4o)", "score": 38.21}
8
+ {"model": "magentic-1 (o1)", "score": 38}
9
+ {"model": "Trase Agent v0.1 (fine-tuned gpt-4o)", "score": 35.55}
10
+ {"model": "sibyl system v0.2 (gpt-4o)", "score": 34.55}
11
+ {"model": "HuggingFaceAgents (gpt-4o)", "score": 33.33}
12
+ {"model": "tapeagent v0.2", "score": 33.22}
13
+ {"model": "little_potato (yanzw gpt-4o)", "score": 32.89}
14
+ {"model": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "score": 32.33}
15
+ {"model": "magentic-1", "score": 32.33}
16
+ {"model": "das_agent v0.2", "score": 32.33}
17
+ {"model": "cola_v0.4", "score": 31.89}
18
+ {"model": "modified hugging face agents + gpt-4o", "score": 31.23}
19
+ {"model": "das_agent", "score": 31}
20
+ {"model": "das_agent v0.3", "score": 30.33}
21
+ {"model": "cola_v0.3", "score": 30.23}
22
+ {"model": "gpt-4o-2024-08-06", "score": 29}
23
+ {"model": "replicated hugging face agents + gpt-4o", "score": 29}
24
+ {"model": "tapeagent v0.1", "score": 27.57}
25
+ {"model": "das_agent v0.4 mini (fixed)", "score": 26.91}
26
+ {"model": "sibyl system v0.2 (gpt-4o-2024-08-06)", "score": 26.58}
27
+ {"model": "das_agent v0.4 mini", "score": 25.91}
28
+ {"model": "mmac v1.1 (gpt4v gemini 1.5)", "score": 25.91}
29
+ {"model": "modified sibyl system", "score": 25.91}
30
+ {"model": "maac_v1", "score": 25.58}
31
+ {"model": "uk ai safety institute internal (gpt-4-turbo)", "score": 25}
32
+ {"model": "FRIDAY (gpt-4-turbo)", "score": 24.25}
33
+ {"model": "cola_abl", "score": 23.26}
34
+ {"model": "replicated hugging face agents + gpt-4o mini", "score": 22.67}
35
+ {"model": "tapeagent v0.2 mini", "score": 21.93}
36
+ {"model": "friday_without_learning (os-copilot gpt-4-turbo)", "score": 21.59}
37
+ {"model": "ceylon", "score": 17.06}
38
+ {"model": "tapeagent v0.1 mini", "score": 16.61}
39
+ {"model": "dip (gpt-4-turbo)", "score": 15.95}
40
+ {"model": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "score": 15.61}
41
+ {"model": "cola_v0.2", "score": 15.28}
42
+ {"model": "chamomile", "score": 14.67}
43
+ {"model": "clarity v1", "score": 14.05}
44
+ {"model": "warm-up act (gpt-4-turbo)", "score": 12.96}
45
+ {"model": "frc v5", "score": 12}
46
+ {"model": "cola_v0.1", "score": 10.96}
47
+ {"model": "somedayv1.2", "score": 10.3}
48
+ {"model": "somedayv1.1", "score": 9.97}
49
+ {"model": "frc v4", "score": 9.33}
50
+ {"model": "stealth3", "score": 9.3}
51
+ {"model": "stealth2", "score": 8.97}
52
+ {"model": "someday1", "score": 8.97}
53
+ {"model": "frc v3", "score": 8.67}
54
+ {"model": "stealth", "score": 8.64}
55
+ {"model": "gpt-4-turbo", "score": 6.67}
56
+ {"model": "someday", "score": 6.31}
57
+ {"model": "AutoGPT4 (gpt-4)", "score": 5}
58
+ {"model": "gpt-4o-mini-2024-07-18", "score": 4.65}
59
+ {"model": "gpt-4", "score": 4}
60
+ {"model": "gpt-3.5-turbo", "score": 2.67}
61
+ {"model": "alphaagent v0.1 (gpt-4o)", "score": 2.33}
gpqa_leaderboard.jsonl ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-2024-12-17", "score": 76}
2
+ {"model": "claude-3-5-sonnet-20240620", "score": 56}
3
+ {"model": "gpt-4o-2024-05-13", "score": 49}
4
+ {"model": "claude-3-opus-20240229", "score": 48}
5
+ {"model": "gemini-1.5-pro-001", "score": 45}
6
+ {"model": "gpt-4-1106-preview", "score": 43}
7
+ {"model": "claude-2.0", "score": 35}
8
+ {"model": "gpt-4-0613", "score": 33}
models.jsonl CHANGED
@@ -1,39 +1,60 @@
1
  {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
  {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
 
 
 
 
3
  {"Name": "o1", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
 
 
 
4
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
  {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
6
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
7
  {"Name": "o1-mini", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
8
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
 
9
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
10
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
11
  {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
12
  {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
13
  {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
14
  {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
15
  {"Name": "command-r-plus-08-2024", "Release Date": "2024-08-21", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
16
  {"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
17
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
18
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
19
  {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
20
  {"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
21
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
22
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
23
  {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
24
  {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
25
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
26
  {"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
27
  {"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
 
28
  {"Name": "llama-3.1-405b", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
29
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
30
  {"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
31
  {"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
32
  {"Name": "gpt-4-turbo-2024-04-09", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
33
  {"Name": "gpt-4-1106-preview", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
34
  {"Name": "mistral-large-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
35
  {"Name": "athene-70b-0725", "Release Date": "2024-07-25", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
36
  {"Name": "claude-3-opus-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
37
  {"Name": "llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
38
  {"Name": "gpt-4-0125-preview", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
39
  {"Name": "yi-large-preview", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -47,6 +68,7 @@
47
  {"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
48
  {"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
49
  {"Name": "gemini-1.5-pro-001", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
50
  {"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
51
  {"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
52
  {"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -54,10 +76,10 @@
54
  {"Name": "reka-core-20240501", "Release Date": "2024-05-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
55
  {"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
56
  {"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
 
57
  {"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
58
  {"Name": "gpt-4", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
59
  {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
60
- {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
61
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
62
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
63
  {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
@@ -93,6 +115,7 @@
93
  {"Name": "gemini-pro", "Release Date": "2023-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
94
  {"Name": "qwen1.5-14b-chat", "Release Date": "2024-02-04", "Total Parameters": 14, "Active Parameters": 14, "API Cost": 0}
95
  {"Name": "gpt-3.5-turbo-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
96
  {"Name": "wizardlm-70b", "Release Date": "2023-08-09", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
97
  {"Name": "gpt-3.5-turbo-0125", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
98
  {"Name": "dbrx-instruct-preview", "Release Date": "2024-03-27", "Total Parameters": 132, "Active Parameters": 36, "API Cost": 0}
@@ -153,4 +176,94 @@
153
  {"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
154
  {"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
155
  {"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
156
- {"Name": "gpt-3.5", "Release Date": "2022-11-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
2
  {"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
3
+ {"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
4
+ {"Name": "o1-2024-12-17 (temperature=1)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
5
+ {"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
6
+ {"Name": "Jeremy Berman", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
7
+ {"Name": "gemini-exp-1206", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
8
+ {"Name": "llama-3.3-70b-instruct", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
9
  {"Name": "o1", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
10
+ {"Name": "claude-3-5-sonnet-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
11
+ {"Name": "claude-3.5-sonnet-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
12
+ {"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
13
+ {"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
14
+ {"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
15
  {"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
16
  {"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
17
  {"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
18
  {"Name": "o1-mini", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
19
  {"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
20
+ {"Name": "deepseek-v2.5-0908", "Release Date": "2024-09-08", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
21
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
22
  {"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
23
  {"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
24
  {"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
25
  {"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
26
  {"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
27
+ {"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
28
  {"Name": "command-r-plus-08-2024", "Release Date": "2024-08-21", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
29
  {"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
30
  {"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
31
+ {"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
32
  {"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
33
+ {"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
34
+ {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
35
  {"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
36
  {"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
37
  {"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
38
  {"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
39
  {"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
40
  {"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
41
+ {"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
42
  {"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
43
  {"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
44
  {"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
45
+ {"Name": "llama-3.1-405b-instruct", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
46
  {"Name": "llama-3.1-405b", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
47
  {"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
48
  {"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
49
  {"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
50
  {"Name": "gpt-4-turbo-2024-04-09", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
51
  {"Name": "gpt-4-1106-preview", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
52
+ {"Name": "gpt-4-turbo", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
53
  {"Name": "mistral-large-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
54
+ {"Name": "mistral-large-2", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
55
  {"Name": "athene-70b-0725", "Release Date": "2024-07-25", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
56
  {"Name": "claude-3-opus-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
57
+ {"Name": "meta-llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
58
  {"Name": "llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
59
  {"Name": "gpt-4-0125-preview", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
60
  {"Name": "yi-large-preview", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
68
  {"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
69
  {"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
70
  {"Name": "gemini-1.5-pro-001", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
71
+ {"Name": "gemini-1.5-pro", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
72
  {"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
73
  {"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
74
  {"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
76
  {"Name": "reka-core-20240501", "Release Date": "2024-05-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
77
  {"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
78
  {"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
79
+ {"Name": "qwen2.5-coder-32b-instruct", "Release Date": "2024-09-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
80
  {"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
81
  {"Name": "gpt-4", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
82
  {"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
83
  {"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
84
  {"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
85
  {"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
 
115
  {"Name": "gemini-pro", "Release Date": "2023-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
116
  {"Name": "qwen1.5-14b-chat", "Release Date": "2024-02-04", "Total Parameters": 14, "Active Parameters": 14, "API Cost": 0}
117
  {"Name": "gpt-3.5-turbo-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
118
+ {"Name": "gpt-3.5-turbo", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
119
  {"Name": "wizardlm-70b", "Release Date": "2023-08-09", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
120
  {"Name": "gpt-3.5-turbo-0125", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
121
  {"Name": "dbrx-instruct-preview", "Release Date": "2024-03-27", "Total Parameters": 132, "Active Parameters": 36, "API Cost": 0}
 
176
  {"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
177
  {"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
178
  {"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
179
+ {"Name": "gpt-3.5", "Release Date": "2022-11-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
180
+ {"Name": "deepseek-coder-v2-instruct (2024-07-24)", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
181
+ {"Name": "gemini-exp-1114", "Release Date": "2024-11-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
182
+ {"Name": "athene-v2-chat", "Release Date": "2024-11-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
183
+ {"Name": "athene-v2-agent", "Release Date": "2024-11-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
184
+ {"Name": "claude-3.5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
185
+ {"Name": "o1-preview-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
186
+ {"Name": "deepseek-v2-chat (2024-06-28)", "Release Date": "2024-06-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
187
+ {"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
188
+ {"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
189
+ {"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
190
+ {"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
191
+ {"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
192
+ {"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
193
+ {"Name": "qwen2.5-72b-instruct", "Release Date": "2024-09-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
194
+ {"Name": "qwen2.5-32b-instruct", "Release Date": "2024-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
195
+ {"Name": "llama-3.1-nemotron-70b-instruct", "Release Date": "2024-10-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
196
+ {"Name": "dracarys-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
197
+ {"Name": "llama-3-70b-synthia-v3.5", "Release Date": "2024-05-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
198
+ {"Name": "dracarys-72b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
199
+ {"Name": "hermes-2-theta-llama-3-70b", "Release Date": "2024-06-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
200
+ {"Name": "phi-3.1-mini-128k-instruct", "Release Date": "2024-07-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
201
+ {"Name": "hermes-2-pro-llama-3-70b", "Release Date": "2024-06-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
202
+ {"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
203
+ {"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
204
+ {"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
205
+ {"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
206
+ {"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
207
+ {"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
208
+ {"Name": "athene-70b", "Release Date": "2024-07-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
209
+ {"Name": "deepseek-coder-33b-instruct", "Release Date": "2023-11-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
210
+ {"Name": "whiterabbitneo-33b-v1.5", "Release Date": "2024-02-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
211
+ {"Name": "reflectioncoder-ds-33b", "Release Date": "2024-05-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
212
+ {"Name": "deepseek-v2-chat", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
213
+ {"Name": "opencoder-8b-instruct", "Release Date": "2024-11-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
214
+ {"Name": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "Release Date": "2024-12-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
215
+ {"Name": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "Release Date": "2024-12-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
216
+ {"Name": "barcelona v0.1 (claude sonnet 3.5)", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
217
+ {"Name": "omne v0.1 (o1-preview, gpt-4o)", "Release Date": "2024-10-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
218
+ {"Name": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "Release Date": "2024-10-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
219
+ {"Name": "Multi Agent", "Release Date": "2024-10-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
220
+ {"Name": "DynaSaur (gpt-4o)", "Release Date": "2024-10-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
221
+ {"Name": "magentic-1 (o1)", "Release Date": "2024-10-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
222
+ {"Name": "Trase Agent v0.1 (fine-tuned gpt-4o)", "Release Date": "2024-09-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
223
+ {"Name": "sibyl system v0.2 (gpt-4o)", "Release Date": "2024-11-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
224
+ {"Name": "HuggingFaceAgents (gpt-4o)", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
225
+ {"Name": "tapeagent v0.2", "Release Date": "2024-12-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
226
+ {"Name": "little_potato (yanzw gpt-4o)", "Release Date": "2024-10-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
227
+ {"Name": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "Release Date": "2024-03-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
228
+ {"Name": "magentic-1", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
229
+ {"Name": "das_agent v0.2", "Release Date": "2024-09-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
230
+ {"Name": "cola_v0.4", "Release Date": "2024-11-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
231
+ {"Name": "modified hugging face agents + gpt-4o", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
232
+ {"Name": "das_agent", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
233
+ {"Name": "das_agent v0.3", "Release Date": "2024-09-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
234
+ {"Name": "cola_v0.3", "Release Date": "2024-11-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
235
+ {"Name": "replicated hugging face agents + gpt-4o", "Release Date": "2024-09-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
236
+ {"Name": "tapeagent v0.1", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
237
+ {"Name": "das_agent v0.4 mini (fixed)", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
238
+ {"Name": "sibyl system v0.2 (gpt-4o-2024-08-06)", "Release Date": "2024-09-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
239
+ {"Name": "das_agent v0.4 mini", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
240
+ {"Name": "mmac v1.1 (gpt4v gemini 1.5)", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
241
+ {"Name": "modified sibyl system", "Release Date": "2024-08-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
242
+ {"Name": "maac_v1", "Release Date": "2024-04-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
243
+ {"Name": "uk ai safety institute internal (gpt-4-turbo)", "Release Date": "2024-04-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
244
+ {"Name": "FRIDAY (gpt-4-turbo)", "Release Date": "2024-01-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
245
+ {"Name": "cola_abl", "Release Date": "2024-12-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
246
+ {"Name": "replicated hugging face agents + gpt-4o mini", "Release Date": "2024-10-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
247
+ {"Name": "tapeagent v0.2 mini", "Release Date": "2024-12-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
248
+ {"Name": "friday_without_learning (os-copilot gpt-4-turbo)", "Release Date": "2024-01-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
249
+ {"Name": "ceylon", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
250
+ {"Name": "tapeagent v0.1 mini", "Release Date": "2024-10-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
251
+ {"Name": "dip (gpt-4-turbo)", "Release Date": "2024-04-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
252
+ {"Name": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "Release Date": "2024-10-08", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
253
+ {"Name": "cola_v0.2", "Release Date": "2024-10-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
254
+ {"Name": "chamomile", "Release Date": "2024-03-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
255
+ {"Name": "clarity v1", "Release Date": "2024-02-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
256
+ {"Name": "warm-up act (gpt-4-turbo)", "Release Date": "2024-02-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
257
+ {"Name": "frc v5", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
258
+ {"Name": "cola_v0.1", "Release Date": "2024-10-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
259
+ {"Name": "somedayv1.2", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
260
+ {"Name": "somedayv1.1", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
261
+ {"Name": "frc v4", "Release Date": "2024-04-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
262
+ {"Name": "stealth3", "Release Date": "2024-02-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
263
+ {"Name": "stealth2", "Release Date": "2024-02-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
264
+ {"Name": "someday1", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
265
+ {"Name": "frc v3", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
266
+ {"Name": "stealth", "Release Date": "2024-02-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
267
+ {"Name": "someday", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
268
+ {"Name": "AutoGPT4 (gpt-4)", "Release Date": "2023-11-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
269
+ {"Name": "alphaagent v0.1 (gpt-4o)", "Release Date": "2024-10-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
planbench_leaderboard.jsonl CHANGED
@@ -1,3 +1,4 @@
1
  {"model": "o1-preview-2024-09-12", "score": 52.8}
2
  {"model": "llama-3.1-405b", "score": 0.8}
3
- {"model": "gpt-4", "score": 0.16}
 
 
1
  {"model": "o1-preview-2024-09-12", "score": 52.8}
2
  {"model": "llama-3.1-405b", "score": 0.8}
3
+ {"model": "gpt-4", "score": 0.16}
4
+ {"model": "gpt-4o", "score": 0}
simple_bench_leaderboard.jsonl CHANGED
@@ -1,10 +1,17 @@
1
- {"model": "claude-3-5-sonnet-20240620", "score": 27}
2
- {"model": "gpt-4-1106-preview", "score": 26}
3
- {"model": "claude-3-opus-20240229", "score": 25}
4
- {"model": "llama-3.1-405b-instruct-fp8", "score": 22}
5
- {"model": "gemini-1.5-pro-001", "score": 21}
6
- {"model": "gpt-4-0613", "score": 18}
7
- {"model": "gpt-4o-2024-05-13", "score": 16}
8
- {"model": "deepseek-v2-api-0628", "score": 15}
9
- {"model": "mistral-large-2407", "score": 13}
10
- {"model": "gpt-4o-mini-2024-07-18", "score": 5}
 
 
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 41.7}
2
+ {"model": "claude-3-5-sonnet-20241022", "score": 41.4}
3
+ {"model": "o1-2024-12-17", "score": 36.7}
4
+ {"model": "gemini-exp-1206", "score": 31.1}
5
+ {"model": "claude-3-5-sonnet-20240620", "score": 27.5}
6
+ {"model": "gemini-1.5-pro-002", "score": 27.1}
7
+ {"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
8
+ {"model": "claude-3-opus-20240229", "score": 23.5}
9
+ {"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
10
+ {"model": "grok-beta", "score": 22.7}
11
+ {"model": "mistral-large-2407", "score": 22.5}
12
+ {"model": "llama-3.3-70b-instruct", "score": 19.9}
13
+ {"model": "gemini-2.0-flash-exp", "score": 18.9}
14
+ {"model": "o1-mini-2024-09-12", "score": 18.1}
15
+ {"model": "gpt-4o-2024-08-06", "score": 17.8}
16
+ {"model": "command-r-plus", "score": 17.4}
17
+ {"model": "gpt-4o-mini-2024-07-18", "score": 10.7}
zeroeval_average_leaderboard.jsonl ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 86.1}
2
+ {"model": "o1-mini-2024-09-12", "score": 80.6}
3
+ {"model": "claude-3-5-sonnet-20241022", "score": 67.1}
4
+ {"model": "gemini-1.5-pro-exp-0827", "score": 66.1}
5
+ {"model": "gpt-4o-2024-08-06", "score": 65.6}
6
+ {"model": "chatgpt-4o-latest-24-09-07", "score": 64.6}
7
+ {"model": "gpt-4o-2024-05-13", "score": 64.3}
8
+ {"model": "claude-3-5-sonnet-20240620", "score": 63.0}
9
+ {"model": "grok-2-1212", "score": 62.8}
10
+ {"model": "qwen2.5-72b-instruct", "score": 61.6}
11
+ {"model": "llama-3.1-405b-instruct", "score": 59.8}
12
+ {"model": "gpt-4-turbo-2024-04-09", "score": 59.8}
13
+ {"model": "gemini-1.5-flash-exp-0827", "score": 59.0}
14
+ {"model": "mistral-large-2", "score": 58.9}
15
+ {"model": "gpt-4o-mini-2024-07-18", "score": 57.4}
16
+ {"model": "deepseek-v2.5-0908", "score": 54.3}
17
+ {"model": "claude-3-opus-20240229", "score": 54.2}
18
+ {"model": "meta-llama-3.1-70b-instruct", "score": 53.8}
19
+ {"model": "claude-3-5-haiku-20241022", "score": 53.4}
20
+ {"model": "gemini-1.5-pro", "score": 52.5}
21
+ {"model": "gpt-4-0314", "score": 52.3}
zeroeval_crux_leaderboard.jsonl ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 95.9}
2
+ {"model": "o1-mini-2024-09-12", "score": 93.8}
3
+ {"model": "claude-3-5-sonnet-20241022", "score": 83.9}
4
+ {"model": "gemini-1.5-pro-exp-0827", "score": 79.6}
5
+ {"model": "gpt-4o-2024-08-06", "score": 87.0}
6
+ {"model": "chatgpt-4o-latest-24-09-07", "score": 86.5}
7
+ {"model": "gpt-4o-2024-05-13", "score": 86.1}
8
+ {"model": "claude-3-5-sonnet-20240620", "score": 80.8}
9
+ {"model": "grok-2-1212", "score": 75.3}
10
+ {"model": "qwen2.5-72b-instruct", "score": 73.9}
11
+ {"model": "llama-3.1-405b-instruct", "score": 73.0}
12
+ {"model": "gpt-4-turbo-2024-04-09", "score": 78.9}
13
+ {"model": "gemini-1.5-flash-exp-0827", "score": 74.5}
14
+ {"model": "mistral-large-2", "score": 75.1}
15
+ {"model": "gpt-4o-mini-2024-07-18", "score": 75.9}
16
+ {"model": "deepseek-v2.5-0908", "score": 70.0}
17
+ {"model": "claude-3-opus-20240229", "score": 70.4}
18
+ {"model": "meta-llama-3.1-70b-instruct", "score": 64.3}
19
+ {"model": "claude-3-5-haiku-20241022", "score": 68.8}
20
+ {"model": "gemini-1.5-pro", "score": 68.0}
21
+ {"model": "gpt-4-0314", "score": 74.5}
zeroeval_math_l5_leaderboard.jsonl ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 84.5}
2
+ {"model": "o1-mini-2024-09-12", "score": 89.3}
3
+ {"model": "claude-3-5-sonnet-20241022", "score": 59.4}
4
+ {"model": "gemini-1.5-pro-exp-0827", "score": 68.1}
5
+ {"model": "gpt-4o-2024-08-06", "score": 55.3}
6
+ {"model": "chatgpt-4o-latest-24-09-07", "score": 53.1}
7
+ {"model": "gpt-4o-2024-05-13", "score": 54.8}
8
+ {"model": "claude-3-5-sonnet-20240620", "score": 51.9}
9
+ {"model": "grok-2-1212", "score": 60.9}
10
+ {"model": "qwen2.5-72b-instruct", "score": 60.2}
11
+ {"model": "llama-3.1-405b-instruct", "score": 49.8}
12
+ {"model": "gpt-4-turbo-2024-04-09", "score": 46.5}
13
+ {"model": "gemini-1.5-flash-exp-0827", "score": 54.5}
14
+ {"model": "mistral-large-2", "score": 48.5}
15
+ {"model": "gpt-4o-mini-2024-07-18", "score": 52.2}
16
+ {"model": "deepseek-v2.5-0908", "score": 44.7}
17
+ {"model": "claude-3-opus-20240229", "score": 36.9}
18
+ {"model": "meta-llama-3.1-70b-instruct", "score": 43.1}
19
+ {"model": "claude-3-5-haiku-20241022", "score": 46.5}
20
+ {"model": "gemini-1.5-pro", "score": 39.8}
21
+ {"model": "gpt-4-0314", "score": 26.1}
zeroeval_mmlu_redux_leaderboard.jsonl ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-preview-2024-09-12", "score": 92.8}
2
+ {"model": "o1-mini-2024-09-12", "score": 86.7}
3
+ {"model": "claude-3-5-sonnet-20241022", "score": 88.9}
4
+ {"model": "gemini-1.5-pro-exp-0827", "score": 86.1}
5
+ {"model": "gpt-4o-2024-08-06", "score": 88.3}
6
+ {"model": "chatgpt-4o-latest-24-09-07", "score": 88.9}
7
+ {"model": "gpt-4o-2024-05-13", "score": 88.0}
8
+ {"model": "claude-3-5-sonnet-20240620", "score": 86.0}
9
+ {"model": "grok-2-1212", "score": 87.4}
10
+ {"model": "qwen2.5-72b-instruct", "score": 85.6}
11
+ {"model": "llama-3.1-405b-instruct", "score": 86.2}
12
+ {"model": "gpt-4-turbo-2024-04-09", "score": 85.3}
13
+ {"model": "gemini-1.5-flash-exp-0827", "score": 82.1}
14
+ {"model": "mistral-large-2", "score": 83.0}
15
+ {"model": "gpt-4o-mini-2024-07-18", "score": 81.5}
16
+ {"model": "deepseek-v2.5-0908", "score": 80.4}
17
+ {"model": "claude-3-opus-20240229", "score": 82.5}
18
+ {"model": "meta-llama-3.1-70b-instruct", "score": 83.0}
19
+ {"model": "claude-3-5-haiku-20241022", "score": 79.6}
20
+ {"model": "gemini-1.5-pro", "score": 82.8}
21
+ {"model": "gpt-4-0314", "score": 81.6}
zeroeval_zebralogic_leaderboard.jsonl ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"model": "o1-2024-12-17", "score": 81.0}
2
+ {"model": "o1-preview-2024-09-12", "score": 71.4}
3
+ {"model": "o1-mini-2024-09-12", "score": 52.6}
4
+ {"model": "claude-3-5-sonnet-20241022", "score": 36.2}
5
+ {"model": "gemini-1.5-pro-exp-0827", "score": 30.5}
6
+ {"model": "gpt-4o-2024-08-06", "score": 31.7}
7
+ {"model": "chatgpt-4o-latest-24-09-07", "score": 29.9}
8
+ {"model": "gpt-4o-2024-05-13", "score": 28.2}
9
+ {"model": "claude-3-5-sonnet-20240620", "score": 33.4}
10
+ {"model": "grok-2-1212", "score": 27.7}
11
+ {"model": "qwen2.5-72b-instruct", "score": 26.6}
12
+ {"model": "llama-3.1-405b-instruct", "score": 30.1}
13
+ {"model": "gpt-4-turbo-2024-04-09", "score": 28.4}
14
+ {"model": "gemini-1.5-flash-exp-0827", "score": 25.0}
15
+ {"model": "mistral-large-2", "score": 29.0}
16
+ {"model": "gpt-4o-mini-2024-07-18", "score": 20.1}
17
+ {"model": "deepseek-v2.5-0908", "score": 22.1}
18
+ {"model": "claude-3-opus-20240229", "score": 27.0}
19
+ {"model": "meta-llama-3.1-70b-instruct", "score": 24.9}
20
+ {"model": "claude-3-5-haiku-20241022", "score": 18.7}
21
+ {"model": "gemini-1.5-pro", "score": 19.4}
22
+ {"model": "gpt-4-0314", "score": 27.1}