Commit
·
afb8d0c
1
Parent(s):
0a86c6a
Add new benchmarks; Several improvements
Browse files- app.py +220 -46
- arc_agi_semi_private_eval_leaderboard.jsonl +6 -0
- big_five_capex.jsonl +40 -40
- bigcodebench_hard_average_leaderboard.jsonl +62 -0
- codeforces_leaderboard.jsonl +1 -1
- gaia_leaderboard.jsonl +61 -0
- gpqa_leaderboard.jsonl +8 -0
- models.jsonl +115 -2
- planbench_leaderboard.jsonl +2 -1
- simple_bench_leaderboard.jsonl +17 -10
- zeroeval_average_leaderboard.jsonl +21 -0
- zeroeval_crux_leaderboard.jsonl +21 -0
- zeroeval_math_l5_leaderboard.jsonl +21 -0
- zeroeval_mmlu_redux_leaderboard.jsonl +21 -0
- zeroeval_zebralogic_leaderboard.jsonl +22 -0
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import json
|
2 |
-
from datetime import datetime, date
|
3 |
|
4 |
import gradio as gr
|
5 |
import plotly.graph_objects as go
|
|
|
|
|
6 |
|
7 |
|
8 |
def create_big_five_capex_plot() -> go.Figure:
|
@@ -11,8 +13,8 @@ def create_big_five_capex_plot() -> go.Figure:
|
|
11 |
data = [json.loads(line) for line in file if line.strip()]
|
12 |
|
13 |
quarters: list[str] = [entry["Quarter"] for entry in data]
|
14 |
-
companies = ['Microsoft', 'Google', 'Meta', '
|
15 |
-
colors = ['#80bb00', '#ee161f', '#0065e3', '#
|
16 |
|
17 |
x_positions = list(range(len(quarters)))
|
18 |
|
@@ -29,7 +31,7 @@ def create_big_five_capex_plot() -> go.Figure:
|
|
29 |
fig = go.Figure(data=traces)
|
30 |
fig.update_layout(
|
31 |
barmode="stack",
|
32 |
-
title="Capital Expenditures of
|
33 |
xaxis_title="Quarter",
|
34 |
yaxis_title="Capital Expenditures (Millions USD)",
|
35 |
xaxis=dict(
|
@@ -37,7 +39,7 @@ def create_big_five_capex_plot() -> go.Figure:
|
|
37 |
tickvals=x_positions,
|
38 |
ticktext=quarters
|
39 |
),
|
40 |
-
height=
|
41 |
)
|
42 |
|
43 |
# Calculate the x position for the vertical dotted line.
|
@@ -86,12 +88,14 @@ def create_big_five_capex_plot() -> go.Figure:
|
|
86 |
|
87 |
def create_simple_plot(data_path: str,
|
88 |
name: str,
|
|
|
89 |
start_date: datetime, end_date: datetime,
|
90 |
-
min_value: int = 0, max_value: int = 100
|
91 |
-
|
|
|
92 |
with open(data_path, 'r') as file:
|
93 |
for line in file:
|
94 |
-
|
95 |
|
96 |
models = []
|
97 |
with open("models.jsonl", 'r') as file:
|
@@ -99,7 +103,7 @@ def create_simple_plot(data_path: str,
|
|
99 |
models.append(json.loads(line))
|
100 |
|
101 |
data = []
|
102 |
-
for entry in
|
103 |
model_name = entry['model']
|
104 |
score = entry['score']
|
105 |
model_info = next((m for m in models if m['Name'] == model_name), None)
|
@@ -142,8 +146,8 @@ def create_simple_plot(data_path: str,
|
|
142 |
))
|
143 |
|
144 |
fig.update_layout(
|
145 |
-
title=f'{name} Over Time',
|
146 |
-
xaxis_title='Release Date',
|
147 |
yaxis_title=name,
|
148 |
hovermode='x unified',
|
149 |
xaxis=dict(
|
@@ -156,67 +160,237 @@ def create_simple_plot(data_path: str,
|
|
156 |
height=800
|
157 |
)
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
return fig
|
160 |
|
161 |
|
162 |
with gr.Blocks() as demo:
|
163 |
with gr.Tab("System Performance Over Time"):
|
164 |
-
with gr.Tab("
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
simple_bench_plot: gr.Plot = gr.Plot()
|
168 |
-
|
|
|
|
|
|
|
169 |
planbench_plot: gr.Plot = gr.Plot()
|
170 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
171 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
172 |
)
|
173 |
-
with gr.Tab("
|
174 |
-
with gr.Tab("General-Purpose Systems"):
|
175 |
-
codeforces_plot: gr.Plot = gr.Plot()
|
176 |
-
with gr.Tab("BigCodeBench", interactive=False):
|
177 |
-
bigcodebench_plot: gr.Plot = gr.Plot()
|
178 |
-
with gr.Tab("GAIA", interactive=False):
|
179 |
gaia_plot: gr.Plot = gr.Plot()
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
gpqa_plot: gr.Plot = gr.Plot()
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
opencompass_plot: gr.Plot = gr.Plot()
|
190 |
-
|
|
|
|
|
|
|
191 |
swe_bench_plot: gr.Plot = gr.Plot()
|
192 |
-
|
|
|
|
|
|
|
193 |
webarena_plot: gr.Plot = gr.Plot()
|
194 |
-
|
195 |
-
|
|
|
196 |
with gr.Tab("Finance") as finance_tab:
|
197 |
-
with gr.Tab("Big
|
198 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
199 |
-
with gr.Tab("NVIDIA Revenue",
|
200 |
nvidia_revenue_plot: gr.Plot = gr.Plot()
|
201 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
arc_agi_tab.select(fn=create_simple_plot,
|
204 |
-
inputs=[gr.State("
|
205 |
-
gr.State(
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
simple_bench_tab.select(fn=create_simple_plot,
|
208 |
-
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
209 |
-
gr.State(
|
|
|
|
|
|
|
|
|
210 |
outputs=simple_bench_plot)
|
211 |
codeforces_tab.select(fn=create_simple_plot,
|
212 |
-
inputs=[gr.State("codeforces_leaderboard.jsonl"),
|
213 |
-
gr.State(
|
214 |
-
gr.State(
|
|
|
|
|
|
|
215 |
outputs=codeforces_plot)
|
216 |
planbench_tab.select(fn=create_simple_plot,
|
217 |
-
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
218 |
-
gr.State(
|
|
|
|
|
219 |
outputs=planbench_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
|
222 |
if __name__ == "__main__":
|
|
|
1 |
import json
|
2 |
+
from datetime import datetime, date, timedelta
|
3 |
|
4 |
import gradio as gr
|
5 |
import plotly.graph_objects as go
|
6 |
+
from scipy.optimize import curve_fit
|
7 |
+
import numpy as np
|
8 |
|
9 |
|
10 |
def create_big_five_capex_plot() -> go.Figure:
|
|
|
13 |
data = [json.loads(line) for line in file if line.strip()]
|
14 |
|
15 |
quarters: list[str] = [entry["Quarter"] for entry in data]
|
16 |
+
companies = ['Microsoft', 'Google', 'Meta', 'Amazon']
|
17 |
+
colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200']
|
18 |
|
19 |
x_positions = list(range(len(quarters)))
|
20 |
|
|
|
31 |
fig = go.Figure(data=traces)
|
32 |
fig.update_layout(
|
33 |
barmode="stack",
|
34 |
+
title="Capital Expenditures of Amazon, Meta, Google and Microsoft in Millions of USD per Quarter",
|
35 |
xaxis_title="Quarter",
|
36 |
yaxis_title="Capital Expenditures (Millions USD)",
|
37 |
xaxis=dict(
|
|
|
39 |
tickvals=x_positions,
|
40 |
ticktext=quarters
|
41 |
),
|
42 |
+
height=800
|
43 |
)
|
44 |
|
45 |
# Calculate the x position for the vertical dotted line.
|
|
|
88 |
|
89 |
def create_simple_plot(data_path: str,
|
90 |
name: str,
|
91 |
+
subtitle: str,
|
92 |
start_date: datetime, end_date: datetime,
|
93 |
+
min_value: int = 0, max_value: int = 100,
|
94 |
+
labeled_horizontal_lines: dict[str, float] = None) -> go.Figure:
|
95 |
+
leaderboard = []
|
96 |
with open(data_path, 'r') as file:
|
97 |
for line in file:
|
98 |
+
leaderboard.append(json.loads(line))
|
99 |
|
100 |
models = []
|
101 |
with open("models.jsonl", 'r') as file:
|
|
|
103 |
models.append(json.loads(line))
|
104 |
|
105 |
data = []
|
106 |
+
for entry in leaderboard:
|
107 |
model_name = entry['model']
|
108 |
score = entry['score']
|
109 |
model_info = next((m for m in models if m['Name'] == model_name), None)
|
|
|
146 |
))
|
147 |
|
148 |
fig.update_layout(
|
149 |
+
title=f'{name} Over Time<br><sup>{subtitle}</sup>',
|
150 |
+
xaxis_title='Publication or Release Date',
|
151 |
yaxis_title=name,
|
152 |
hovermode='x unified',
|
153 |
xaxis=dict(
|
|
|
160 |
height=800
|
161 |
)
|
162 |
|
163 |
+
if labeled_horizontal_lines:
|
164 |
+
for label, y_value in labeled_horizontal_lines.items():
|
165 |
+
fig.add_hline(
|
166 |
+
y=y_value,
|
167 |
+
line_dash="dot",
|
168 |
+
line_color="black",
|
169 |
+
annotation_text=label,
|
170 |
+
annotation_position="right",
|
171 |
+
annotation=dict(
|
172 |
+
font_size=12,
|
173 |
+
font_color="black",
|
174 |
+
xanchor="left",
|
175 |
+
yanchor="middle",
|
176 |
+
xshift=10
|
177 |
+
)
|
178 |
+
)
|
179 |
+
|
180 |
return fig
|
181 |
|
182 |
|
183 |
with gr.Blocks() as demo:
|
184 |
with gr.Tab("System Performance Over Time"):
|
185 |
+
with gr.Tab("Legend"):
|
186 |
+
legend_markdown: gr.Markdown = gr.Markdown(
|
187 |
+
value="""
|
188 |
+
## Benchmarks and Top Scores
|
189 |
+
|
190 |
+
| Benchmark | Top Score |
|
191 |
+
|-----------|-----------|
|
192 |
+
| BigCodeBench | 🟠 36% |
|
193 |
+
| Simple Bench | 🟠 42% |
|
194 |
+
| PlanBench | 🟠 53% |
|
195 |
+
| GAIA | 🟡 65% |
|
196 |
+
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% |
|
197 |
+
| GPQA | 🟡 76% |
|
198 |
+
| ZebraLogic | 🟡 81% |
|
199 |
+
| ARC-AGI-Pub (Public Eval) | 🟡 83% |
|
200 |
+
| ZeroEval | 🟡 86% |
|
201 |
+
| MATH-L5 | 🟡 89% |
|
202 |
+
| MMLU-Redux | 🟢 93% |
|
203 |
+
| CRUX | 🟢 96% |
|
204 |
+
|
205 |
+
## Colors
|
206 |
+
|
207 |
+
| Color | Score Range |
|
208 |
+
|-------|------------|
|
209 |
+
| 🔴 Red | Below 30% |
|
210 |
+
| 🟠 Orange | 30% to 60% |
|
211 |
+
| 🟡 Yellow | 60% to 90% |
|
212 |
+
| 🟢 Green | Above 90% |"""
|
213 |
+
)
|
214 |
+
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
|
215 |
+
bigcodebench_plot: gr.Plot = gr.Plot()
|
216 |
+
bigcodebench_markdown: gr.Markdown = gr.Markdown(
|
217 |
+
value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)"""
|
218 |
+
)
|
219 |
+
with gr.Tab("🟠 Simple Bench") as simple_bench_tab:
|
220 |
simple_bench_plot: gr.Plot = gr.Plot()
|
221 |
+
simple_bench_markdown: gr.Markdown = gr.Markdown(
|
222 |
+
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
|
223 |
+
)
|
224 |
+
with gr.Tab("🟠 PlanBench") as planbench_tab:
|
225 |
planbench_plot: gr.Plot = gr.Plot()
|
226 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
227 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
228 |
)
|
229 |
+
with gr.Tab("🟡 GAIA") as gaia_tab:
|
|
|
|
|
|
|
|
|
|
|
230 |
gaia_plot: gr.Plot = gr.Plot()
|
231 |
+
gaia_markdown: gr.Markdown = gr.Markdown(
|
232 |
+
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
|
233 |
+
)
|
234 |
+
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
|
235 |
+
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
|
236 |
+
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
|
237 |
+
with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab:
|
238 |
+
arc_agi_public_eval_plot: gr.Plot = gr.Plot()
|
239 |
+
arc_agi_markdown: gr.Markdown = gr.Markdown(
|
240 |
+
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
|
241 |
+
)
|
242 |
+
with gr.Tab("🟡 GPQA") as gpqa_tab:
|
243 |
gpqa_plot: gr.Plot = gr.Plot()
|
244 |
+
gpqa_markdown: gr.Markdown = gr.Markdown(
|
245 |
+
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
|
246 |
+
)
|
247 |
+
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
|
248 |
+
zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
|
249 |
+
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
|
250 |
+
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
251 |
+
)
|
252 |
+
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
|
253 |
+
zeroeval_average_plot: gr.Plot = gr.Plot()
|
254 |
+
zeroeval_average_markdown: gr.Markdown = gr.Markdown(
|
255 |
+
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
256 |
+
)
|
257 |
+
with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab:
|
258 |
+
zeroeval_math_l5_plot: gr.Plot = gr.Plot()
|
259 |
+
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
|
260 |
+
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
261 |
+
)
|
262 |
+
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
|
263 |
+
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
|
264 |
+
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
|
265 |
+
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
266 |
+
)
|
267 |
+
with gr.Tab("🟢 CRUX") as zeroeval_crux_tab:
|
268 |
+
zeroeval_crux_plot: gr.Plot = gr.Plot()
|
269 |
+
zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
|
270 |
+
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
|
271 |
+
)
|
272 |
+
with gr.Tab("Codeforces") as codeforces_tab:
|
273 |
+
codeforces_plot: gr.Plot = gr.Plot()
|
274 |
+
with gr.Tab("OpenCompass", visible=False):
|
275 |
opencompass_plot: gr.Plot = gr.Plot()
|
276 |
+
opencompass_markdown: gr.Markdown = gr.Markdown(
|
277 |
+
value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)"""
|
278 |
+
)
|
279 |
+
with gr.Tab("SWE-bench", visible=False):
|
280 |
swe_bench_plot: gr.Plot = gr.Plot()
|
281 |
+
swe_bench_markdown: gr.Markdown = gr.Markdown(
|
282 |
+
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
|
283 |
+
)
|
284 |
+
with gr.Tab("WebArena", visible=False):
|
285 |
webarena_plot: gr.Plot = gr.Plot()
|
286 |
+
webarena_markdown: gr.Markdown = gr.Markdown(
|
287 |
+
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
|
288 |
+
)
|
289 |
with gr.Tab("Finance") as finance_tab:
|
290 |
+
with gr.Tab("Big Tech Capex") as big_five_capex_tab:
|
291 |
big_five_capex_plot: gr.Plot = gr.Plot()
|
292 |
+
with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue:
|
293 |
nvidia_revenue_plot: gr.Plot = gr.Plot()
|
294 |
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
295 |
+
arc_agi_public_eval_tab.select(fn=create_simple_plot,
|
296 |
+
inputs=[gr.State("arc_agi_leaderboard.jsonl"),
|
297 |
+
gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
298 |
+
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
299 |
+
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
300 |
+
gr.State(0), gr.State(100),
|
301 |
+
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
302 |
+
outputs=arc_agi_public_eval_plot)
|
303 |
arc_agi_tab.select(fn=create_simple_plot,
|
304 |
+
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
305 |
+
gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
306 |
+
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
307 |
+
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
308 |
+
gr.State(0), gr.State(100),
|
309 |
+
gr.State({"MTurkers": 77})],
|
310 |
+
outputs=arc_agi_semi_private_eval_plot)
|
311 |
+
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
|
312 |
+
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
|
313 |
+
gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
314 |
+
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
315 |
+
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
316 |
+
gr.State(0), gr.State(100),
|
317 |
+
gr.State({"MTurkers": 77})],
|
318 |
+
outputs=arc_agi_semi_private_eval_plot)
|
319 |
+
finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
|
320 |
simple_bench_tab.select(fn=create_simple_plot,
|
321 |
+
inputs=[gr.State("simple_bench_leaderboard.jsonl"),
|
322 |
+
gr.State("Simple Bench Score"),
|
323 |
+
gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
|
324 |
+
gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)),
|
325 |
+
gr.State(0), gr.State(100),
|
326 |
+
gr.State({"Humans": 83.7})],
|
327 |
outputs=simple_bench_plot)
|
328 |
codeforces_tab.select(fn=create_simple_plot,
|
329 |
+
inputs=[gr.State("codeforces_leaderboard.jsonl"),
|
330 |
+
gr.State("Codeforces Rating"),
|
331 |
+
gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
|
332 |
+
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
|
333 |
+
gr.State(0), gr.State(4000),
|
334 |
+
gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
|
335 |
outputs=codeforces_plot)
|
336 |
planbench_tab.select(fn=create_simple_plot,
|
337 |
+
inputs=[gr.State("planbench_leaderboard.jsonl"),
|
338 |
+
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
|
339 |
+
gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
|
340 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
|
341 |
outputs=planbench_plot)
|
342 |
+
bigcodebench_tab.select(fn=create_simple_plot,
|
343 |
+
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
|
344 |
+
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
|
345 |
+
gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
|
346 |
+
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
|
347 |
+
outputs=bigcodebench_plot)
|
348 |
+
gaia_tab.select(fn=create_simple_plot,
|
349 |
+
inputs=[gr.State("gaia_leaderboard.jsonl"),
|
350 |
+
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
|
351 |
+
gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
|
352 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
|
353 |
+
gr.State(0), gr.State(100),
|
354 |
+
gr.State({"Humans": 92})],
|
355 |
+
outputs=gaia_plot)
|
356 |
+
gpqa_tab.select(fn=create_simple_plot,
|
357 |
+
inputs=[gr.State("gpqa_leaderboard.jsonl"),
|
358 |
+
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
|
359 |
+
gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
|
360 |
+
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
|
361 |
+
gr.State(25), gr.State(100),
|
362 |
+
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
|
363 |
+
outputs=gpqa_plot)
|
364 |
+
zeroeval_average_tab.select(fn=create_simple_plot,
|
365 |
+
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
|
366 |
+
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
|
367 |
+
gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
|
368 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
369 |
+
outputs=zeroeval_average_plot)
|
370 |
+
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
|
371 |
+
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
|
372 |
+
gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
|
373 |
+
gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
|
374 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
375 |
+
outputs=zeroeval_mmlu_redux_plot)
|
376 |
+
zeroeval_zebralogic_tab.select(fn=create_simple_plot,
|
377 |
+
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
|
378 |
+
gr.State("ZeroEval ZebraLogic Score"),
|
379 |
+
gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
|
380 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
381 |
+
outputs=zeroeval_zebralogic_plot)
|
382 |
+
zeroeval_crux_tab.select(fn=create_simple_plot,
|
383 |
+
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
|
384 |
+
gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
|
385 |
+
gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
|
386 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
387 |
+
outputs=zeroeval_crux_plot)
|
388 |
+
zeroeval_math_l5_tab.select(fn=create_simple_plot,
|
389 |
+
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
|
390 |
+
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
|
391 |
+
gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
|
392 |
+
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
|
393 |
+
outputs=zeroeval_math_l5_plot)
|
394 |
|
395 |
|
396 |
if __name__ == "__main__":
|
arc_agi_semi_private_eval_leaderboard.jsonl
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o3", "score": 75.7}
|
2 |
+
{"model": "o1-2024-12-17", "score": 32}
|
3 |
+
{"model": "o1-preview-2024-09-12", "score": 18}
|
4 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 14}
|
5 |
+
{"model": "gpt-4o-2024-05-13", "score": 5}
|
6 |
+
{"model": "gemini-1.5-pro-001", "score": 4.5}
|
big_five_capex.jsonl
CHANGED
@@ -1,40 +1,40 @@
|
|
1 |
-
{"Quarter": "2015 Q1", "Microsoft": 1391, "Google": 2927, "Meta": 502, "
|
2 |
-
{"Quarter": "2015 Q2", "Microsoft": 1781, "Google": 2515, "Meta": 549, "
|
3 |
-
{"Quarter": "2015 Q3", "Microsoft": 1356, "Google": 2406, "Meta": 780, "
|
4 |
-
{"Quarter": "2015 Q4", "Microsoft": 2024, "Google": 2102, "Meta": 692, "
|
5 |
-
{"Quarter": "2016 Q1", "Microsoft": 2308, "Google": 2444, "Meta": 1132, "
|
6 |
-
{"Quarter": "2016 Q2", "Microsoft": 2655, "Google": 2136, "Meta": 995, "
|
7 |
-
{"Quarter": "2016 Q3", "Microsoft": 2163, "Google": 2554, "Meta": 1095, "
|
8 |
-
{"Quarter": "2016 Q4", "Microsoft": 1988, "Google": 3078, "Meta": 1269, "
|
9 |
-
{"Quarter": "2017 Q1", "Microsoft": 1695, "Google": 2508, "Meta": 1271, "
|
10 |
-
{"Quarter": "2017 Q2", "Microsoft": 2283, "Google": 2831, "Meta": 1444, "
|
11 |
-
{"Quarter": "2017 Q3", "Microsoft": 2132, "Google": 3538, "Meta": 1755, "
|
12 |
-
{"Quarter": "2017 Q4", "Microsoft": 2586, "Google": 4307, "Meta": 2263, "
|
13 |
-
{"Quarter": "2018 Q1", "Microsoft": 2934, "Google": 7299, "Meta": 2812, "
|
14 |
-
{"Quarter": "2018 Q2", "Microsoft": 3980, "Google": 5477, "Meta": 3460, "
|
15 |
-
{"Quarter": "2018 Q3", "Microsoft": 3602, "Google": 5282, "Meta": 3342, "
|
16 |
-
{"Quarter": "2018 Q4", "Microsoft": 3707, "Google": 7081, "Meta": 4301, "
|
17 |
-
{"Quarter": "2019 Q1", "Microsoft": 2565, "Google": 4638, "Meta": 3837, "
|
18 |
-
{"Quarter": "2019 Q2", "Microsoft": 4051, "Google": 6126, "Meta": 3633, "
|
19 |
-
{"Quarter": "2019 Q3", "Microsoft": 3385, "Google": 6732, "Meta": 3532, "
|
20 |
-
{"Quarter": "2019 Q4", "Microsoft": 3545, "Google": 6052, "Meta": 4100, "
|
21 |
-
{"Quarter": "2020 Q1", "Microsoft": 3767, "Google": 6005, "Meta": 3558, "
|
22 |
-
{"Quarter": "2020 Q2", "Microsoft": 4744, "Google": 5391, "Meta": 3255, "
|
23 |
-
{"Quarter": "2020 Q3", "Microsoft": 4907, "Google": 5406, "Meta": 3689, "
|
24 |
-
{"Quarter": "2020 Q4", "Microsoft": 4174, "Google": 5479, "Meta": 4613, "
|
25 |
-
{"Quarter": "2021 Q1", "Microsoft": 5089, "Google": 5942, "Meta": 4303, "
|
26 |
-
{"Quarter": "2021 Q2", "Microsoft": 6452, "Google": 5496, "Meta": 4641, "
|
27 |
-
{"Quarter": "2021 Q3", "Microsoft": 5810, "Google": 6819, "Meta": 4346, "
|
28 |
-
{"Quarter": "2021 Q4", "Microsoft": 5865, "Google": 6383, "Meta": 5400, "
|
29 |
-
{"Quarter": "2022 Q1", "Microsoft": 5340, "Google": 9786, "Meta": 5441, "
|
30 |
-
{"Quarter": "2022 Q2", "Microsoft": 6871, "Google": 6828, "Meta": 7572, "
|
31 |
-
{"Quarter": "2022 Q3", "Microsoft": 6283, "Google": 7276, "Meta": 9375, "
|
32 |
-
{"Quarter": "2022 Q4", "Microsoft": 6274, "Google": 7595, "Meta": 9043, "
|
33 |
-
{"Quarter": "2023 Q1", "Microsoft": 6607, "Google": 6289, "Meta": 6823, "
|
34 |
-
{"Quarter": "2023 Q2", "Microsoft": 8943, "Google": 6888, "Meta": 6134, "
|
35 |
-
{"Quarter": "2023 Q3", "Microsoft": 9917, "Google": 8055, "Meta": 6543, "
|
36 |
-
{"Quarter": "2023 Q4", "Microsoft": 9735, "Google": 11019, "Meta": 7665, "
|
37 |
-
{"Quarter": "2024 Q1", "Microsoft": 10952, "Google": 12012, "Meta": 6400, "
|
38 |
-
{"Quarter": "2024 Q2", "Microsoft": 13873, "Google": 13186, "Meta": 8173, "
|
39 |
-
{"Quarter": "2024 Q3", "Microsoft": 14923, "Google": 13016, "Meta": 8258, "
|
40 |
-
{"Quarter": "2024 Q4", "Microsoft": 15804, "Google": 14276, "Meta": 14425, "
|
|
|
1 |
+
{"Quarter": "2015 Q1", "Microsoft": 1391, "Google": 2927, "Meta": 502, "Amazon": 871}
|
2 |
+
{"Quarter": "2015 Q2", "Microsoft": 1781, "Google": 2515, "Meta": 549, "Amazon": 1213}
|
3 |
+
{"Quarter": "2015 Q3", "Microsoft": 1356, "Google": 2406, "Meta": 780, "Amazon": 1195}
|
4 |
+
{"Quarter": "2015 Q4", "Microsoft": 2024, "Google": 2102, "Meta": 692, "Amazon": 1309}
|
5 |
+
{"Quarter": "2016 Q1", "Microsoft": 2308, "Google": 2444, "Meta": 1132, "Amazon": 1179}
|
6 |
+
{"Quarter": "2016 Q2", "Microsoft": 2655, "Google": 2136, "Meta": 995, "Amazon": 1711}
|
7 |
+
{"Quarter": "2016 Q3", "Microsoft": 2163, "Google": 2554, "Meta": 1095, "Amazon": 1841}
|
8 |
+
{"Quarter": "2016 Q4", "Microsoft": 1988, "Google": 3078, "Meta": 1269, "Amazon": 3073}
|
9 |
+
{"Quarter": "2017 Q1", "Microsoft": 1695, "Google": 2508, "Meta": 1271, "Amazon": 2148}
|
10 |
+
{"Quarter": "2017 Q2", "Microsoft": 2283, "Google": 2831, "Meta": 1444, "Amazon": 3113}
|
11 |
+
{"Quarter": "2017 Q3", "Microsoft": 2132, "Google": 3538, "Meta": 1755, "Amazon": 3074}
|
12 |
+
{"Quarter": "2017 Q4", "Microsoft": 2586, "Google": 4307, "Meta": 2263, "Amazon": 3619}
|
13 |
+
{"Quarter": "2018 Q1", "Microsoft": 2934, "Google": 7299, "Meta": 2812, "Amazon": 3098}
|
14 |
+
{"Quarter": "2018 Q2", "Microsoft": 3980, "Google": 5477, "Meta": 3460, "Amazon": 3243}
|
15 |
+
{"Quarter": "2018 Q3", "Microsoft": 3602, "Google": 5282, "Meta": 3342, "Amazon": 3352}
|
16 |
+
{"Quarter": "2018 Q4", "Microsoft": 3707, "Google": 7081, "Meta": 4301, "Amazon": 3734}
|
17 |
+
{"Quarter": "2019 Q1", "Microsoft": 2565, "Google": 4638, "Meta": 3837, "Amazon": 3290}
|
18 |
+
{"Quarter": "2019 Q2", "Microsoft": 4051, "Google": 6126, "Meta": 3633, "Amazon": 3562}
|
19 |
+
{"Quarter": "2019 Q3", "Microsoft": 3385, "Google": 6732, "Meta": 3532, "Amazon": 4697}
|
20 |
+
{"Quarter": "2019 Q4", "Microsoft": 3545, "Google": 6052, "Meta": 4100, "Amazon": 5312}
|
21 |
+
{"Quarter": "2020 Q1", "Microsoft": 3767, "Google": 6005, "Meta": 3558, "Amazon": 6795}
|
22 |
+
{"Quarter": "2020 Q2", "Microsoft": 4744, "Google": 5391, "Meta": 3255, "Amazon": 7459}
|
23 |
+
{"Quarter": "2020 Q3", "Microsoft": 4907, "Google": 5406, "Meta": 3689, "Amazon": 11063}
|
24 |
+
{"Quarter": "2020 Q4", "Microsoft": 4174, "Google": 5479, "Meta": 4613, "Amazon": 14823}
|
25 |
+
{"Quarter": "2021 Q1", "Microsoft": 5089, "Google": 5942, "Meta": 4303, "Amazon": 12082}
|
26 |
+
{"Quarter": "2021 Q2", "Microsoft": 6452, "Google": 5496, "Meta": 4641, "Amazon": 14288}
|
27 |
+
{"Quarter": "2021 Q3", "Microsoft": 5810, "Google": 6819, "Meta": 4346, "Amazon": 15748}
|
28 |
+
{"Quarter": "2021 Q4", "Microsoft": 5865, "Google": 6383, "Meta": 5400, "Amazon": 18935}
|
29 |
+
{"Quarter": "2022 Q1", "Microsoft": 5340, "Google": 9786, "Meta": 5441, "Amazon": 14951}
|
30 |
+
{"Quarter": "2022 Q2", "Microsoft": 6871, "Google": 6828, "Meta": 7572, "Amazon": 15724}
|
31 |
+
{"Quarter": "2022 Q3", "Microsoft": 6283, "Google": 7276, "Meta": 9375, "Amazon": 16378}
|
32 |
+
{"Quarter": "2022 Q4", "Microsoft": 6274, "Google": 7595, "Meta": 9043, "Amazon": 16592}
|
33 |
+
{"Quarter": "2023 Q1", "Microsoft": 6607, "Google": 6289, "Meta": 6823, "Amazon": 14207}
|
34 |
+
{"Quarter": "2023 Q2", "Microsoft": 8943, "Google": 6888, "Meta": 6134, "Amazon": 11455}
|
35 |
+
{"Quarter": "2023 Q3", "Microsoft": 9917, "Google": 8055, "Meta": 6543, "Amazon": 12479}
|
36 |
+
{"Quarter": "2023 Q4", "Microsoft": 9735, "Google": 11019, "Meta": 7665, "Amazon": 14588}
|
37 |
+
{"Quarter": "2024 Q1", "Microsoft": 10952, "Google": 12012, "Meta": 6400, "Amazon": 14925}
|
38 |
+
{"Quarter": "2024 Q2", "Microsoft": 13873, "Google": 13186, "Meta": 8173, "Amazon": 17620}
|
39 |
+
{"Quarter": "2024 Q3", "Microsoft": 14923, "Google": 13016, "Meta": 8258, "Amazon": 22620}
|
40 |
+
{"Quarter": "2024 Q4", "Microsoft": 15804, "Google": 14276, "Meta": 14425, "Amazon": 27834}
|
bigcodebench_hard_average_leaderboard.jsonl
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-2024-12-17", "score": 35.5}
|
2 |
+
{"model": "gemini-exp-1206", "score": 34.1}
|
3 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 32.1}
|
4 |
+
{"model": "athene-v2-chat", "score": 32.1}
|
5 |
+
{"model": "athene-v2-agent", "score": 31.4}
|
6 |
+
{"model": "gpt-4o-2024-11-20", "score": 31.1}
|
7 |
+
{"model": "gpt-4o-2024-08-06", "score": 30.8}
|
8 |
+
{"model": "qwen2.5-coder-32b-instruct", "score": 30.8}
|
9 |
+
{"model": "claude-3.5-sonnet-20241022", "score": 30.4}
|
10 |
+
{"model": "claude-3.5-haiku-20241022", "score": 30.1}
|
11 |
+
{"model": "claude-3.5-sonnet-20240620", "score": 29.4}
|
12 |
+
{"model": "deepseek-coder-v2-instruct (2024-07-24)", "score": 29.4}
|
13 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 29.4}
|
14 |
+
{"model": "gemini-exp-1114", "score": 29.4}
|
15 |
+
{"model": "o1-preview-2024-09-12 (temperature=1)", "score": 28.8}
|
16 |
+
{"model": "deepseek-v2-chat (2024-06-28)", "score": 28.7}
|
17 |
+
{"model": "llama-3.3-70b-instruct", "score": 28.4}
|
18 |
+
{"model": "gemini-2.0-flash-exp", "score": 28.1}
|
19 |
+
{"model": "gemini-1.5-pro-exp-0801", "score": 27.4}
|
20 |
+
{"model": "o1-mini-2024-09-12 (temperature=1)", "score": 27.4}
|
21 |
+
{"model": "gemini-exp-1121", "score": 27.4}
|
22 |
+
{"model": "gemini-2.0-flash-thinking-exp-1219", "score": 27.4}
|
23 |
+
{"model": "gpt-4o-2024-05-13", "score": 27.1}
|
24 |
+
{"model": "deepseek-coder-v2-instruct", "score": 27}
|
25 |
+
{"model": "gemini-1.5-pro-002", "score": 26.6}
|
26 |
+
{"model": "grok-beta", "score": 26.6}
|
27 |
+
{"model": "llama-3.1-405b-instruct", "score": 26.4}
|
28 |
+
{"model": "deepseek-v2.5-1210", "score": 26.4}
|
29 |
+
{"model": "deepseek-v2.5", "score": 26.1}
|
30 |
+
{"model": "claude-3-opus-20240229", "score": 26}
|
31 |
+
{"model": "mistral-large-instruct-2407", "score": 26}
|
32 |
+
{"model": "gemini-1.5-pro-api-0514", "score": 25.4}
|
33 |
+
{"model": "llama-3.1-70b-instruct", "score": 25.4}
|
34 |
+
{"model": "qwen2.5-72b-instruct", "score": 25.4}
|
35 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 25.3}
|
36 |
+
{"model": "llama-3-70b-instruct", "score": 24.6}
|
37 |
+
{"model": "qwen2.5-32b-instruct", "score": 24.6}
|
38 |
+
{"model": "llama-3.1-nemotron-70b-instruct", "score": 24.6}
|
39 |
+
{"model": "dracarys-llama-3.1-70b-instruct", "score": 24.3}
|
40 |
+
{"model": "gemini-1.5-flash-api-0514", "score": 23.6}
|
41 |
+
{"model": "llama-3-70b-synthia-v3.5", "score": 23.6}
|
42 |
+
{"model": "claude-3-sonnet-20240229", "score": 23.4}
|
43 |
+
{"model": "dracarys-72b-instruct", "score": 22.6}
|
44 |
+
{"model": "hermes-2-theta-llama-3-70b", "score": 22.3}
|
45 |
+
{"model": "phi-3.1-mini-128k-instruct", "score": 22}
|
46 |
+
{"model": "hermes-2-pro-llama-3-70b", "score": 21.6}
|
47 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 21.6}
|
48 |
+
{"model": "qwen2.5-14b-instruct", "score": 20.9}
|
49 |
+
{"model": "qwen2-72b-chat", "score": 20.6}
|
50 |
+
{"model": "codestral-22b-v0.1", "score": 20.6}
|
51 |
+
{"model": "qwen2.5-coder-7b-instruct", "score": 20.3}
|
52 |
+
{"model": "gemma-2-27b-instruct", "score": 20}
|
53 |
+
{"model": "gpt-3.5-turbo-0125", "score": 19.9}
|
54 |
+
{"model": "mixtral-8x22b-instruct", "score": 19.9}
|
55 |
+
{"model": "athene-70b", "score": 19.9}
|
56 |
+
{"model": "deepseek-coder-33b-instruct", "score": 19.3}
|
57 |
+
{"model": "whiterabbitneo-33b-v1.5", "score": 19.3}
|
58 |
+
{"model": "reflectioncoder-ds-33b", "score": 18.9}
|
59 |
+
{"model": "deepseek-v2-chat", "score": 18.6}
|
60 |
+
{"model": "opencoder-8b-instruct", "score": 18.5}
|
61 |
+
{"model": "claude-3-haiku-20240307", "score": 18.3}
|
62 |
+
{"model": "gpt-4-0613", "score": 17.6}
|
codeforces_leaderboard.jsonl
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
{"model": "o3", "score":
|
2 |
{"model": "o3-mini", "score": 2073}
|
3 |
{"model": "o1", "score": 1673}
|
4 |
{"model": "o1-mini", "score": 1650}
|
|
|
1 |
+
{"model": "o3", "score": 2400}
|
2 |
{"model": "o3-mini", "score": 2073}
|
3 |
{"model": "o1", "score": 1673}
|
4 |
{"model": "o1-mini", "score": 1650}
|
gaia_leaderboard.jsonl
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "score": 65.12}
|
2 |
+
{"model": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "score": 49.33}
|
3 |
+
{"model": "barcelona v0.1 (claude sonnet 3.5)", "score": 46.18}
|
4 |
+
{"model": "omne v0.1 (o1-preview, gpt-4o)", "score": 40.53}
|
5 |
+
{"model": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "score": 39.53}
|
6 |
+
{"model": "Multi Agent", "score": 38.87}
|
7 |
+
{"model": "DynaSaur (gpt-4o)", "score": 38.21}
|
8 |
+
{"model": "magentic-1 (o1)", "score": 38}
|
9 |
+
{"model": "Trase Agent v0.1 (fine-tuned gpt-4o)", "score": 35.55}
|
10 |
+
{"model": "sibyl system v0.2 (gpt-4o)", "score": 34.55}
|
11 |
+
{"model": "HuggingFaceAgents (gpt-4o)", "score": 33.33}
|
12 |
+
{"model": "tapeagent v0.2", "score": 33.22}
|
13 |
+
{"model": "little_potato (yanzw gpt-4o)", "score": 32.89}
|
14 |
+
{"model": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "score": 32.33}
|
15 |
+
{"model": "magentic-1", "score": 32.33}
|
16 |
+
{"model": "das_agent v0.2", "score": 32.33}
|
17 |
+
{"model": "cola_v0.4", "score": 31.89}
|
18 |
+
{"model": "modified hugging face agents + gpt-4o", "score": 31.23}
|
19 |
+
{"model": "das_agent", "score": 31}
|
20 |
+
{"model": "das_agent v0.3", "score": 30.33}
|
21 |
+
{"model": "cola_v0.3", "score": 30.23}
|
22 |
+
{"model": "gpt-4o-2024-08-06", "score": 29}
|
23 |
+
{"model": "replicated hugging face agents + gpt-4o", "score": 29}
|
24 |
+
{"model": "tapeagent v0.1", "score": 27.57}
|
25 |
+
{"model": "das_agent v0.4 mini (fixed)", "score": 26.91}
|
26 |
+
{"model": "sibyl system v0.2 (gpt-4o-2024-08-06)", "score": 26.58}
|
27 |
+
{"model": "das_agent v0.4 mini", "score": 25.91}
|
28 |
+
{"model": "mmac v1.1 (gpt4v gemini 1.5)", "score": 25.91}
|
29 |
+
{"model": "modified sibyl system", "score": 25.91}
|
30 |
+
{"model": "maac_v1", "score": 25.58}
|
31 |
+
{"model": "uk ai safety institute internal (gpt-4-turbo)", "score": 25}
|
32 |
+
{"model": "FRIDAY (gpt-4-turbo)", "score": 24.25}
|
33 |
+
{"model": "cola_abl", "score": 23.26}
|
34 |
+
{"model": "replicated hugging face agents + gpt-4o mini", "score": 22.67}
|
35 |
+
{"model": "tapeagent v0.2 mini", "score": 21.93}
|
36 |
+
{"model": "friday_without_learning (os-copilot gpt-4-turbo)", "score": 21.59}
|
37 |
+
{"model": "ceylon", "score": 17.06}
|
38 |
+
{"model": "tapeagent v0.1 mini", "score": 16.61}
|
39 |
+
{"model": "dip (gpt-4-turbo)", "score": 15.95}
|
40 |
+
{"model": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "score": 15.61}
|
41 |
+
{"model": "cola_v0.2", "score": 15.28}
|
42 |
+
{"model": "chamomile", "score": 14.67}
|
43 |
+
{"model": "clarity v1", "score": 14.05}
|
44 |
+
{"model": "warm-up act (gpt-4-turbo)", "score": 12.96}
|
45 |
+
{"model": "frc v5", "score": 12}
|
46 |
+
{"model": "cola_v0.1", "score": 10.96}
|
47 |
+
{"model": "somedayv1.2", "score": 10.3}
|
48 |
+
{"model": "somedayv1.1", "score": 9.97}
|
49 |
+
{"model": "frc v4", "score": 9.33}
|
50 |
+
{"model": "stealth3", "score": 9.3}
|
51 |
+
{"model": "stealth2", "score": 8.97}
|
52 |
+
{"model": "someday1", "score": 8.97}
|
53 |
+
{"model": "frc v3", "score": 8.67}
|
54 |
+
{"model": "stealth", "score": 8.64}
|
55 |
+
{"model": "gpt-4-turbo", "score": 6.67}
|
56 |
+
{"model": "someday", "score": 6.31}
|
57 |
+
{"model": "AutoGPT4 (gpt-4)", "score": 5}
|
58 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 4.65}
|
59 |
+
{"model": "gpt-4", "score": 4}
|
60 |
+
{"model": "gpt-3.5-turbo", "score": 2.67}
|
61 |
+
{"model": "alphaagent v0.1 (gpt-4o)", "score": 2.33}
|
gpqa_leaderboard.jsonl
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-2024-12-17", "score": 76}
|
2 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 56}
|
3 |
+
{"model": "gpt-4o-2024-05-13", "score": 49}
|
4 |
+
{"model": "claude-3-opus-20240229", "score": 48}
|
5 |
+
{"model": "gemini-1.5-pro-001", "score": 45}
|
6 |
+
{"model": "gpt-4-1106-preview", "score": 43}
|
7 |
+
{"model": "claude-2.0", "score": 35}
|
8 |
+
{"model": "gpt-4-0613", "score": 33}
|
models.jsonl
CHANGED
@@ -1,39 +1,60 @@
|
|
1 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{"Name": "o1", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
|
|
|
|
|
|
4 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
5 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
6 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
7 |
{"Name": "o1-mini", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
8 |
{"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
|
|
|
9 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
10 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
11 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
12 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
13 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
14 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
15 |
{"Name": "command-r-plus-08-2024", "Release Date": "2024-08-21", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
|
16 |
{"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
|
17 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
18 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
19 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
20 |
{"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
21 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
22 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
23 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
24 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
25 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
26 |
{"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
27 |
{"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
|
|
28 |
{"Name": "llama-3.1-405b", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
29 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
30 |
{"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
31 |
{"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
32 |
{"Name": "gpt-4-turbo-2024-04-09", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
33 |
{"Name": "gpt-4-1106-preview", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
34 |
{"Name": "mistral-large-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
35 |
{"Name": "athene-70b-0725", "Release Date": "2024-07-25", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
36 |
{"Name": "claude-3-opus-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
37 |
{"Name": "llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
38 |
{"Name": "gpt-4-0125-preview", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
39 |
{"Name": "yi-large-preview", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -47,6 +68,7 @@
|
|
47 |
{"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
|
48 |
{"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
49 |
{"Name": "gemini-1.5-pro-001", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
50 |
{"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
|
51 |
{"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
52 |
{"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -54,10 +76,10 @@
|
|
54 |
{"Name": "reka-core-20240501", "Release Date": "2024-05-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
55 |
{"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
|
56 |
{"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
|
|
57 |
{"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
|
58 |
{"Name": "gpt-4", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
59 |
{"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
60 |
-
{"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
61 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
62 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
63 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
@@ -93,6 +115,7 @@
|
|
93 |
{"Name": "gemini-pro", "Release Date": "2023-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
94 |
{"Name": "qwen1.5-14b-chat", "Release Date": "2024-02-04", "Total Parameters": 14, "Active Parameters": 14, "API Cost": 0}
|
95 |
{"Name": "gpt-3.5-turbo-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
96 |
{"Name": "wizardlm-70b", "Release Date": "2023-08-09", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
97 |
{"Name": "gpt-3.5-turbo-0125", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
98 |
{"Name": "dbrx-instruct-preview", "Release Date": "2024-03-27", "Total Parameters": 132, "Active Parameters": 36, "API Cost": 0}
|
@@ -153,4 +176,94 @@
|
|
153 |
{"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
|
154 |
{"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
|
155 |
{"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
|
156 |
-
{"Name": "gpt-3.5", "Release Date": "2022-11-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
{"Name": "o3", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
2 |
{"Name": "o3-mini", "Release Date": "2024-12-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
3 |
+
{"Name": "o1-2024-12-17", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
4 |
+
{"Name": "o1-2024-12-17 (temperature=1)", "Release Date": "2024-12-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
5 |
+
{"Name": "gemini-2.0-flash-exp", "Release Date": "2024-12-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
6 |
+
{"Name": "Jeremy Berman", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
7 |
+
{"Name": "gemini-exp-1206", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
8 |
+
{"Name": "llama-3.3-70b-instruct", "Release Date": "2024-12-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
9 |
{"Name": "o1", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
10 |
+
{"Name": "claude-3-5-sonnet-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
11 |
+
{"Name": "claude-3.5-sonnet-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
12 |
+
{"Name": "claude-3-5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
13 |
+
{"Name": "claude-3.5-haiku-20241022", "Release Date": "2024-10-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
14 |
+
{"Name": "gemini-1.5-pro-002", "Release Date": "2024-09-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
15 |
{"Name": "o1-preview-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
16 |
{"Name": "o1-preview", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
17 |
{"Name": "o1-mini-2024-09-12", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
18 |
{"Name": "o1-mini", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
19 |
{"Name": "deepseek-v2.5", "Release Date": "2024-09-05", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
|
20 |
+
{"Name": "deepseek-v2.5-0908", "Release Date": "2024-09-08", "Total Parameters": 236, "Active Parameters": 236, "API Cost": 0}
|
21 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
22 |
{"Name": "qwen-plus-0828", "Release Date": "2024-08-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
23 |
{"Name": "gemini-1.5-pro-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
24 |
{"Name": "gemini-1.5-flash-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
25 |
{"Name": "gemini-1.5-flash-8b-exp-0827", "Release Date": "2024-08-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
26 |
{"Name": "chatgpt-4o-latest", "Release Date": "2024-08-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
27 |
+
{"Name": "chatgpt-4o-latest-24-09-07", "Release Date": "2024-09-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
28 |
{"Name": "command-r-plus-08-2024", "Release Date": "2024-08-21", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
|
29 |
{"Name": "command-r-08-2024", "Release Date": "2024-08-19", "Total Parameters": 32, "Active Parameters": 32, "API Cost": 0}
|
30 |
{"Name": "gemini-1.5-pro-exp-0801", "Release Date": "2024-08-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
31 |
+
{"Name": "grok-2-1212", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
32 |
{"Name": "grok-2-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
33 |
+
{"Name": "gpt-4o-2024-11-20", "Release Date": "2024-11-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
34 |
+
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
35 |
{"Name": "gpt-4o", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
36 |
{"Name": "gpt-4o-2024-05-13", "Release Date": "2024-05-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
37 |
{"Name": "gpt-4o-mini-2024-07-18", "Release Date": "2024-07-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
38 |
{"Name": "gemma-2-9b-it-simpo", "Release Date": "2024-07-17", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
39 |
{"Name": "claude-3-5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
40 |
{"Name": "grok-2-mini-2024-08-13", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
41 |
+
{"Name": "grok-beta", "Release Date": "2024-08-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
42 |
{"Name": "gemini-advanced-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
43 |
{"Name": "llama-3.1-405b-instruct-bf16", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
44 |
{"Name": "llama-3.1-405b-instruct-fp8", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
45 |
+
{"Name": "llama-3.1-405b-instruct", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
46 |
{"Name": "llama-3.1-405b", "Release Date": "2024-07-23", "Total Parameters": 405, "Active Parameters": 405, "API Cost": 0}
|
47 |
{"Name": "gpt-4o-2024-08-06", "Release Date": "2024-08-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
48 |
{"Name": "gemini-1.5-pro-api-0514", "Release Date": "2024-05-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
49 |
{"Name": "gemini-1.5-pro-api-0409-preview", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
50 |
{"Name": "gpt-4-turbo-2024-04-09", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
51 |
{"Name": "gpt-4-1106-preview", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
52 |
+
{"Name": "gpt-4-turbo", "Release Date": "2023-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
53 |
{"Name": "mistral-large-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
54 |
+
{"Name": "mistral-large-2", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
55 |
{"Name": "athene-70b-0725", "Release Date": "2024-07-25", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
56 |
{"Name": "claude-3-opus-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
57 |
+
{"Name": "meta-llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
58 |
{"Name": "llama-3.1-70b-instruct", "Release Date": "2024-07-23", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
59 |
{"Name": "gpt-4-0125-preview", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
60 |
{"Name": "yi-large-preview", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
68 |
{"Name": "nemotron-4-340b-instruct", "Release Date": "2024-06-14", "Total Parameters": 340, "Active Parameters": 340, "API Cost": 0}
|
69 |
{"Name": "bard-jan-24-gemini-pro", "Release Date": "2024-01-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
70 |
{"Name": "gemini-1.5-pro-001", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
71 |
+
{"Name": "gemini-1.5-pro", "Release Date": "2024-02-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
72 |
{"Name": "glm-4-0520", "Release Date": "2024-05-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0.63}
|
73 |
{"Name": "llama-3-70b-instruct", "Release Date": "2024-04-18", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
74 |
{"Name": "claude-3-sonnet-20240229", "Release Date": "2024-02-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
76 |
{"Name": "reka-core-20240501", "Release Date": "2024-05-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
77 |
{"Name": "command-r-plus", "Release Date": "2024-04-04", "Total Parameters": 104, "Active Parameters": 104, "API Cost": 0}
|
78 |
{"Name": "gemma-2-9b-it", "Release Date": "2024-06-27", "Total Parameters": 9, "Active Parameters": 9, "API Cost": 0}
|
79 |
+
{"Name": "qwen2.5-coder-32b-instruct", "Release Date": "2024-09-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
80 |
{"Name": "qwen2-72b-instruct", "Release Date": "2024-06-07", "Total Parameters": 72, "Active Parameters": 0, "API Cost": 0}
|
81 |
{"Name": "gpt-4", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
82 |
{"Name": "gpt-4-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
83 |
{"Name": "qwen-max-0428", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
84 |
{"Name": "glm-4-0116", "Release Date": "2024-01-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
85 |
{"Name": "claude-3-haiku-20240307", "Release Date": "2024-03-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
|
|
115 |
{"Name": "gemini-pro", "Release Date": "2023-12-13", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
116 |
{"Name": "qwen1.5-14b-chat", "Release Date": "2024-02-04", "Total Parameters": 14, "Active Parameters": 14, "API Cost": 0}
|
117 |
{"Name": "gpt-3.5-turbo-0314", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
118 |
+
{"Name": "gpt-3.5-turbo", "Release Date": "2023-03-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
119 |
{"Name": "wizardlm-70b", "Release Date": "2023-08-09", "Total Parameters": 70, "Active Parameters": 70, "API Cost": 0}
|
120 |
{"Name": "gpt-3.5-turbo-0125", "Release Date": "2024-01-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
121 |
{"Name": "dbrx-instruct-preview", "Release Date": "2024-03-27", "Total Parameters": 132, "Active Parameters": 36, "API Cost": 0}
|
|
|
176 |
{"Name": "stablelm-tuned-alpha-7b", "Release Date": "2023-04-20", "Total Parameters": 7, "Active Parameters": 7, "API Cost": 0}
|
177 |
{"Name": "dolly-v2-12b", "Release Date": "2023-04-12", "Total Parameters": 12, "Active Parameters": 12, "API Cost": 0}
|
178 |
{"Name": "llama-13b", "Release Date": "2023-02-27", "Total Parameters": 13, "Active Parameters": 13, "API Cost": 0}
|
179 |
+
{"Name": "gpt-3.5", "Release Date": "2022-11-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
180 |
+
{"Name": "deepseek-coder-v2-instruct (2024-07-24)", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
181 |
+
{"Name": "gemini-exp-1114", "Release Date": "2024-11-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
182 |
+
{"Name": "athene-v2-chat", "Release Date": "2024-11-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
183 |
+
{"Name": "athene-v2-agent", "Release Date": "2024-11-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
184 |
+
{"Name": "claude-3.5-sonnet-20240620", "Release Date": "2024-06-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
185 |
+
{"Name": "o1-preview-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
186 |
+
{"Name": "deepseek-v2-chat (2024-06-28)", "Release Date": "2024-06-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
187 |
+
{"Name": "o1-mini-2024-09-12 (temperature=1)", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
188 |
+
{"Name": "gemini-exp-1121", "Release Date": "2024-11-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
189 |
+
{"Name": "gemini-2.0-flash-thinking-exp-1219", "Release Date": "2024-12-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
190 |
+
{"Name": "deepseek-coder-v2-instruct", "Release Date": "2024-06-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
191 |
+
{"Name": "deepseek-v2.5-1210", "Release Date": "2024-12-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
192 |
+
{"Name": "mistral-large-instruct-2407", "Release Date": "2024-07-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
193 |
+
{"Name": "qwen2.5-72b-instruct", "Release Date": "2024-09-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
194 |
+
{"Name": "qwen2.5-32b-instruct", "Release Date": "2024-11-06", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
195 |
+
{"Name": "llama-3.1-nemotron-70b-instruct", "Release Date": "2024-10-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
196 |
+
{"Name": "dracarys-llama-3.1-70b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
197 |
+
{"Name": "llama-3-70b-synthia-v3.5", "Release Date": "2024-05-26", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
198 |
+
{"Name": "dracarys-72b-instruct", "Release Date": "2024-08-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
199 |
+
{"Name": "hermes-2-theta-llama-3-70b", "Release Date": "2024-06-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
200 |
+
{"Name": "phi-3.1-mini-128k-instruct", "Release Date": "2024-07-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
201 |
+
{"Name": "hermes-2-pro-llama-3-70b", "Release Date": "2024-06-25", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
202 |
+
{"Name": "qwen2.5-14b-instruct", "Release Date": "2024-09-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
203 |
+
{"Name": "qwen2-72b-chat", "Release Date": "2024-05-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
204 |
+
{"Name": "codestral-22b-v0.1", "Release Date": "2024-05-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
205 |
+
{"Name": "qwen2.5-coder-7b-instruct", "Release Date": "2024-09-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
206 |
+
{"Name": "gemma-2-27b-instruct", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
207 |
+
{"Name": "mixtral-8x22b-instruct", "Release Date": "2024-04-17", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
208 |
+
{"Name": "athene-70b", "Release Date": "2024-07-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
209 |
+
{"Name": "deepseek-coder-33b-instruct", "Release Date": "2023-11-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
210 |
+
{"Name": "whiterabbitneo-33b-v1.5", "Release Date": "2024-02-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
211 |
+
{"Name": "reflectioncoder-ds-33b", "Release Date": "2024-05-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
212 |
+
{"Name": "deepseek-v2-chat", "Release Date": "2024-04-28", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
213 |
+
{"Name": "opencoder-8b-instruct", "Release Date": "2024-11-07", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
214 |
+
{"Name": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "Release Date": "2024-12-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
215 |
+
{"Name": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "Release Date": "2024-12-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
216 |
+
{"Name": "barcelona v0.1 (claude sonnet 3.5)", "Release Date": "2024-12-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
217 |
+
{"Name": "omne v0.1 (o1-preview, gpt-4o)", "Release Date": "2024-10-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
218 |
+
{"Name": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "Release Date": "2024-10-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
219 |
+
{"Name": "Multi Agent", "Release Date": "2024-10-10", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
220 |
+
{"Name": "DynaSaur (gpt-4o)", "Release Date": "2024-10-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
221 |
+
{"Name": "magentic-1 (o1)", "Release Date": "2024-10-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
222 |
+
{"Name": "Trase Agent v0.1 (fine-tuned gpt-4o)", "Release Date": "2024-09-04", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
223 |
+
{"Name": "sibyl system v0.2 (gpt-4o)", "Release Date": "2024-11-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
224 |
+
{"Name": "HuggingFaceAgents (gpt-4o)", "Release Date": "2024-06-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
225 |
+
{"Name": "tapeagent v0.2", "Release Date": "2024-12-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
226 |
+
{"Name": "little_potato (yanzw gpt-4o)", "Release Date": "2024-10-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
227 |
+
{"Name": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "Release Date": "2024-03-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
228 |
+
{"Name": "magentic-1", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
229 |
+
{"Name": "das_agent v0.2", "Release Date": "2024-09-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
230 |
+
{"Name": "cola_v0.4", "Release Date": "2024-11-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
231 |
+
{"Name": "modified hugging face agents + gpt-4o", "Release Date": "2024-09-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
232 |
+
{"Name": "das_agent", "Release Date": "2024-09-12", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
233 |
+
{"Name": "das_agent v0.3", "Release Date": "2024-09-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
234 |
+
{"Name": "cola_v0.3", "Release Date": "2024-11-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
235 |
+
{"Name": "replicated hugging face agents + gpt-4o", "Release Date": "2024-09-11", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
236 |
+
{"Name": "tapeagent v0.1", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
237 |
+
{"Name": "das_agent v0.4 mini (fixed)", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
238 |
+
{"Name": "sibyl system v0.2 (gpt-4o-2024-08-06)", "Release Date": "2024-09-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
239 |
+
{"Name": "das_agent v0.4 mini", "Release Date": "2024-10-14", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
240 |
+
{"Name": "mmac v1.1 (gpt4v gemini 1.5)", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
241 |
+
{"Name": "modified sibyl system", "Release Date": "2024-08-29", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
242 |
+
{"Name": "maac_v1", "Release Date": "2024-04-01", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
243 |
+
{"Name": "uk ai safety institute internal (gpt-4-turbo)", "Release Date": "2024-04-21", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
244 |
+
{"Name": "FRIDAY (gpt-4-turbo)", "Release Date": "2024-01-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
245 |
+
{"Name": "cola_abl", "Release Date": "2024-12-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
246 |
+
{"Name": "replicated hugging face agents + gpt-4o mini", "Release Date": "2024-10-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
247 |
+
{"Name": "tapeagent v0.2 mini", "Release Date": "2024-12-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
248 |
+
{"Name": "friday_without_learning (os-copilot gpt-4-turbo)", "Release Date": "2024-01-30", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
249 |
+
{"Name": "ceylon", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
250 |
+
{"Name": "tapeagent v0.1 mini", "Release Date": "2024-10-15", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
251 |
+
{"Name": "dip (gpt-4-turbo)", "Release Date": "2024-04-02", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
252 |
+
{"Name": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "Release Date": "2024-10-08", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
253 |
+
{"Name": "cola_v0.2", "Release Date": "2024-10-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
254 |
+
{"Name": "chamomile", "Release Date": "2024-03-05", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
255 |
+
{"Name": "clarity v1", "Release Date": "2024-02-27", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
256 |
+
{"Name": "warm-up act (gpt-4-turbo)", "Release Date": "2024-02-22", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
257 |
+
{"Name": "frc v5", "Release Date": "2024-04-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
258 |
+
{"Name": "cola_v0.1", "Release Date": "2024-10-23", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
259 |
+
{"Name": "somedayv1.2", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
260 |
+
{"Name": "somedayv1.1", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
261 |
+
{"Name": "frc v4", "Release Date": "2024-04-18", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
262 |
+
{"Name": "stealth3", "Release Date": "2024-02-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
263 |
+
{"Name": "stealth2", "Release Date": "2024-02-20", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
264 |
+
{"Name": "someday1", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
265 |
+
{"Name": "frc v3", "Release Date": "2024-04-09", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
266 |
+
{"Name": "stealth", "Release Date": "2024-02-19", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
267 |
+
{"Name": "someday", "Release Date": "2024-06-16", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
268 |
+
{"Name": "AutoGPT4 (gpt-4)", "Release Date": "2023-11-03", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
269 |
+
{"Name": "alphaagent v0.1 (gpt-4o)", "Release Date": "2024-10-24", "Total Parameters": 0, "Active Parameters": 0, "API Cost": 0}
|
planbench_leaderboard.jsonl
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
{"model": "o1-preview-2024-09-12", "score": 52.8}
|
2 |
{"model": "llama-3.1-405b", "score": 0.8}
|
3 |
-
{"model": "gpt-4", "score": 0.16}
|
|
|
|
1 |
{"model": "o1-preview-2024-09-12", "score": 52.8}
|
2 |
{"model": "llama-3.1-405b", "score": 0.8}
|
3 |
+
{"model": "gpt-4", "score": 0.16}
|
4 |
+
{"model": "gpt-4o", "score": 0}
|
simple_bench_leaderboard.jsonl
CHANGED
@@ -1,10 +1,17 @@
|
|
1 |
-
{"model": "
|
2 |
-
{"model": "
|
3 |
-
{"model": "
|
4 |
-
{"model": "
|
5 |
-
{"model": "
|
6 |
-
{"model": "
|
7 |
-
{"model": "gpt-
|
8 |
-
{"model": "
|
9 |
-
{"model": "
|
10 |
-
{"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-preview-2024-09-12", "score": 41.7}
|
2 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 41.4}
|
3 |
+
{"model": "o1-2024-12-17", "score": 36.7}
|
4 |
+
{"model": "gemini-exp-1206", "score": 31.1}
|
5 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 27.5}
|
6 |
+
{"model": "gemini-1.5-pro-002", "score": 27.1}
|
7 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 25.1}
|
8 |
+
{"model": "claude-3-opus-20240229", "score": 23.5}
|
9 |
+
{"model": "llama-3.1-405b-instruct-fp8", "score": 23.0}
|
10 |
+
{"model": "grok-beta", "score": 22.7}
|
11 |
+
{"model": "mistral-large-2407", "score": 22.5}
|
12 |
+
{"model": "llama-3.3-70b-instruct", "score": 19.9}
|
13 |
+
{"model": "gemini-2.0-flash-exp", "score": 18.9}
|
14 |
+
{"model": "o1-mini-2024-09-12", "score": 18.1}
|
15 |
+
{"model": "gpt-4o-2024-08-06", "score": 17.8}
|
16 |
+
{"model": "command-r-plus", "score": 17.4}
|
17 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 10.7}
|
zeroeval_average_leaderboard.jsonl
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-preview-2024-09-12", "score": 86.1}
|
2 |
+
{"model": "o1-mini-2024-09-12", "score": 80.6}
|
3 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 67.1}
|
4 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 66.1}
|
5 |
+
{"model": "gpt-4o-2024-08-06", "score": 65.6}
|
6 |
+
{"model": "chatgpt-4o-latest-24-09-07", "score": 64.6}
|
7 |
+
{"model": "gpt-4o-2024-05-13", "score": 64.3}
|
8 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 63.0}
|
9 |
+
{"model": "grok-2-1212", "score": 62.8}
|
10 |
+
{"model": "qwen2.5-72b-instruct", "score": 61.6}
|
11 |
+
{"model": "llama-3.1-405b-instruct", "score": 59.8}
|
12 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 59.8}
|
13 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 59.0}
|
14 |
+
{"model": "mistral-large-2", "score": 58.9}
|
15 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 57.4}
|
16 |
+
{"model": "deepseek-v2.5-0908", "score": 54.3}
|
17 |
+
{"model": "claude-3-opus-20240229", "score": 54.2}
|
18 |
+
{"model": "meta-llama-3.1-70b-instruct", "score": 53.8}
|
19 |
+
{"model": "claude-3-5-haiku-20241022", "score": 53.4}
|
20 |
+
{"model": "gemini-1.5-pro", "score": 52.5}
|
21 |
+
{"model": "gpt-4-0314", "score": 52.3}
|
zeroeval_crux_leaderboard.jsonl
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-preview-2024-09-12", "score": 95.9}
|
2 |
+
{"model": "o1-mini-2024-09-12", "score": 93.8}
|
3 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 83.9}
|
4 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 79.6}
|
5 |
+
{"model": "gpt-4o-2024-08-06", "score": 87.0}
|
6 |
+
{"model": "chatgpt-4o-latest-24-09-07", "score": 86.5}
|
7 |
+
{"model": "gpt-4o-2024-05-13", "score": 86.1}
|
8 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 80.8}
|
9 |
+
{"model": "grok-2-1212", "score": 75.3}
|
10 |
+
{"model": "qwen2.5-72b-instruct", "score": 73.9}
|
11 |
+
{"model": "llama-3.1-405b-instruct", "score": 73.0}
|
12 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 78.9}
|
13 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 74.5}
|
14 |
+
{"model": "mistral-large-2", "score": 75.1}
|
15 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 75.9}
|
16 |
+
{"model": "deepseek-v2.5-0908", "score": 70.0}
|
17 |
+
{"model": "claude-3-opus-20240229", "score": 70.4}
|
18 |
+
{"model": "meta-llama-3.1-70b-instruct", "score": 64.3}
|
19 |
+
{"model": "claude-3-5-haiku-20241022", "score": 68.8}
|
20 |
+
{"model": "gemini-1.5-pro", "score": 68.0}
|
21 |
+
{"model": "gpt-4-0314", "score": 74.5}
|
zeroeval_math_l5_leaderboard.jsonl
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-preview-2024-09-12", "score": 84.5}
|
2 |
+
{"model": "o1-mini-2024-09-12", "score": 89.3}
|
3 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 59.4}
|
4 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 68.1}
|
5 |
+
{"model": "gpt-4o-2024-08-06", "score": 55.3}
|
6 |
+
{"model": "chatgpt-4o-latest-24-09-07", "score": 53.1}
|
7 |
+
{"model": "gpt-4o-2024-05-13", "score": 54.8}
|
8 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 51.9}
|
9 |
+
{"model": "grok-2-1212", "score": 60.9}
|
10 |
+
{"model": "qwen2.5-72b-instruct", "score": 60.2}
|
11 |
+
{"model": "llama-3.1-405b-instruct", "score": 49.8}
|
12 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 46.5}
|
13 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 54.5}
|
14 |
+
{"model": "mistral-large-2", "score": 48.5}
|
15 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 52.2}
|
16 |
+
{"model": "deepseek-v2.5-0908", "score": 44.7}
|
17 |
+
{"model": "claude-3-opus-20240229", "score": 36.9}
|
18 |
+
{"model": "meta-llama-3.1-70b-instruct", "score": 43.1}
|
19 |
+
{"model": "claude-3-5-haiku-20241022", "score": 46.5}
|
20 |
+
{"model": "gemini-1.5-pro", "score": 39.8}
|
21 |
+
{"model": "gpt-4-0314", "score": 26.1}
|
zeroeval_mmlu_redux_leaderboard.jsonl
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-preview-2024-09-12", "score": 92.8}
|
2 |
+
{"model": "o1-mini-2024-09-12", "score": 86.7}
|
3 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 88.9}
|
4 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 86.1}
|
5 |
+
{"model": "gpt-4o-2024-08-06", "score": 88.3}
|
6 |
+
{"model": "chatgpt-4o-latest-24-09-07", "score": 88.9}
|
7 |
+
{"model": "gpt-4o-2024-05-13", "score": 88.0}
|
8 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 86.0}
|
9 |
+
{"model": "grok-2-1212", "score": 87.4}
|
10 |
+
{"model": "qwen2.5-72b-instruct", "score": 85.6}
|
11 |
+
{"model": "llama-3.1-405b-instruct", "score": 86.2}
|
12 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 85.3}
|
13 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 82.1}
|
14 |
+
{"model": "mistral-large-2", "score": 83.0}
|
15 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 81.5}
|
16 |
+
{"model": "deepseek-v2.5-0908", "score": 80.4}
|
17 |
+
{"model": "claude-3-opus-20240229", "score": 82.5}
|
18 |
+
{"model": "meta-llama-3.1-70b-instruct", "score": 83.0}
|
19 |
+
{"model": "claude-3-5-haiku-20241022", "score": 79.6}
|
20 |
+
{"model": "gemini-1.5-pro", "score": 82.8}
|
21 |
+
{"model": "gpt-4-0314", "score": 81.6}
|
zeroeval_zebralogic_leaderboard.jsonl
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-2024-12-17", "score": 81.0}
|
2 |
+
{"model": "o1-preview-2024-09-12", "score": 71.4}
|
3 |
+
{"model": "o1-mini-2024-09-12", "score": 52.6}
|
4 |
+
{"model": "claude-3-5-sonnet-20241022", "score": 36.2}
|
5 |
+
{"model": "gemini-1.5-pro-exp-0827", "score": 30.5}
|
6 |
+
{"model": "gpt-4o-2024-08-06", "score": 31.7}
|
7 |
+
{"model": "chatgpt-4o-latest-24-09-07", "score": 29.9}
|
8 |
+
{"model": "gpt-4o-2024-05-13", "score": 28.2}
|
9 |
+
{"model": "claude-3-5-sonnet-20240620", "score": 33.4}
|
10 |
+
{"model": "grok-2-1212", "score": 27.7}
|
11 |
+
{"model": "qwen2.5-72b-instruct", "score": 26.6}
|
12 |
+
{"model": "llama-3.1-405b-instruct", "score": 30.1}
|
13 |
+
{"model": "gpt-4-turbo-2024-04-09", "score": 28.4}
|
14 |
+
{"model": "gemini-1.5-flash-exp-0827", "score": 25.0}
|
15 |
+
{"model": "mistral-large-2", "score": 29.0}
|
16 |
+
{"model": "gpt-4o-mini-2024-07-18", "score": 20.1}
|
17 |
+
{"model": "deepseek-v2.5-0908", "score": 22.1}
|
18 |
+
{"model": "claude-3-opus-20240229", "score": 27.0}
|
19 |
+
{"model": "meta-llama-3.1-70b-instruct", "score": 24.9}
|
20 |
+
{"model": "claude-3-5-haiku-20241022", "score": 18.7}
|
21 |
+
{"model": "gemini-1.5-pro", "score": 19.4}
|
22 |
+
{"model": "gpt-4-0314", "score": 27.1}
|