open-agent-leaderboard / src /overall_results.csv
liaojiajia
add tot and math500 scores
cd01d35
Rank,Algorithm,LLM,Eval Date,Avg Score,gsm8k-Score,gsm8k-Cost($),AQuA-Score,AQuA-Cost($),MATH-500-Score,MATH-500-Cost($)
1.0,CoT,Qwen2.5-72B-Instruct,2025/1/22,86.43,92.87,0.7195,86.22,0.0808,80.2,0.349
2.0,SC-CoT,Qwen2.5-72B-Instruct,2025/1/22,84.30,93.86,5.9858,85.04,1.0348,74.0,3.1556
3.0,SC-CoT,Llama-3.3-70B-Instruct,2025/1/22,83.85,95.07,6.2005,82.28,1.0756,74.2,3.2239
4.0,CoT,Llama-3.3-70B-Instruct,2025/1/22,82.86,93.93,0.687,83.46,0.0927,71.2,0.3463
5.0,CoT,gpt-4o,2025/1/22,81.59,94.09,4.5367,82.68,1.0417,68.0,3.0569
6.0,IO,Llama-3.3-70B-Instruct,2025/1/22,81.45,92.27,0.4709,82.68,0.0798,69.4,0.2386
7.0,IO,Qwen2.5-72B-Instruct,2025/1/22,80.34,86.58,0.4899,84.25,0.0742,70.2,0.2506
8.0,SC-CoT,Qwen2.5-7B-Instruct,2025/1/22,79.35,91.13,0.0,79.92,0.0,67.0,0.0
9.0,CoT,Qwen2.5-7B-Instruct,2025/1/22,78.73,85.67,0.0,80.71,0.0,69.8,0.0
10.0,ReAct-Pro*,Llama-3.3-70B-Instruct,2025/1/22,77.12,87.64,10.1124,79.13,0.768,64.6,3.1806
11.0,CoT,Doubao-lite-32k,2025/1/7,77.00,89.31,0.0558,82.68,0.0066,59.0,0.0255
12.0,ReAct-Pro*,Qwen2.5-72B-Instruct,2025/1/22,74.43,87.26,10.5479,73.23,0.3177,62.8,3.4541
13.0,SC-CoT,Doubao-lite-32k,2025/1/7,72.52,87.26,0.2083,81.1,0.0519,49.2,0.1406
14.0,PoT,Qwen2.5-72B-Instruct,2025/1/22,71.58,92.34,0.7054,75.2,0.1645,47.2,0.233
15.0,PoT,gpt-4o,2025/1/22,71.50,93.1,4.2166,75.2,1.6087,46.2,1.5994
16.0,SC-CoT,gpt-4o,2025/1/22,70.44,90.3,31.0542,86.61,8.1485,34.4,19.6538
17.0,ReAct-Pro*,Doubao-lite-32k,2025/1/7,70.12,85.6,0.2512,77.56,0.0445,47.2,0.186
18.0,ReAct-Pro*,Qwen2.5-7B-Instruct,2025/1/22,68.69,82.87,0.0,74.41,0.0,48.8,0.0
19.0,IO,gpt-4o,2025/1/22,68.60,88.4,3.3463,75.59,1.1453,41.8,2.7907
20.0,IO,Qwen2.5-7B-Instruct,2025/1/22,65.13,57.24,0.0,78.74,0.0,59.4,0.0
21.0,PoT,Llama-3.3-70B-Instruct,2025/1/22,65.07,73.09,0.9736,79.53,0.1746,42.6,0.2839
22.0,CoT,deepseek-r1:1.5b,2025/1/23,63.90,70.66,0.0,71.65,0.0,49.4,0.0
23.0,IO,Doubao-lite-32k,2025/1/7,62.85,72.02,0.0354,79.13,0.0058,37.4,0.0187
24.0,PoT,Doubao-lite-32k,2025/1/7,61.29,79.61,0.0576,71.65,0.0147,32.6,0.0144
25.0,ToT,Qwen2.5-72B-Instruct,2025/1/22,60.26,88.88,23.5911,81.1,3.7389,10.8,9.0421
26.0,CoT,gpt-3.5-turbo,2025/1/7,59.84,78.7,0.6788,61.02,0.0957,39.8,0.3189
27.0,CoT,Internllm2_5-7B,2025/1/22,59.02,77.71,0.0,52.76,0.0,46.6,0.0
28.0,IO,deepseek-r1:1.5b,2025/1/22,58.95,64.14,0.0,68.9,0.0,43.8,0.0
29.0,ToT,Llama-3.3-70B-Instruct,2025/1/22,58.79,91.89,20.8753,83.07,2.9404,1.4,8.2699
30.0,ToT,gpt-4o,2025/1/22,58.61,91.13,86.8581,81.5,8.5295,3.2,40.8094
31.0,SC-CoT,gpt-3.5-turbo,2025/1/7,58.28,79.91,3.3938,66.14,0.7888,28.8,1.9764
32.0,ReAct-Pro*,gpt-4o,2025/1/22,58.26,63.31,39.0751,57.48,2.304,54.0,17.7735
33.0,PoT,Qwen2.5-7B-Instruct,2025/1/22,55.51,58.83,0.0,68.11,0.0,39.6,0.0
34.0,PoT,gpt-3.5-turbo,2025/1/7,55.04,76.88,0.6902,59.45,0.1748,28.8,0.168
35.0,ReAct-Pro*,gpt-3.5-turbo,2025/1/7,54.43,74.91,3.4633,64.57,0.4928,23.8,2.0406
36.0,SC-CoT,Llama-3.1-8B-Instruct,2025/1/22,54.37,73.46,0.0,59.45,0.0,30.2,0.0
37.0,CoT,Llama-3.1-8B-Instruct,2025/1/22,53.96,75.44,0.0,60.63,0.0,25.8,0.0
38.0,SC-CoT,deepseek-r1:1.5b,2025/2/10,50.80,55.34,0.0,59.06,0.0,38.0,0.0
39.0,ReAct-Pro*,Llama-3.1-8B-Instruct,2025/1/22,50.70,67.78,0.0,55.51,0.0,28.8,0.0
40.0,IO,Llama-3.1-8B-Instruct,2025/1/22,48.98,57.16,0.0,51.18,0.0,38.6,0.0
41.0,ToT,gpt-3.5-turbo,2025/1/7,44.94,67.93,9.1707,57.09,1.1513,9.8,5.2914
42.0,ToT,Qwen2.5-7B-Instruct,2025/1/22,42.52,72.21,0.0,53.94,0.0,1.4,0.0
43.0,ToT,Llama-3.1-8B-Instruct,2025/1/22,41.97,65.05,0.0,59.06,0.0,1.8,0.0
44.0,ReAct-Pro*,deepseek-r1:1.5b,2025/2/10,38.22,35.94,0.0,54.33,0.0,24.4,0.0
45.0,CoT,Qwen2-1.5B-Instruct,2025/1/22,37.08,55.5,0.0,40.55,0.0,15.2,0.0
46.0,PoT,Llama-3.1-8B-Instruct,2025/1/22,33.56,38.67,0.0,36.61,0.0,25.4,0.0
47.0,SC-CoT,Internllm2_5-7B,2025/1/22,32.46,48.22,0.0,39.37,0.0,9.8,0.0
48.0,IO,gpt-3.5-turbo,2025/1/7,31.34,37.83,0.3328,38.98,0.038,17.2,0.2436
49.0,PoT,Internllm2_5-7B,2025/1/22,29.94,38.21,0.0,36.61,0.0,15.0,0.0
50.0,ReAct-Pro*,Internllm2_5-7B,2025/1/22,29.75,33.51,0.0,40.94,0.0,14.8,0.0
51.0,ToT,Doubao-lite-32k,2025/1/7,28.10,37.83,0.8739,45.28,0.0881,1.2,0.2371
52.0,IO,Internllm2_5-7B,2025/1/22,27.35,11.6,0.0,47.64,0.0,22.8,0.0
53.0,CoT,Qwen2-0.5B-Instruct,2025/1/22,25.07,35.94,0.0,33.07,0.0,6.2,0.0
54.0,PoT,deepseek-r1:1.5b,2025/2/10,22.54,11.9,0.0,54.72,0.0,1.0,0.0
55.0,ReAct-Pro*,Qwen2-1.5B-Instruct,2025/1/22,19.55,24.87,0.0,25.59,0.0,8.2,0.0
56.0,ToT,Internllm2_5-7B,2025/1/22,18.96,20.85,0.0,35.83,0.0,0.2,0.0
57.0,IO,Qwen2-1.5B-Instruct,2025/1/22,17.60,16.68,0.0,29.13,0.0,7.0,0.0
58.0,ToT,Qwen2-1.5B-Instruct,2025/1/22,17.31,19.64,0.0,31.5,0.0,0.8,0.0
59.0,PoT,Qwen2-1.5B-Instruct,2025/1/22,16.67,18.5,0.0,30.71,0.0,0.8,0.0
60.0,ToT,deepseek-r1:1.5b,2025/2/10,16.11,23.12,0.0,24.8,0.0,0.4,0.0
61.0,IO,Qwen2-0.5B-Instruct,2025/1/22,14.83,14.71,0.0,27.17,0.0,2.6,0.0
62.0,SC-CoT,Qwen2-1.5B-Instruct,2025/1/22,13.06,11.75,0.0,23.62,0.0,3.8,0.0
63.0,ReAct-Pro*,Qwen2-0.5B-Instruct,2025/1/22,10.76,7.66,0.0,24.02,0.0,0.6,0.0
64.0,ToT,Qwen2-0.5B-Instruct,2025/1/22,9.97,0.0,0.0,29.92,0.0,0.0,0.0
65.0,PoT,Qwen2-0.5B-Instruct,2025/1/22,8.98,9.63,0.0,17.32,0.0,0.0,0.0
66.0,SC-CoT,Qwen2-0.5B-Instruct,2025/1/22,8.43,1.67,0.0,22.83,0.0,0.8,0.0