open-agent-leaderboard / src /overall_math_score.json
liaojiajia
add tot and math500 scores
cd01d35
{
"time": "2025-02-11 13:23:00",
"results": {
"IO": {
"META": {
"Algorithm": "IO",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 37.83,
"Cost($)": 0.3328
},
"AQuA": {
"Score": 38.98,
"Cost($)": 0.038
},
"MATH-500": {
"Score": 17.2,
"Cost($)": 0.2436
}
},
"ReAct-Pro*": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 74.91,
"Cost($)": 3.4633
},
"AQuA": {
"Score": 64.57,
"Cost($)": 0.4928
},
"MATH-500": {
"Score": 23.8,
"Cost($)": 2.0406
}
},
"PoT": {
"META": {
"Algorithm": "PoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 76.88,
"Cost($)": 0.6902
},
"AQuA": {
"Score": 59.45,
"Cost($)": 0.1748
},
"MATH-500": {
"Score": 28.8,
"Cost($)": 0.168
}
},
"CoT": {
"META": {
"Algorithm": "CoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 78.7,
"Cost($)": 0.6788
},
"AQuA": {
"Score": 61.02,
"Cost($)": 0.0957
},
"MATH-500": {
"Score": 39.8,
"Cost($)": 0.3189
}
},
"SC-CoT": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 79.91,
"Cost($)": 3.3938
},
"AQuA": {
"Score": 66.14,
"Cost($)": 0.7888
},
"MATH-500": {
"Score": 28.8,
"Cost($)": 1.9764
}
},
"ToT": {
"META": {
"Algorithm": "ToT",
"LLM": "gpt-3.5-turbo",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 67.93,
"Cost($)": 9.1707
},
"AQuA": {
"Score": 57.09,
"Cost($)": 1.1513
},
"MATH-500": {
"Score": 9.8,
"Cost($)": 5.2914
}
},
"IO-Doubao-lite-32k": {
"META": {
"Algorithm": "IO",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 72.02,
"Cost($)": 0.0354
},
"AQuA": {
"Score": 79.13,
"Cost($)": 0.0058
},
"MATH-500": {
"Score": 37.4,
"Cost($)": 0.0187
}
},
"ReAct-Pro*-Doubao-lite-32k": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 85.6,
"Cost($)": 0.2512
},
"AQuA": {
"Score": 77.56,
"Cost($)": 0.0445
},
"MATH-500": {
"Score": 47.2,
"Cost($)": 0.186
}
},
"PoT-Doubao-lite-32k": {
"META": {
"Algorithm": "PoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 79.61,
"Cost($)": 0.0576
},
"AQuA": {
"Score": 71.65,
"Cost($)": 0.0147
},
"MATH-500": {
"Score": 32.6,
"Cost($)": 0.0144
}
},
"CoT-Doubao-lite-32k": {
"META": {
"Algorithm": "CoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 89.31,
"Cost($)": 0.0558
},
"AQuA": {
"Score": 82.68,
"Cost($)": 0.0066
},
"MATH-500": {
"Score": 59.0,
"Cost($)": 0.0255
}
},
"SC-CoT-Doubao-lite-32k": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 87.26,
"Cost($)": 0.2083
},
"AQuA": {
"Score": 81.1,
"Cost($)": 0.0519
},
"MATH-500": {
"Score": 49.2,
"Cost($)": 0.1406
}
},
"ToT-Doubao-lite-32k": {
"META": {
"Algorithm": "ToT",
"LLM": "Doubao-lite-32k",
"Eval Date": "2025/1/7"
},
"gsm8k": {
"Score": 37.83,
"Cost($)": 0.8739
},
"AQuA": {
"Score": 45.28,
"Cost($)": 0.0881
},
"MATH-500": {
"Score": 1.2,
"Cost($)": 0.2371
}
},
"IO-gpt-4o": {
"META": {
"Algorithm": "IO",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 88.4,
"Cost($)": 3.3463
},
"AQuA": {
"Score": 75.59,
"Cost($)": 1.1453
},
"MATH-500": {
"Score": 41.8,
"Cost($)": 2.7907
}
},
"ReAct-Pro*-gpt-4o": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 63.31,
"Cost($)": 39.0751
},
"AQuA": {
"Score": 57.48,
"Cost($)": 2.304
},
"MATH-500": {
"Score": 54.0,
"Cost($)": 17.7735
}
},
"PoT-gpt-4o": {
"META": {
"Algorithm": "PoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.1,
"Cost($)": 4.2166
},
"AQuA": {
"Score": 75.2,
"Cost($)": 1.6087
},
"MATH-500": {
"Score": 46.2,
"Cost($)": 1.5994
}
},
"CoT-gpt-4o": {
"META": {
"Algorithm": "CoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 94.09,
"Cost($)": 4.5367
},
"AQuA": {
"Score": 82.68,
"Cost($)": 1.0417
},
"MATH-500": {
"Score": 68.0,
"Cost($)": 3.0569
}
},
"SC-CoT-gpt-4o": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 90.3,
"Cost($)": 31.0542
},
"AQuA": {
"Score": 86.61,
"Cost($)": 8.1485
},
"MATH-500": {
"Score": 34.4,
"Cost($)": 19.6538
}
},
"ToT-gpt-4o": {
"META": {
"Algorithm": "ToT",
"LLM": "gpt-4o",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 91.13,
"Cost($)": 86.8581
},
"AQuA": {
"Score": 81.5,
"Cost($)": 8.5295
},
"MATH-500": {
"Score": 3.2,
"Cost($)": 40.8094
}
},
"IO-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 86.58,
"Cost($)": 0.4899
},
"AQuA": {
"Score": 84.25,
"Cost($)": 0.0742
},
"MATH-500": {
"Score": 70.2,
"Cost($)": 0.2506
}
},
"ReAct-Pro*-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 87.26,
"Cost($)": 10.5479
},
"AQuA": {
"Score": 73.23,
"Cost($)": 0.3177
},
"MATH-500": {
"Score": 62.8,
"Cost($)": 3.4541
}
},
"PoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.34,
"Cost($)": 0.7054
},
"AQuA": {
"Score": 75.2,
"Cost($)": 0.1645
},
"MATH-500": {
"Score": 47.2,
"Cost($)": 0.233
}
},
"CoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.87,
"Cost($)": 0.7195
},
"AQuA": {
"Score": 86.22,
"Cost($)": 0.0808
},
"MATH-500": {
"Score": 80.2,
"Cost($)": 0.349
}
},
"SC-CoT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.86,
"Cost($)": 5.9858
},
"AQuA": {
"Score": 85.04,
"Cost($)": 1.0348
},
"MATH-500": {
"Score": 74.0,
"Cost($)": 3.1556
}
},
"ToT-Qwen2.5-72B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Qwen2.5-72B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 88.88,
"Cost($)": 23.5911
},
"AQuA": {
"Score": 81.1,
"Cost($)": 3.7389
},
"MATH-500": {
"Score": 10.8,
"Cost($)": 9.0421
}
},
"IO-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 92.27,
"Cost($)": 0.4709
},
"AQuA": {
"Score": 82.68,
"Cost($)": 0.0798
},
"MATH-500": {
"Score": 69.4,
"Cost($)": 0.2386
}
},
"ReAct-Pro*-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 87.64,
"Cost($)": 10.1124
},
"AQuA": {
"Score": 79.13,
"Cost($)": 0.768
},
"MATH-500": {
"Score": 64.6,
"Cost($)": 3.1806
}
},
"PoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 73.09,
"Cost($)": 0.9736
},
"AQuA": {
"Score": 79.53,
"Cost($)": 0.1746
},
"MATH-500": {
"Score": 42.6,
"Cost($)": 0.2839
}
},
"CoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 93.93,
"Cost($)": 0.687
},
"AQuA": {
"Score": 83.46,
"Cost($)": 0.0927
},
"MATH-500": {
"Score": 71.2,
"Cost($)": 0.3463
}
},
"SC-CoT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 95.07,
"Cost($)": 6.2005
},
"AQuA": {
"Score": 82.28,
"Cost($)": 1.0756
},
"MATH-500": {
"Score": 74.2,
"Cost($)": 3.2239
}
},
"ToT-Llama-3.3-70B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Llama-3.3-70B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 91.89,
"Cost($)": 20.8753
},
"AQuA": {
"Score": 83.07,
"Cost($)": 2.9404
},
"MATH-500": {
"Score": 1.4,
"Cost($)": 8.2699
}
},
"IO-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 57.24,
"Cost($)": 0.0
},
"AQuA": {
"Score": 78.74,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 59.4,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 82.87,
"Cost($)": 0.0
},
"AQuA": {
"Score": 74.41,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 48.8,
"Cost($)": 0.0
}
},
"PoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 58.83,
"Cost($)": 0.0
},
"AQuA": {
"Score": 68.11,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 39.6,
"Cost($)": 0.0
}
},
"CoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 85.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 80.71,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 69.8,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 91.13,
"Cost($)": 0.0
},
"AQuA": {
"Score": 79.92,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 67.0,
"Cost($)": 0.0
}
},
"ToT-Qwen2.5-7B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Qwen2.5-7B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 72.21,
"Cost($)": 0.0
},
"AQuA": {
"Score": 53.94,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 1.4,
"Cost($)": 0.0
}
},
"IO-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 57.16,
"Cost($)": 0.0
},
"AQuA": {
"Score": 51.18,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 38.6,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 67.78,
"Cost($)": 0.0
},
"AQuA": {
"Score": 55.51,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 28.8,
"Cost($)": 0.0
}
},
"PoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 38.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 36.61,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 25.4,
"Cost($)": 0.0
}
},
"CoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 75.44,
"Cost($)": 0.0
},
"AQuA": {
"Score": 60.63,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 25.8,
"Cost($)": 0.0
}
},
"SC-CoT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 73.46,
"Cost($)": 0.0
},
"AQuA": {
"Score": 59.45,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 30.2,
"Cost($)": 0.0
}
},
"ToT-Llama-3.1-8B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Llama-3.1-8B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 65.05,
"Cost($)": 0.0
},
"AQuA": {
"Score": 59.06,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 1.8,
"Cost($)": 0.0
}
},
"IO-Internllm2_5-7B": {
"META": {
"Algorithm": "IO",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 11.6,
"Cost($)": 0.0
},
"AQuA": {
"Score": 47.64,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 22.8,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Internllm2_5-7B": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 33.51,
"Cost($)": 0.0
},
"AQuA": {
"Score": 40.94,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 14.8,
"Cost($)": 0.0
}
},
"PoT-Internllm2_5-7B": {
"META": {
"Algorithm": "PoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 38.21,
"Cost($)": 0.0
},
"AQuA": {
"Score": 36.61,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 15.0,
"Cost($)": 0.0
}
},
"CoT-Internllm2_5-7B": {
"META": {
"Algorithm": "CoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 77.71,
"Cost($)": 0.0
},
"AQuA": {
"Score": 52.76,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 46.6,
"Cost($)": 0.0
}
},
"SC-CoT-Internllm2_5-7B": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 48.22,
"Cost($)": 0.0
},
"AQuA": {
"Score": 39.37,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 9.8,
"Cost($)": 0.0
}
},
"ToT-Internllm2_5-7B": {
"META": {
"Algorithm": "ToT",
"LLM": "Internllm2_5-7B",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 20.85,
"Cost($)": 0.0
},
"AQuA": {
"Score": 35.83,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.2,
"Cost($)": 0.0
}
},
"IO-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 16.68,
"Cost($)": 0.0
},
"AQuA": {
"Score": 29.13,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 7.0,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 24.87,
"Cost($)": 0.0
},
"AQuA": {
"Score": 25.59,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 8.2,
"Cost($)": 0.0
}
},
"PoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 18.5,
"Cost($)": 0.0
},
"AQuA": {
"Score": 30.71,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.8,
"Cost($)": 0.0
}
},
"CoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 55.5,
"Cost($)": 0.0
},
"AQuA": {
"Score": 40.55,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 15.2,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 11.75,
"Cost($)": 0.0
},
"AQuA": {
"Score": 23.62,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 3.8,
"Cost($)": 0.0
}
},
"ToT-Qwen2-1.5B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Qwen2-1.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 19.64,
"Cost($)": 0.0
},
"AQuA": {
"Score": 31.5,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.8,
"Cost($)": 0.0
}
},
"IO-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "IO",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 14.71,
"Cost($)": 0.0
},
"AQuA": {
"Score": 27.17,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 2.6,
"Cost($)": 0.0
}
},
"ReAct-Pro*-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 7.66,
"Cost($)": 0.0
},
"AQuA": {
"Score": 24.02,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.6,
"Cost($)": 0.0
}
},
"PoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "PoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 9.63,
"Cost($)": 0.0
},
"AQuA": {
"Score": 17.32,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.0,
"Cost($)": 0.0
}
},
"CoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "CoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 35.94,
"Cost($)": 0.0
},
"AQuA": {
"Score": 33.07,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 6.2,
"Cost($)": 0.0
}
},
"SC-CoT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 1.67,
"Cost($)": 0.0
},
"AQuA": {
"Score": 22.83,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.8,
"Cost($)": 0.0
}
},
"ToT-Qwen2-0.5B-Instruct": {
"META": {
"Algorithm": "ToT",
"LLM": "Qwen2-0.5B-Instruct",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 0.0,
"Cost($)": 0.0
},
"AQuA": {
"Score": 29.92,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.0,
"Cost($)": 0.0
}
},
"IO-deepseek-r1:1.5b": {
"META": {
"Algorithm": "IO",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/1/22"
},
"gsm8k": {
"Score": 64.14,
"Cost($)": 0.0
},
"AQuA": {
"Score": 68.9,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 43.8,
"Cost($)": 0.0
}
},
"ReAct-Pro*-deepseek-r1:1.5b": {
"META": {
"Algorithm": "ReAct-Pro*",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/2/10"
},
"gsm8k": {
"Score": 35.94,
"Cost($)": 0.0
},
"AQuA": {
"Score": 54.33,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 24.4,
"Cost($)": 0.0
}
},
"PoT-deepseek-r1:1.5b": {
"META": {
"Algorithm": "PoT",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/2/10"
},
"gsm8k": {
"Score": 11.9,
"Cost($)": 0.0
},
"AQuA": {
"Score": 54.72,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 1.0,
"Cost($)": 0.0
}
},
"CoT-deepseek-r1:1.5b": {
"META": {
"Algorithm": "CoT",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/1/23"
},
"gsm8k": {
"Score": 70.66,
"Cost($)": 0.0
},
"AQuA": {
"Score": 71.65,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 49.4,
"Cost($)": 0.0
}
},
"SC-CoT-deepseek-r1:1.5b": {
"META": {
"Algorithm": "SC-CoT",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/2/10"
},
"gsm8k": {
"Score": 55.34,
"Cost($)": 0.0
},
"AQuA": {
"Score": 59.06,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 38.0,
"Cost($)": 0.0
}
},
"ToT-deepseek-r1:1.5b": {
"META": {
"Algorithm": "ToT",
"LLM": "deepseek-r1:1.5b",
"Eval Date": "2025/2/10"
},
"gsm8k": {
"Score": 23.12,
"Cost($)": 0.0
},
"AQuA": {
"Score": 24.8,
"Cost($)": 0.0
},
"MATH-500": {
"Score": 0.4,
"Cost($)": 0.0
}
}
}
}