{ "time": "2025-02-11 13:22:59", "results": { "IO": { "gpt-3.5-turbo": { "META": { "Algorithm": "IO", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 37.83, "Pass rate": 0.9992, "Cost($)": 0.3328, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 586553, "Total input tokens": 546990, "Average input tokens": 415, "Total output tokens": 39563, "Average output tokens": 30 }, "AQuA": { "Score": 38.98, "Pass rate": 1.0, "Cost($)": 0.038, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 42471, "Total input tokens": 25701, "Average input tokens": 101, "Total output tokens": 16770, "Average output tokens": 66 }, "MATH-500": { "Score": 17.2, "Pass rate": 1.0, "Cost($)": 0.2436, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 265625, "Total input tokens": 154881, "Average input tokens": 310, "Total output tokens": 110744, "Average output tokens": 221 } }, "Doubao-lite-32k": { "META": { "Algorithm": "IO", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 72.02, "Pass rate": 0.9992, "Cost($)": 0.0354, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 740483, "Total input tokens": 617377, "Average input tokens": 468, "Total output tokens": 123106, "Average output tokens": 93 }, "AQuA": { "Score": 79.13, "Pass rate": 1.0, "Cost($)": 0.0058, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 87742, "Total input tokens": 33058, "Average input tokens": 130, "Total output tokens": 54684, "Average output tokens": 215 }, "MATH-500": { "Score": 37.4, "Pass rate": 1.0, "Cost($)": 0.0187, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 311730, "Total input tokens": 166870, "Average input tokens": 334, "Total output tokens": 144860, "Average output tokens": 290 } }, "gpt-4o": { "META": { "Algorithm": "IO", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 88.4, "Pass rate": 1.0, "Cost($)": 3.3463, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 741446, "Total input tokens": 542416, "Average input tokens": 411, "Total output tokens": 199030, "Average output tokens": 151 }, "AQuA": { "Score": 75.59, "Pass rate": 0.9724, "Cost($)": 1.1453, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 133752, "Total input tokens": 25631, "Average input tokens": 101, "Total output tokens": 108121, "Average output tokens": 426 }, "MATH-500": { "Score": 41.8, "Pass rate": 1.0, "Cost($)": 2.7907, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 394447, "Total input tokens": 153832, "Average input tokens": 308, "Total output tokens": 240615, "Average output tokens": 481 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 86.58, "Pass rate": 1.0, "Cost($)": 0.4899, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 869060, "Total input tokens": 555340, "Average input tokens": 421, "Total output tokens": 313720, "Average output tokens": 238 }, "AQuA": { "Score": 84.25, "Pass rate": 0.9961, "Cost($)": 0.0742, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 131604, "Total input tokens": 25397, "Average input tokens": 100, "Total output tokens": 106207, "Average output tokens": 418 }, "MATH-500": { "Score": 70.2, "Pass rate": 1.0, "Cost($)": 0.2506, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 444591, "Total input tokens": 169549, "Average input tokens": 339, "Total output tokens": 275042, "Average output tokens": 550 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.27, "Pass rate": 1.0, "Cost($)": 0.4709, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 835275, "Total input tokens": 583916, "Average input tokens": 443, "Total output tokens": 251359, "Average output tokens": 191 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9921, "Cost($)": 0.0798, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 141567, "Total input tokens": 32809, "Average input tokens": 129, "Total output tokens": 108758, "Average output tokens": 428 }, "MATH-500": { "Score": 69.4, "Pass rate": 1.0, "Cost($)": 0.2386, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 423216, "Total input tokens": 155879, "Average input tokens": 312, "Total output tokens": 267337, "Average output tokens": 535 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.24, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 887913, "Total input tokens": 596229, "Average input tokens": 452, "Total output tokens": 291684, "Average output tokens": 221 }, "AQuA": { "Score": 78.74, "Pass rate": 0.9843, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 137771, "Total input tokens": 33271, "Average input tokens": 131, "Total output tokens": 104500, "Average output tokens": 411 }, "MATH-500": { "Score": 59.4, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 411362, "Total input tokens": 169549, "Average input tokens": 339, "Total output tokens": 241813, "Average output tokens": 484 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 57.16, "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1745429, "Total input tokens": 550941, "Average input tokens": 418, "Total output tokens": 1194488, "Average output tokens": 906 }, "AQuA": { "Score": 51.18, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 133106, "Total input tokens": 26459, "Average input tokens": 104, "Total output tokens": 106647, "Average output tokens": 420 }, "MATH-500": { "Score": 38.6, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 503934, "Total input tokens": 155563, "Average input tokens": 311, "Total output tokens": 348371, "Average output tokens": 697 } }, "Internllm2_5-7B": { "META": { "Algorithm": "IO", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.6, "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1113728, "Total input tokens": 679302, "Average input tokens": 515, "Total output tokens": 434426, "Average output tokens": 329 }, "AQuA": { "Score": 47.64, "Pass rate": 0.9094, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 185041, "Total input tokens": 50232, "Average input tokens": 198, "Total output tokens": 134809, "Average output tokens": 531 }, "MATH-500": { "Score": 22.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 467888, "Total input tokens": 201883, "Average input tokens": 404, "Total output tokens": 266005, "Average output tokens": 532 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 16.68, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 736996, "Total input tokens": 568530, "Average input tokens": 431, "Total output tokens": 168466, "Average output tokens": 128 }, "AQuA": { "Score": 29.13, "Pass rate": 0.9764, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 71047, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 43110, "Average output tokens": 170 }, "MATH-500": { "Score": 7.0, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 413878, "Total input tokens": 158777, "Average input tokens": 318, "Total output tokens": 255101, "Average output tokens": 510 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "IO", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 14.71, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 834897, "Total input tokens": 568116, "Average input tokens": 431, "Total output tokens": 266781, "Average output tokens": 202 }, "AQuA": { "Score": 27.17, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 110415, "Total input tokens": 27937, "Average input tokens": 110, "Total output tokens": 82478, "Average output tokens": 325 }, "MATH-500": { "Score": 2.6, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 429330, "Total input tokens": 159049, "Average input tokens": 318, "Total output tokens": 270281, "Average output tokens": 541 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "IO", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 64.14, "Pass rate": 0.9962, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1483051, "Total input tokens": 561935, "Average input tokens": 426, "Total output tokens": 921116, "Average output tokens": 698 }, "AQuA": { "Score": 68.9, "Pass rate": 0.9488, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 351767, "Total input tokens": 26667, "Average input tokens": 105, "Total output tokens": 325100, "Average output tokens": 1280 }, "MATH-500": { "Score": 43.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 1022548, "Total input tokens": 157049, "Average input tokens": 314, "Total output tokens": 865499, "Average output tokens": 1731 } } }, "ReAct-Pro*": { "gpt-3.5-turbo": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 74.91, "Pass rate": 0.9939, "Cost($)": 3.4633, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 6646286, "Total input tokens": 6506164, "Average input tokens": 4933, "Total output tokens": 140122, "Average output tokens": 106 }, "AQuA": { "Score": 64.57, "Pass rate": 0.9803, "Cost($)": 0.4928, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 903587, "Total input tokens": 862614, "Average input tokens": 3396, "Total output tokens": 40973, "Average output tokens": 161 }, "MATH-500": { "Score": 23.8, "Pass rate": 1.0, "Cost($)": 2.0406, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 3832714, "Total input tokens": 3708461, "Average input tokens": 7417, "Total output tokens": 124253, "Average output tokens": 249 } }, "Doubao-lite-32k": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 85.6, "Pass rate": 0.9962, "Cost($)": 0.2512, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 5998639, "Total input tokens": 5862016, "Average input tokens": 4444, "Total output tokens": 136623, "Average output tokens": 104 }, "AQuA": { "Score": 77.56, "Pass rate": 0.9606, "Cost($)": 0.0445, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1032841, "Total input tokens": 977890, "Average input tokens": 3850, "Total output tokens": 54951, "Average output tokens": 216 }, "MATH-500": { "Score": 47.2, "Pass rate": 1.0, "Cost($)": 0.186, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 4388666, "Total input tokens": 4234620, "Average input tokens": 8469, "Total output tokens": 154046, "Average output tokens": 308 } }, "gpt-4o": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 63.31, "Pass rate": 0.9955, "Cost($)": 39.0751, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 14715887, "Total input tokens": 14411173, "Average input tokens": 10926, "Total output tokens": 304714, "Average output tokens": 231 }, "AQuA": { "Score": 57.48, "Pass rate": 0.9724, "Cost($)": 2.304, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 692096, "Total input tokens": 615589, "Average input tokens": 2424, "Total output tokens": 76507, "Average output tokens": 301 }, "MATH-500": { "Score": 54.0, "Pass rate": 1.0, "Cost($)": 17.7735, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 6153255, "Total input tokens": 5834537, "Average input tokens": 11669, "Total output tokens": 318718, "Average output tokens": 637 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.26, "Pass rate": 1.0, "Cost($)": 10.5479, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 18710437, "Total input tokens": 18160983, "Average input tokens": 13769, "Total output tokens": 549454, "Average output tokens": 417 }, "AQuA": { "Score": 73.23, "Pass rate": 1.0, "Cost($)": 0.3177, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 563603, "Total input tokens": 441765, "Average input tokens": 1739, "Total output tokens": 121838, "Average output tokens": 480 }, "MATH-500": { "Score": 62.8, "Pass rate": 1.0, "Cost($)": 3.4541, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 6127117, "Total input tokens": 5747268, "Average input tokens": 11495, "Total output tokens": 379849, "Average output tokens": 760 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 87.64, "Pass rate": 0.9992, "Cost($)": 10.1124, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 17937864, "Total input tokens": 17038928, "Average input tokens": 12918, "Total output tokens": 898936, "Average output tokens": 682 }, "AQuA": { "Score": 79.13, "Pass rate": 0.9961, "Cost($)": 0.768, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1362379, "Total input tokens": 1119143, "Average input tokens": 4406, "Total output tokens": 243236, "Average output tokens": 958 }, "MATH-500": { "Score": 64.6, "Pass rate": 1.0, "Cost($)": 3.1806, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5641879, "Total input tokens": 5223611, "Average input tokens": 10447, "Total output tokens": 418268, "Average output tokens": 837 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 82.87, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 14850914, "Total input tokens": 14355752, "Average input tokens": 10884, "Total output tokens": 495162, "Average output tokens": 375 }, "AQuA": { "Score": 74.41, "Pass rate": 0.9921, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 695844, "Total input tokens": 564165, "Average input tokens": 2221, "Total output tokens": 131679, "Average output tokens": 518 }, "MATH-500": { "Score": 48.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 4990240, "Total input tokens": 4646708, "Average input tokens": 9293, "Total output tokens": 343532, "Average output tokens": 687 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 67.78, "Pass rate": 0.9856, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 22835767, "Total input tokens": 21044978, "Average input tokens": 15955, "Total output tokens": 1790789, "Average output tokens": 1358 }, "AQuA": { "Score": 55.51, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 4340821, "Total input tokens": 3764723, "Average input tokens": 14822, "Total output tokens": 576098, "Average output tokens": 2268 }, "MATH-500": { "Score": 28.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 8763629, "Total input tokens": 7486706, "Average input tokens": 14973, "Total output tokens": 1276923, "Average output tokens": 2554 } }, "Internllm2_5-7B": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 33.51, "Pass rate": 0.9795, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 35669989, "Total input tokens": 30120070, "Average input tokens": 22836, "Total output tokens": 5549919, "Average output tokens": 4208 }, "AQuA": { "Score": 40.94, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 4428801, "Total input tokens": 3592039, "Average input tokens": 14142, "Total output tokens": 836762, "Average output tokens": 3294 }, "MATH-500": { "Score": 14.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 14186105, "Total input tokens": 11831496, "Average input tokens": 23663, "Total output tokens": 2354609, "Average output tokens": 4709 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 24.87, "Pass rate": 0.8021, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 9828001, "Total input tokens": 9133603, "Average input tokens": 6925, "Total output tokens": 694398, "Average output tokens": 526 }, "AQuA": { "Score": 25.59, "Pass rate": 0.9606, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 5072004, "Total input tokens": 4555858, "Average input tokens": 17936, "Total output tokens": 516146, "Average output tokens": 2032 }, "MATH-500": { "Score": 8.2, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 8987061, "Total input tokens": 8430774, "Average input tokens": 16862, "Total output tokens": 556287, "Average output tokens": 1113 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 7.66, "Pass rate": 0.9522, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 55392611, "Total input tokens": 52431343, "Average input tokens": 39751, "Total output tokens": 2961268, "Average output tokens": 2245 }, "AQuA": { "Score": 24.02, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 7170087, "Total input tokens": 6344167, "Average input tokens": 24977, "Total output tokens": 825920, "Average output tokens": 3252 }, "MATH-500": { "Score": 0.6, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 19442440, "Total input tokens": 18137392, "Average input tokens": 36275, "Total output tokens": 1305048, "Average output tokens": 2610 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "ReAct-Pro*", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/2/10" }, "gsm8k": { "Score": 35.94, "Pass rate": 0.9962, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 24219077, "Total input tokens": 19299381, "Average input tokens": 14632, "Total output tokens": 4919696, "Average output tokens": 3730 }, "AQuA": { "Score": 54.33, "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 14445041, "Total input tokens": 10578715, "Average input tokens": 41648, "Total output tokens": 3866326, "Average output tokens": 15222 }, "MATH-500": { "Score": 24.4, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 30177348, "Total input tokens": 20729970, "Average input tokens": 41460, "Total output tokens": 9447378, "Average output tokens": 18895 } } }, "PoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "PoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 76.88, "Pass rate": 0.9924, "Cost($)": 0.6902, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1187080, "Total input tokens": 1090418, "Average input tokens": 827, "Total output tokens": 96662, "Average output tokens": 73 }, "AQuA": { "Score": 59.45, "Pass rate": 1.0, "Cost($)": 0.1748, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 266654, "Total input tokens": 225162, "Average input tokens": 886, "Total output tokens": 41492, "Average output tokens": 163 }, "MATH-500": { "Score": 28.8, "Pass rate": 0.838, "Cost($)": 0.168, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 271916, "Total input tokens": 239902, "Average input tokens": 480, "Total output tokens": 32014, "Average output tokens": 64 } }, "Doubao-lite-32k": { "META": { "Algorithm": "PoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.61, "Pass rate": 0.9257, "Cost($)": 0.0576, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1288055, "Total input tokens": 1170038, "Average input tokens": 887, "Total output tokens": 118017, "Average output tokens": 89 }, "AQuA": { "Score": 71.65, "Pass rate": 0.9685, "Cost($)": 0.0147, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 309436, "Total input tokens": 259863, "Average input tokens": 1023, "Total output tokens": 49573, "Average output tokens": 195 }, "MATH-500": { "Score": 32.6, "Pass rate": 0.68, "Cost($)": 0.0144, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 303148, "Total input tokens": 254377, "Average input tokens": 509, "Total output tokens": 48771, "Average output tokens": 98 } }, "gpt-4o": { "META": { "Algorithm": "PoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.1, "Pass rate": 0.9977, "Cost($)": 4.2166, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1247912, "Total input tokens": 1101672, "Average input tokens": 835, "Total output tokens": 146240, "Average output tokens": 111 }, "AQuA": { "Score": 75.2, "Pass rate": 1.0, "Cost($)": 1.6087, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 327908, "Total input tokens": 222717, "Average input tokens": 877, "Total output tokens": 105191, "Average output tokens": 414 }, "MATH-500": { "Score": 46.2, "Pass rate": 0.864, "Cost($)": 1.5994, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 340960, "Total input tokens": 241357, "Average input tokens": 483, "Total output tokens": 99603, "Average output tokens": 199 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.34, "Pass rate": 0.9939, "Cost($)": 0.7054, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1251210, "Total input tokens": 1106682, "Average input tokens": 839, "Total output tokens": 144528, "Average output tokens": 110 }, "AQuA": { "Score": 75.2, "Pass rate": 1.0, "Cost($)": 0.1645, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 291764, "Total input tokens": 249215, "Average input tokens": 981, "Total output tokens": 42549, "Average output tokens": 168 }, "MATH-500": { "Score": 47.2, "Pass rate": 0.822, "Cost($)": 0.233, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 413372, "Total input tokens": 242549, "Average input tokens": 485, "Total output tokens": 170823, "Average output tokens": 342 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.09, "Pass rate": 0.7961, "Cost($)": 0.9736, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1727044, "Total input tokens": 1126025, "Average input tokens": 854, "Total output tokens": 601019, "Average output tokens": 456 }, "AQuA": { "Score": 79.53, "Pass rate": 0.9921, "Cost($)": 0.1746, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 309799, "Total input tokens": 240735, "Average input tokens": 948, "Total output tokens": 69064, "Average output tokens": 272 }, "MATH-500": { "Score": 42.6, "Pass rate": 0.802, "Cost($)": 0.2839, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 503596, "Total input tokens": 253879, "Average input tokens": 508, "Total output tokens": 249717, "Average output tokens": 499 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 58.83, "Pass rate": 0.7051, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1362822, "Total input tokens": 1145390, "Average input tokens": 868, "Total output tokens": 217432, "Average output tokens": 165 }, "AQuA": { "Score": 68.11, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 313728, "Total input tokens": 264517, "Average input tokens": 1041, "Total output tokens": 49211, "Average output tokens": 194 }, "MATH-500": { "Score": 39.6, "Pass rate": 0.744, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 408812, "Total input tokens": 258549, "Average input tokens": 517, "Total output tokens": 150263, "Average output tokens": 301 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.67, "Pass rate": 0.5542, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1391111, "Total input tokens": 1147538, "Average input tokens": 870, "Total output tokens": 243573, "Average output tokens": 185 }, "AQuA": { "Score": 36.61, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 290914, "Total input tokens": 240613, "Average input tokens": 947, "Total output tokens": 50301, "Average output tokens": 198 }, "MATH-500": { "Score": 25.4, "Pass rate": 0.684, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 462271, "Total input tokens": 253879, "Average input tokens": 508, "Total output tokens": 208392, "Average output tokens": 417 } }, "Internllm2_5-7B": { "META": { "Algorithm": "PoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 38.21, "Pass rate": 0.489, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1324949, "Total input tokens": 1136843, "Average input tokens": 862, "Total output tokens": 188106, "Average output tokens": 143 }, "AQuA": { "Score": 36.61, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 301962, "Total input tokens": 233505, "Average input tokens": 919, "Total output tokens": 68457, "Average output tokens": 270 }, "MATH-500": { "Score": 15.0, "Pass rate": 0.324, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 368709, "Total input tokens": 247883, "Average input tokens": 496, "Total output tokens": 120826, "Average output tokens": 242 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 18.5, "Pass rate": 0.3101, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1327522, "Total input tokens": 1151528, "Average input tokens": 873, "Total output tokens": 175994, "Average output tokens": 133 }, "AQuA": { "Score": 30.71, "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 298475, "Total input tokens": 246560, "Average input tokens": 971, "Total output tokens": 51915, "Average output tokens": 204 }, "MATH-500": { "Score": 0.8, "Pass rate": 0.022, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 786870, "Total input tokens": 248509, "Average input tokens": 497, "Total output tokens": 538361, "Average output tokens": 1077 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "PoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 9.63, "Pass rate": 0.1691, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1389135, "Total input tokens": 1151528, "Average input tokens": 873, "Total output tokens": 237607, "Average output tokens": 180 }, "AQuA": { "Score": 17.32, "Pass rate": 0.9213, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 322281, "Total input tokens": 258867, "Average input tokens": 1019, "Total output tokens": 63414, "Average output tokens": 250 }, "MATH-500": { "Score": 0.0, "Pass rate": 0.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 437202, "Total input tokens": 253549, "Average input tokens": 507, "Total output tokens": 183653, "Average output tokens": 367 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "PoT", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/2/10" }, "gsm8k": { "Score": 11.9, "Pass rate": 0.1744, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1954509, "Total input tokens": 1138872, "Average input tokens": 863, "Total output tokens": 815637, "Average output tokens": 618 }, "AQuA": { "Score": 54.72, "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1016647, "Total input tokens": 250690, "Average input tokens": 987, "Total output tokens": 765957, "Average output tokens": 3016 }, "MATH-500": { "Score": 1.0, "Pass rate": 0.016, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 1031067, "Total input tokens": 245549, "Average input tokens": 491, "Total output tokens": 785518, "Average output tokens": 1571 } } }, "CoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 78.7, "Pass rate": 1.0, "Cost($)": 0.6788, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1088041, "Total input tokens": 953242, "Average input tokens": 723, "Total output tokens": 134799, "Average output tokens": 102 }, "AQuA": { "Score": 61.02, "Pass rate": 0.937, "Cost($)": 0.0957, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 80793, "Total input tokens": 25447, "Average input tokens": 100, "Total output tokens": 55346, "Average output tokens": 218 }, "MATH-500": { "Score": 39.8, "Pass rate": 1.0, "Cost($)": 0.3189, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 432196, "Total input tokens": 329381, "Average input tokens": 659, "Total output tokens": 102815, "Average output tokens": 206 } }, "Doubao-lite-32k": { "META": { "Algorithm": "CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 89.31, "Pass rate": 1.0, "Cost($)": 0.0558, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1201820, "Total input tokens": 1042095, "Average input tokens": 790, "Total output tokens": 159725, "Average output tokens": 121 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9724, "Cost($)": 0.0066, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 94577, "Total input tokens": 27978, "Average input tokens": 110, "Total output tokens": 66599, "Average output tokens": 262 }, "MATH-500": { "Score": 59.0, "Pass rate": 1.0, "Cost($)": 0.0255, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 479941, "Total input tokens": 336370, "Average input tokens": 673, "Total output tokens": 143571, "Average output tokens": 287 } }, "gpt-4o": { "META": { "Algorithm": "CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 94.09, "Pass rate": 1.0, "Cost($)": 4.5367, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1165166, "Total input tokens": 948668, "Average input tokens": 719, "Total output tokens": 216498, "Average output tokens": 164 }, "AQuA": { "Score": 82.68, "Pass rate": 0.9803, "Cost($)": 1.0417, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 123017, "Total input tokens": 25123, "Average input tokens": 99, "Total output tokens": 97894, "Average output tokens": 385 }, "MATH-500": { "Score": 68.0, "Pass rate": 1.0, "Cost($)": 3.0569, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 552688, "Total input tokens": 329332, "Average input tokens": 659, "Total output tokens": 223356, "Average output tokens": 447 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 92.87, "Pass rate": 1.0, "Cost($)": 0.7195, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1276252, "Total input tokens": 1005119, "Average input tokens": 762, "Total output tokens": 271133, "Average output tokens": 206 }, "AQuA": { "Score": 86.22, "Pass rate": 0.9921, "Cost($)": 0.0808, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 143289, "Total input tokens": 25143, "Average input tokens": 99, "Total output tokens": 118146, "Average output tokens": 465 }, "MATH-500": { "Score": 80.2, "Pass rate": 1.0, "Cost($)": 0.349, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 619015, "Total input tokens": 338549, "Average input tokens": 677, "Total output tokens": 280466, "Average output tokens": 561 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.93, "Pass rate": 1.0, "Cost($)": 0.687, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1218665, "Total input tokens": 990168, "Average input tokens": 751, "Total output tokens": 228497, "Average output tokens": 173 }, "AQuA": { "Score": 83.46, "Pass rate": 0.9843, "Cost($)": 0.0927, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 164389, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 131834, "Average output tokens": 519 }, "MATH-500": { "Score": 71.2, "Pass rate": 1.0, "Cost($)": 0.3463, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 614221, "Total input tokens": 342879, "Average input tokens": 686, "Total output tokens": 271342, "Average output tokens": 543 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 85.67, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1290805, "Total input tokens": 1046008, "Average input tokens": 793, "Total output tokens": 244797, "Average output tokens": 186 }, "AQuA": { "Score": 80.71, "Pass rate": 0.9961, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 149736, "Total input tokens": 33017, "Average input tokens": 130, "Total output tokens": 116719, "Average output tokens": 460 }, "MATH-500": { "Score": 69.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 617204, "Total input tokens": 354049, "Average input tokens": 708, "Total output tokens": 263155, "Average output tokens": 526 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 75.44, "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1248329, "Total input tokens": 990168, "Average input tokens": 751, "Total output tokens": 258161, "Average output tokens": 196 }, "AQuA": { "Score": 60.63, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 144435, "Total input tokens": 32555, "Average input tokens": 128, "Total output tokens": 111880, "Average output tokens": 440 }, "MATH-500": { "Score": 25.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 625568, "Total input tokens": 342879, "Average input tokens": 686, "Total output tokens": 282689, "Average output tokens": 565 } }, "Internllm2_5-7B": { "META": { "Algorithm": "CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 77.71, "Pass rate": 0.997, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1202163, "Total input tokens": 968163, "Average input tokens": 734, "Total output tokens": 234000, "Average output tokens": 177 }, "AQuA": { "Score": 52.76, "Pass rate": 0.8937, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 127520, "Total input tokens": 26610, "Average input tokens": 105, "Total output tokens": 100910, "Average output tokens": 397 }, "MATH-500": { "Score": 46.6, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 546774, "Total input tokens": 332883, "Average input tokens": 666, "Total output tokens": 213891, "Average output tokens": 428 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 55.5, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1218525, "Total input tokens": 1032818, "Average input tokens": 783, "Total output tokens": 185707, "Average output tokens": 141 }, "AQuA": { "Score": 40.55, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 110040, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 79563, "Average output tokens": 313 }, "MATH-500": { "Score": 15.2, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 536377, "Total input tokens": 349049, "Average input tokens": 698, "Total output tokens": 187328, "Average output tokens": 375 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 35.94, "Pass rate": 0.9992, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 1223459, "Total input tokens": 1032818, "Average input tokens": 783, "Total output tokens": 190641, "Average output tokens": 145 }, "AQuA": { "Score": 33.07, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 117339, "Total input tokens": 30477, "Average input tokens": 120, "Total output tokens": 86862, "Average output tokens": 342 }, "MATH-500": { "Score": 6.2, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 549188, "Total input tokens": 349049, "Average input tokens": 698, "Total output tokens": 200139, "Average output tokens": 400 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "CoT", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/1/23" }, "gsm8k": { "Score": 70.66, "Pass rate": 0.9977, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 2090625, "Total input tokens": 1011714, "Average input tokens": 767, "Total output tokens": 1078911, "Average output tokens": 818 }, "AQuA": { "Score": 71.65, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 333072, "Total input tokens": 26413, "Average input tokens": 104, "Total output tokens": 306659, "Average output tokens": 1207 }, "MATH-500": { "Score": 49.4, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 1199129, "Total input tokens": 341549, "Average input tokens": 683, "Total output tokens": 857580, "Average output tokens": 1715 } } }, "SC-CoT": { "gpt-3.5-turbo": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 79.91, "Pass rate": 0.9992, "Cost($)": 3.3938, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 4089612, "Total input tokens": 2740652, "Average input tokens": 2078, "Total output tokens": 1348960, "Average output tokens": 1023 }, "AQuA": { "Score": 66.14, "Pass rate": 0.9921, "Cost($)": 0.7888, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 847335, "Total input tokens": 482192, "Average input tokens": 1898, "Total output tokens": 365143, "Average output tokens": 1438 }, "MATH-500": { "Score": 28.8, "Pass rate": 1.0, "Cost($)": 1.9764, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 2238812, "Total input tokens": 1381818, "Average input tokens": 2764, "Total output tokens": 856994, "Average output tokens": 1714 } }, "Doubao-lite-32k": { "META": { "Algorithm": "SC-CoT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 87.26, "Pass rate": 0.9992, "Cost($)": 0.2083, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 3888813, "Total input tokens": 2691714, "Average input tokens": 2041, "Total output tokens": 1197099, "Average output tokens": 908 }, "AQuA": { "Score": 81.1, "Pass rate": 0.9724, "Cost($)": 0.0519, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 885986, "Total input tokens": 503751, "Average input tokens": 1983, "Total output tokens": 382235, "Average output tokens": 1505 }, "MATH-500": { "Score": 49.2, "Pass rate": 1.0, "Cost($)": 0.1406, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 2470810, "Total input tokens": 1507651, "Average input tokens": 3015, "Total output tokens": 963159, "Average output tokens": 1926 } }, "gpt-4o": { "META": { "Algorithm": "SC-CoT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 90.3, "Pass rate": 0.9992, "Cost($)": 31.0542, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 5798173, "Total input tokens": 3590336, "Average input tokens": 2722, "Total output tokens": 2207837, "Average output tokens": 1674 }, "AQuA": { "Score": 86.61, "Pass rate": 0.9882, "Cost($)": 8.1485, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1373206, "Total input tokens": 744478, "Average input tokens": 2931, "Total output tokens": 628728, "Average output tokens": 2475 }, "MATH-500": { "Score": 34.4, "Pass rate": 1.0, "Cost($)": 19.6538, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 3455323, "Total input tokens": 1986584, "Average input tokens": 3973, "Total output tokens": 1468739, "Average output tokens": 2937 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 93.86, "Pass rate": 1.0, "Cost($)": 5.9858, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 10618008, "Total input tokens": 8136223, "Average input tokens": 6168, "Total output tokens": 2481785, "Average output tokens": 1882 }, "AQuA": { "Score": 85.04, "Pass rate": 0.9921, "Cost($)": 1.0348, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1835669, "Total input tokens": 1051218, "Average input tokens": 4139, "Total output tokens": 784451, "Average output tokens": 3088 }, "MATH-500": { "Score": 74.0, "Pass rate": 1.0, "Cost($)": 3.1556, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5597513, "Total input tokens": 3823997, "Average input tokens": 7648, "Total output tokens": 1773516, "Average output tokens": 3547 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 95.07, "Pass rate": 1.0, "Cost($)": 6.2005, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 10998794, "Total input tokens": 8413717, "Average input tokens": 6379, "Total output tokens": 2585077, "Average output tokens": 1960 }, "AQuA": { "Score": 82.28, "Pass rate": 0.9921, "Cost($)": 1.0756, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1907924, "Total input tokens": 1135251, "Average input tokens": 4469, "Total output tokens": 772673, "Average output tokens": 3042 }, "MATH-500": { "Score": 74.2, "Pass rate": 1.0, "Cost($)": 3.2239, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5718739, "Total input tokens": 3959492, "Average input tokens": 7919, "Total output tokens": 1759247, "Average output tokens": 3518 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 91.13, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 11140985, "Total input tokens": 8586888, "Average input tokens": 6510, "Total output tokens": 2554097, "Average output tokens": 1936 }, "AQuA": { "Score": 79.92, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1845332, "Total input tokens": 1098280, "Average input tokens": 4324, "Total output tokens": 747052, "Average output tokens": 2941 }, "MATH-500": { "Score": 67.0, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5451484, "Total input tokens": 3833751, "Average input tokens": 7668, "Total output tokens": 1617733, "Average output tokens": 3235 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 73.46, "Pass rate": 0.9955, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 11778716, "Total input tokens": 8630514, "Average input tokens": 6543, "Total output tokens": 3148202, "Average output tokens": 2387 }, "AQuA": { "Score": 59.45, "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1651333, "Total input tokens": 971003, "Average input tokens": 3823, "Total output tokens": 680330, "Average output tokens": 2678 }, "MATH-500": { "Score": 30.2, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5034937, "Total input tokens": 3546673, "Average input tokens": 7093, "Total output tokens": 1488264, "Average output tokens": 2977 } }, "Internllm2_5-7B": { "META": { "Algorithm": "SC-CoT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 48.22, "Pass rate": 0.9841, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 14526431, "Total input tokens": 10678792, "Average input tokens": 8096, "Total output tokens": 3847639, "Average output tokens": 2917 }, "AQuA": { "Score": 39.37, "Pass rate": 0.9803, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 2296222, "Total input tokens": 1420494, "Average input tokens": 5592, "Total output tokens": 875728, "Average output tokens": 3448 }, "MATH-500": { "Score": 9.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5838466, "Total input tokens": 4193296, "Average input tokens": 8387, "Total output tokens": 1645170, "Average output tokens": 3290 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 11.75, "Pass rate": 0.9189, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 12411942, "Total input tokens": 9066115, "Average input tokens": 6873, "Total output tokens": 3345827, "Average output tokens": 2537 }, "AQuA": { "Score": 23.62, "Pass rate": 0.9646, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 1775335, "Total input tokens": 1034362, "Average input tokens": 4072, "Total output tokens": 740973, "Average output tokens": 2917 }, "MATH-500": { "Score": 3.8, "Pass rate": 0.99, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5569442, "Total input tokens": 3832429, "Average input tokens": 7665, "Total output tokens": 1737013, "Average output tokens": 3474 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "SC-CoT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 1.67, "Pass rate": 0.9469, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 16465720, "Total input tokens": 11019864, "Average input tokens": 8355, "Total output tokens": 5445856, "Average output tokens": 4129 }, "AQuA": { "Score": 22.83, "Pass rate": 0.9724, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 2215091, "Total input tokens": 1246929, "Average input tokens": 4909, "Total output tokens": 968162, "Average output tokens": 3812 }, "MATH-500": { "Score": 0.8, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 6862056, "Total input tokens": 4448663, "Average input tokens": 8897, "Total output tokens": 2413393, "Average output tokens": 4827 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "SC-CoT", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/2/10" }, "gsm8k": { "Score": 55.34, "Pass rate": 0.997, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 25785865, "Total input tokens": 14540096, "Average input tokens": 11024, "Total output tokens": 11245769, "Average output tokens": 8526 }, "AQuA": { "Score": 59.06, "Pass rate": 0.9685, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 5802711, "Total input tokens": 2547772, "Average input tokens": 10031, "Total output tokens": 3254939, "Average output tokens": 12815 }, "MATH-500": { "Score": 38.0, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 14742109, "Total input tokens": 7080559, "Average input tokens": 14161, "Total output tokens": 7661550, "Average output tokens": 15323 } } }, "ToT": { "gpt-3.5-turbo": { "META": { "Algorithm": "ToT", "LLM": "gpt-3.5-turbo", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 67.93, "Pass rate": 0.997, "Cost($)": 9.1707, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 16727175, "Total input tokens": 15920037, "Average input tokens": 12070, "Total output tokens": 807138, "Average output tokens": 612 }, "AQuA": { "Score": 57.09, "Pass rate": 0.9961, "Cost($)": 1.1513, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 2001396, "Total input tokens": 1850767, "Average input tokens": 7286, "Total output tokens": 150629, "Average output tokens": 593 }, "MATH-500": { "Score": 9.8, "Pass rate": 1.0, "Cost($)": 5.2914, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 10001767, "Total input tokens": 9711244, "Average input tokens": 19422, "Total output tokens": 290523, "Average output tokens": 581 } }, "Doubao-lite-32k": { "META": { "Algorithm": "ToT", "LLM": "Doubao-lite-32k", "Eval Date": "2025/1/7" }, "gsm8k": { "Score": 37.83, "Pass rate": 0.8734, "Cost($)": 0.8739, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 20274349, "Total input tokens": 19208597, "Average input tokens": 14563, "Total output tokens": 1065752, "Average output tokens": 808 }, "AQuA": { "Score": 45.28, "Pass rate": 0.7402, "Cost($)": 0.0881, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 2000550, "Total input tokens": 1850249, "Average input tokens": 7284, "Total output tokens": 150301, "Average output tokens": 592 }, "MATH-500": { "Score": 1.2, "Pass rate": 0.942, "Cost($)": 0.2371, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5564500, "Total input tokens": 5338500, "Average input tokens": 10677, "Total output tokens": 226000, "Average output tokens": 452 } }, "gpt-4o": { "META": { "Algorithm": "ToT", "LLM": "gpt-4o", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 91.13, "Pass rate": 1.0, "Cost($)": 86.8581, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 30769735, "Total input tokens": 29445237, "Average input tokens": 22324, "Total output tokens": 1324498, "Average output tokens": 1004 }, "AQuA": { "Score": 81.5, "Pass rate": 0.9921, "Cost($)": 8.5295, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 2613607, "Total input tokens": 2347538, "Average input tokens": 9242, "Total output tokens": 266069, "Average output tokens": 1048 }, "MATH-500": { "Score": 3.2, "Pass rate": 1.0, "Cost($)": 40.8094, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 15242432, "Total input tokens": 14881985, "Average input tokens": 29764, "Total output tokens": 360447, "Average output tokens": 721 } }, "Qwen2.5-72B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Qwen2.5-72B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 88.88, "Pass rate": 1.0, "Cost($)": 23.5911, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 41847148, "Total input tokens": 40435361, "Average input tokens": 30656, "Total output tokens": 1411787, "Average output tokens": 1070 }, "AQuA": { "Score": 81.1, "Pass rate": 0.9921, "Cost($)": 3.7389, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 6632255, "Total input tokens": 6371642, "Average input tokens": 25085, "Total output tokens": 260613, "Average output tokens": 1026 }, "MATH-500": { "Score": 10.8, "Pass rate": 1.0, "Cost($)": 9.0421, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 16039361, "Total input tokens": 15657730, "Average input tokens": 31315, "Total output tokens": 381631, "Average output tokens": 763 } }, "Llama-3.3-70B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Llama-3.3-70B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 91.89, "Pass rate": 1.0, "Cost($)": 20.8753, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 37029687, "Total input tokens": 35096810, "Average input tokens": 26609, "Total output tokens": 1932877, "Average output tokens": 1465 }, "AQuA": { "Score": 83.07, "Pass rate": 1.0, "Cost($)": 2.9404, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 5215848, "Total input tokens": 4735188, "Average input tokens": 18642, "Total output tokens": 480660, "Average output tokens": 1892 }, "MATH-500": { "Score": 1.4, "Pass rate": 0.698, "Cost($)": 8.2699, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 14669500, "Total input tokens": 14099500, "Average input tokens": 28199, "Total output tokens": 570000, "Average output tokens": 1140 } }, "Qwen2.5-7B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Qwen2.5-7B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 72.21, "Pass rate": 0.9901, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 31657319, "Total input tokens": 20196528, "Average input tokens": 15312, "Total output tokens": 11460791, "Average output tokens": 8689 }, "AQuA": { "Score": 53.94, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 8602682, "Total input tokens": 8224468, "Average input tokens": 32380, "Total output tokens": 378214, "Average output tokens": 1489 }, "MATH-500": { "Score": 1.4, "Pass rate": 0.916, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 10167500, "Total input tokens": 9749000, "Average input tokens": 19498, "Total output tokens": 418500, "Average output tokens": 837 } }, "Llama-3.1-8B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Llama-3.1-8B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 65.05, "Pass rate": 0.9196, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 16432102, "Total input tokens": 15554967, "Average input tokens": 11793, "Total output tokens": 877135, "Average output tokens": 665 }, "AQuA": { "Score": 59.06, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 5739684, "Total input tokens": 4896222, "Average input tokens": 19276, "Total output tokens": 843462, "Average output tokens": 3321 }, "MATH-500": { "Score": 1.8, "Pass rate": 0.908, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 9035000, "Total input tokens": 7729000, "Average input tokens": 15458, "Total output tokens": 1306000, "Average output tokens": 2612 } }, "Internllm2_5-7B": { "META": { "Algorithm": "ToT", "LLM": "Internllm2_5-7B", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 20.85, "Pass rate": 0.7013, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 13178129, "Total input tokens": 11768118, "Average input tokens": 8922, "Total output tokens": 1410011, "Average output tokens": 1069 }, "AQuA": { "Score": 35.83, "Pass rate": 0.9961, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 4734560, "Total input tokens": 4263136, "Average input tokens": 16784, "Total output tokens": 471424, "Average output tokens": 1856 }, "MATH-500": { "Score": 0.2, "Pass rate": 0.99, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 8350500, "Total input tokens": 7515000, "Average input tokens": 15030, "Total output tokens": 835500, "Average output tokens": 1671 } }, "Qwen2-1.5B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Qwen2-1.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 19.64, "Pass rate": 0.7726, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 12758687, "Total input tokens": 12124248, "Average input tokens": 9192, "Total output tokens": 634439, "Average output tokens": 481 }, "AQuA": { "Score": 31.5, "Pass rate": 0.9882, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 6250702, "Total input tokens": 6058022, "Average input tokens": 23850, "Total output tokens": 192680, "Average output tokens": 759 }, "MATH-500": { "Score": 0.8, "Pass rate": 0.972, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 4535000, "Total input tokens": 4408000, "Average input tokens": 8816, "Total output tokens": 127000, "Average output tokens": 254 } }, "Qwen2-0.5B-Instruct": { "META": { "Algorithm": "ToT", "LLM": "Qwen2-0.5B-Instruct", "Eval Date": "2025/1/22" }, "gsm8k": { "Score": 0, "Pass rate": 0.0, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 0, "Total input tokens": 0, "Average input tokens": 0, "Total output tokens": 0, "Average output tokens": 0 }, "AQuA": { "Score": 29.92, "Pass rate": 1.0, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 8700281, "Total input tokens": 8100085, "Average input tokens": 31890, "Total output tokens": 600196, "Average output tokens": 2363 }, "MATH-500": { "Score": 0.0, "Pass rate": 0.962, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 5996500, "Total input tokens": 5590500, "Average input tokens": 11181, "Total output tokens": 406000, "Average output tokens": 812 } }, "deepseek-r1:1.5b": { "META": { "Algorithm": "ToT", "LLM": "deepseek-r1:1.5b", "Eval Date": "2025/2/10" }, "gsm8k": { "Score": 23.12, "Pass rate": 0.7248, "Cost($)": 0.0, "Framework": "", "X-shot": "8", "Samples": 1319, "All tokens": 3421486, "Total input tokens": 2738244, "Average input tokens": 2076, "Total output tokens": 683242, "Average output tokens": 518 }, "AQuA": { "Score": 24.8, "Pass rate": 0.5551, "Cost($)": 0.0, "Framework": "", "X-shot": "0", "Samples": 254, "All tokens": 794512, "Total input tokens": 605028, "Average input tokens": 2382, "Total output tokens": 189484, "Average output tokens": 746 }, "MATH-500": { "Score": 0.4, "Pass rate": 0.716, "Cost($)": 0.0, "Framework": "", "X-shot": "4", "Samples": 500, "All tokens": 1941500, "Total input tokens": 1831000, "Average input tokens": 3662, "Total output tokens": 110500, "Average output tokens": 221 } } } } }