Spaces:
Running
Running
{ | |
"time": "2025-02-11 13:22:59", | |
"results": { | |
"IO": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 37.83, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.3328, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 586553, | |
"Total input tokens": 546990, | |
"Average input tokens": 415, | |
"Total output tokens": 39563, | |
"Average output tokens": 30 | |
}, | |
"AQuA": { | |
"Score": 38.98, | |
"Pass rate": 1.0, | |
"Cost($)": 0.038, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 42471, | |
"Total input tokens": 25701, | |
"Average input tokens": 101, | |
"Total output tokens": 16770, | |
"Average output tokens": 66 | |
}, | |
"MATH-500": { | |
"Score": 17.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.2436, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 265625, | |
"Total input tokens": 154881, | |
"Average input tokens": 310, | |
"Total output tokens": 110744, | |
"Average output tokens": 221 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 72.02, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0354, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 740483, | |
"Total input tokens": 617377, | |
"Average input tokens": 468, | |
"Total output tokens": 123106, | |
"Average output tokens": 93 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0058, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 87742, | |
"Total input tokens": 33058, | |
"Average input tokens": 130, | |
"Total output tokens": 54684, | |
"Average output tokens": 215 | |
}, | |
"MATH-500": { | |
"Score": 37.4, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0187, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 311730, | |
"Total input tokens": 166870, | |
"Average input tokens": 334, | |
"Total output tokens": 144860, | |
"Average output tokens": 290 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 88.4, | |
"Pass rate": 1.0, | |
"Cost($)": 3.3463, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 741446, | |
"Total input tokens": 542416, | |
"Average input tokens": 411, | |
"Total output tokens": 199030, | |
"Average output tokens": 151 | |
}, | |
"AQuA": { | |
"Score": 75.59, | |
"Pass rate": 0.9724, | |
"Cost($)": 1.1453, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 133752, | |
"Total input tokens": 25631, | |
"Average input tokens": 101, | |
"Total output tokens": 108121, | |
"Average output tokens": 426 | |
}, | |
"MATH-500": { | |
"Score": 41.8, | |
"Pass rate": 1.0, | |
"Cost($)": 2.7907, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 394447, | |
"Total input tokens": 153832, | |
"Average input tokens": 308, | |
"Total output tokens": 240615, | |
"Average output tokens": 481 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 86.58, | |
"Pass rate": 1.0, | |
"Cost($)": 0.4899, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 869060, | |
"Total input tokens": 555340, | |
"Average input tokens": 421, | |
"Total output tokens": 313720, | |
"Average output tokens": 238 | |
}, | |
"AQuA": { | |
"Score": 84.25, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.0742, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 131604, | |
"Total input tokens": 25397, | |
"Average input tokens": 100, | |
"Total output tokens": 106207, | |
"Average output tokens": 418 | |
}, | |
"MATH-500": { | |
"Score": 70.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.2506, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 444591, | |
"Total input tokens": 169549, | |
"Average input tokens": 339, | |
"Total output tokens": 275042, | |
"Average output tokens": 550 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.27, | |
"Pass rate": 1.0, | |
"Cost($)": 0.4709, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 835275, | |
"Total input tokens": 583916, | |
"Average input tokens": 443, | |
"Total output tokens": 251359, | |
"Average output tokens": 191 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0798, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 141567, | |
"Total input tokens": 32809, | |
"Average input tokens": 129, | |
"Total output tokens": 108758, | |
"Average output tokens": 428 | |
}, | |
"MATH-500": { | |
"Score": 69.4, | |
"Pass rate": 1.0, | |
"Cost($)": 0.2386, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 423216, | |
"Total input tokens": 155879, | |
"Average input tokens": 312, | |
"Total output tokens": 267337, | |
"Average output tokens": 535 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.24, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 887913, | |
"Total input tokens": 596229, | |
"Average input tokens": 452, | |
"Total output tokens": 291684, | |
"Average output tokens": 221 | |
}, | |
"AQuA": { | |
"Score": 78.74, | |
"Pass rate": 0.9843, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 137771, | |
"Total input tokens": 33271, | |
"Average input tokens": 131, | |
"Total output tokens": 104500, | |
"Average output tokens": 411 | |
}, | |
"MATH-500": { | |
"Score": 59.4, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 411362, | |
"Total input tokens": 169549, | |
"Average input tokens": 339, | |
"Total output tokens": 241813, | |
"Average output tokens": 484 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 57.16, | |
"Pass rate": 0.9955, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1745429, | |
"Total input tokens": 550941, | |
"Average input tokens": 418, | |
"Total output tokens": 1194488, | |
"Average output tokens": 906 | |
}, | |
"AQuA": { | |
"Score": 51.18, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 133106, | |
"Total input tokens": 26459, | |
"Average input tokens": 104, | |
"Total output tokens": 106647, | |
"Average output tokens": 420 | |
}, | |
"MATH-500": { | |
"Score": 38.6, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 503934, | |
"Total input tokens": 155563, | |
"Average input tokens": 311, | |
"Total output tokens": 348371, | |
"Average output tokens": 697 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.6, | |
"Pass rate": 0.9795, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1113728, | |
"Total input tokens": 679302, | |
"Average input tokens": 515, | |
"Total output tokens": 434426, | |
"Average output tokens": 329 | |
}, | |
"AQuA": { | |
"Score": 47.64, | |
"Pass rate": 0.9094, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 185041, | |
"Total input tokens": 50232, | |
"Average input tokens": 198, | |
"Total output tokens": 134809, | |
"Average output tokens": 531 | |
}, | |
"MATH-500": { | |
"Score": 22.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 467888, | |
"Total input tokens": 201883, | |
"Average input tokens": 404, | |
"Total output tokens": 266005, | |
"Average output tokens": 532 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 16.68, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 736996, | |
"Total input tokens": 568530, | |
"Average input tokens": 431, | |
"Total output tokens": 168466, | |
"Average output tokens": 128 | |
}, | |
"AQuA": { | |
"Score": 29.13, | |
"Pass rate": 0.9764, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 71047, | |
"Total input tokens": 27937, | |
"Average input tokens": 110, | |
"Total output tokens": 43110, | |
"Average output tokens": 170 | |
}, | |
"MATH-500": { | |
"Score": 7.0, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 413878, | |
"Total input tokens": 158777, | |
"Average input tokens": 318, | |
"Total output tokens": 255101, | |
"Average output tokens": 510 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 14.71, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 834897, | |
"Total input tokens": 568116, | |
"Average input tokens": 431, | |
"Total output tokens": 266781, | |
"Average output tokens": 202 | |
}, | |
"AQuA": { | |
"Score": 27.17, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 110415, | |
"Total input tokens": 27937, | |
"Average input tokens": 110, | |
"Total output tokens": 82478, | |
"Average output tokens": 325 | |
}, | |
"MATH-500": { | |
"Score": 2.6, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 429330, | |
"Total input tokens": 159049, | |
"Average input tokens": 318, | |
"Total output tokens": 270281, | |
"Average output tokens": 541 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "IO", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 64.14, | |
"Pass rate": 0.9962, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1483051, | |
"Total input tokens": 561935, | |
"Average input tokens": 426, | |
"Total output tokens": 921116, | |
"Average output tokens": 698 | |
}, | |
"AQuA": { | |
"Score": 68.9, | |
"Pass rate": 0.9488, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 351767, | |
"Total input tokens": 26667, | |
"Average input tokens": 105, | |
"Total output tokens": 325100, | |
"Average output tokens": 1280 | |
}, | |
"MATH-500": { | |
"Score": 43.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 1022548, | |
"Total input tokens": 157049, | |
"Average input tokens": 314, | |
"Total output tokens": 865499, | |
"Average output tokens": 1731 | |
} | |
} | |
}, | |
"ReAct-Pro*": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 74.91, | |
"Pass rate": 0.9939, | |
"Cost($)": 3.4633, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 6646286, | |
"Total input tokens": 6506164, | |
"Average input tokens": 4933, | |
"Total output tokens": 140122, | |
"Average output tokens": 106 | |
}, | |
"AQuA": { | |
"Score": 64.57, | |
"Pass rate": 0.9803, | |
"Cost($)": 0.4928, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 903587, | |
"Total input tokens": 862614, | |
"Average input tokens": 3396, | |
"Total output tokens": 40973, | |
"Average output tokens": 161 | |
}, | |
"MATH-500": { | |
"Score": 23.8, | |
"Pass rate": 1.0, | |
"Cost($)": 2.0406, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 3832714, | |
"Total input tokens": 3708461, | |
"Average input tokens": 7417, | |
"Total output tokens": 124253, | |
"Average output tokens": 249 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 85.6, | |
"Pass rate": 0.9962, | |
"Cost($)": 0.2512, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 5998639, | |
"Total input tokens": 5862016, | |
"Average input tokens": 4444, | |
"Total output tokens": 136623, | |
"Average output tokens": 104 | |
}, | |
"AQuA": { | |
"Score": 77.56, | |
"Pass rate": 0.9606, | |
"Cost($)": 0.0445, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1032841, | |
"Total input tokens": 977890, | |
"Average input tokens": 3850, | |
"Total output tokens": 54951, | |
"Average output tokens": 216 | |
}, | |
"MATH-500": { | |
"Score": 47.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.186, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 4388666, | |
"Total input tokens": 4234620, | |
"Average input tokens": 8469, | |
"Total output tokens": 154046, | |
"Average output tokens": 308 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 63.31, | |
"Pass rate": 0.9955, | |
"Cost($)": 39.0751, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 14715887, | |
"Total input tokens": 14411173, | |
"Average input tokens": 10926, | |
"Total output tokens": 304714, | |
"Average output tokens": 231 | |
}, | |
"AQuA": { | |
"Score": 57.48, | |
"Pass rate": 0.9724, | |
"Cost($)": 2.304, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 692096, | |
"Total input tokens": 615589, | |
"Average input tokens": 2424, | |
"Total output tokens": 76507, | |
"Average output tokens": 301 | |
}, | |
"MATH-500": { | |
"Score": 54.0, | |
"Pass rate": 1.0, | |
"Cost($)": 17.7735, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 6153255, | |
"Total input tokens": 5834537, | |
"Average input tokens": 11669, | |
"Total output tokens": 318718, | |
"Average output tokens": 637 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Pass rate": 1.0, | |
"Cost($)": 10.5479, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 18710437, | |
"Total input tokens": 18160983, | |
"Average input tokens": 13769, | |
"Total output tokens": 549454, | |
"Average output tokens": 417 | |
}, | |
"AQuA": { | |
"Score": 73.23, | |
"Pass rate": 1.0, | |
"Cost($)": 0.3177, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 563603, | |
"Total input tokens": 441765, | |
"Average input tokens": 1739, | |
"Total output tokens": 121838, | |
"Average output tokens": 480 | |
}, | |
"MATH-500": { | |
"Score": 62.8, | |
"Pass rate": 1.0, | |
"Cost($)": 3.4541, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 6127117, | |
"Total input tokens": 5747268, | |
"Average input tokens": 11495, | |
"Total output tokens": 379849, | |
"Average output tokens": 760 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 87.64, | |
"Pass rate": 0.9992, | |
"Cost($)": 10.1124, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 17937864, | |
"Total input tokens": 17038928, | |
"Average input tokens": 12918, | |
"Total output tokens": 898936, | |
"Average output tokens": 682 | |
}, | |
"AQuA": { | |
"Score": 79.13, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.768, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1362379, | |
"Total input tokens": 1119143, | |
"Average input tokens": 4406, | |
"Total output tokens": 243236, | |
"Average output tokens": 958 | |
}, | |
"MATH-500": { | |
"Score": 64.6, | |
"Pass rate": 1.0, | |
"Cost($)": 3.1806, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5641879, | |
"Total input tokens": 5223611, | |
"Average input tokens": 10447, | |
"Total output tokens": 418268, | |
"Average output tokens": 837 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 82.87, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 14850914, | |
"Total input tokens": 14355752, | |
"Average input tokens": 10884, | |
"Total output tokens": 495162, | |
"Average output tokens": 375 | |
}, | |
"AQuA": { | |
"Score": 74.41, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 695844, | |
"Total input tokens": 564165, | |
"Average input tokens": 2221, | |
"Total output tokens": 131679, | |
"Average output tokens": 518 | |
}, | |
"MATH-500": { | |
"Score": 48.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 4990240, | |
"Total input tokens": 4646708, | |
"Average input tokens": 9293, | |
"Total output tokens": 343532, | |
"Average output tokens": 687 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 67.78, | |
"Pass rate": 0.9856, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 22835767, | |
"Total input tokens": 21044978, | |
"Average input tokens": 15955, | |
"Total output tokens": 1790789, | |
"Average output tokens": 1358 | |
}, | |
"AQuA": { | |
"Score": 55.51, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 4340821, | |
"Total input tokens": 3764723, | |
"Average input tokens": 14822, | |
"Total output tokens": 576098, | |
"Average output tokens": 2268 | |
}, | |
"MATH-500": { | |
"Score": 28.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 8763629, | |
"Total input tokens": 7486706, | |
"Average input tokens": 14973, | |
"Total output tokens": 1276923, | |
"Average output tokens": 2554 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 33.51, | |
"Pass rate": 0.9795, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 35669989, | |
"Total input tokens": 30120070, | |
"Average input tokens": 22836, | |
"Total output tokens": 5549919, | |
"Average output tokens": 4208 | |
}, | |
"AQuA": { | |
"Score": 40.94, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 4428801, | |
"Total input tokens": 3592039, | |
"Average input tokens": 14142, | |
"Total output tokens": 836762, | |
"Average output tokens": 3294 | |
}, | |
"MATH-500": { | |
"Score": 14.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 14186105, | |
"Total input tokens": 11831496, | |
"Average input tokens": 23663, | |
"Total output tokens": 2354609, | |
"Average output tokens": 4709 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 24.87, | |
"Pass rate": 0.8021, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 9828001, | |
"Total input tokens": 9133603, | |
"Average input tokens": 6925, | |
"Total output tokens": 694398, | |
"Average output tokens": 526 | |
}, | |
"AQuA": { | |
"Score": 25.59, | |
"Pass rate": 0.9606, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 5072004, | |
"Total input tokens": 4555858, | |
"Average input tokens": 17936, | |
"Total output tokens": 516146, | |
"Average output tokens": 2032 | |
}, | |
"MATH-500": { | |
"Score": 8.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 8987061, | |
"Total input tokens": 8430774, | |
"Average input tokens": 16862, | |
"Total output tokens": 556287, | |
"Average output tokens": 1113 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 7.66, | |
"Pass rate": 0.9522, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 55392611, | |
"Total input tokens": 52431343, | |
"Average input tokens": 39751, | |
"Total output tokens": 2961268, | |
"Average output tokens": 2245 | |
}, | |
"AQuA": { | |
"Score": 24.02, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 7170087, | |
"Total input tokens": 6344167, | |
"Average input tokens": 24977, | |
"Total output tokens": 825920, | |
"Average output tokens": 3252 | |
}, | |
"MATH-500": { | |
"Score": 0.6, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 19442440, | |
"Total input tokens": 18137392, | |
"Average input tokens": 36275, | |
"Total output tokens": 1305048, | |
"Average output tokens": 2610 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "ReAct-Pro*", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/2/10" | |
}, | |
"gsm8k": { | |
"Score": 35.94, | |
"Pass rate": 0.9962, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 24219077, | |
"Total input tokens": 19299381, | |
"Average input tokens": 14632, | |
"Total output tokens": 4919696, | |
"Average output tokens": 3730 | |
}, | |
"AQuA": { | |
"Score": 54.33, | |
"Pass rate": 0.9646, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 14445041, | |
"Total input tokens": 10578715, | |
"Average input tokens": 41648, | |
"Total output tokens": 3866326, | |
"Average output tokens": 15222 | |
}, | |
"MATH-500": { | |
"Score": 24.4, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 30177348, | |
"Total input tokens": 20729970, | |
"Average input tokens": 41460, | |
"Total output tokens": 9447378, | |
"Average output tokens": 18895 | |
} | |
} | |
}, | |
"PoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 76.88, | |
"Pass rate": 0.9924, | |
"Cost($)": 0.6902, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1187080, | |
"Total input tokens": 1090418, | |
"Average input tokens": 827, | |
"Total output tokens": 96662, | |
"Average output tokens": 73 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Pass rate": 1.0, | |
"Cost($)": 0.1748, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 266654, | |
"Total input tokens": 225162, | |
"Average input tokens": 886, | |
"Total output tokens": 41492, | |
"Average output tokens": 163 | |
}, | |
"MATH-500": { | |
"Score": 28.8, | |
"Pass rate": 0.838, | |
"Cost($)": 0.168, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 271916, | |
"Total input tokens": 239902, | |
"Average input tokens": 480, | |
"Total output tokens": 32014, | |
"Average output tokens": 64 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.61, | |
"Pass rate": 0.9257, | |
"Cost($)": 0.0576, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1288055, | |
"Total input tokens": 1170038, | |
"Average input tokens": 887, | |
"Total output tokens": 118017, | |
"Average output tokens": 89 | |
}, | |
"AQuA": { | |
"Score": 71.65, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0147, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 309436, | |
"Total input tokens": 259863, | |
"Average input tokens": 1023, | |
"Total output tokens": 49573, | |
"Average output tokens": 195 | |
}, | |
"MATH-500": { | |
"Score": 32.6, | |
"Pass rate": 0.68, | |
"Cost($)": 0.0144, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 303148, | |
"Total input tokens": 254377, | |
"Average input tokens": 509, | |
"Total output tokens": 48771, | |
"Average output tokens": 98 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.1, | |
"Pass rate": 0.9977, | |
"Cost($)": 4.2166, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1247912, | |
"Total input tokens": 1101672, | |
"Average input tokens": 835, | |
"Total output tokens": 146240, | |
"Average output tokens": 111 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Pass rate": 1.0, | |
"Cost($)": 1.6087, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 327908, | |
"Total input tokens": 222717, | |
"Average input tokens": 877, | |
"Total output tokens": 105191, | |
"Average output tokens": 414 | |
}, | |
"MATH-500": { | |
"Score": 46.2, | |
"Pass rate": 0.864, | |
"Cost($)": 1.5994, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 340960, | |
"Total input tokens": 241357, | |
"Average input tokens": 483, | |
"Total output tokens": 99603, | |
"Average output tokens": 199 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.34, | |
"Pass rate": 0.9939, | |
"Cost($)": 0.7054, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1251210, | |
"Total input tokens": 1106682, | |
"Average input tokens": 839, | |
"Total output tokens": 144528, | |
"Average output tokens": 110 | |
}, | |
"AQuA": { | |
"Score": 75.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.1645, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 291764, | |
"Total input tokens": 249215, | |
"Average input tokens": 981, | |
"Total output tokens": 42549, | |
"Average output tokens": 168 | |
}, | |
"MATH-500": { | |
"Score": 47.2, | |
"Pass rate": 0.822, | |
"Cost($)": 0.233, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 413372, | |
"Total input tokens": 242549, | |
"Average input tokens": 485, | |
"Total output tokens": 170823, | |
"Average output tokens": 342 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.09, | |
"Pass rate": 0.7961, | |
"Cost($)": 0.9736, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1727044, | |
"Total input tokens": 1126025, | |
"Average input tokens": 854, | |
"Total output tokens": 601019, | |
"Average output tokens": 456 | |
}, | |
"AQuA": { | |
"Score": 79.53, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.1746, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 309799, | |
"Total input tokens": 240735, | |
"Average input tokens": 948, | |
"Total output tokens": 69064, | |
"Average output tokens": 272 | |
}, | |
"MATH-500": { | |
"Score": 42.6, | |
"Pass rate": 0.802, | |
"Cost($)": 0.2839, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 503596, | |
"Total input tokens": 253879, | |
"Average input tokens": 508, | |
"Total output tokens": 249717, | |
"Average output tokens": 499 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 58.83, | |
"Pass rate": 0.7051, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1362822, | |
"Total input tokens": 1145390, | |
"Average input tokens": 868, | |
"Total output tokens": 217432, | |
"Average output tokens": 165 | |
}, | |
"AQuA": { | |
"Score": 68.11, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 313728, | |
"Total input tokens": 264517, | |
"Average input tokens": 1041, | |
"Total output tokens": 49211, | |
"Average output tokens": 194 | |
}, | |
"MATH-500": { | |
"Score": 39.6, | |
"Pass rate": 0.744, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 408812, | |
"Total input tokens": 258549, | |
"Average input tokens": 517, | |
"Total output tokens": 150263, | |
"Average output tokens": 301 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.67, | |
"Pass rate": 0.5542, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1391111, | |
"Total input tokens": 1147538, | |
"Average input tokens": 870, | |
"Total output tokens": 243573, | |
"Average output tokens": 185 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 290914, | |
"Total input tokens": 240613, | |
"Average input tokens": 947, | |
"Total output tokens": 50301, | |
"Average output tokens": 198 | |
}, | |
"MATH-500": { | |
"Score": 25.4, | |
"Pass rate": 0.684, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 462271, | |
"Total input tokens": 253879, | |
"Average input tokens": 508, | |
"Total output tokens": 208392, | |
"Average output tokens": 417 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 38.21, | |
"Pass rate": 0.489, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1324949, | |
"Total input tokens": 1136843, | |
"Average input tokens": 862, | |
"Total output tokens": 188106, | |
"Average output tokens": 143 | |
}, | |
"AQuA": { | |
"Score": 36.61, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 301962, | |
"Total input tokens": 233505, | |
"Average input tokens": 919, | |
"Total output tokens": 68457, | |
"Average output tokens": 270 | |
}, | |
"MATH-500": { | |
"Score": 15.0, | |
"Pass rate": 0.324, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 368709, | |
"Total input tokens": 247883, | |
"Average input tokens": 496, | |
"Total output tokens": 120826, | |
"Average output tokens": 242 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 18.5, | |
"Pass rate": 0.3101, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1327522, | |
"Total input tokens": 1151528, | |
"Average input tokens": 873, | |
"Total output tokens": 175994, | |
"Average output tokens": 133 | |
}, | |
"AQuA": { | |
"Score": 30.71, | |
"Pass rate": 0.9646, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 298475, | |
"Total input tokens": 246560, | |
"Average input tokens": 971, | |
"Total output tokens": 51915, | |
"Average output tokens": 204 | |
}, | |
"MATH-500": { | |
"Score": 0.8, | |
"Pass rate": 0.022, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 786870, | |
"Total input tokens": 248509, | |
"Average input tokens": 497, | |
"Total output tokens": 538361, | |
"Average output tokens": 1077 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 9.63, | |
"Pass rate": 0.1691, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1389135, | |
"Total input tokens": 1151528, | |
"Average input tokens": 873, | |
"Total output tokens": 237607, | |
"Average output tokens": 180 | |
}, | |
"AQuA": { | |
"Score": 17.32, | |
"Pass rate": 0.9213, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 322281, | |
"Total input tokens": 258867, | |
"Average input tokens": 1019, | |
"Total output tokens": 63414, | |
"Average output tokens": 250 | |
}, | |
"MATH-500": { | |
"Score": 0.0, | |
"Pass rate": 0.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 437202, | |
"Total input tokens": 253549, | |
"Average input tokens": 507, | |
"Total output tokens": 183653, | |
"Average output tokens": 367 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "PoT", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/2/10" | |
}, | |
"gsm8k": { | |
"Score": 11.9, | |
"Pass rate": 0.1744, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1954509, | |
"Total input tokens": 1138872, | |
"Average input tokens": 863, | |
"Total output tokens": 815637, | |
"Average output tokens": 618 | |
}, | |
"AQuA": { | |
"Score": 54.72, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1016647, | |
"Total input tokens": 250690, | |
"Average input tokens": 987, | |
"Total output tokens": 765957, | |
"Average output tokens": 3016 | |
}, | |
"MATH-500": { | |
"Score": 1.0, | |
"Pass rate": 0.016, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 1031067, | |
"Total input tokens": 245549, | |
"Average input tokens": 491, | |
"Total output tokens": 785518, | |
"Average output tokens": 1571 | |
} | |
} | |
}, | |
"CoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 78.7, | |
"Pass rate": 1.0, | |
"Cost($)": 0.6788, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1088041, | |
"Total input tokens": 953242, | |
"Average input tokens": 723, | |
"Total output tokens": 134799, | |
"Average output tokens": 102 | |
}, | |
"AQuA": { | |
"Score": 61.02, | |
"Pass rate": 0.937, | |
"Cost($)": 0.0957, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 80793, | |
"Total input tokens": 25447, | |
"Average input tokens": 100, | |
"Total output tokens": 55346, | |
"Average output tokens": 218 | |
}, | |
"MATH-500": { | |
"Score": 39.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.3189, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 432196, | |
"Total input tokens": 329381, | |
"Average input tokens": 659, | |
"Total output tokens": 102815, | |
"Average output tokens": 206 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 89.31, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0558, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1201820, | |
"Total input tokens": 1042095, | |
"Average input tokens": 790, | |
"Total output tokens": 159725, | |
"Average output tokens": 121 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0066, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 94577, | |
"Total input tokens": 27978, | |
"Average input tokens": 110, | |
"Total output tokens": 66599, | |
"Average output tokens": 262 | |
}, | |
"MATH-500": { | |
"Score": 59.0, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0255, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 479941, | |
"Total input tokens": 336370, | |
"Average input tokens": 673, | |
"Total output tokens": 143571, | |
"Average output tokens": 287 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 94.09, | |
"Pass rate": 1.0, | |
"Cost($)": 4.5367, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1165166, | |
"Total input tokens": 948668, | |
"Average input tokens": 719, | |
"Total output tokens": 216498, | |
"Average output tokens": 164 | |
}, | |
"AQuA": { | |
"Score": 82.68, | |
"Pass rate": 0.9803, | |
"Cost($)": 1.0417, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 123017, | |
"Total input tokens": 25123, | |
"Average input tokens": 99, | |
"Total output tokens": 97894, | |
"Average output tokens": 385 | |
}, | |
"MATH-500": { | |
"Score": 68.0, | |
"Pass rate": 1.0, | |
"Cost($)": 3.0569, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 552688, | |
"Total input tokens": 329332, | |
"Average input tokens": 659, | |
"Total output tokens": 223356, | |
"Average output tokens": 447 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 92.87, | |
"Pass rate": 1.0, | |
"Cost($)": 0.7195, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1276252, | |
"Total input tokens": 1005119, | |
"Average input tokens": 762, | |
"Total output tokens": 271133, | |
"Average output tokens": 206 | |
}, | |
"AQuA": { | |
"Score": 86.22, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.0808, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 143289, | |
"Total input tokens": 25143, | |
"Average input tokens": 99, | |
"Total output tokens": 118146, | |
"Average output tokens": 465 | |
}, | |
"MATH-500": { | |
"Score": 80.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.349, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 619015, | |
"Total input tokens": 338549, | |
"Average input tokens": 677, | |
"Total output tokens": 280466, | |
"Average output tokens": 561 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.93, | |
"Pass rate": 1.0, | |
"Cost($)": 0.687, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1218665, | |
"Total input tokens": 990168, | |
"Average input tokens": 751, | |
"Total output tokens": 228497, | |
"Average output tokens": 173 | |
}, | |
"AQuA": { | |
"Score": 83.46, | |
"Pass rate": 0.9843, | |
"Cost($)": 0.0927, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 164389, | |
"Total input tokens": 32555, | |
"Average input tokens": 128, | |
"Total output tokens": 131834, | |
"Average output tokens": 519 | |
}, | |
"MATH-500": { | |
"Score": 71.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.3463, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 614221, | |
"Total input tokens": 342879, | |
"Average input tokens": 686, | |
"Total output tokens": 271342, | |
"Average output tokens": 543 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 85.67, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1290805, | |
"Total input tokens": 1046008, | |
"Average input tokens": 793, | |
"Total output tokens": 244797, | |
"Average output tokens": 186 | |
}, | |
"AQuA": { | |
"Score": 80.71, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 149736, | |
"Total input tokens": 33017, | |
"Average input tokens": 130, | |
"Total output tokens": 116719, | |
"Average output tokens": 460 | |
}, | |
"MATH-500": { | |
"Score": 69.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 617204, | |
"Total input tokens": 354049, | |
"Average input tokens": 708, | |
"Total output tokens": 263155, | |
"Average output tokens": 526 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 75.44, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1248329, | |
"Total input tokens": 990168, | |
"Average input tokens": 751, | |
"Total output tokens": 258161, | |
"Average output tokens": 196 | |
}, | |
"AQuA": { | |
"Score": 60.63, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 144435, | |
"Total input tokens": 32555, | |
"Average input tokens": 128, | |
"Total output tokens": 111880, | |
"Average output tokens": 440 | |
}, | |
"MATH-500": { | |
"Score": 25.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 625568, | |
"Total input tokens": 342879, | |
"Average input tokens": 686, | |
"Total output tokens": 282689, | |
"Average output tokens": 565 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 77.71, | |
"Pass rate": 0.997, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1202163, | |
"Total input tokens": 968163, | |
"Average input tokens": 734, | |
"Total output tokens": 234000, | |
"Average output tokens": 177 | |
}, | |
"AQuA": { | |
"Score": 52.76, | |
"Pass rate": 0.8937, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 127520, | |
"Total input tokens": 26610, | |
"Average input tokens": 105, | |
"Total output tokens": 100910, | |
"Average output tokens": 397 | |
}, | |
"MATH-500": { | |
"Score": 46.6, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 546774, | |
"Total input tokens": 332883, | |
"Average input tokens": 666, | |
"Total output tokens": 213891, | |
"Average output tokens": 428 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 55.5, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1218525, | |
"Total input tokens": 1032818, | |
"Average input tokens": 783, | |
"Total output tokens": 185707, | |
"Average output tokens": 141 | |
}, | |
"AQuA": { | |
"Score": 40.55, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 110040, | |
"Total input tokens": 30477, | |
"Average input tokens": 120, | |
"Total output tokens": 79563, | |
"Average output tokens": 313 | |
}, | |
"MATH-500": { | |
"Score": 15.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 536377, | |
"Total input tokens": 349049, | |
"Average input tokens": 698, | |
"Total output tokens": 187328, | |
"Average output tokens": 375 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 35.94, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 1223459, | |
"Total input tokens": 1032818, | |
"Average input tokens": 783, | |
"Total output tokens": 190641, | |
"Average output tokens": 145 | |
}, | |
"AQuA": { | |
"Score": 33.07, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 117339, | |
"Total input tokens": 30477, | |
"Average input tokens": 120, | |
"Total output tokens": 86862, | |
"Average output tokens": 342 | |
}, | |
"MATH-500": { | |
"Score": 6.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 549188, | |
"Total input tokens": 349049, | |
"Average input tokens": 698, | |
"Total output tokens": 200139, | |
"Average output tokens": 400 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "CoT", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/1/23" | |
}, | |
"gsm8k": { | |
"Score": 70.66, | |
"Pass rate": 0.9977, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 2090625, | |
"Total input tokens": 1011714, | |
"Average input tokens": 767, | |
"Total output tokens": 1078911, | |
"Average output tokens": 818 | |
}, | |
"AQuA": { | |
"Score": 71.65, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 333072, | |
"Total input tokens": 26413, | |
"Average input tokens": 104, | |
"Total output tokens": 306659, | |
"Average output tokens": 1207 | |
}, | |
"MATH-500": { | |
"Score": 49.4, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 1199129, | |
"Total input tokens": 341549, | |
"Average input tokens": 683, | |
"Total output tokens": 857580, | |
"Average output tokens": 1715 | |
} | |
} | |
}, | |
"SC-CoT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 79.91, | |
"Pass rate": 0.9992, | |
"Cost($)": 3.3938, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 4089612, | |
"Total input tokens": 2740652, | |
"Average input tokens": 2078, | |
"Total output tokens": 1348960, | |
"Average output tokens": 1023 | |
}, | |
"AQuA": { | |
"Score": 66.14, | |
"Pass rate": 0.9921, | |
"Cost($)": 0.7888, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 847335, | |
"Total input tokens": 482192, | |
"Average input tokens": 1898, | |
"Total output tokens": 365143, | |
"Average output tokens": 1438 | |
}, | |
"MATH-500": { | |
"Score": 28.8, | |
"Pass rate": 1.0, | |
"Cost($)": 1.9764, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 2238812, | |
"Total input tokens": 1381818, | |
"Average input tokens": 2764, | |
"Total output tokens": 856994, | |
"Average output tokens": 1714 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 87.26, | |
"Pass rate": 0.9992, | |
"Cost($)": 0.2083, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 3888813, | |
"Total input tokens": 2691714, | |
"Average input tokens": 2041, | |
"Total output tokens": 1197099, | |
"Average output tokens": 908 | |
}, | |
"AQuA": { | |
"Score": 81.1, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0519, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 885986, | |
"Total input tokens": 503751, | |
"Average input tokens": 1983, | |
"Total output tokens": 382235, | |
"Average output tokens": 1505 | |
}, | |
"MATH-500": { | |
"Score": 49.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.1406, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 2470810, | |
"Total input tokens": 1507651, | |
"Average input tokens": 3015, | |
"Total output tokens": 963159, | |
"Average output tokens": 1926 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 90.3, | |
"Pass rate": 0.9992, | |
"Cost($)": 31.0542, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 5798173, | |
"Total input tokens": 3590336, | |
"Average input tokens": 2722, | |
"Total output tokens": 2207837, | |
"Average output tokens": 1674 | |
}, | |
"AQuA": { | |
"Score": 86.61, | |
"Pass rate": 0.9882, | |
"Cost($)": 8.1485, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1373206, | |
"Total input tokens": 744478, | |
"Average input tokens": 2931, | |
"Total output tokens": 628728, | |
"Average output tokens": 2475 | |
}, | |
"MATH-500": { | |
"Score": 34.4, | |
"Pass rate": 1.0, | |
"Cost($)": 19.6538, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 3455323, | |
"Total input tokens": 1986584, | |
"Average input tokens": 3973, | |
"Total output tokens": 1468739, | |
"Average output tokens": 2937 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 93.86, | |
"Pass rate": 1.0, | |
"Cost($)": 5.9858, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 10618008, | |
"Total input tokens": 8136223, | |
"Average input tokens": 6168, | |
"Total output tokens": 2481785, | |
"Average output tokens": 1882 | |
}, | |
"AQuA": { | |
"Score": 85.04, | |
"Pass rate": 0.9921, | |
"Cost($)": 1.0348, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1835669, | |
"Total input tokens": 1051218, | |
"Average input tokens": 4139, | |
"Total output tokens": 784451, | |
"Average output tokens": 3088 | |
}, | |
"MATH-500": { | |
"Score": 74.0, | |
"Pass rate": 1.0, | |
"Cost($)": 3.1556, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5597513, | |
"Total input tokens": 3823997, | |
"Average input tokens": 7648, | |
"Total output tokens": 1773516, | |
"Average output tokens": 3547 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 95.07, | |
"Pass rate": 1.0, | |
"Cost($)": 6.2005, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 10998794, | |
"Total input tokens": 8413717, | |
"Average input tokens": 6379, | |
"Total output tokens": 2585077, | |
"Average output tokens": 1960 | |
}, | |
"AQuA": { | |
"Score": 82.28, | |
"Pass rate": 0.9921, | |
"Cost($)": 1.0756, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1907924, | |
"Total input tokens": 1135251, | |
"Average input tokens": 4469, | |
"Total output tokens": 772673, | |
"Average output tokens": 3042 | |
}, | |
"MATH-500": { | |
"Score": 74.2, | |
"Pass rate": 1.0, | |
"Cost($)": 3.2239, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5718739, | |
"Total input tokens": 3959492, | |
"Average input tokens": 7919, | |
"Total output tokens": 1759247, | |
"Average output tokens": 3518 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 91.13, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 11140985, | |
"Total input tokens": 8586888, | |
"Average input tokens": 6510, | |
"Total output tokens": 2554097, | |
"Average output tokens": 1936 | |
}, | |
"AQuA": { | |
"Score": 79.92, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1845332, | |
"Total input tokens": 1098280, | |
"Average input tokens": 4324, | |
"Total output tokens": 747052, | |
"Average output tokens": 2941 | |
}, | |
"MATH-500": { | |
"Score": 67.0, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5451484, | |
"Total input tokens": 3833751, | |
"Average input tokens": 7668, | |
"Total output tokens": 1617733, | |
"Average output tokens": 3235 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 73.46, | |
"Pass rate": 0.9955, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 11778716, | |
"Total input tokens": 8630514, | |
"Average input tokens": 6543, | |
"Total output tokens": 3148202, | |
"Average output tokens": 2387 | |
}, | |
"AQuA": { | |
"Score": 59.45, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1651333, | |
"Total input tokens": 971003, | |
"Average input tokens": 3823, | |
"Total output tokens": 680330, | |
"Average output tokens": 2678 | |
}, | |
"MATH-500": { | |
"Score": 30.2, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5034937, | |
"Total input tokens": 3546673, | |
"Average input tokens": 7093, | |
"Total output tokens": 1488264, | |
"Average output tokens": 2977 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 48.22, | |
"Pass rate": 0.9841, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 14526431, | |
"Total input tokens": 10678792, | |
"Average input tokens": 8096, | |
"Total output tokens": 3847639, | |
"Average output tokens": 2917 | |
}, | |
"AQuA": { | |
"Score": 39.37, | |
"Pass rate": 0.9803, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 2296222, | |
"Total input tokens": 1420494, | |
"Average input tokens": 5592, | |
"Total output tokens": 875728, | |
"Average output tokens": 3448 | |
}, | |
"MATH-500": { | |
"Score": 9.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5838466, | |
"Total input tokens": 4193296, | |
"Average input tokens": 8387, | |
"Total output tokens": 1645170, | |
"Average output tokens": 3290 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 11.75, | |
"Pass rate": 0.9189, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 12411942, | |
"Total input tokens": 9066115, | |
"Average input tokens": 6873, | |
"Total output tokens": 3345827, | |
"Average output tokens": 2537 | |
}, | |
"AQuA": { | |
"Score": 23.62, | |
"Pass rate": 0.9646, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 1775335, | |
"Total input tokens": 1034362, | |
"Average input tokens": 4072, | |
"Total output tokens": 740973, | |
"Average output tokens": 2917 | |
}, | |
"MATH-500": { | |
"Score": 3.8, | |
"Pass rate": 0.99, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5569442, | |
"Total input tokens": 3832429, | |
"Average input tokens": 7665, | |
"Total output tokens": 1737013, | |
"Average output tokens": 3474 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 1.67, | |
"Pass rate": 0.9469, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 16465720, | |
"Total input tokens": 11019864, | |
"Average input tokens": 8355, | |
"Total output tokens": 5445856, | |
"Average output tokens": 4129 | |
}, | |
"AQuA": { | |
"Score": 22.83, | |
"Pass rate": 0.9724, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 2215091, | |
"Total input tokens": 1246929, | |
"Average input tokens": 4909, | |
"Total output tokens": 968162, | |
"Average output tokens": 3812 | |
}, | |
"MATH-500": { | |
"Score": 0.8, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 6862056, | |
"Total input tokens": 4448663, | |
"Average input tokens": 8897, | |
"Total output tokens": 2413393, | |
"Average output tokens": 4827 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "SC-CoT", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/2/10" | |
}, | |
"gsm8k": { | |
"Score": 55.34, | |
"Pass rate": 0.997, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 25785865, | |
"Total input tokens": 14540096, | |
"Average input tokens": 11024, | |
"Total output tokens": 11245769, | |
"Average output tokens": 8526 | |
}, | |
"AQuA": { | |
"Score": 59.06, | |
"Pass rate": 0.9685, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 5802711, | |
"Total input tokens": 2547772, | |
"Average input tokens": 10031, | |
"Total output tokens": 3254939, | |
"Average output tokens": 12815 | |
}, | |
"MATH-500": { | |
"Score": 38.0, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 14742109, | |
"Total input tokens": 7080559, | |
"Average input tokens": 14161, | |
"Total output tokens": 7661550, | |
"Average output tokens": 15323 | |
} | |
} | |
}, | |
"ToT": { | |
"gpt-3.5-turbo": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "gpt-3.5-turbo", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 67.93, | |
"Pass rate": 0.997, | |
"Cost($)": 9.1707, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 16727175, | |
"Total input tokens": 15920037, | |
"Average input tokens": 12070, | |
"Total output tokens": 807138, | |
"Average output tokens": 612 | |
}, | |
"AQuA": { | |
"Score": 57.09, | |
"Pass rate": 0.9961, | |
"Cost($)": 1.1513, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 2001396, | |
"Total input tokens": 1850767, | |
"Average input tokens": 7286, | |
"Total output tokens": 150629, | |
"Average output tokens": 593 | |
}, | |
"MATH-500": { | |
"Score": 9.8, | |
"Pass rate": 1.0, | |
"Cost($)": 5.2914, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 10001767, | |
"Total input tokens": 9711244, | |
"Average input tokens": 19422, | |
"Total output tokens": 290523, | |
"Average output tokens": 581 | |
} | |
}, | |
"Doubao-lite-32k": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Doubao-lite-32k", | |
"Eval Date": "2025/1/7" | |
}, | |
"gsm8k": { | |
"Score": 37.83, | |
"Pass rate": 0.8734, | |
"Cost($)": 0.8739, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 20274349, | |
"Total input tokens": 19208597, | |
"Average input tokens": 14563, | |
"Total output tokens": 1065752, | |
"Average output tokens": 808 | |
}, | |
"AQuA": { | |
"Score": 45.28, | |
"Pass rate": 0.7402, | |
"Cost($)": 0.0881, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 2000550, | |
"Total input tokens": 1850249, | |
"Average input tokens": 7284, | |
"Total output tokens": 150301, | |
"Average output tokens": 592 | |
}, | |
"MATH-500": { | |
"Score": 1.2, | |
"Pass rate": 0.942, | |
"Cost($)": 0.2371, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5564500, | |
"Total input tokens": 5338500, | |
"Average input tokens": 10677, | |
"Total output tokens": 226000, | |
"Average output tokens": 452 | |
} | |
}, | |
"gpt-4o": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "gpt-4o", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 91.13, | |
"Pass rate": 1.0, | |
"Cost($)": 86.8581, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 30769735, | |
"Total input tokens": 29445237, | |
"Average input tokens": 22324, | |
"Total output tokens": 1324498, | |
"Average output tokens": 1004 | |
}, | |
"AQuA": { | |
"Score": 81.5, | |
"Pass rate": 0.9921, | |
"Cost($)": 8.5295, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 2613607, | |
"Total input tokens": 2347538, | |
"Average input tokens": 9242, | |
"Total output tokens": 266069, | |
"Average output tokens": 1048 | |
}, | |
"MATH-500": { | |
"Score": 3.2, | |
"Pass rate": 1.0, | |
"Cost($)": 40.8094, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 15242432, | |
"Total input tokens": 14881985, | |
"Average input tokens": 29764, | |
"Total output tokens": 360447, | |
"Average output tokens": 721 | |
} | |
}, | |
"Qwen2.5-72B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Qwen2.5-72B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 88.88, | |
"Pass rate": 1.0, | |
"Cost($)": 23.5911, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 41847148, | |
"Total input tokens": 40435361, | |
"Average input tokens": 30656, | |
"Total output tokens": 1411787, | |
"Average output tokens": 1070 | |
}, | |
"AQuA": { | |
"Score": 81.1, | |
"Pass rate": 0.9921, | |
"Cost($)": 3.7389, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 6632255, | |
"Total input tokens": 6371642, | |
"Average input tokens": 25085, | |
"Total output tokens": 260613, | |
"Average output tokens": 1026 | |
}, | |
"MATH-500": { | |
"Score": 10.8, | |
"Pass rate": 1.0, | |
"Cost($)": 9.0421, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 16039361, | |
"Total input tokens": 15657730, | |
"Average input tokens": 31315, | |
"Total output tokens": 381631, | |
"Average output tokens": 763 | |
} | |
}, | |
"Llama-3.3-70B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Llama-3.3-70B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 91.89, | |
"Pass rate": 1.0, | |
"Cost($)": 20.8753, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 37029687, | |
"Total input tokens": 35096810, | |
"Average input tokens": 26609, | |
"Total output tokens": 1932877, | |
"Average output tokens": 1465 | |
}, | |
"AQuA": { | |
"Score": 83.07, | |
"Pass rate": 1.0, | |
"Cost($)": 2.9404, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 5215848, | |
"Total input tokens": 4735188, | |
"Average input tokens": 18642, | |
"Total output tokens": 480660, | |
"Average output tokens": 1892 | |
}, | |
"MATH-500": { | |
"Score": 1.4, | |
"Pass rate": 0.698, | |
"Cost($)": 8.2699, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 14669500, | |
"Total input tokens": 14099500, | |
"Average input tokens": 28199, | |
"Total output tokens": 570000, | |
"Average output tokens": 1140 | |
} | |
}, | |
"Qwen2.5-7B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Qwen2.5-7B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 72.21, | |
"Pass rate": 0.9901, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 31657319, | |
"Total input tokens": 20196528, | |
"Average input tokens": 15312, | |
"Total output tokens": 11460791, | |
"Average output tokens": 8689 | |
}, | |
"AQuA": { | |
"Score": 53.94, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 8602682, | |
"Total input tokens": 8224468, | |
"Average input tokens": 32380, | |
"Total output tokens": 378214, | |
"Average output tokens": 1489 | |
}, | |
"MATH-500": { | |
"Score": 1.4, | |
"Pass rate": 0.916, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 10167500, | |
"Total input tokens": 9749000, | |
"Average input tokens": 19498, | |
"Total output tokens": 418500, | |
"Average output tokens": 837 | |
} | |
}, | |
"Llama-3.1-8B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Llama-3.1-8B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 65.05, | |
"Pass rate": 0.9196, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 16432102, | |
"Total input tokens": 15554967, | |
"Average input tokens": 11793, | |
"Total output tokens": 877135, | |
"Average output tokens": 665 | |
}, | |
"AQuA": { | |
"Score": 59.06, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 5739684, | |
"Total input tokens": 4896222, | |
"Average input tokens": 19276, | |
"Total output tokens": 843462, | |
"Average output tokens": 3321 | |
}, | |
"MATH-500": { | |
"Score": 1.8, | |
"Pass rate": 0.908, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 9035000, | |
"Total input tokens": 7729000, | |
"Average input tokens": 15458, | |
"Total output tokens": 1306000, | |
"Average output tokens": 2612 | |
} | |
}, | |
"Internllm2_5-7B": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Internllm2_5-7B", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 20.85, | |
"Pass rate": 0.7013, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 13178129, | |
"Total input tokens": 11768118, | |
"Average input tokens": 8922, | |
"Total output tokens": 1410011, | |
"Average output tokens": 1069 | |
}, | |
"AQuA": { | |
"Score": 35.83, | |
"Pass rate": 0.9961, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 4734560, | |
"Total input tokens": 4263136, | |
"Average input tokens": 16784, | |
"Total output tokens": 471424, | |
"Average output tokens": 1856 | |
}, | |
"MATH-500": { | |
"Score": 0.2, | |
"Pass rate": 0.99, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 8350500, | |
"Total input tokens": 7515000, | |
"Average input tokens": 15030, | |
"Total output tokens": 835500, | |
"Average output tokens": 1671 | |
} | |
}, | |
"Qwen2-1.5B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Qwen2-1.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 19.64, | |
"Pass rate": 0.7726, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 12758687, | |
"Total input tokens": 12124248, | |
"Average input tokens": 9192, | |
"Total output tokens": 634439, | |
"Average output tokens": 481 | |
}, | |
"AQuA": { | |
"Score": 31.5, | |
"Pass rate": 0.9882, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 6250702, | |
"Total input tokens": 6058022, | |
"Average input tokens": 23850, | |
"Total output tokens": 192680, | |
"Average output tokens": 759 | |
}, | |
"MATH-500": { | |
"Score": 0.8, | |
"Pass rate": 0.972, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 4535000, | |
"Total input tokens": 4408000, | |
"Average input tokens": 8816, | |
"Total output tokens": 127000, | |
"Average output tokens": 254 | |
} | |
}, | |
"Qwen2-0.5B-Instruct": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "Qwen2-0.5B-Instruct", | |
"Eval Date": "2025/1/22" | |
}, | |
"gsm8k": { | |
"Score": 0, | |
"Pass rate": 0.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 0, | |
"Total input tokens": 0, | |
"Average input tokens": 0, | |
"Total output tokens": 0, | |
"Average output tokens": 0 | |
}, | |
"AQuA": { | |
"Score": 29.92, | |
"Pass rate": 1.0, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 8700281, | |
"Total input tokens": 8100085, | |
"Average input tokens": 31890, | |
"Total output tokens": 600196, | |
"Average output tokens": 2363 | |
}, | |
"MATH-500": { | |
"Score": 0.0, | |
"Pass rate": 0.962, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 5996500, | |
"Total input tokens": 5590500, | |
"Average input tokens": 11181, | |
"Total output tokens": 406000, | |
"Average output tokens": 812 | |
} | |
}, | |
"deepseek-r1:1.5b": { | |
"META": { | |
"Algorithm": "ToT", | |
"LLM": "deepseek-r1:1.5b", | |
"Eval Date": "2025/2/10" | |
}, | |
"gsm8k": { | |
"Score": 23.12, | |
"Pass rate": 0.7248, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "8", | |
"Samples": 1319, | |
"All tokens": 3421486, | |
"Total input tokens": 2738244, | |
"Average input tokens": 2076, | |
"Total output tokens": 683242, | |
"Average output tokens": 518 | |
}, | |
"AQuA": { | |
"Score": 24.8, | |
"Pass rate": 0.5551, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "0", | |
"Samples": 254, | |
"All tokens": 794512, | |
"Total input tokens": 605028, | |
"Average input tokens": 2382, | |
"Total output tokens": 189484, | |
"Average output tokens": 746 | |
}, | |
"MATH-500": { | |
"Score": 0.4, | |
"Pass rate": 0.716, | |
"Cost($)": 0.0, | |
"Framework": "", | |
"X-shot": "4", | |
"Samples": 500, | |
"All tokens": 1941500, | |
"Total input tokens": 1831000, | |
"Average input tokens": 3662, | |
"Total output tokens": 110500, | |
"Average output tokens": 221 | |
} | |
} | |
} | |
} | |
} |