[ { "config": { "model_name": "ChatGPT-4o-latest (2024-09-03)", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 94.49771627042422, "Standard Deviation": 0.251607817784525, "Rank": 4 }, "Geometry": { "Average Score": 81.11505705795187, "Standard Deviation": null, "Rank": 6 }, "Algebra": { "Average Score": 91.79122001491199, "Standard Deviation": null, "Rank": 8 }, "Probability": { "Average Score": 88.00190397870577, "Standard Deviation": null, "Rank": 4 }, "Logical": { "Average Score": 97.47223448912972, "Standard Deviation": null, "Rank": 2 }, "Social": { "Average Score": 89.73262585993845, "Standard Deviation": null, "Rank": 7 }, "Chemistry": { "Average Score": 90.48070030738856, "Standard Deviation": null, "Rank": 3 }, "CPP": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Physics": { "Average Score": 99.7043774383865, "Standard Deviation": null, "Rank": 2 }, "Biology": { "Average Score": 95.98449860487872, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "gpt-4o-2024-08-06", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 83.33484787198124, "Standard Deviation": 3.0334254138998893, "Rank": 12 }, "Geometry": { "Average Score": 85.73211137938175, "Standard Deviation": null, "Rank": 2 }, "Algebra": { "Average Score": 95.29454759516874, "Standard Deviation": null, "Rank": 5 }, "Probability": { "Average Score": 80.9483280228488, "Standard Deviation": null, "Rank": 7 }, "Logical": { "Average Score": 78.93507998348575, "Standard Deviation": null, "Rank": 12 }, "Social": { "Average Score": 78.21553692695771, "Standard Deviation": null, "Rank": 11 }, "Chemistry": { "Average Score": 79.46337310221962, "Standard Deviation": null, "Rank": 9 }, "CPP": { "Average Score": 92.43090226400756, "Standard Deviation": null, "Rank": 2 }, "Physics": { "Average Score": 92.63882355350016, "Standard Deviation": null, "Rank": 6 }, "Biology": { "Average Score": 79.88713500945879, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "gpt-4o-2024-05-13", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 89.31218128337491, "Standard Deviation": 0.5511990686487255, "Rank": 8 }, "Geometry": { "Average Score": 81.70458958633901, "Standard Deviation": null, "Rank": 4 }, "Algebra": { "Average Score": 90.16488595415144, "Standard Deviation": null, "Rank": 9 }, "Probability": { "Average Score": 83.8098272382245, "Standard Deviation": null, "Rank": 5 }, "Logical": { "Average Score": 88.2742970015626, "Standard Deviation": null, "Rank": 9 }, "Social": { "Average Score": 71.51855733216095, "Standard Deviation": null, "Rank": 15 }, "Chemistry": { "Average Score": 84.0147961443266, "Standard Deviation": null, "Rank": 7 }, "CPP": { "Average Score": 79.1592634699295, "Standard Deviation": null, "Rank": 6 }, "Physics": { "Average Score": 96.44583156689123, "Standard Deviation": null, "Rank": 3 }, "Biology": { "Average Score": 86.17947030919935, "Standard Deviation": null, "Rank": 10 } } }, { "config": { "model_name": "gpt-4-turbo-2024-04-09", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 89.20222265636137, "Standard Deviation": 0.9498836008363539, "Rank": 9 }, "Geometry": { "Average Score": 77.90202019775627, "Standard Deviation": null, "Rank": 8 }, "Algebra": { "Average Score": 84.83537307564205, "Standard Deviation": null, "Rank": 12 }, "Probability": { "Average Score": 80.01448545719413, "Standard Deviation": null, "Rank": 9 }, "Logical": { "Average Score": 89.63955736396734, "Standard Deviation": null, "Rank": 8 }, "Social": { "Average Score": 77.25088451567024, "Standard Deviation": null, "Rank": 12 }, "Chemistry": { "Average Score": 78.97054235015905, "Standard Deviation": null, "Rank": 11 }, "CPP": { "Average Score": 70.73143363230263, "Standard Deviation": null, "Rank": 11 }, "Physics": { "Average Score": 90.33497346058968, "Standard Deviation": null, "Rank": 7 }, "Biology": { "Average Score": 86.17949760404831, "Standard Deviation": null, "Rank": 9 } } }, { "config": { "model_name": "gemini-1.5-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 82.91139866415075, "Standard Deviation": 3.013751980804677, "Rank": 13 }, "Geometry": { "Average Score": 83.6654007694722, "Standard Deviation": null, "Rank": 3 }, "Algebra": { "Average Score": 98.84487439119522, "Standard Deviation": null, "Rank": 3 }, "Probability": { "Average Score": 75.94594518060929, "Standard Deviation": null, "Rank": 13 }, "Logical": { "Average Score": 78.89834475831927, "Standard Deviation": null, "Rank": 14 }, "Social": { "Average Score": 78.21569899283614, "Standard Deviation": null, "Rank": 10 }, "Physics": { "Average Score": 88.41290613720335, "Standard Deviation": null, "Rank": 10 }, "Biology": { "Average Score": 86.45347978614136, "Standard Deviation": null, "Rank": 8 } } }, { "config": { "model_name": "qwen2-72b-instruct", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/09" }, "results": { "OVERALL": { "Average Score": 80.51855735113782, "Standard Deviation": 2.389693257324127, "Rank": 15 }, "Geometry": { "Average Score": 68.80768467173304, "Standard Deviation": null, "Rank": 13 }, "Algebra": { "Average Score": 95.86210030199506, "Standard Deviation": null, "Rank": 4 }, "Probability": { "Average Score": 82.29702731445691, "Standard Deviation": null, "Rank": 6 }, "Logical": { "Average Score": 73.55135235722557, "Standard Deviation": null, "Rank": 19 }, "Social": { "Average Score": 57.41502695932332, "Standard Deviation": null, "Rank": 19 }, "Chemistry": { "Average Score": 75.8879803782176, "Standard Deviation": null, "Rank": 13 }, "CPP": { "Average Score": 73.54037778797029, "Standard Deviation": null, "Rank": 7 }, "Physics": { "Average Score": 82.02738090295061, "Standard Deviation": null, "Rank": 16 }, "Biology": { "Average Score": 66.99838962851355, "Standard Deviation": null, "Rank": 22 } } }, { "config": { "model_name": "gpt-4o-mini-2024-07-18", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 85.79551424780102, "Standard Deviation": 2.25059599602412, "Rank": 11 }, "Geometry": { "Average Score": 78.03415885586699, "Standard Deviation": null, "Rank": 7 }, "Algebra": { "Average Score": 90.10621818673319, "Standard Deviation": null, "Rank": 10 }, "Probability": { "Average Score": 80.94824796859724, "Standard Deviation": null, "Rank": 8 }, "Logical": { "Average Score": 86.1004659652016, "Standard Deviation": null, "Rank": 10 }, "Social": { "Average Score": 74.20253943841105, "Standard Deviation": null, "Rank": 13 }, "Chemistry": { "Average Score": 75.44768883899778, "Standard Deviation": null, "Rank": 15 }, "CPP": { "Average Score": 88.3877070580296, "Standard Deviation": null, "Rank": 3 }, "Physics": { "Average Score": 90.33492089386435, "Standard Deviation": null, "Rank": 8 }, "Biology": { "Average Score": 79.03781031583883, "Standard Deviation": null, "Rank": 15 } } }, { "config": { "model_name": "claude-3.5-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 90.30644459276833, "Standard Deviation": 0.6105034066546057, "Rank": 7 }, "Geometry": { "Average Score": 72.63402106402285, "Standard Deviation": null, "Rank": 12 }, "Algebra": { "Average Score": 83.32075177480141, "Standard Deviation": null, "Rank": 14 }, "Probability": { "Average Score": 76.7319625254773, "Standard Deviation": null, "Rank": 11 }, "Logical": { "Average Score": 90.00404188010565, "Standard Deviation": null, "Rank": 7 }, "Social": { "Average Score": 99.89849499454823, "Standard Deviation": null, "Rank": 2 }, "Chemistry": { "Average Score": 85.86402884262867, "Standard Deviation": null, "Rank": 4 }, "CPP": { "Average Score": 82.37734076815008, "Standard Deviation": null, "Rank": 5 }, "Physics": { "Average Score": 92.83215449096147, "Standard Deviation": null, "Rank": 5 }, "Biology": { "Average Score": 85.76627192038262, "Standard Deviation": null, "Rank": 11 } } }, { "config": { "model_name": "claude-3.5-sonnet-20241022", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "UNKNOW" }, "results": { "OVERALL": { "Average Score": 81.7399750668719, "Standard Deviation": 6.158375141726245, "Rank": 14 }, "Geometry": { "Average Score": 72.63581025178527, "Standard Deviation": null, "Rank": 11 }, "Algebra": { "Average Score": 89.50323347048936, "Standard Deviation": null, "Rank": 11 }, "Probability": { "Average Score": 73.919, "Standard Deviation": null, "Rank": 13 }, "Logical": { "Average Score": 90.514, "Standard Deviation": null, "Rank": 7 }, "Social": { "Average Score": 84.505, "Standard Deviation": null, "Rank": 7 }, "Chemistry": { "Average Score": 85.15970597010583, "Standard Deviation": null, "Rank": 6 }, "Physics": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Biology": { "Average Score": 85.56526806360797, "Standard Deviation": null, "Rank": 12 } } }, { "config": { "model_name": "o1-mini", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 97.50448224920098, "Standard Deviation": 0.18820973784944708, "Rank": 2 }, "Geometry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Algebra": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Probability": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 96.62093396445893, "Standard Deviation": null, "Rank": 3 }, "Social": { "Average Score": 98.93701302706319, "Standard Deviation": null, "Rank": 4 }, "Chemistry": { "Average Score": 93.52027415963765, "Standard Deviation": null, "Rank": 2 }, "Biology": { "Average Score": 99.9210788257773, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "o1-preview", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 92.95670511909181, "Standard Deviation": 0.26193636312885404, "Rank": 5 }, "Geometry": { "Average Score": 81.70453162182778, "Standard Deviation": null, "Rank": 5 }, "Algebra": { "Average Score": 99.2204666813678, "Standard Deviation": null, "Rank": 2 }, "Probability": { "Average Score": 96.11141903959506, "Standard Deviation": null, "Rank": 2 }, "Logical": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Social": { "Average Score": 99.35681400812317, "Standard Deviation": null, "Rank": 3 }, "Biology": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 } } }, { "config": { "model_name": "gemini-1.5-flash-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 63.90738369106308, "Standard Deviation": 2.5840022803072342, "Rank": 20 }, "Geometry": { "Average Score": 62.78784730869374, "Standard Deviation": null, "Rank": 16 }, "Algebra": { "Average Score": 84.4516255656167, "Standard Deviation": null, "Rank": 13 }, "Probability": { "Average Score": 71.21668893483972, "Standard Deviation": null, "Rank": 15 }, "Logical": { "Average Score": 73.55137041991937, "Standard Deviation": null, "Rank": 17 }, "Social": { "Average Score": 71.51839473022034, "Standard Deviation": null, "Rank": 16 }, "Chemistry": { "Average Score": 78.9281328399534, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 72.1127762005651, "Standard Deviation": null, "Rank": 10 }, "Physics": { "Average Score": 86.21163726768592, "Standard Deviation": null, "Rank": 14 }, "Biology": { "Average Score": 77.50881946688955, "Standard Deviation": null, "Rank": 16 } } }, { "config": { "model_name": "gpt4-1106", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 88.08481721079524, "Standard Deviation": 1.4421920877285703, "Rank": 10 }, "Geometry": { "Average Score": 59.2110329866853, "Standard Deviation": null, "Rank": 17 }, "Algebra": { "Average Score": 80.79050620153212, "Standard Deviation": null, "Rank": 15 }, "Probability": { "Average Score": 74.36123524515216, "Standard Deviation": null, "Rank": 14 }, "Logical": { "Average Score": 77.02518347398768, "Standard Deviation": null, "Rank": 15 }, "Social": { "Average Score": 51.13078063545894, "Standard Deviation": null, "Rank": 25 }, "Chemistry": { "Average Score": 72.4125941071821, "Standard Deviation": null, "Rank": 16 }, "CPP": { "Average Score": 69.11824072252848, "Standard Deviation": null, "Rank": 12 }, "Physics": { "Average Score": 87.0543996394885, "Standard Deviation": null, "Rank": 13 }, "Biology": { "Average Score": 82.36213636857161, "Standard Deviation": null, "Rank": 13 } } }, { "config": { "model_name": "gemma-2-27b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 70.59188609288081, "Standard Deviation": 8.717841670213112, "Rank": 19 }, "Geometry": { "Average Score": 58.00008857041582, "Standard Deviation": null, "Rank": 19 }, "Algebra": { "Average Score": 77.82927803658924, "Standard Deviation": null, "Rank": 19 }, "Probability": { "Average Score": 69.63382706259532, "Standard Deviation": null, "Rank": 18 }, "Logical": { "Average Score": 73.55136762438677, "Standard Deviation": null, "Rank": 18 }, "Social": { "Average Score": 57.17847568664103, "Standard Deviation": null, "Rank": 20 }, "Chemistry": { "Average Score": 68.65449070488427, "Standard Deviation": null, "Rank": 20 }, "CPP": { "Average Score": 63.28920072143611, "Standard Deviation": null, "Rank": 14 }, "Physics": { "Average Score": 76.8395150041688, "Standard Deviation": null, "Rank": 19 }, "Biology": { "Average Score": 66.99846220210911, "Standard Deviation": null, "Rank": 21 } } }, { "config": { "model_name": "claude-3-opus", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 79.77338364506384, "Standard Deviation": 2.32886155429398, "Rank": 16 }, "Geometry": { "Average Score": 57.5200576513199, "Standard Deviation": null, "Rank": 20 }, "Algebra": { "Average Score": 76.89230078890219, "Standard Deviation": null, "Rank": 20 }, "Probability": { "Average Score": 71.20578106177237, "Standard Deviation": null, "Rank": 16 }, "Logical": { "Average Score": 78.93505058041774, "Standard Deviation": null, "Rank": 13 }, "Social": { "Average Score": 88.40491896661747, "Standard Deviation": null, "Rank": 8 }, "Chemistry": { "Average Score": 79.0571776580065, "Standard Deviation": null, "Rank": 10 }, "CPP": { "Average Score": 73.5404403567132, "Standard Deviation": null, "Rank": 8 }, "Physics": { "Average Score": 87.28118117714033, "Standard Deviation": null, "Rank": 12 }, "Biology": { "Average Score": 71.23527633371832, "Standard Deviation": null, "Rank": 20 } } }, { "config": { "model_name": "gemma-2-9b-it-simpo", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Geometry": { "Average Score": 57.520011750672175, "Standard Deviation": null, "Rank": 21 }, "Algebra": { "Average Score": 72.3731046476544, "Standard Deviation": null, "Rank": 21 }, "Probability": { "Average Score": 61.79614379365174, "Standard Deviation": null, "Rank": 22 }, "Logical": { "Average Score": 64.62661472571767, "Standard Deviation": null, "Rank": 23 }, "Social": { "Average Score": 87.65488278831526, "Standard Deviation": null, "Rank": 9 }, "Chemistry": { "Average Score": 85.36850564169866, "Standard Deviation": null, "Rank": 5 }, "CPP": { "Average Score": 73.43757596214863, "Standard Deviation": null, "Rank": 9 }, "Physics": { "Average Score": 82.02727994935249, "Standard Deviation": null, "Rank": 17 }, "Biology": { "Average Score": 88.80821937078267, "Standard Deviation": null, "Rank": 7 } } }, { "config": { "model_name": "qwen1.5-72b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 61.57517122936127, "Standard Deviation": 5.01096656930536, "Rank": 21 }, "Geometry": { "Average Score": 49.36591842356095, "Standard Deviation": null, "Rank": 28 }, "Algebra": { "Average Score": 71.12615153442515, "Standard Deviation": null, "Rank": 22 }, "Probability": { "Average Score": 51.76027345875035, "Standard Deviation": null, "Rank": 28 }, "Logical": { "Average Score": 34.74438889550426, "Standard Deviation": null, "Rank": 39 }, "Social": { "Average Score": 47.47112348597555, "Standard Deviation": null, "Rank": 27 }, "Chemistry": { "Average Score": 51.65772092991593, "Standard Deviation": null, "Rank": 25 }, "CPP": { "Average Score": 48.69302376665551, "Standard Deviation": null, "Rank": 20 }, "Physics": { "Average Score": 62.45893584822384, "Standard Deviation": null, "Rank": 27 }, "Biology": { "Average Score": 56.96571500324531, "Standard Deviation": null, "Rank": 27 } } }, { "config": { "model_name": "qwen1.5-32b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 45.3199699974334, "Standard Deviation": 3.7527776450894996, "Rank": 31 }, "Geometry": { "Average Score": 45.66389348479106, "Standard Deviation": null, "Rank": 30 }, "Algebra": { "Average Score": 64.9403510842088, "Standard Deviation": null, "Rank": 25 }, "Probability": { "Average Score": 51.99376831114535, "Standard Deviation": null, "Rank": 27 }, "Logical": { "Average Score": 39.30230377209954, "Standard Deviation": null, "Rank": 36 }, "Social": { "Average Score": 45.679222078247186, "Standard Deviation": null, "Rank": 28 }, "Chemistry": { "Average Score": 46.41262433996582, "Standard Deviation": null, "Rank": 28 }, "CPP": { "Average Score": 45.14284028264288, "Standard Deviation": null, "Rank": 24 }, "Physics": { "Average Score": 65.80533740982938, "Standard Deviation": null, "Rank": 25 }, "Biology": { "Average Score": 50.767985684362536, "Standard Deviation": null, "Rank": 33 } } }, { "config": { "model_name": "google-gemma-2-9b-it", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 59.024943267290716, "Standard Deviation": 3.979239820929726, "Rank": 23 }, "Geometry": { "Average Score": 53.495866814128156, "Standard Deviation": null, "Rank": 24 }, "Algebra": { "Average Score": 65.98776390439404, "Standard Deviation": null, "Rank": 23 }, "Probability": { "Average Score": 65.76699220336998, "Standard Deviation": null, "Rank": 21 }, "Logical": { "Average Score": 71.04386923330611, "Standard Deviation": null, "Rank": 20 }, "Social": { "Average Score": 73.74087367208867, "Standard Deviation": null, "Rank": 14 }, "Chemistry": { "Average Score": 57.074735438190935, "Standard Deviation": null, "Rank": 22 }, "CPP": { "Average Score": 54.03167523687635, "Standard Deviation": null, "Rank": 17 }, "Physics": { "Average Score": 63.03919029129539, "Standard Deviation": null, "Rank": 26 }, "Biology": { "Average Score": 63.18363754826406, "Standard Deviation": null, "Rank": 23 } } }, { "config": { "model_name": "yi-1.5-34b-chat", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "2024/05" }, "results": { "OVERALL": { "Average Score": 71.78031967728624, "Standard Deviation": 12.994861744386325, "Rank": 18 }, "Geometry": { "Average Score": 54.06826621860964, "Standard Deviation": null, "Rank": 23 }, "Algebra": { "Average Score": 65.66679210942144, "Standard Deviation": null, "Rank": 24 }, "Probability": { "Average Score": 66.46858903563573, "Standard Deviation": null, "Rank": 20 }, "Logical": { "Average Score": 67.36081192984079, "Standard Deviation": null, "Rank": 21 }, "Social": { "Average Score": 53.898293694371446, "Standard Deviation": null, "Rank": 22 }, "Chemistry": { "Average Score": 56.1520167017115, "Standard Deviation": null, "Rank": 23 }, "CPP": { "Average Score": 52.148798061768964, "Standard Deviation": null, "Rank": 18 }, "Physics": { "Average Score": 73.06547347263036, "Standard Deviation": null, "Rank": 21 }, "Biology": { "Average Score": 72.47949036617567, "Standard Deviation": null, "Rank": 18 } } }, { "config": { "model_name": "meta-llama-3.1-70b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 72.88379857527117, "Standard Deviation": 3.7053577253028176, "Rank": 17 }, "Geometry": { "Average Score": 62.78788327507421, "Standard Deviation": null, "Rank": 15 }, "Algebra": { "Average Score": 80.79028754890449, "Standard Deviation": null, "Rank": 16 }, "Probability": { "Average Score": 69.6338691921361, "Standard Deviation": null, "Rank": 17 }, "Logical": { "Average Score": 74.43905975120572, "Standard Deviation": null, "Rank": 16 }, "Social": { "Average Score": 61.22534257022315, "Standard Deviation": null, "Rank": 18 }, "Chemistry": { "Average Score": 70.9160725889497, "Standard Deviation": null, "Rank": 18 }, "CPP": { "Average Score": 84.36815192532764, "Standard Deviation": null, "Rank": 4 }, "Physics": { "Average Score": 82.02759904132307, "Standard Deviation": null, "Rank": 15 }, "Biology": { "Average Score": 72.47948013923437, "Standard Deviation": null, "Rank": 19 } } }, { "config": { "model_name": "meta-llama-3.1-8b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 52.21824740443002, "Standard Deviation": 3.7833302779202937, "Rank": 27 }, "Geometry": { "Average Score": 43.03691891008171, "Standard Deviation": null, "Rank": 32 }, "Algebra": { "Average Score": 64.13661497122277, "Standard Deviation": null, "Rank": 26 }, "Probability": { "Average Score": 55.37882298464668, "Standard Deviation": null, "Rank": 25 }, "Logical": { "Average Score": 53.843773408414144, "Standard Deviation": null, "Rank": 29 }, "Social": { "Average Score": 44.993575656549545, "Standard Deviation": null, "Rank": 30 }, "Chemistry": { "Average Score": 43.98798267082055, "Standard Deviation": null, "Rank": 31 }, "CPP": { "Average Score": 44.41846841004584, "Standard Deviation": null, "Rank": 26 }, "Physics": { "Average Score": 49.65976817230991, "Standard Deviation": null, "Rank": 37 }, "Biology": { "Average Score": 52.132998637966764, "Standard Deviation": null, "Rank": 32 } } }, { "config": { "model_name": "gpt3.5-turbo-0125", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2021/09" }, "results": { "OVERALL": { "Average Score": 32.61987548870099, "Standard Deviation": 7.421068133219178, "Rank": 41 }, "Geometry": { "Average Score": 52.43446046073764, "Standard Deviation": null, "Rank": 25 }, "Algebra": { "Average Score": 62.62345918733465, "Standard Deviation": null, "Rank": 27 }, "Probability": { "Average Score": 46.778615832700474, "Standard Deviation": null, "Rank": 30 }, "Logical": { "Average Score": 20.161483818418485, "Standard Deviation": null, "Rank": 48 }, "Social": { "Average Score": 36.005021312700556, "Standard Deviation": null, "Rank": 43 }, "Chemistry": { "Average Score": 41.27375172990709, "Standard Deviation": null, "Rank": 34 }, "CPP": { "Average Score": 40.46958736582551, "Standard Deviation": null, "Rank": 29 }, "Physics": { "Average Score": 53.13517938912883, "Standard Deviation": null, "Rank": 33 }, "Biology": { "Average Score": 40.750963952571375, "Standard Deviation": null, "Rank": 43 } } }, { "config": { "model_name": "llama-3-70b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 58.67095788786492, "Standard Deviation": 3.916500171452786, "Rank": 25 }, "Geometry": { "Average Score": 47.16123420770543, "Standard Deviation": null, "Rank": 29 }, "Algebra": { "Average Score": 62.38398769226985, "Standard Deviation": null, "Rank": 28 }, "Probability": { "Average Score": 57.7568005808253, "Standard Deviation": null, "Rank": 23 }, "Logical": { "Average Score": 84.45551822980201, "Standard Deviation": null, "Rank": 11 }, "Social": { "Average Score": 52.450283668620365, "Standard Deviation": null, "Rank": 23 }, "Chemistry": { "Average Score": 70.91630635362482, "Standard Deviation": null, "Rank": 17 }, "CPP": { "Average Score": 65.32140697218945, "Standard Deviation": null, "Rank": 13 }, "Physics": { "Average Score": 78.08120808341037, "Standard Deviation": null, "Rank": 18 }, "Biology": { "Average Score": 60.6111504865126, "Standard Deviation": null, "Rank": 25 } } }, { "config": { "model_name": "claude-3-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 61.25499749085383, "Standard Deviation": 5.012226129836105, "Rank": 22 }, "Geometry": { "Average Score": 52.4291917862642, "Standard Deviation": null, "Rank": 26 }, "Algebra": { "Average Score": 60.40928261066776, "Standard Deviation": null, "Rank": 29 }, "Probability": { "Average Score": 57.4556182999398, "Standard Deviation": null, "Rank": 24 }, "Logical": { "Average Score": 66.81740129837053, "Standard Deviation": null, "Rank": 22 }, "Social": { "Average Score": 69.99747730347514, "Standard Deviation": null, "Rank": 17 }, "Chemistry": { "Average Score": 68.8316074174692, "Standard Deviation": null, "Rank": 19 }, "CPP": { "Average Score": 61.33538592327427, "Standard Deviation": null, "Rank": 15 }, "Physics": { "Average Score": 75.18056969699853, "Standard Deviation": null, "Rank": 20 }, "Biology": { "Average Score": 77.09610271458331, "Standard Deviation": null, "Rank": 17 } } }, { "config": { "model_name": "qwen1.5-14b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 44.55620746942043, "Standard Deviation": 3.997156497824947, "Rank": 32 }, "Geometry": { "Average Score": 36.7560037779628, "Standard Deviation": null, "Rank": 34 }, "Algebra": { "Average Score": 59.50136116119945, "Standard Deviation": null, "Rank": 30 }, "Probability": { "Average Score": 40.080049006314795, "Standard Deviation": null, "Rank": 35 }, "Logical": { "Average Score": 34.744529623515994, "Standard Deviation": null, "Rank": 38 }, "Social": { "Average Score": 40.62146960769885, "Standard Deviation": null, "Rank": 36 }, "Chemistry": { "Average Score": 38.9739127306118, "Standard Deviation": null, "Rank": 37 }, "CPP": { "Average Score": 38.552779976347026, "Standard Deviation": null, "Rank": 31 }, "Physics": { "Average Score": 57.98313138991904, "Standard Deviation": null, "Rank": 31 }, "Biology": { "Average Score": 45.732215792439575, "Standard Deviation": null, "Rank": 40 } } }, { "config": { "model_name": "claude-3-haiku", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 54.96475677538885, "Standard Deviation": 5.908641649857827, "Rank": 26 }, "Geometry": { "Average Score": 43.48740351644307, "Standard Deviation": null, "Rank": 31 }, "Algebra": { "Average Score": 55.72045911130164, "Standard Deviation": null, "Rank": 33 }, "Probability": { "Average Score": 53.07470665022828, "Standard Deviation": null, "Rank": 26 }, "Logical": { "Average Score": 63.661198382201675, "Standard Deviation": null, "Rank": 24 }, "Social": { "Average Score": 56.49297908205363, "Standard Deviation": null, "Rank": 21 }, "Chemistry": { "Average Score": 60.28485867590517, "Standard Deviation": null, "Rank": 21 }, "CPP": { "Average Score": 56.40200048817984, "Standard Deviation": null, "Rank": 16 }, "Physics": { "Average Score": 67.69802411023282, "Standard Deviation": null, "Rank": 24 }, "Biology": { "Average Score": 60.63801358326118, "Standard Deviation": null, "Rank": 24 } } }, { "config": { "model_name": "claude-2.1", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 39.436633770824685, "Standard Deviation": 1.0979568551024126, "Rank": 36 }, "Geometry": { "Average Score": 52.12445910303711, "Standard Deviation": null, "Rank": 27 }, "Algebra": { "Average Score": 55.51421646167608, "Standard Deviation": null, "Rank": 34 }, "Probability": { "Average Score": 44.720527688076, "Standard Deviation": null, "Rank": 33 }, "Logical": { "Average Score": 61.64930710809233, "Standard Deviation": null, "Rank": 25 }, "Social": { "Average Score": 41.24714538607354, "Standard Deviation": null, "Rank": 35 }, "Chemistry": { "Average Score": 49.503134730071984, "Standard Deviation": null, "Rank": 26 }, "CPP": { "Average Score": 47.23672563994903, "Standard Deviation": null, "Rank": 21 }, "Physics": { "Average Score": 71.80748688814478, "Standard Deviation": null, "Rank": 22 }, "Biology": { "Average Score": 56.35051024959833, "Standard Deviation": null, "Rank": 28 } } }, { "config": { "model_name": "mistral-8x7b-instruct-v0.1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 41.89229040550289, "Standard Deviation": 1.0093122675555612, "Rank": 33 }, "Geometry": { "Average Score": 33.703560702831055, "Standard Deviation": null, "Rank": 38 }, "Algebra": { "Average Score": 50.89266418264096, "Standard Deviation": null, "Rank": 37 }, "Probability": { "Average Score": 44.763608895327415, "Standard Deviation": null, "Rank": 32 }, "Logical": { "Average Score": 40.32090734088309, "Standard Deviation": null, "Rank": 35 }, "Social": { "Average Score": 36.25120096194333, "Standard Deviation": null, "Rank": 42 }, "Chemistry": { "Average Score": 45.537417249801685, "Standard Deviation": null, "Rank": 29 }, "CPP": { "Average Score": 44.533118241976666, "Standard Deviation": null, "Rank": 25 }, "Physics": { "Average Score": 59.27177919021739, "Standard Deviation": null, "Rank": 29 }, "Biology": { "Average Score": 53.73577835290789, "Standard Deviation": null, "Rank": 29 } } }, { "config": { "model_name": "claude-2.0", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 29.746629448410072, "Standard Deviation": 2.904279782741168, "Rank": 44 }, "Geometry": { "Average Score": 38.83959305205546, "Standard Deviation": null, "Rank": 33 }, "Algebra": { "Average Score": 50.95581898913443, "Standard Deviation": null, "Rank": 36 }, "Probability": { "Average Score": 46.77856061078482, "Standard Deviation": null, "Rank": 31 }, "Logical": { "Average Score": 55.87663184155831, "Standard Deviation": null, "Rank": 28 }, "Social": { "Average Score": 52.418630462591864, "Standard Deviation": null, "Rank": 24 }, "Chemistry": { "Average Score": 54.485802241006866, "Standard Deviation": null, "Rank": 24 }, "CPP": { "Average Score": 50.773143448036464, "Standard Deviation": null, "Rank": 19 }, "Physics": { "Average Score": 70.21815140033613, "Standard Deviation": null, "Rank": 23 }, "Biology": { "Average Score": 58.06960426451617, "Standard Deviation": null, "Rank": 26 } } }, { "config": { "model_name": "starling-lm-7b-beta", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 49.37320778476737, "Standard Deviation": 3.6745696228749076, "Rank": 28 }, "Geometry": { "Average Score": 34.931531551032506, "Standard Deviation": null, "Rank": 37 }, "Algebra": { "Average Score": 51.66718360952931, "Standard Deviation": null, "Rank": 35 }, "Probability": { "Average Score": 40.79623349276488, "Standard Deviation": null, "Rank": 34 }, "Logical": { "Average Score": 47.86775375284415, "Standard Deviation": null, "Rank": 30 }, "Social": { "Average Score": 42.30631821350664, "Standard Deviation": null, "Rank": 33 }, "Chemistry": { "Average Score": 38.68957842968336, "Standard Deviation": null, "Rank": 38 }, "CPP": { "Average Score": 38.27587102395908, "Standard Deviation": null, "Rank": 32 }, "Physics": { "Average Score": 43.122496379867655, "Standard Deviation": null, "Rank": 40 }, "Biology": { "Average Score": 49.80517713841127, "Standard Deviation": null, "Rank": 35 } } }, { "config": { "model_name": "gemini-1.0-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 37.757029496159134, "Standard Deviation": 2.4871563947325797, "Rank": 38 }, "Geometry": { "Average Score": 35.792088134579124, "Standard Deviation": null, "Rank": 36 }, "Algebra": { "Average Score": 50.157930404365224, "Standard Deviation": null, "Rank": 38 }, "Probability": { "Average Score": 25.033769367203313, "Standard Deviation": null, "Rank": 47 }, "Logical": { "Average Score": 23.38732786204667, "Standard Deviation": null, "Rank": 46 }, "Social": { "Average Score": 26.25171796810704, "Standard Deviation": null, "Rank": 51 }, "Chemistry": { "Average Score": 43.59712830576298, "Standard Deviation": null, "Rank": 32 }, "CPP": { "Average Score": 45.22204471452975, "Standard Deviation": null, "Rank": 23 }, "Physics": { "Average Score": 62.1145967631314, "Standard Deviation": null, "Rank": 28 }, "Biology": { "Average Score": 38.93328880463975, "Standard Deviation": null, "Rank": 46 } } }, { "config": { "model_name": "openchat-3.5-0106", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2024/01" }, "results": { "OVERALL": { "Average Score": 39.892305843585234, "Standard Deviation": 2.147396504115797, "Rank": 35 }, "Geometry": { "Average Score": 29.941588970091672, "Standard Deviation": null, "Rank": 40 }, "Algebra": { "Average Score": 47.48449168554534, "Standard Deviation": null, "Rank": 39 }, "Probability": { "Average Score": 39.64777697224284, "Standard Deviation": null, "Rank": 36 }, "Logical": { "Average Score": 41.361836834955504, "Standard Deviation": null, "Rank": 33 }, "Social": { "Average Score": 36.716597579856675, "Standard Deviation": null, "Rank": 41 }, "Chemistry": { "Average Score": 32.618034432282414, "Standard Deviation": null, "Rank": 41 }, "CPP": { "Average Score": 33.70639271807677, "Standard Deviation": null, "Rank": 33 }, "Physics": { "Average Score": 41.117269227834775, "Standard Deviation": null, "Rank": 42 }, "Biology": { "Average Score": 46.46694211682319, "Standard Deviation": null, "Rank": 38 } } }, { "config": { "model_name": "openchat-3.5", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 38.241198423073044, "Standard Deviation": 0.5484943791516782, "Rank": 37 }, "Geometry": { "Average Score": 30.89638678506991, "Standard Deviation": null, "Rank": 39 }, "Algebra": { "Average Score": 41.83128388520244, "Standard Deviation": null, "Rank": 42 }, "Probability": { "Average Score": 36.10478976665624, "Standard Deviation": null, "Rank": 39 }, "Logical": { "Average Score": 40.320934300651516, "Standard Deviation": null, "Rank": 34 }, "Social": { "Average Score": 43.49055300551458, "Standard Deviation": null, "Rank": 31 }, "Chemistry": { "Average Score": 34.73882038803731, "Standard Deviation": null, "Rank": 40 }, "CPP": { "Average Score": 33.020911255646965, "Standard Deviation": null, "Rank": 34 }, "Physics": { "Average Score": 43.28671808104924, "Standard Deviation": null, "Rank": 39 }, "Biology": { "Average Score": 37.18520956253795, "Standard Deviation": null, "Rank": 47 } } }, { "config": { "model_name": "command-r-(08-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/08" }, "results": { "OVERALL": { "Average Score": 45.419599943563604, "Standard Deviation": 3.867586763039621, "Rank": 30 }, "Geometry": { "Average Score": 36.68143035371426, "Standard Deviation": null, "Rank": 35 }, "Algebra": { "Average Score": 41.64517540472657, "Standard Deviation": null, "Rank": 43 }, "Probability": { "Average Score": 37.95189112967414, "Standard Deviation": null, "Rank": 38 }, "Logical": { "Average Score": 25.409088658564166, "Standard Deviation": null, "Rank": 43 }, "Social": { "Average Score": 40.389393367109264, "Standard Deviation": null, "Rank": 37 }, "Chemistry": { "Average Score": 40.08660883479598, "Standard Deviation": null, "Rank": 36 }, "CPP": { "Average Score": 39.61492485677676, "Standard Deviation": null, "Rank": 30 }, "Physics": { "Average Score": 49.51833550380945, "Standard Deviation": null, "Rank": 38 }, "Biology": { "Average Score": 46.55085862120477, "Standard Deviation": null, "Rank": 37 } } }, { "config": { "model_name": "gemma-1.1-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 31.46481370727848, "Standard Deviation": 5.403408635399989, "Rank": 42 }, "Geometry": { "Average Score": 26.078500005143134, "Standard Deviation": null, "Rank": 45 }, "Algebra": { "Average Score": 40.92453155837702, "Standard Deviation": null, "Rank": 44 }, "Probability": { "Average Score": 31.502661407350192, "Standard Deviation": null, "Rank": 44 }, "Logical": { "Average Score": 39.27282391466396, "Standard Deviation": null, "Rank": 37 }, "Social": { "Average Score": 31.639615427886643, "Standard Deviation": null, "Rank": 46 }, "Chemistry": { "Average Score": 43.59704806585925, "Standard Deviation": null, "Rank": 33 }, "CPP": { "Average Score": 42.666504105798204, "Standard Deviation": null, "Rank": 27 }, "Physics": { "Average Score": 49.845369349755345, "Standard Deviation": null, "Rank": 36 }, "Biology": { "Average Score": 45.813201684684124, "Standard Deviation": null, "Rank": 39 } } }, { "config": { "model_name": "llama3-8b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/03" }, "results": { "OVERALL": { "Average Score": 36.30010331322555, "Standard Deviation": 2.6021295258334334, "Rank": 40 }, "Geometry": { "Average Score": 28.61237715170709, "Standard Deviation": null, "Rank": 42 }, "Algebra": { "Average Score": 42.6394310988214, "Standard Deviation": null, "Rank": 41 }, "Probability": { "Average Score": 35.51226405104781, "Standard Deviation": null, "Rank": 40 }, "Logical": { "Average Score": 59.594410427422616, "Standard Deviation": null, "Rank": 26 }, "Social": { "Average Score": 42.58469219441349, "Standard Deviation": null, "Rank": 32 }, "Chemistry": { "Average Score": 48.45708298495634, "Standard Deviation": null, "Rank": 27 }, "CPP": { "Average Score": 45.35392139264795, "Standard Deviation": null, "Rank": 22 }, "Physics": { "Average Score": 58.61979255906953, "Standard Deviation": null, "Rank": 30 }, "Biology": { "Average Score": 50.39755478099045, "Standard Deviation": null, "Rank": 34 } } }, { "config": { "model_name": "gemma-2-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": 58.76741528626868, "Standard Deviation": 5.683174110350625, "Rank": 24 }, "Geometry": { "Average Score": 29.901411513695468, "Standard Deviation": null, "Rank": 41 }, "Algebra": { "Average Score": 40.60048971047775, "Standard Deviation": null, "Rank": 45 }, "Probability": { "Average Score": 33.448597365831304, "Standard Deviation": null, "Rank": 42 }, "Logical": { "Average Score": 43.89688208707135, "Standard Deviation": null, "Rank": 31 }, "Social": { "Average Score": 48.769368715100335, "Standard Deviation": null, "Rank": 26 }, "Chemistry": { "Average Score": 28.982153819366474, "Standard Deviation": null, "Rank": 44 }, "CPP": { "Average Score": 30.53406933106768, "Standard Deviation": null, "Rank": 36 }, "Physics": { "Average Score": 22.78354134298823, "Standard Deviation": null, "Rank": 49 }, "Biology": { "Average Score": 53.59359459245764, "Standard Deviation": null, "Rank": 30 } } }, { "config": { "model_name": "starling-lm-7b-alpha", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 36.98646367219327, "Standard Deviation": 0.5488180472607256, "Rank": 39 }, "Geometry": { "Average Score": 26.472892835994372, "Standard Deviation": null, "Rank": 44 }, "Algebra": { "Average Score": 38.4553696839335, "Standard Deviation": null, "Rank": 47 }, "Probability": { "Average Score": 33.907837077924526, "Standard Deviation": null, "Rank": 41 }, "Logical": { "Average Score": 33.129169647630114, "Standard Deviation": null, "Rank": 41 }, "Social": { "Average Score": 39.97855588617487, "Standard Deviation": null, "Rank": 38 }, "Chemistry": { "Average Score": 29.187364253387454, "Standard Deviation": null, "Rank": 43 }, "CPP": { "Average Score": 30.07926487356878, "Standard Deviation": null, "Rank": 37 }, "Physics": { "Average Score": 32.39068796677421, "Standard Deviation": null, "Rank": 43 }, "Biology": { "Average Score": 40.884001946009214, "Standard Deviation": null, "Rank": 41 } } }, { "config": { "model_name": "qwen1.5-4b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 9.87888465860545, "Standard Deviation": 0.8496756485041839, "Rank": 58 }, "Geometry": { "Average Score": 16.727214095722648, "Standard Deviation": null, "Rank": 51 }, "Algebra": { "Average Score": 30.868954326245674, "Standard Deviation": null, "Rank": 48 }, "Probability": { "Average Score": 12.542151831707827, "Standard Deviation": null, "Rank": 52 }, "Logical": { "Average Score": 13.591142976589552, "Standard Deviation": null, "Rank": 55 }, "Social": { "Average Score": 29.86221951671923, "Standard Deviation": null, "Rank": 47 }, "Chemistry": { "Average Score": 15.258365841050109, "Standard Deviation": null, "Rank": 57 }, "CPP": { "Average Score": 13.21208067122554, "Standard Deviation": null, "Rank": 47 }, "Physics": { "Average Score": 12.8962411286233, "Standard Deviation": null, "Rank": 56 }, "Biology": { "Average Score": 8.598267308776672, "Standard Deviation": null, "Rank": 61 } } }, { "config": { "model_name": "command-r-(04-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 41.52933196050375, "Standard Deviation": 2.241081240676662, "Rank": 34 }, "Geometry": { "Average Score": 25.015789717085156, "Standard Deviation": null, "Rank": 47 }, "Algebra": { "Average Score": 30.86273392294722, "Standard Deviation": null, "Rank": 49 }, "Probability": { "Average Score": 32.69230455171987, "Standard Deviation": null, "Rank": 43 }, "Logical": { "Average Score": 34.412636294090625, "Standard Deviation": null, "Rank": 40 }, "Social": { "Average Score": 41.24738365139523, "Standard Deviation": null, "Rank": 34 }, "Chemistry": { "Average Score": 40.79571212108303, "Standard Deviation": null, "Rank": 35 }, "CPP": { "Average Score": 41.346336503003236, "Standard Deviation": null, "Rank": 28 }, "Physics": { "Average Score": 52.309001772076435, "Standard Deviation": null, "Rank": 34 }, "Biology": { "Average Score": 49.100219607909104, "Standard Deviation": null, "Rank": 36 } } }, { "config": { "model_name": "vicuna-33b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 26.771867469042252, "Standard Deviation": 2.2628124527776685, "Rank": 45 }, "Geometry": { "Average Score": 17.75361072083444, "Standard Deviation": null, "Rank": 50 }, "Algebra": { "Average Score": 24.801410292720103, "Standard Deviation": null, "Rank": 50 }, "Probability": { "Average Score": 18.923598681430988, "Standard Deviation": null, "Rank": 50 }, "Logical": { "Average Score": 22.485046383293895, "Standard Deviation": null, "Rank": 47 }, "Social": { "Average Score": 37.63057970959196, "Standard Deviation": null, "Rank": 40 }, "Chemistry": { "Average Score": 28.982029986253178, "Standard Deviation": null, "Rank": 45 }, "CPP": { "Average Score": 28.01838653090379, "Standard Deviation": null, "Rank": 38 }, "Physics": { "Average Score": 28.904101398112875, "Standard Deviation": null, "Rank": 45 }, "Biology": { "Average Score": 40.66824421437282, "Standard Deviation": null, "Rank": 44 } } }, { "config": { "model_name": "gemma-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 23.946098797294113, "Standard Deviation": 1.882540513317503, "Rank": 48 }, "Geometry": { "Average Score": 20.947476737376597, "Standard Deviation": null, "Rank": 48 }, "Algebra": { "Average Score": 23.018014851651127, "Standard Deviation": null, "Rank": 52 }, "Probability": { "Average Score": 15.37360248124904, "Standard Deviation": null, "Rank": 51 }, "Logical": { "Average Score": 23.856001036256362, "Standard Deviation": null, "Rank": 44 }, "Social": { "Average Score": 33.803173718782276, "Standard Deviation": null, "Rank": 44 }, "Chemistry": { "Average Score": 28.96403210090221, "Standard Deviation": null, "Rank": 46 }, "CPP": { "Average Score": 28.014658234926813, "Standard Deviation": null, "Rank": 39 }, "Physics": { "Average Score": 31.52560551567879, "Standard Deviation": null, "Rank": 44 }, "Biology": { "Average Score": 33.30740831237261, "Standard Deviation": null, "Rank": 48 } } }, { "config": { "model_name": "mistral-7b-instruct-2", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 30.425212839239084, "Standard Deviation": 3.2420324833230745, "Rank": 43 }, "Geometry": { "Average Score": 17.98077256453581, "Standard Deviation": null, "Rank": 49 }, "Algebra": { "Average Score": 23.03227606898818, "Standard Deviation": null, "Rank": 51 }, "Probability": { "Average Score": 22.515548503444595, "Standard Deviation": null, "Rank": 48 }, "Logical": { "Average Score": 28.172299674407935, "Standard Deviation": null, "Rank": 42 }, "Social": { "Average Score": 32.34681006422513, "Standard Deviation": null, "Rank": 45 }, "Chemistry": { "Average Score": 29.847754052571794, "Standard Deviation": null, "Rank": 42 }, "CPP": { "Average Score": 31.382959631870822, "Standard Deviation": null, "Rank": 35 }, "Physics": { "Average Score": 42.179522893964496, "Standard Deviation": null, "Rank": 41 }, "Biology": { "Average Score": 40.80741758174906, "Standard Deviation": null, "Rank": 42 } } }, { "config": { "model_name": "mistral-7b-instruct-1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 19.00770440704137, "Standard Deviation": 2.5108129577834823, "Rank": 55 }, "Geometry": { "Average Score": 11.76124122331528, "Standard Deviation": null, "Rank": 55 }, "Algebra": { "Average Score": 20.16800788676758, "Standard Deviation": null, "Rank": 53 }, "Probability": { "Average Score": 21.982214302316194, "Standard Deviation": null, "Rank": 49 }, "Logical": { "Average Score": 16.458119477880455, "Standard Deviation": null, "Rank": 51 }, "Social": { "Average Score": 11.83909143203254, "Standard Deviation": null, "Rank": 56 }, "Chemistry": { "Average Score": 20.227175038540732, "Standard Deviation": null, "Rank": 52 }, "CPP": { "Average Score": 18.929093202755805, "Standard Deviation": null, "Rank": 42 }, "Physics": { "Average Score": 16.942666711550366, "Standard Deviation": null, "Rank": 53 }, "Biology": { "Average Score": 14.862055999215585, "Standard Deviation": null, "Rank": 56 } } }, { "config": { "model_name": "vicuna-13b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 17.596440211877606, "Standard Deviation": 2.1378036693126887, "Rank": 56 }, "Geometry": { "Average Score": 13.613562588758793, "Standard Deviation": null, "Rank": 54 }, "Algebra": { "Average Score": 17.777580357601646, "Standard Deviation": null, "Rank": 54 }, "Probability": { "Average Score": 11.773651220819335, "Standard Deviation": null, "Rank": 53 }, "Logical": { "Average Score": 16.62840722654711, "Standard Deviation": null, "Rank": 50 }, "Social": { "Average Score": 12.015284814277452, "Standard Deviation": null, "Rank": 54 }, "Chemistry": { "Average Score": 22.59071707495557, "Standard Deviation": null, "Rank": 49 }, "CPP": { "Average Score": 21.840013221590294, "Standard Deviation": null, "Rank": 40 }, "Physics": { "Average Score": 23.12484986614339, "Standard Deviation": null, "Rank": 48 }, "Biology": { "Average Score": 32.46475144310054, "Standard Deviation": null, "Rank": 49 } } }, { "config": { "model_name": "zephyr-7b-beta", "organization": "HuggingFace", "license": "MIT", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 9.430771900746599, "Standard Deviation": 0.5392686957469028, "Rank": 59 }, "Geometry": { "Average Score": 8.776172464719641, "Standard Deviation": null, "Rank": 56 }, "Algebra": { "Average Score": 12.864251022808256, "Standard Deviation": null, "Rank": 55 }, "Probability": { "Average Score": 6.856387198441145, "Standard Deviation": null, "Rank": 58 }, "Logical": { "Average Score": 7.23067331414496, "Standard Deviation": null, "Rank": 59 }, "Social": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 61 }, "Chemistry": { "Average Score": 16.809164907349935, "Standard Deviation": null, "Rank": 54 }, "CPP": { "Average Score": 18.92902220864132, "Standard Deviation": null, "Rank": 43 }, "Physics": { "Average Score": 17.655293480361614, "Standard Deviation": null, "Rank": 52 }, "Biology": { "Average Score": 12.415097886994968, "Standard Deviation": null, "Rank": 58 } } }, { "config": { "model_name": "gemma-1.1-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 21.90250655573766, "Standard Deviation": 1.9871388098125085, "Rank": 52 }, "Geometry": { "Average Score": 13.697788759430225, "Standard Deviation": null, "Rank": 53 }, "Algebra": { "Average Score": 12.157310639752737, "Standard Deviation": null, "Rank": 56 }, "Probability": { "Average Score": 7.449868080506948, "Standard Deviation": null, "Rank": 56 }, "Logical": { "Average Score": 10.62657710416428, "Standard Deviation": null, "Rank": 57 }, "Social": { "Average Score": 29.175325965898267, "Standard Deviation": null, "Rank": 48 }, "Chemistry": { "Average Score": 21.740619629476075, "Standard Deviation": null, "Rank": 50 }, "CPP": { "Average Score": 20.724691953843916, "Standard Deviation": null, "Rank": 41 }, "Physics": { "Average Score": 23.632640386132042, "Standard Deviation": null, "Rank": 47 }, "Biology": { "Average Score": 29.750661487753543, "Standard Deviation": null, "Rank": 50 } } }, { "config": { "model_name": "llama2-7b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 23.15262700172829, "Standard Deviation": 1.5180515912969421, "Rank": 50 }, "Geometry": { "Average Score": 6.062981955604592, "Standard Deviation": null, "Rank": 57 }, "Algebra": { "Average Score": 9.702442741719038, "Standard Deviation": null, "Rank": 58 }, "Probability": { "Average Score": 7.323764901851239, "Standard Deviation": null, "Rank": 57 }, "Logical": { "Average Score": 20.042615636879354, "Standard Deviation": null, "Rank": 49 }, "Social": { "Average Score": 28.003092092497983, "Standard Deviation": null, "Rank": 49 }, "Chemistry": { "Average Score": 20.22732766050842, "Standard Deviation": null, "Rank": 51 }, "CPP": { "Average Score": 15.730513733660898, "Standard Deviation": null, "Rank": 45 }, "Physics": { "Average Score": 12.866623115939365, "Standard Deviation": null, "Rank": 57 }, "Biology": { "Average Score": 29.435323133887913, "Standard Deviation": null, "Rank": 51 } } }, { "config": { "model_name": "gemma-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 20.296640473489866, "Standard Deviation": 2.333666507610861, "Rank": 53 }, "Geometry": { "Average Score": 16.155982788407485, "Standard Deviation": null, "Rank": 52 }, "Algebra": { "Average Score": 9.997670449242714, "Standard Deviation": null, "Rank": 57 }, "Probability": { "Average Score": 6.055292262170126, "Standard Deviation": null, "Rank": 59 }, "Logical": { "Average Score": 5.200573121259635, "Standard Deviation": null, "Rank": 60 }, "Social": { "Average Score": 9.560337024016134, "Standard Deviation": null, "Rank": 58 }, "Chemistry": { "Average Score": 16.613881599313693, "Standard Deviation": null, "Rank": 55 }, "CPP": { "Average Score": 17.2715657115764, "Standard Deviation": null, "Rank": 44 }, "Physics": { "Average Score": 17.72258050873005, "Standard Deviation": null, "Rank": 51 }, "Biology": { "Average Score": 10.891363209321185, "Standard Deviation": null, "Rank": 59 } } }, { "config": { "model_name": "llama2-13b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 22.40246822660458, "Standard Deviation": 1.5744155926563603, "Rank": 51 }, "Geometry": { "Average Score": 4.287260426268335, "Standard Deviation": null, "Rank": 59 }, "Algebra": { "Average Score": 7.122650832792122, "Standard Deviation": null, "Rank": 59 }, "Probability": { "Average Score": 10.367779885088286, "Standard Deviation": null, "Rank": 54 }, "Logical": { "Average Score": 23.416885515011753, "Standard Deviation": null, "Rank": 45 }, "Social": { "Average Score": 26.251837552806705, "Standard Deviation": null, "Rank": 50 }, "Chemistry": { "Average Score": 15.236408439765913, "Standard Deviation": null, "Rank": 58 }, "CPP": { "Average Score": 13.17258252933903, "Standard Deviation": null, "Rank": 48 }, "Physics": { "Average Score": 9.756032013938237, "Standard Deviation": null, "Rank": 58 }, "Biology": { "Average Score": 14.373926163839833, "Standard Deviation": null, "Rank": 57 } } }, { "config": { "model_name": "vicuna-7b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 16.947504584923095, "Standard Deviation": 2.1935303160759494, "Rank": 57 }, "Geometry": { "Average Score": 5.6556788835908565, "Standard Deviation": null, "Rank": 58 }, "Algebra": { "Average Score": 6.937810777972691, "Standard Deviation": null, "Rank": 60 }, "Probability": { "Average Score": 7.449902539116639, "Standard Deviation": null, "Rank": 55 }, "Logical": { "Average Score": 11.53991650872671, "Standard Deviation": null, "Rank": 56 }, "Social": { "Average Score": 10.510431618145562, "Standard Deviation": null, "Rank": 57 }, "Chemistry": { "Average Score": 15.565621989451936, "Standard Deviation": null, "Rank": 56 }, "CPP": { "Average Score": 14.255194156624162, "Standard Deviation": null, "Rank": 46 }, "Physics": { "Average Score": 13.654470501928998, "Standard Deviation": null, "Rank": 55 }, "Biology": { "Average Score": 16.31264249867034, "Standard Deviation": null, "Rank": 55 } } }, { "config": { "model_name": "koala-13b", "organization": "UC Berkeley", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 8.83755726181737, "Standard Deviation": 0.6967904064276641, "Rank": 60 }, "Geometry": { "Average Score": 0.16630617078665783, "Standard Deviation": null, "Rank": 60 }, "Algebra": { "Average Score": 2.2176438662182405, "Standard Deviation": null, "Rank": 61 }, "Probability": { "Average Score": 3.0086045641099886, "Standard Deviation": null, "Rank": 60 }, "Logical": { "Average Score": 8.007902379487398, "Standard Deviation": null, "Rank": 58 }, "Social": { "Average Score": 9.267400643797334, "Standard Deviation": null, "Rank": 59 }, "Chemistry": { "Average Score": 6.881971917535636, "Standard Deviation": null, "Rank": 59 }, "CPP": { "Average Score": 6.36433272373514, "Standard Deviation": null, "Rank": 49 }, "Physics": { "Average Score": 1.4745736403582252, "Standard Deviation": null, "Rank": 59 }, "Biology": { "Average Score": 10.173901160370301, "Standard Deviation": null, "Rank": 60 } } }, { "config": { "model_name": "openassistant-pythia-12b", "organization": "OpenAssistant", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 61 }, "Geometry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 61 }, "Algebra": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 62 }, "Probability": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 61 }, "Logical": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 61 }, "Social": { "Average Score": 1.5648937446490145, "Standard Deviation": null, "Rank": 60 }, "Chemistry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 60 }, "CPP": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 50 }, "Physics": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 60 }, "Biology": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 62 } } }, { "config": { "model_name": "nemotron-70b", "organization": "NVIDIA", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 100.0, "Standard Deviation": 0.0, "Rank": 1 }, "Algebra": { "Average Score": 80.66812253661826, "Standard Deviation": null, "Rank": 17 }, "Geometry": { "Average Score": 64.79317124458657, "Standard Deviation": null, "Rank": 14 }, "Probability": { "Average Score": 77.90998100977566, "Standard Deviation": null, "Rank": 10 }, "Logical": { "Average Score": 92.79205249453312, "Standard Deviation": null, "Rank": 5 }, "Social": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Chemistry": { "Average Score": 75.51792600714916, "Standard Deviation": null, "Rank": 14 }, "Physics": { "Average Score": 87.87343018217607, "Standard Deviation": null, "Rank": 11 }, "Biology": { "Average Score": 89.70989044405452, "Standard Deviation": null, "Rank": 6 } } }, { "config": { "model_name": "llama-3.2-3b-it", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 24.55648638012998, "Standard Deviation": 2.7438328116042396, "Rank": 47 }, "Algebra": { "Average Score": 58.282081682035965, "Standard Deviation": null, "Rank": 32 }, "Probability": { "Average Score": 38.82178804612166, "Standard Deviation": null, "Rank": 37 }, "Logical": { "Average Score": 14.284884351545829, "Standard Deviation": null, "Rank": 53 }, "Social": { "Average Score": 12.015170971293347, "Standard Deviation": null, "Rank": 55 }, "Chemistry": { "Average Score": 28.594555260782386, "Standard Deviation": null, "Rank": 47 }, "Physics": { "Average Score": 28.49646725691165, "Standard Deviation": null, "Rank": 46 }, "Biology": { "Average Score": 19.26616886675504, "Standard Deviation": null, "Rank": 54 } } }, { "config": { "model_name": "glm-4-plus", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 93.38486963586884, "Standard Deviation": null, "Rank": 4 }, "Biology": { "Average Score": 92.22645537080881, "Standard Deviation": null, "Rank": 4 }, "Chemistry": { "Average Score": 83.011021476943, "Standard Deviation": null, "Rank": 8 }, "Social": { "Average Score": 96.10166232633848, "Standard Deviation": null, "Rank": 5 }, "Logical": { "Average Score": 92.48639421432455, "Standard Deviation": null, "Rank": 6 }, "Algebra": { "Average Score": 91.79128700104991, "Standard Deviation": null, "Rank": 7 }, "Geometry": { "Average Score": 75.41344471165868, "Standard Deviation": null, "Rank": 10 }, "Probability": { "Average Score": 76.73191937524591, "Standard Deviation": null, "Rank": 12 }, "OVERALL": { "Average Score": 92.39089671677698, "Standard Deviation": 0.5005865827133669, "Rank": 6 } } }, { "config": { "model_name": "yi-lightning", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 88.49402753650628, "Standard Deviation": null, "Rank": 9 }, "Biology": { "Average Score": 90.37891957676416, "Standard Deviation": null, "Rank": 5 }, "Chemistry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Social": { "Average Score": 92.14580653902937, "Standard Deviation": null, "Rank": 6 }, "Logical": { "Average Score": 94.75701503537329, "Standard Deviation": null, "Rank": 4 }, "Algebra": { "Average Score": 93.3186019721947, "Standard Deviation": null, "Rank": 6 }, "Geometry": { "Average Score": 76.16313216563569, "Standard Deviation": null, "Rank": 9 }, "Probability": { "Average Score": 92.54460354742838, "Standard Deviation": null, "Rank": 3 }, "OVERALL": { "Average Score": 96.802929532644, "Standard Deviation": 0.27491691197906704, "Rank": 3 } } }, { "config": { "model_name": "ministral-8b-it", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 57.14492748742418, "Standard Deviation": null, "Rank": 32 }, "Biology": { "Average Score": 53.5479824847229, "Standard Deviation": null, "Rank": 31 }, "Chemistry": { "Average Score": 45.51400153833142, "Standard Deviation": null, "Rank": 30 }, "Social": { "Average Score": 45.54025353861784, "Standard Deviation": null, "Rank": 29 }, "Logical": { "Average Score": 59.25000685096734, "Standard Deviation": null, "Rank": 27 }, "Algebra": { "Average Score": 58.56021213895309, "Standard Deviation": null, "Rank": 31 }, "Geometry": { "Average Score": 54.902884398306554, "Standard Deviation": null, "Rank": 22 }, "Probability": { "Average Score": 49.69358274321923, "Standard Deviation": null, "Rank": 29 }, "OVERALL": { "Average Score": 45.88665474541969, "Standard Deviation": 4.242263667629549, "Rank": 29 } } }, { "config": { "model_name": "qwen2.5-1.5b", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 50.38291508013627, "Standard Deviation": null, "Rank": 35 }, "Biology": { "Average Score": 40.134558844170826, "Standard Deviation": null, "Rank": 45 }, "Chemistry": { "Average Score": 34.891253153439166, "Standard Deviation": null, "Rank": 39 }, "Social": { "Average Score": 39.812806552940735, "Standard Deviation": null, "Rank": 39 }, "Logical": { "Average Score": 42.70305684307474, "Standard Deviation": null, "Rank": 32 }, "Algebra": { "Average Score": 79.30455838359877, "Standard Deviation": null, "Rank": 18 }, "Geometry": { "Average Score": 58.56739922365014, "Standard Deviation": null, "Rank": 18 }, "Probability": { "Average Score": 68.07725566867765, "Standard Deviation": null, "Rank": 19 }, "OVERALL": { "Average Score": 23.25904934716627, "Standard Deviation": 1.5089621200216172, "Rank": 49 } } }, { "config": { "model_name": "smollm2-1.7b", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 20.328651604714242, "Standard Deviation": null, "Rank": 50 }, "Biology": { "Average Score": 23.55167655906088, "Standard Deviation": null, "Rank": 53 }, "Chemistry": { "Average Score": 17.90654461263675, "Standard Deviation": null, "Rank": 53 }, "Social": { "Average Score": 18.586981509149783, "Standard Deviation": null, "Rank": 53 }, "Logical": { "Average Score": 13.753294179366819, "Standard Deviation": null, "Rank": 54 }, "Algebra": { "Average Score": 38.86009773073664, "Standard Deviation": null, "Rank": 46 }, "Geometry": { "Average Score": 26.65205080537627, "Standard Deviation": null, "Rank": 43 }, "Probability": { "Average Score": 28.77646355213561, "Standard Deviation": null, "Rank": 45 }, "OVERALL": { "Average Score": 20.14565641258473, "Standard Deviation": 2.3679638882398857, "Rank": 54 } } }, { "config": { "model_name": "llama-3.2-1b-it", "organization": "Unknown", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "Physics": { "Average Score": 13.730639722217427, "Standard Deviation": null, "Rank": 54 }, "Biology": { "Average Score": 25.09504378386352, "Standard Deviation": null, "Rank": 52 }, "Chemistry": { "Average Score": 22.71076097859151, "Standard Deviation": null, "Rank": 48 }, "Social": { "Average Score": 20.34042449083379, "Standard Deviation": null, "Rank": 52 }, "Logical": { "Average Score": 15.338736069283176, "Standard Deviation": null, "Rank": 52 }, "Algebra": { "Average Score": 43.69053020706735, "Standard Deviation": null, "Rank": 40 }, "Geometry": { "Average Score": 25.35058286701741, "Standard Deviation": null, "Rank": 46 }, "Probability": { "Average Score": 28.620674481486535, "Standard Deviation": null, "Rank": 46 }, "OVERALL": { "Average Score": 24.93401522355894, "Standard Deviation": 2.6710490374694014, "Rank": 46 } } } ]