de-arena / src /results /models_2024-11-08-08:36:00.464224.json
yzabc007's picture
update
785b751
[
{
"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 94.49771627042422,
"Standard Deviation": 0.251607817784525,
"Rank": 4
},
"Geometry": {
"Average Score": 81.11505705795187,
"Standard Deviation": null,
"Rank": 6
},
"Algebra": {
"Average Score": 91.79122001491199,
"Standard Deviation": null,
"Rank": 8
},
"Probability": {
"Average Score": 88.00190397870577,
"Standard Deviation": null,
"Rank": 4
},
"Logical": {
"Average Score": 97.47223448912972,
"Standard Deviation": null,
"Rank": 2
},
"Social": {
"Average Score": 89.73262585993845,
"Standard Deviation": null,
"Rank": 7
},
"Chemistry": {
"Average Score": 90.48070030738856,
"Standard Deviation": null,
"Rank": 3
},
"CPP": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Physics": {
"Average Score": 99.7043774383865,
"Standard Deviation": null,
"Rank": 2
},
"Biology": {
"Average Score": 95.98449860487872,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 83.33484787198124,
"Standard Deviation": 3.0334254138998893,
"Rank": 12
},
"Geometry": {
"Average Score": 85.73211137938175,
"Standard Deviation": null,
"Rank": 2
},
"Algebra": {
"Average Score": 95.29454759516874,
"Standard Deviation": null,
"Rank": 5
},
"Probability": {
"Average Score": 80.9483280228488,
"Standard Deviation": null,
"Rank": 7
},
"Logical": {
"Average Score": 78.93507998348575,
"Standard Deviation": null,
"Rank": 12
},
"Social": {
"Average Score": 78.21553692695771,
"Standard Deviation": null,
"Rank": 11
},
"Chemistry": {
"Average Score": 79.46337310221962,
"Standard Deviation": null,
"Rank": 9
},
"CPP": {
"Average Score": 92.43090226400756,
"Standard Deviation": null,
"Rank": 2
},
"Physics": {
"Average Score": 92.63882355350016,
"Standard Deviation": null,
"Rank": 6
},
"Biology": {
"Average Score": 79.88713500945879,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 89.31218128337491,
"Standard Deviation": 0.5511990686487255,
"Rank": 8
},
"Geometry": {
"Average Score": 81.70458958633901,
"Standard Deviation": null,
"Rank": 4
},
"Algebra": {
"Average Score": 90.16488595415144,
"Standard Deviation": null,
"Rank": 9
},
"Probability": {
"Average Score": 83.8098272382245,
"Standard Deviation": null,
"Rank": 5
},
"Logical": {
"Average Score": 88.2742970015626,
"Standard Deviation": null,
"Rank": 9
},
"Social": {
"Average Score": 71.51855733216095,
"Standard Deviation": null,
"Rank": 15
},
"Chemistry": {
"Average Score": 84.0147961443266,
"Standard Deviation": null,
"Rank": 7
},
"CPP": {
"Average Score": 79.1592634699295,
"Standard Deviation": null,
"Rank": 6
},
"Physics": {
"Average Score": 96.44583156689123,
"Standard Deviation": null,
"Rank": 3
},
"Biology": {
"Average Score": 86.17947030919935,
"Standard Deviation": null,
"Rank": 10
}
}
},
{
"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 89.20222265636137,
"Standard Deviation": 0.9498836008363539,
"Rank": 9
},
"Geometry": {
"Average Score": 77.90202019775627,
"Standard Deviation": null,
"Rank": 8
},
"Algebra": {
"Average Score": 84.83537307564205,
"Standard Deviation": null,
"Rank": 12
},
"Probability": {
"Average Score": 80.01448545719413,
"Standard Deviation": null,
"Rank": 9
},
"Logical": {
"Average Score": 89.63955736396734,
"Standard Deviation": null,
"Rank": 8
},
"Social": {
"Average Score": 77.25088451567024,
"Standard Deviation": null,
"Rank": 12
},
"Chemistry": {
"Average Score": 78.97054235015905,
"Standard Deviation": null,
"Rank": 11
},
"CPP": {
"Average Score": 70.73143363230263,
"Standard Deviation": null,
"Rank": 11
},
"Physics": {
"Average Score": 90.33497346058968,
"Standard Deviation": null,
"Rank": 7
},
"Biology": {
"Average Score": 86.17949760404831,
"Standard Deviation": null,
"Rank": 9
}
}
},
{
"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 82.91139866415075,
"Standard Deviation": 3.013751980804677,
"Rank": 13
},
"Geometry": {
"Average Score": 83.6654007694722,
"Standard Deviation": null,
"Rank": 3
},
"Algebra": {
"Average Score": 98.84487439119522,
"Standard Deviation": null,
"Rank": 3
},
"Probability": {
"Average Score": 75.94594518060929,
"Standard Deviation": null,
"Rank": 13
},
"Logical": {
"Average Score": 78.89834475831927,
"Standard Deviation": null,
"Rank": 14
},
"Social": {
"Average Score": 78.21569899283614,
"Standard Deviation": null,
"Rank": 10
},
"Physics": {
"Average Score": 88.41290613720335,
"Standard Deviation": null,
"Rank": 10
},
"Biology": {
"Average Score": 86.45347978614136,
"Standard Deviation": null,
"Rank": 8
}
}
},
{
"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/09"
},
"results": {
"OVERALL": {
"Average Score": 80.51855735113782,
"Standard Deviation": 2.389693257324127,
"Rank": 15
},
"Geometry": {
"Average Score": 68.80768467173304,
"Standard Deviation": null,
"Rank": 13
},
"Algebra": {
"Average Score": 95.86210030199506,
"Standard Deviation": null,
"Rank": 4
},
"Probability": {
"Average Score": 82.29702731445691,
"Standard Deviation": null,
"Rank": 6
},
"Logical": {
"Average Score": 73.55135235722557,
"Standard Deviation": null,
"Rank": 19
},
"Social": {
"Average Score": 57.41502695932332,
"Standard Deviation": null,
"Rank": 19
},
"Chemistry": {
"Average Score": 75.8879803782176,
"Standard Deviation": null,
"Rank": 13
},
"CPP": {
"Average Score": 73.54037778797029,
"Standard Deviation": null,
"Rank": 7
},
"Physics": {
"Average Score": 82.02738090295061,
"Standard Deviation": null,
"Rank": 16
},
"Biology": {
"Average Score": 66.99838962851355,
"Standard Deviation": null,
"Rank": 22
}
}
},
{
"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 85.79551424780102,
"Standard Deviation": 2.25059599602412,
"Rank": 11
},
"Geometry": {
"Average Score": 78.03415885586699,
"Standard Deviation": null,
"Rank": 7
},
"Algebra": {
"Average Score": 90.10621818673319,
"Standard Deviation": null,
"Rank": 10
},
"Probability": {
"Average Score": 80.94824796859724,
"Standard Deviation": null,
"Rank": 8
},
"Logical": {
"Average Score": 86.1004659652016,
"Standard Deviation": null,
"Rank": 10
},
"Social": {
"Average Score": 74.20253943841105,
"Standard Deviation": null,
"Rank": 13
},
"Chemistry": {
"Average Score": 75.44768883899778,
"Standard Deviation": null,
"Rank": 15
},
"CPP": {
"Average Score": 88.3877070580296,
"Standard Deviation": null,
"Rank": 3
},
"Physics": {
"Average Score": 90.33492089386435,
"Standard Deviation": null,
"Rank": 8
},
"Biology": {
"Average Score": 79.03781031583883,
"Standard Deviation": null,
"Rank": 15
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 90.30644459276833,
"Standard Deviation": 0.6105034066546057,
"Rank": 7
},
"Geometry": {
"Average Score": 72.63402106402285,
"Standard Deviation": null,
"Rank": 12
},
"Algebra": {
"Average Score": 83.32075177480141,
"Standard Deviation": null,
"Rank": 14
},
"Probability": {
"Average Score": 76.7319625254773,
"Standard Deviation": null,
"Rank": 11
},
"Logical": {
"Average Score": 90.00404188010565,
"Standard Deviation": null,
"Rank": 7
},
"Social": {
"Average Score": 99.89849499454823,
"Standard Deviation": null,
"Rank": 2
},
"Chemistry": {
"Average Score": 85.86402884262867,
"Standard Deviation": null,
"Rank": 4
},
"CPP": {
"Average Score": 82.37734076815008,
"Standard Deviation": null,
"Rank": 5
},
"Physics": {
"Average Score": 92.83215449096147,
"Standard Deviation": null,
"Rank": 5
},
"Biology": {
"Average Score": 85.76627192038262,
"Standard Deviation": null,
"Rank": 11
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet-20241022",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "UNKNOW"
},
"results": {
"OVERALL": {
"Average Score": 81.7399750668719,
"Standard Deviation": 6.158375141726245,
"Rank": 14
},
"Geometry": {
"Average Score": 72.63581025178527,
"Standard Deviation": null,
"Rank": 11
},
"Algebra": {
"Average Score": 89.50323347048936,
"Standard Deviation": null,
"Rank": 11
},
"Probability": {
"Average Score": 73.919,
"Standard Deviation": null,
"Rank": 13
},
"Logical": {
"Average Score": 90.514,
"Standard Deviation": null,
"Rank": 7
},
"Social": {
"Average Score": 84.505,
"Standard Deviation": null,
"Rank": 7
},
"Chemistry": {
"Average Score": 85.15970597010583,
"Standard Deviation": null,
"Rank": 6
},
"Physics": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Biology": {
"Average Score": 85.56526806360797,
"Standard Deviation": null,
"Rank": 12
}
}
},
{
"config": {
"model_name": "o1-mini",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 97.50448224920098,
"Standard Deviation": 0.18820973784944708,
"Rank": 2
},
"Geometry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Algebra": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Probability": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 96.62093396445893,
"Standard Deviation": null,
"Rank": 3
},
"Social": {
"Average Score": 98.93701302706319,
"Standard Deviation": null,
"Rank": 4
},
"Chemistry": {
"Average Score": 93.52027415963765,
"Standard Deviation": null,
"Rank": 2
},
"Biology": {
"Average Score": 99.9210788257773,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "o1-preview",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 92.95670511909181,
"Standard Deviation": 0.26193636312885404,
"Rank": 5
},
"Geometry": {
"Average Score": 81.70453162182778,
"Standard Deviation": null,
"Rank": 5
},
"Algebra": {
"Average Score": 99.2204666813678,
"Standard Deviation": null,
"Rank": 2
},
"Probability": {
"Average Score": 96.11141903959506,
"Standard Deviation": null,
"Rank": 2
},
"Logical": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Social": {
"Average Score": 99.35681400812317,
"Standard Deviation": null,
"Rank": 3
},
"Biology": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
}
}
},
{
"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 63.90738369106308,
"Standard Deviation": 2.5840022803072342,
"Rank": 20
},
"Geometry": {
"Average Score": 62.78784730869374,
"Standard Deviation": null,
"Rank": 16
},
"Algebra": {
"Average Score": 84.4516255656167,
"Standard Deviation": null,
"Rank": 13
},
"Probability": {
"Average Score": 71.21668893483972,
"Standard Deviation": null,
"Rank": 15
},
"Logical": {
"Average Score": 73.55137041991937,
"Standard Deviation": null,
"Rank": 17
},
"Social": {
"Average Score": 71.51839473022034,
"Standard Deviation": null,
"Rank": 16
},
"Chemistry": {
"Average Score": 78.9281328399534,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 72.1127762005651,
"Standard Deviation": null,
"Rank": 10
},
"Physics": {
"Average Score": 86.21163726768592,
"Standard Deviation": null,
"Rank": 14
},
"Biology": {
"Average Score": 77.50881946688955,
"Standard Deviation": null,
"Rank": 16
}
}
},
{
"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 88.08481721079524,
"Standard Deviation": 1.4421920877285703,
"Rank": 10
},
"Geometry": {
"Average Score": 59.2110329866853,
"Standard Deviation": null,
"Rank": 17
},
"Algebra": {
"Average Score": 80.79050620153212,
"Standard Deviation": null,
"Rank": 15
},
"Probability": {
"Average Score": 74.36123524515216,
"Standard Deviation": null,
"Rank": 14
},
"Logical": {
"Average Score": 77.02518347398768,
"Standard Deviation": null,
"Rank": 15
},
"Social": {
"Average Score": 51.13078063545894,
"Standard Deviation": null,
"Rank": 25
},
"Chemistry": {
"Average Score": 72.4125941071821,
"Standard Deviation": null,
"Rank": 16
},
"CPP": {
"Average Score": 69.11824072252848,
"Standard Deviation": null,
"Rank": 12
},
"Physics": {
"Average Score": 87.0543996394885,
"Standard Deviation": null,
"Rank": 13
},
"Biology": {
"Average Score": 82.36213636857161,
"Standard Deviation": null,
"Rank": 13
}
}
},
{
"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 70.59188609288081,
"Standard Deviation": 8.717841670213112,
"Rank": 19
},
"Geometry": {
"Average Score": 58.00008857041582,
"Standard Deviation": null,
"Rank": 19
},
"Algebra": {
"Average Score": 77.82927803658924,
"Standard Deviation": null,
"Rank": 19
},
"Probability": {
"Average Score": 69.63382706259532,
"Standard Deviation": null,
"Rank": 18
},
"Logical": {
"Average Score": 73.55136762438677,
"Standard Deviation": null,
"Rank": 18
},
"Social": {
"Average Score": 57.17847568664103,
"Standard Deviation": null,
"Rank": 20
},
"Chemistry": {
"Average Score": 68.65449070488427,
"Standard Deviation": null,
"Rank": 20
},
"CPP": {
"Average Score": 63.28920072143611,
"Standard Deviation": null,
"Rank": 14
},
"Physics": {
"Average Score": 76.8395150041688,
"Standard Deviation": null,
"Rank": 19
},
"Biology": {
"Average Score": 66.99846220210911,
"Standard Deviation": null,
"Rank": 21
}
}
},
{
"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 79.77338364506384,
"Standard Deviation": 2.32886155429398,
"Rank": 16
},
"Geometry": {
"Average Score": 57.5200576513199,
"Standard Deviation": null,
"Rank": 20
},
"Algebra": {
"Average Score": 76.89230078890219,
"Standard Deviation": null,
"Rank": 20
},
"Probability": {
"Average Score": 71.20578106177237,
"Standard Deviation": null,
"Rank": 16
},
"Logical": {
"Average Score": 78.93505058041774,
"Standard Deviation": null,
"Rank": 13
},
"Social": {
"Average Score": 88.40491896661747,
"Standard Deviation": null,
"Rank": 8
},
"Chemistry": {
"Average Score": 79.0571776580065,
"Standard Deviation": null,
"Rank": 10
},
"CPP": {
"Average Score": 73.5404403567132,
"Standard Deviation": null,
"Rank": 8
},
"Physics": {
"Average Score": 87.28118117714033,
"Standard Deviation": null,
"Rank": 12
},
"Biology": {
"Average Score": 71.23527633371832,
"Standard Deviation": null,
"Rank": 20
}
}
},
{
"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Geometry": {
"Average Score": 57.520011750672175,
"Standard Deviation": null,
"Rank": 21
},
"Algebra": {
"Average Score": 72.3731046476544,
"Standard Deviation": null,
"Rank": 21
},
"Probability": {
"Average Score": 61.79614379365174,
"Standard Deviation": null,
"Rank": 22
},
"Logical": {
"Average Score": 64.62661472571767,
"Standard Deviation": null,
"Rank": 23
},
"Social": {
"Average Score": 87.65488278831526,
"Standard Deviation": null,
"Rank": 9
},
"Chemistry": {
"Average Score": 85.36850564169866,
"Standard Deviation": null,
"Rank": 5
},
"CPP": {
"Average Score": 73.43757596214863,
"Standard Deviation": null,
"Rank": 9
},
"Physics": {
"Average Score": 82.02727994935249,
"Standard Deviation": null,
"Rank": 17
},
"Biology": {
"Average Score": 88.80821937078267,
"Standard Deviation": null,
"Rank": 7
}
}
},
{
"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 61.57517122936127,
"Standard Deviation": 5.01096656930536,
"Rank": 21
},
"Geometry": {
"Average Score": 49.36591842356095,
"Standard Deviation": null,
"Rank": 28
},
"Algebra": {
"Average Score": 71.12615153442515,
"Standard Deviation": null,
"Rank": 22
},
"Probability": {
"Average Score": 51.76027345875035,
"Standard Deviation": null,
"Rank": 28
},
"Logical": {
"Average Score": 34.74438889550426,
"Standard Deviation": null,
"Rank": 39
},
"Social": {
"Average Score": 47.47112348597555,
"Standard Deviation": null,
"Rank": 27
},
"Chemistry": {
"Average Score": 51.65772092991593,
"Standard Deviation": null,
"Rank": 25
},
"CPP": {
"Average Score": 48.69302376665551,
"Standard Deviation": null,
"Rank": 20
},
"Physics": {
"Average Score": 62.45893584822384,
"Standard Deviation": null,
"Rank": 27
},
"Biology": {
"Average Score": 56.96571500324531,
"Standard Deviation": null,
"Rank": 27
}
}
},
{
"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 45.3199699974334,
"Standard Deviation": 3.7527776450894996,
"Rank": 31
},
"Geometry": {
"Average Score": 45.66389348479106,
"Standard Deviation": null,
"Rank": 30
},
"Algebra": {
"Average Score": 64.9403510842088,
"Standard Deviation": null,
"Rank": 25
},
"Probability": {
"Average Score": 51.99376831114535,
"Standard Deviation": null,
"Rank": 27
},
"Logical": {
"Average Score": 39.30230377209954,
"Standard Deviation": null,
"Rank": 36
},
"Social": {
"Average Score": 45.679222078247186,
"Standard Deviation": null,
"Rank": 28
},
"Chemistry": {
"Average Score": 46.41262433996582,
"Standard Deviation": null,
"Rank": 28
},
"CPP": {
"Average Score": 45.14284028264288,
"Standard Deviation": null,
"Rank": 24
},
"Physics": {
"Average Score": 65.80533740982938,
"Standard Deviation": null,
"Rank": 25
},
"Biology": {
"Average Score": 50.767985684362536,
"Standard Deviation": null,
"Rank": 33
}
}
},
{
"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 59.024943267290716,
"Standard Deviation": 3.979239820929726,
"Rank": 23
},
"Geometry": {
"Average Score": 53.495866814128156,
"Standard Deviation": null,
"Rank": 24
},
"Algebra": {
"Average Score": 65.98776390439404,
"Standard Deviation": null,
"Rank": 23
},
"Probability": {
"Average Score": 65.76699220336998,
"Standard Deviation": null,
"Rank": 21
},
"Logical": {
"Average Score": 71.04386923330611,
"Standard Deviation": null,
"Rank": 20
},
"Social": {
"Average Score": 73.74087367208867,
"Standard Deviation": null,
"Rank": 14
},
"Chemistry": {
"Average Score": 57.074735438190935,
"Standard Deviation": null,
"Rank": 22
},
"CPP": {
"Average Score": 54.03167523687635,
"Standard Deviation": null,
"Rank": 17
},
"Physics": {
"Average Score": 63.03919029129539,
"Standard Deviation": null,
"Rank": 26
},
"Biology": {
"Average Score": 63.18363754826406,
"Standard Deviation": null,
"Rank": 23
}
}
},
{
"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024/05"
},
"results": {
"OVERALL": {
"Average Score": 71.78031967728624,
"Standard Deviation": 12.994861744386325,
"Rank": 18
},
"Geometry": {
"Average Score": 54.06826621860964,
"Standard Deviation": null,
"Rank": 23
},
"Algebra": {
"Average Score": 65.66679210942144,
"Standard Deviation": null,
"Rank": 24
},
"Probability": {
"Average Score": 66.46858903563573,
"Standard Deviation": null,
"Rank": 20
},
"Logical": {
"Average Score": 67.36081192984079,
"Standard Deviation": null,
"Rank": 21
},
"Social": {
"Average Score": 53.898293694371446,
"Standard Deviation": null,
"Rank": 22
},
"Chemistry": {
"Average Score": 56.1520167017115,
"Standard Deviation": null,
"Rank": 23
},
"CPP": {
"Average Score": 52.148798061768964,
"Standard Deviation": null,
"Rank": 18
},
"Physics": {
"Average Score": 73.06547347263036,
"Standard Deviation": null,
"Rank": 21
},
"Biology": {
"Average Score": 72.47949036617567,
"Standard Deviation": null,
"Rank": 18
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-70b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 72.88379857527117,
"Standard Deviation": 3.7053577253028176,
"Rank": 17
},
"Geometry": {
"Average Score": 62.78788327507421,
"Standard Deviation": null,
"Rank": 15
},
"Algebra": {
"Average Score": 80.79028754890449,
"Standard Deviation": null,
"Rank": 16
},
"Probability": {
"Average Score": 69.6338691921361,
"Standard Deviation": null,
"Rank": 17
},
"Logical": {
"Average Score": 74.43905975120572,
"Standard Deviation": null,
"Rank": 16
},
"Social": {
"Average Score": 61.22534257022315,
"Standard Deviation": null,
"Rank": 18
},
"Chemistry": {
"Average Score": 70.9160725889497,
"Standard Deviation": null,
"Rank": 18
},
"CPP": {
"Average Score": 84.36815192532764,
"Standard Deviation": null,
"Rank": 4
},
"Physics": {
"Average Score": 82.02759904132307,
"Standard Deviation": null,
"Rank": 15
},
"Biology": {
"Average Score": 72.47948013923437,
"Standard Deviation": null,
"Rank": 19
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 52.21824740443002,
"Standard Deviation": 3.7833302779202937,
"Rank": 27
},
"Geometry": {
"Average Score": 43.03691891008171,
"Standard Deviation": null,
"Rank": 32
},
"Algebra": {
"Average Score": 64.13661497122277,
"Standard Deviation": null,
"Rank": 26
},
"Probability": {
"Average Score": 55.37882298464668,
"Standard Deviation": null,
"Rank": 25
},
"Logical": {
"Average Score": 53.843773408414144,
"Standard Deviation": null,
"Rank": 29
},
"Social": {
"Average Score": 44.993575656549545,
"Standard Deviation": null,
"Rank": 30
},
"Chemistry": {
"Average Score": 43.98798267082055,
"Standard Deviation": null,
"Rank": 31
},
"CPP": {
"Average Score": 44.41846841004584,
"Standard Deviation": null,
"Rank": 26
},
"Physics": {
"Average Score": 49.65976817230991,
"Standard Deviation": null,
"Rank": 37
},
"Biology": {
"Average Score": 52.132998637966764,
"Standard Deviation": null,
"Rank": 32
}
}
},
{
"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2021/09"
},
"results": {
"OVERALL": {
"Average Score": 32.61987548870099,
"Standard Deviation": 7.421068133219178,
"Rank": 41
},
"Geometry": {
"Average Score": 52.43446046073764,
"Standard Deviation": null,
"Rank": 25
},
"Algebra": {
"Average Score": 62.62345918733465,
"Standard Deviation": null,
"Rank": 27
},
"Probability": {
"Average Score": 46.778615832700474,
"Standard Deviation": null,
"Rank": 30
},
"Logical": {
"Average Score": 20.161483818418485,
"Standard Deviation": null,
"Rank": 48
},
"Social": {
"Average Score": 36.005021312700556,
"Standard Deviation": null,
"Rank": 43
},
"Chemistry": {
"Average Score": 41.27375172990709,
"Standard Deviation": null,
"Rank": 34
},
"CPP": {
"Average Score": 40.46958736582551,
"Standard Deviation": null,
"Rank": 29
},
"Physics": {
"Average Score": 53.13517938912883,
"Standard Deviation": null,
"Rank": 33
},
"Biology": {
"Average Score": 40.750963952571375,
"Standard Deviation": null,
"Rank": 43
}
}
},
{
"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 58.67095788786492,
"Standard Deviation": 3.916500171452786,
"Rank": 25
},
"Geometry": {
"Average Score": 47.16123420770543,
"Standard Deviation": null,
"Rank": 29
},
"Algebra": {
"Average Score": 62.38398769226985,
"Standard Deviation": null,
"Rank": 28
},
"Probability": {
"Average Score": 57.7568005808253,
"Standard Deviation": null,
"Rank": 23
},
"Logical": {
"Average Score": 84.45551822980201,
"Standard Deviation": null,
"Rank": 11
},
"Social": {
"Average Score": 52.450283668620365,
"Standard Deviation": null,
"Rank": 23
},
"Chemistry": {
"Average Score": 70.91630635362482,
"Standard Deviation": null,
"Rank": 17
},
"CPP": {
"Average Score": 65.32140697218945,
"Standard Deviation": null,
"Rank": 13
},
"Physics": {
"Average Score": 78.08120808341037,
"Standard Deviation": null,
"Rank": 18
},
"Biology": {
"Average Score": 60.6111504865126,
"Standard Deviation": null,
"Rank": 25
}
}
},
{
"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 61.25499749085383,
"Standard Deviation": 5.012226129836105,
"Rank": 22
},
"Geometry": {
"Average Score": 52.4291917862642,
"Standard Deviation": null,
"Rank": 26
},
"Algebra": {
"Average Score": 60.40928261066776,
"Standard Deviation": null,
"Rank": 29
},
"Probability": {
"Average Score": 57.4556182999398,
"Standard Deviation": null,
"Rank": 24
},
"Logical": {
"Average Score": 66.81740129837053,
"Standard Deviation": null,
"Rank": 22
},
"Social": {
"Average Score": 69.99747730347514,
"Standard Deviation": null,
"Rank": 17
},
"Chemistry": {
"Average Score": 68.8316074174692,
"Standard Deviation": null,
"Rank": 19
},
"CPP": {
"Average Score": 61.33538592327427,
"Standard Deviation": null,
"Rank": 15
},
"Physics": {
"Average Score": 75.18056969699853,
"Standard Deviation": null,
"Rank": 20
},
"Biology": {
"Average Score": 77.09610271458331,
"Standard Deviation": null,
"Rank": 17
}
}
},
{
"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 44.55620746942043,
"Standard Deviation": 3.997156497824947,
"Rank": 32
},
"Geometry": {
"Average Score": 36.7560037779628,
"Standard Deviation": null,
"Rank": 34
},
"Algebra": {
"Average Score": 59.50136116119945,
"Standard Deviation": null,
"Rank": 30
},
"Probability": {
"Average Score": 40.080049006314795,
"Standard Deviation": null,
"Rank": 35
},
"Logical": {
"Average Score": 34.744529623515994,
"Standard Deviation": null,
"Rank": 38
},
"Social": {
"Average Score": 40.62146960769885,
"Standard Deviation": null,
"Rank": 36
},
"Chemistry": {
"Average Score": 38.9739127306118,
"Standard Deviation": null,
"Rank": 37
},
"CPP": {
"Average Score": 38.552779976347026,
"Standard Deviation": null,
"Rank": 31
},
"Physics": {
"Average Score": 57.98313138991904,
"Standard Deviation": null,
"Rank": 31
},
"Biology": {
"Average Score": 45.732215792439575,
"Standard Deviation": null,
"Rank": 40
}
}
},
{
"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 54.96475677538885,
"Standard Deviation": 5.908641649857827,
"Rank": 26
},
"Geometry": {
"Average Score": 43.48740351644307,
"Standard Deviation": null,
"Rank": 31
},
"Algebra": {
"Average Score": 55.72045911130164,
"Standard Deviation": null,
"Rank": 33
},
"Probability": {
"Average Score": 53.07470665022828,
"Standard Deviation": null,
"Rank": 26
},
"Logical": {
"Average Score": 63.661198382201675,
"Standard Deviation": null,
"Rank": 24
},
"Social": {
"Average Score": 56.49297908205363,
"Standard Deviation": null,
"Rank": 21
},
"Chemistry": {
"Average Score": 60.28485867590517,
"Standard Deviation": null,
"Rank": 21
},
"CPP": {
"Average Score": 56.40200048817984,
"Standard Deviation": null,
"Rank": 16
},
"Physics": {
"Average Score": 67.69802411023282,
"Standard Deviation": null,
"Rank": 24
},
"Biology": {
"Average Score": 60.63801358326118,
"Standard Deviation": null,
"Rank": 24
}
}
},
{
"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 39.436633770824685,
"Standard Deviation": 1.0979568551024126,
"Rank": 36
},
"Geometry": {
"Average Score": 52.12445910303711,
"Standard Deviation": null,
"Rank": 27
},
"Algebra": {
"Average Score": 55.51421646167608,
"Standard Deviation": null,
"Rank": 34
},
"Probability": {
"Average Score": 44.720527688076,
"Standard Deviation": null,
"Rank": 33
},
"Logical": {
"Average Score": 61.64930710809233,
"Standard Deviation": null,
"Rank": 25
},
"Social": {
"Average Score": 41.24714538607354,
"Standard Deviation": null,
"Rank": 35
},
"Chemistry": {
"Average Score": 49.503134730071984,
"Standard Deviation": null,
"Rank": 26
},
"CPP": {
"Average Score": 47.23672563994903,
"Standard Deviation": null,
"Rank": 21
},
"Physics": {
"Average Score": 71.80748688814478,
"Standard Deviation": null,
"Rank": 22
},
"Biology": {
"Average Score": 56.35051024959833,
"Standard Deviation": null,
"Rank": 28
}
}
},
{
"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 41.89229040550289,
"Standard Deviation": 1.0093122675555612,
"Rank": 33
},
"Geometry": {
"Average Score": 33.703560702831055,
"Standard Deviation": null,
"Rank": 38
},
"Algebra": {
"Average Score": 50.89266418264096,
"Standard Deviation": null,
"Rank": 37
},
"Probability": {
"Average Score": 44.763608895327415,
"Standard Deviation": null,
"Rank": 32
},
"Logical": {
"Average Score": 40.32090734088309,
"Standard Deviation": null,
"Rank": 35
},
"Social": {
"Average Score": 36.25120096194333,
"Standard Deviation": null,
"Rank": 42
},
"Chemistry": {
"Average Score": 45.537417249801685,
"Standard Deviation": null,
"Rank": 29
},
"CPP": {
"Average Score": 44.533118241976666,
"Standard Deviation": null,
"Rank": 25
},
"Physics": {
"Average Score": 59.27177919021739,
"Standard Deviation": null,
"Rank": 29
},
"Biology": {
"Average Score": 53.73577835290789,
"Standard Deviation": null,
"Rank": 29
}
}
},
{
"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 29.746629448410072,
"Standard Deviation": 2.904279782741168,
"Rank": 44
},
"Geometry": {
"Average Score": 38.83959305205546,
"Standard Deviation": null,
"Rank": 33
},
"Algebra": {
"Average Score": 50.95581898913443,
"Standard Deviation": null,
"Rank": 36
},
"Probability": {
"Average Score": 46.77856061078482,
"Standard Deviation": null,
"Rank": 31
},
"Logical": {
"Average Score": 55.87663184155831,
"Standard Deviation": null,
"Rank": 28
},
"Social": {
"Average Score": 52.418630462591864,
"Standard Deviation": null,
"Rank": 24
},
"Chemistry": {
"Average Score": 54.485802241006866,
"Standard Deviation": null,
"Rank": 24
},
"CPP": {
"Average Score": 50.773143448036464,
"Standard Deviation": null,
"Rank": 19
},
"Physics": {
"Average Score": 70.21815140033613,
"Standard Deviation": null,
"Rank": 23
},
"Biology": {
"Average Score": 58.06960426451617,
"Standard Deviation": null,
"Rank": 26
}
}
},
{
"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 49.37320778476737,
"Standard Deviation": 3.6745696228749076,
"Rank": 28
},
"Geometry": {
"Average Score": 34.931531551032506,
"Standard Deviation": null,
"Rank": 37
},
"Algebra": {
"Average Score": 51.66718360952931,
"Standard Deviation": null,
"Rank": 35
},
"Probability": {
"Average Score": 40.79623349276488,
"Standard Deviation": null,
"Rank": 34
},
"Logical": {
"Average Score": 47.86775375284415,
"Standard Deviation": null,
"Rank": 30
},
"Social": {
"Average Score": 42.30631821350664,
"Standard Deviation": null,
"Rank": 33
},
"Chemistry": {
"Average Score": 38.68957842968336,
"Standard Deviation": null,
"Rank": 38
},
"CPP": {
"Average Score": 38.27587102395908,
"Standard Deviation": null,
"Rank": 32
},
"Physics": {
"Average Score": 43.122496379867655,
"Standard Deviation": null,
"Rank": 40
},
"Biology": {
"Average Score": 49.80517713841127,
"Standard Deviation": null,
"Rank": 35
}
}
},
{
"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 37.757029496159134,
"Standard Deviation": 2.4871563947325797,
"Rank": 38
},
"Geometry": {
"Average Score": 35.792088134579124,
"Standard Deviation": null,
"Rank": 36
},
"Algebra": {
"Average Score": 50.157930404365224,
"Standard Deviation": null,
"Rank": 38
},
"Probability": {
"Average Score": 25.033769367203313,
"Standard Deviation": null,
"Rank": 47
},
"Logical": {
"Average Score": 23.38732786204667,
"Standard Deviation": null,
"Rank": 46
},
"Social": {
"Average Score": 26.25171796810704,
"Standard Deviation": null,
"Rank": 51
},
"Chemistry": {
"Average Score": 43.59712830576298,
"Standard Deviation": null,
"Rank": 32
},
"CPP": {
"Average Score": 45.22204471452975,
"Standard Deviation": null,
"Rank": 23
},
"Physics": {
"Average Score": 62.1145967631314,
"Standard Deviation": null,
"Rank": 28
},
"Biology": {
"Average Score": 38.93328880463975,
"Standard Deviation": null,
"Rank": 46
}
}
},
{
"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/01"
},
"results": {
"OVERALL": {
"Average Score": 39.892305843585234,
"Standard Deviation": 2.147396504115797,
"Rank": 35
},
"Geometry": {
"Average Score": 29.941588970091672,
"Standard Deviation": null,
"Rank": 40
},
"Algebra": {
"Average Score": 47.48449168554534,
"Standard Deviation": null,
"Rank": 39
},
"Probability": {
"Average Score": 39.64777697224284,
"Standard Deviation": null,
"Rank": 36
},
"Logical": {
"Average Score": 41.361836834955504,
"Standard Deviation": null,
"Rank": 33
},
"Social": {
"Average Score": 36.716597579856675,
"Standard Deviation": null,
"Rank": 41
},
"Chemistry": {
"Average Score": 32.618034432282414,
"Standard Deviation": null,
"Rank": 41
},
"CPP": {
"Average Score": 33.70639271807677,
"Standard Deviation": null,
"Rank": 33
},
"Physics": {
"Average Score": 41.117269227834775,
"Standard Deviation": null,
"Rank": 42
},
"Biology": {
"Average Score": 46.46694211682319,
"Standard Deviation": null,
"Rank": 38
}
}
},
{
"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 38.241198423073044,
"Standard Deviation": 0.5484943791516782,
"Rank": 37
},
"Geometry": {
"Average Score": 30.89638678506991,
"Standard Deviation": null,
"Rank": 39
},
"Algebra": {
"Average Score": 41.83128388520244,
"Standard Deviation": null,
"Rank": 42
},
"Probability": {
"Average Score": 36.10478976665624,
"Standard Deviation": null,
"Rank": 39
},
"Logical": {
"Average Score": 40.320934300651516,
"Standard Deviation": null,
"Rank": 34
},
"Social": {
"Average Score": 43.49055300551458,
"Standard Deviation": null,
"Rank": 31
},
"Chemistry": {
"Average Score": 34.73882038803731,
"Standard Deviation": null,
"Rank": 40
},
"CPP": {
"Average Score": 33.020911255646965,
"Standard Deviation": null,
"Rank": 34
},
"Physics": {
"Average Score": 43.28671808104924,
"Standard Deviation": null,
"Rank": 39
},
"Biology": {
"Average Score": 37.18520956253795,
"Standard Deviation": null,
"Rank": 47
}
}
},
{
"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/08"
},
"results": {
"OVERALL": {
"Average Score": 45.419599943563604,
"Standard Deviation": 3.867586763039621,
"Rank": 30
},
"Geometry": {
"Average Score": 36.68143035371426,
"Standard Deviation": null,
"Rank": 35
},
"Algebra": {
"Average Score": 41.64517540472657,
"Standard Deviation": null,
"Rank": 43
},
"Probability": {
"Average Score": 37.95189112967414,
"Standard Deviation": null,
"Rank": 38
},
"Logical": {
"Average Score": 25.409088658564166,
"Standard Deviation": null,
"Rank": 43
},
"Social": {
"Average Score": 40.389393367109264,
"Standard Deviation": null,
"Rank": 37
},
"Chemistry": {
"Average Score": 40.08660883479598,
"Standard Deviation": null,
"Rank": 36
},
"CPP": {
"Average Score": 39.61492485677676,
"Standard Deviation": null,
"Rank": 30
},
"Physics": {
"Average Score": 49.51833550380945,
"Standard Deviation": null,
"Rank": 38
},
"Biology": {
"Average Score": 46.55085862120477,
"Standard Deviation": null,
"Rank": 37
}
}
},
{
"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 31.46481370727848,
"Standard Deviation": 5.403408635399989,
"Rank": 42
},
"Geometry": {
"Average Score": 26.078500005143134,
"Standard Deviation": null,
"Rank": 45
},
"Algebra": {
"Average Score": 40.92453155837702,
"Standard Deviation": null,
"Rank": 44
},
"Probability": {
"Average Score": 31.502661407350192,
"Standard Deviation": null,
"Rank": 44
},
"Logical": {
"Average Score": 39.27282391466396,
"Standard Deviation": null,
"Rank": 37
},
"Social": {
"Average Score": 31.639615427886643,
"Standard Deviation": null,
"Rank": 46
},
"Chemistry": {
"Average Score": 43.59704806585925,
"Standard Deviation": null,
"Rank": 33
},
"CPP": {
"Average Score": 42.666504105798204,
"Standard Deviation": null,
"Rank": 27
},
"Physics": {
"Average Score": 49.845369349755345,
"Standard Deviation": null,
"Rank": 36
},
"Biology": {
"Average Score": 45.813201684684124,
"Standard Deviation": null,
"Rank": 39
}
}
},
{
"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/03"
},
"results": {
"OVERALL": {
"Average Score": 36.30010331322555,
"Standard Deviation": 2.6021295258334334,
"Rank": 40
},
"Geometry": {
"Average Score": 28.61237715170709,
"Standard Deviation": null,
"Rank": 42
},
"Algebra": {
"Average Score": 42.6394310988214,
"Standard Deviation": null,
"Rank": 41
},
"Probability": {
"Average Score": 35.51226405104781,
"Standard Deviation": null,
"Rank": 40
},
"Logical": {
"Average Score": 59.594410427422616,
"Standard Deviation": null,
"Rank": 26
},
"Social": {
"Average Score": 42.58469219441349,
"Standard Deviation": null,
"Rank": 32
},
"Chemistry": {
"Average Score": 48.45708298495634,
"Standard Deviation": null,
"Rank": 27
},
"CPP": {
"Average Score": 45.35392139264795,
"Standard Deviation": null,
"Rank": 22
},
"Physics": {
"Average Score": 58.61979255906953,
"Standard Deviation": null,
"Rank": 30
},
"Biology": {
"Average Score": 50.39755478099045,
"Standard Deviation": null,
"Rank": 34
}
}
},
{
"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": 58.76741528626868,
"Standard Deviation": 5.683174110350625,
"Rank": 24
},
"Geometry": {
"Average Score": 29.901411513695468,
"Standard Deviation": null,
"Rank": 41
},
"Algebra": {
"Average Score": 40.60048971047775,
"Standard Deviation": null,
"Rank": 45
},
"Probability": {
"Average Score": 33.448597365831304,
"Standard Deviation": null,
"Rank": 42
},
"Logical": {
"Average Score": 43.89688208707135,
"Standard Deviation": null,
"Rank": 31
},
"Social": {
"Average Score": 48.769368715100335,
"Standard Deviation": null,
"Rank": 26
},
"Chemistry": {
"Average Score": 28.982153819366474,
"Standard Deviation": null,
"Rank": 44
},
"CPP": {
"Average Score": 30.53406933106768,
"Standard Deviation": null,
"Rank": 36
},
"Physics": {
"Average Score": 22.78354134298823,
"Standard Deviation": null,
"Rank": 49
},
"Biology": {
"Average Score": 53.59359459245764,
"Standard Deviation": null,
"Rank": 30
}
}
},
{
"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 36.98646367219327,
"Standard Deviation": 0.5488180472607256,
"Rank": 39
},
"Geometry": {
"Average Score": 26.472892835994372,
"Standard Deviation": null,
"Rank": 44
},
"Algebra": {
"Average Score": 38.4553696839335,
"Standard Deviation": null,
"Rank": 47
},
"Probability": {
"Average Score": 33.907837077924526,
"Standard Deviation": null,
"Rank": 41
},
"Logical": {
"Average Score": 33.129169647630114,
"Standard Deviation": null,
"Rank": 41
},
"Social": {
"Average Score": 39.97855588617487,
"Standard Deviation": null,
"Rank": 38
},
"Chemistry": {
"Average Score": 29.187364253387454,
"Standard Deviation": null,
"Rank": 43
},
"CPP": {
"Average Score": 30.07926487356878,
"Standard Deviation": null,
"Rank": 37
},
"Physics": {
"Average Score": 32.39068796677421,
"Standard Deviation": null,
"Rank": 43
},
"Biology": {
"Average Score": 40.884001946009214,
"Standard Deviation": null,
"Rank": 41
}
}
},
{
"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 9.87888465860545,
"Standard Deviation": 0.8496756485041839,
"Rank": 58
},
"Geometry": {
"Average Score": 16.727214095722648,
"Standard Deviation": null,
"Rank": 51
},
"Algebra": {
"Average Score": 30.868954326245674,
"Standard Deviation": null,
"Rank": 48
},
"Probability": {
"Average Score": 12.542151831707827,
"Standard Deviation": null,
"Rank": 52
},
"Logical": {
"Average Score": 13.591142976589552,
"Standard Deviation": null,
"Rank": 55
},
"Social": {
"Average Score": 29.86221951671923,
"Standard Deviation": null,
"Rank": 47
},
"Chemistry": {
"Average Score": 15.258365841050109,
"Standard Deviation": null,
"Rank": 57
},
"CPP": {
"Average Score": 13.21208067122554,
"Standard Deviation": null,
"Rank": 47
},
"Physics": {
"Average Score": 12.8962411286233,
"Standard Deviation": null,
"Rank": 56
},
"Biology": {
"Average Score": 8.598267308776672,
"Standard Deviation": null,
"Rank": 61
}
}
},
{
"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 41.52933196050375,
"Standard Deviation": 2.241081240676662,
"Rank": 34
},
"Geometry": {
"Average Score": 25.015789717085156,
"Standard Deviation": null,
"Rank": 47
},
"Algebra": {
"Average Score": 30.86273392294722,
"Standard Deviation": null,
"Rank": 49
},
"Probability": {
"Average Score": 32.69230455171987,
"Standard Deviation": null,
"Rank": 43
},
"Logical": {
"Average Score": 34.412636294090625,
"Standard Deviation": null,
"Rank": 40
},
"Social": {
"Average Score": 41.24738365139523,
"Standard Deviation": null,
"Rank": 34
},
"Chemistry": {
"Average Score": 40.79571212108303,
"Standard Deviation": null,
"Rank": 35
},
"CPP": {
"Average Score": 41.346336503003236,
"Standard Deviation": null,
"Rank": 28
},
"Physics": {
"Average Score": 52.309001772076435,
"Standard Deviation": null,
"Rank": 34
},
"Biology": {
"Average Score": 49.100219607909104,
"Standard Deviation": null,
"Rank": 36
}
}
},
{
"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 26.771867469042252,
"Standard Deviation": 2.2628124527776685,
"Rank": 45
},
"Geometry": {
"Average Score": 17.75361072083444,
"Standard Deviation": null,
"Rank": 50
},
"Algebra": {
"Average Score": 24.801410292720103,
"Standard Deviation": null,
"Rank": 50
},
"Probability": {
"Average Score": 18.923598681430988,
"Standard Deviation": null,
"Rank": 50
},
"Logical": {
"Average Score": 22.485046383293895,
"Standard Deviation": null,
"Rank": 47
},
"Social": {
"Average Score": 37.63057970959196,
"Standard Deviation": null,
"Rank": 40
},
"Chemistry": {
"Average Score": 28.982029986253178,
"Standard Deviation": null,
"Rank": 45
},
"CPP": {
"Average Score": 28.01838653090379,
"Standard Deviation": null,
"Rank": 38
},
"Physics": {
"Average Score": 28.904101398112875,
"Standard Deviation": null,
"Rank": 45
},
"Biology": {
"Average Score": 40.66824421437282,
"Standard Deviation": null,
"Rank": 44
}
}
},
{
"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 23.946098797294113,
"Standard Deviation": 1.882540513317503,
"Rank": 48
},
"Geometry": {
"Average Score": 20.947476737376597,
"Standard Deviation": null,
"Rank": 48
},
"Algebra": {
"Average Score": 23.018014851651127,
"Standard Deviation": null,
"Rank": 52
},
"Probability": {
"Average Score": 15.37360248124904,
"Standard Deviation": null,
"Rank": 51
},
"Logical": {
"Average Score": 23.856001036256362,
"Standard Deviation": null,
"Rank": 44
},
"Social": {
"Average Score": 33.803173718782276,
"Standard Deviation": null,
"Rank": 44
},
"Chemistry": {
"Average Score": 28.96403210090221,
"Standard Deviation": null,
"Rank": 46
},
"CPP": {
"Average Score": 28.014658234926813,
"Standard Deviation": null,
"Rank": 39
},
"Physics": {
"Average Score": 31.52560551567879,
"Standard Deviation": null,
"Rank": 44
},
"Biology": {
"Average Score": 33.30740831237261,
"Standard Deviation": null,
"Rank": 48
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 30.425212839239084,
"Standard Deviation": 3.2420324833230745,
"Rank": 43
},
"Geometry": {
"Average Score": 17.98077256453581,
"Standard Deviation": null,
"Rank": 49
},
"Algebra": {
"Average Score": 23.03227606898818,
"Standard Deviation": null,
"Rank": 51
},
"Probability": {
"Average Score": 22.515548503444595,
"Standard Deviation": null,
"Rank": 48
},
"Logical": {
"Average Score": 28.172299674407935,
"Standard Deviation": null,
"Rank": 42
},
"Social": {
"Average Score": 32.34681006422513,
"Standard Deviation": null,
"Rank": 45
},
"Chemistry": {
"Average Score": 29.847754052571794,
"Standard Deviation": null,
"Rank": 42
},
"CPP": {
"Average Score": 31.382959631870822,
"Standard Deviation": null,
"Rank": 35
},
"Physics": {
"Average Score": 42.179522893964496,
"Standard Deviation": null,
"Rank": 41
},
"Biology": {
"Average Score": 40.80741758174906,
"Standard Deviation": null,
"Rank": 42
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 19.00770440704137,
"Standard Deviation": 2.5108129577834823,
"Rank": 55
},
"Geometry": {
"Average Score": 11.76124122331528,
"Standard Deviation": null,
"Rank": 55
},
"Algebra": {
"Average Score": 20.16800788676758,
"Standard Deviation": null,
"Rank": 53
},
"Probability": {
"Average Score": 21.982214302316194,
"Standard Deviation": null,
"Rank": 49
},
"Logical": {
"Average Score": 16.458119477880455,
"Standard Deviation": null,
"Rank": 51
},
"Social": {
"Average Score": 11.83909143203254,
"Standard Deviation": null,
"Rank": 56
},
"Chemistry": {
"Average Score": 20.227175038540732,
"Standard Deviation": null,
"Rank": 52
},
"CPP": {
"Average Score": 18.929093202755805,
"Standard Deviation": null,
"Rank": 42
},
"Physics": {
"Average Score": 16.942666711550366,
"Standard Deviation": null,
"Rank": 53
},
"Biology": {
"Average Score": 14.862055999215585,
"Standard Deviation": null,
"Rank": 56
}
}
},
{
"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 17.596440211877606,
"Standard Deviation": 2.1378036693126887,
"Rank": 56
},
"Geometry": {
"Average Score": 13.613562588758793,
"Standard Deviation": null,
"Rank": 54
},
"Algebra": {
"Average Score": 17.777580357601646,
"Standard Deviation": null,
"Rank": 54
},
"Probability": {
"Average Score": 11.773651220819335,
"Standard Deviation": null,
"Rank": 53
},
"Logical": {
"Average Score": 16.62840722654711,
"Standard Deviation": null,
"Rank": 50
},
"Social": {
"Average Score": 12.015284814277452,
"Standard Deviation": null,
"Rank": 54
},
"Chemistry": {
"Average Score": 22.59071707495557,
"Standard Deviation": null,
"Rank": 49
},
"CPP": {
"Average Score": 21.840013221590294,
"Standard Deviation": null,
"Rank": 40
},
"Physics": {
"Average Score": 23.12484986614339,
"Standard Deviation": null,
"Rank": 48
},
"Biology": {
"Average Score": 32.46475144310054,
"Standard Deviation": null,
"Rank": 49
}
}
},
{
"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 9.430771900746599,
"Standard Deviation": 0.5392686957469028,
"Rank": 59
},
"Geometry": {
"Average Score": 8.776172464719641,
"Standard Deviation": null,
"Rank": 56
},
"Algebra": {
"Average Score": 12.864251022808256,
"Standard Deviation": null,
"Rank": 55
},
"Probability": {
"Average Score": 6.856387198441145,
"Standard Deviation": null,
"Rank": 58
},
"Logical": {
"Average Score": 7.23067331414496,
"Standard Deviation": null,
"Rank": 59
},
"Social": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 61
},
"Chemistry": {
"Average Score": 16.809164907349935,
"Standard Deviation": null,
"Rank": 54
},
"CPP": {
"Average Score": 18.92902220864132,
"Standard Deviation": null,
"Rank": 43
},
"Physics": {
"Average Score": 17.655293480361614,
"Standard Deviation": null,
"Rank": 52
},
"Biology": {
"Average Score": 12.415097886994968,
"Standard Deviation": null,
"Rank": 58
}
}
},
{
"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 21.90250655573766,
"Standard Deviation": 1.9871388098125085,
"Rank": 52
},
"Geometry": {
"Average Score": 13.697788759430225,
"Standard Deviation": null,
"Rank": 53
},
"Algebra": {
"Average Score": 12.157310639752737,
"Standard Deviation": null,
"Rank": 56
},
"Probability": {
"Average Score": 7.449868080506948,
"Standard Deviation": null,
"Rank": 56
},
"Logical": {
"Average Score": 10.62657710416428,
"Standard Deviation": null,
"Rank": 57
},
"Social": {
"Average Score": 29.175325965898267,
"Standard Deviation": null,
"Rank": 48
},
"Chemistry": {
"Average Score": 21.740619629476075,
"Standard Deviation": null,
"Rank": 50
},
"CPP": {
"Average Score": 20.724691953843916,
"Standard Deviation": null,
"Rank": 41
},
"Physics": {
"Average Score": 23.632640386132042,
"Standard Deviation": null,
"Rank": 47
},
"Biology": {
"Average Score": 29.750661487753543,
"Standard Deviation": null,
"Rank": 50
}
}
},
{
"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 23.15262700172829,
"Standard Deviation": 1.5180515912969421,
"Rank": 50
},
"Geometry": {
"Average Score": 6.062981955604592,
"Standard Deviation": null,
"Rank": 57
},
"Algebra": {
"Average Score": 9.702442741719038,
"Standard Deviation": null,
"Rank": 58
},
"Probability": {
"Average Score": 7.323764901851239,
"Standard Deviation": null,
"Rank": 57
},
"Logical": {
"Average Score": 20.042615636879354,
"Standard Deviation": null,
"Rank": 49
},
"Social": {
"Average Score": 28.003092092497983,
"Standard Deviation": null,
"Rank": 49
},
"Chemistry": {
"Average Score": 20.22732766050842,
"Standard Deviation": null,
"Rank": 51
},
"CPP": {
"Average Score": 15.730513733660898,
"Standard Deviation": null,
"Rank": 45
},
"Physics": {
"Average Score": 12.866623115939365,
"Standard Deviation": null,
"Rank": 57
},
"Biology": {
"Average Score": 29.435323133887913,
"Standard Deviation": null,
"Rank": 51
}
}
},
{
"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 20.296640473489866,
"Standard Deviation": 2.333666507610861,
"Rank": 53
},
"Geometry": {
"Average Score": 16.155982788407485,
"Standard Deviation": null,
"Rank": 52
},
"Algebra": {
"Average Score": 9.997670449242714,
"Standard Deviation": null,
"Rank": 57
},
"Probability": {
"Average Score": 6.055292262170126,
"Standard Deviation": null,
"Rank": 59
},
"Logical": {
"Average Score": 5.200573121259635,
"Standard Deviation": null,
"Rank": 60
},
"Social": {
"Average Score": 9.560337024016134,
"Standard Deviation": null,
"Rank": 58
},
"Chemistry": {
"Average Score": 16.613881599313693,
"Standard Deviation": null,
"Rank": 55
},
"CPP": {
"Average Score": 17.2715657115764,
"Standard Deviation": null,
"Rank": 44
},
"Physics": {
"Average Score": 17.72258050873005,
"Standard Deviation": null,
"Rank": 51
},
"Biology": {
"Average Score": 10.891363209321185,
"Standard Deviation": null,
"Rank": 59
}
}
},
{
"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 22.40246822660458,
"Standard Deviation": 1.5744155926563603,
"Rank": 51
},
"Geometry": {
"Average Score": 4.287260426268335,
"Standard Deviation": null,
"Rank": 59
},
"Algebra": {
"Average Score": 7.122650832792122,
"Standard Deviation": null,
"Rank": 59
},
"Probability": {
"Average Score": 10.367779885088286,
"Standard Deviation": null,
"Rank": 54
},
"Logical": {
"Average Score": 23.416885515011753,
"Standard Deviation": null,
"Rank": 45
},
"Social": {
"Average Score": 26.251837552806705,
"Standard Deviation": null,
"Rank": 50
},
"Chemistry": {
"Average Score": 15.236408439765913,
"Standard Deviation": null,
"Rank": 58
},
"CPP": {
"Average Score": 13.17258252933903,
"Standard Deviation": null,
"Rank": 48
},
"Physics": {
"Average Score": 9.756032013938237,
"Standard Deviation": null,
"Rank": 58
},
"Biology": {
"Average Score": 14.373926163839833,
"Standard Deviation": null,
"Rank": 57
}
}
},
{
"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 16.947504584923095,
"Standard Deviation": 2.1935303160759494,
"Rank": 57
},
"Geometry": {
"Average Score": 5.6556788835908565,
"Standard Deviation": null,
"Rank": 58
},
"Algebra": {
"Average Score": 6.937810777972691,
"Standard Deviation": null,
"Rank": 60
},
"Probability": {
"Average Score": 7.449902539116639,
"Standard Deviation": null,
"Rank": 55
},
"Logical": {
"Average Score": 11.53991650872671,
"Standard Deviation": null,
"Rank": 56
},
"Social": {
"Average Score": 10.510431618145562,
"Standard Deviation": null,
"Rank": 57
},
"Chemistry": {
"Average Score": 15.565621989451936,
"Standard Deviation": null,
"Rank": 56
},
"CPP": {
"Average Score": 14.255194156624162,
"Standard Deviation": null,
"Rank": 46
},
"Physics": {
"Average Score": 13.654470501928998,
"Standard Deviation": null,
"Rank": 55
},
"Biology": {
"Average Score": 16.31264249867034,
"Standard Deviation": null,
"Rank": 55
}
}
},
{
"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 8.83755726181737,
"Standard Deviation": 0.6967904064276641,
"Rank": 60
},
"Geometry": {
"Average Score": 0.16630617078665783,
"Standard Deviation": null,
"Rank": 60
},
"Algebra": {
"Average Score": 2.2176438662182405,
"Standard Deviation": null,
"Rank": 61
},
"Probability": {
"Average Score": 3.0086045641099886,
"Standard Deviation": null,
"Rank": 60
},
"Logical": {
"Average Score": 8.007902379487398,
"Standard Deviation": null,
"Rank": 58
},
"Social": {
"Average Score": 9.267400643797334,
"Standard Deviation": null,
"Rank": 59
},
"Chemistry": {
"Average Score": 6.881971917535636,
"Standard Deviation": null,
"Rank": 59
},
"CPP": {
"Average Score": 6.36433272373514,
"Standard Deviation": null,
"Rank": 49
},
"Physics": {
"Average Score": 1.4745736403582252,
"Standard Deviation": null,
"Rank": 59
},
"Biology": {
"Average Score": 10.173901160370301,
"Standard Deviation": null,
"Rank": 60
}
}
},
{
"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 0.0,
"Standard Deviation": 0.0,
"Rank": 61
},
"Geometry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 61
},
"Algebra": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 62
},
"Probability": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 61
},
"Logical": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 61
},
"Social": {
"Average Score": 1.5648937446490145,
"Standard Deviation": null,
"Rank": 60
},
"Chemistry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 60
},
"CPP": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 50
},
"Physics": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 60
},
"Biology": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 62
}
}
},
{
"config": {
"model_name": "nemotron-70b",
"organization": "NVIDIA",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 100.0,
"Standard Deviation": 0.0,
"Rank": 1
},
"Algebra": {
"Average Score": 80.66812253661826,
"Standard Deviation": null,
"Rank": 17
},
"Geometry": {
"Average Score": 64.79317124458657,
"Standard Deviation": null,
"Rank": 14
},
"Probability": {
"Average Score": 77.90998100977566,
"Standard Deviation": null,
"Rank": 10
},
"Logical": {
"Average Score": 92.79205249453312,
"Standard Deviation": null,
"Rank": 5
},
"Social": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Chemistry": {
"Average Score": 75.51792600714916,
"Standard Deviation": null,
"Rank": 14
},
"Physics": {
"Average Score": 87.87343018217607,
"Standard Deviation": null,
"Rank": 11
},
"Biology": {
"Average Score": 89.70989044405452,
"Standard Deviation": null,
"Rank": 6
}
}
},
{
"config": {
"model_name": "llama-3.2-3b-it",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 24.55648638012998,
"Standard Deviation": 2.7438328116042396,
"Rank": 47
},
"Algebra": {
"Average Score": 58.282081682035965,
"Standard Deviation": null,
"Rank": 32
},
"Probability": {
"Average Score": 38.82178804612166,
"Standard Deviation": null,
"Rank": 37
},
"Logical": {
"Average Score": 14.284884351545829,
"Standard Deviation": null,
"Rank": 53
},
"Social": {
"Average Score": 12.015170971293347,
"Standard Deviation": null,
"Rank": 55
},
"Chemistry": {
"Average Score": 28.594555260782386,
"Standard Deviation": null,
"Rank": 47
},
"Physics": {
"Average Score": 28.49646725691165,
"Standard Deviation": null,
"Rank": 46
},
"Biology": {
"Average Score": 19.26616886675504,
"Standard Deviation": null,
"Rank": 54
}
}
},
{
"config": {
"model_name": "glm-4-plus",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 93.38486963586884,
"Standard Deviation": null,
"Rank": 4
},
"Biology": {
"Average Score": 92.22645537080881,
"Standard Deviation": null,
"Rank": 4
},
"Chemistry": {
"Average Score": 83.011021476943,
"Standard Deviation": null,
"Rank": 8
},
"Social": {
"Average Score": 96.10166232633848,
"Standard Deviation": null,
"Rank": 5
},
"Logical": {
"Average Score": 92.48639421432455,
"Standard Deviation": null,
"Rank": 6
},
"Algebra": {
"Average Score": 91.79128700104991,
"Standard Deviation": null,
"Rank": 7
},
"Geometry": {
"Average Score": 75.41344471165868,
"Standard Deviation": null,
"Rank": 10
},
"Probability": {
"Average Score": 76.73191937524591,
"Standard Deviation": null,
"Rank": 12
},
"OVERALL": {
"Average Score": 92.39089671677698,
"Standard Deviation": 0.5005865827133669,
"Rank": 6
}
}
},
{
"config": {
"model_name": "yi-lightning",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 88.49402753650628,
"Standard Deviation": null,
"Rank": 9
},
"Biology": {
"Average Score": 90.37891957676416,
"Standard Deviation": null,
"Rank": 5
},
"Chemistry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Social": {
"Average Score": 92.14580653902937,
"Standard Deviation": null,
"Rank": 6
},
"Logical": {
"Average Score": 94.75701503537329,
"Standard Deviation": null,
"Rank": 4
},
"Algebra": {
"Average Score": 93.3186019721947,
"Standard Deviation": null,
"Rank": 6
},
"Geometry": {
"Average Score": 76.16313216563569,
"Standard Deviation": null,
"Rank": 9
},
"Probability": {
"Average Score": 92.54460354742838,
"Standard Deviation": null,
"Rank": 3
},
"OVERALL": {
"Average Score": 96.802929532644,
"Standard Deviation": 0.27491691197906704,
"Rank": 3
}
}
},
{
"config": {
"model_name": "ministral-8b-it",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 57.14492748742418,
"Standard Deviation": null,
"Rank": 32
},
"Biology": {
"Average Score": 53.5479824847229,
"Standard Deviation": null,
"Rank": 31
},
"Chemistry": {
"Average Score": 45.51400153833142,
"Standard Deviation": null,
"Rank": 30
},
"Social": {
"Average Score": 45.54025353861784,
"Standard Deviation": null,
"Rank": 29
},
"Logical": {
"Average Score": 59.25000685096734,
"Standard Deviation": null,
"Rank": 27
},
"Algebra": {
"Average Score": 58.56021213895309,
"Standard Deviation": null,
"Rank": 31
},
"Geometry": {
"Average Score": 54.902884398306554,
"Standard Deviation": null,
"Rank": 22
},
"Probability": {
"Average Score": 49.69358274321923,
"Standard Deviation": null,
"Rank": 29
},
"OVERALL": {
"Average Score": 45.88665474541969,
"Standard Deviation": 4.242263667629549,
"Rank": 29
}
}
},
{
"config": {
"model_name": "qwen2.5-1.5b",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 50.38291508013627,
"Standard Deviation": null,
"Rank": 35
},
"Biology": {
"Average Score": 40.134558844170826,
"Standard Deviation": null,
"Rank": 45
},
"Chemistry": {
"Average Score": 34.891253153439166,
"Standard Deviation": null,
"Rank": 39
},
"Social": {
"Average Score": 39.812806552940735,
"Standard Deviation": null,
"Rank": 39
},
"Logical": {
"Average Score": 42.70305684307474,
"Standard Deviation": null,
"Rank": 32
},
"Algebra": {
"Average Score": 79.30455838359877,
"Standard Deviation": null,
"Rank": 18
},
"Geometry": {
"Average Score": 58.56739922365014,
"Standard Deviation": null,
"Rank": 18
},
"Probability": {
"Average Score": 68.07725566867765,
"Standard Deviation": null,
"Rank": 19
},
"OVERALL": {
"Average Score": 23.25904934716627,
"Standard Deviation": 1.5089621200216172,
"Rank": 49
}
}
},
{
"config": {
"model_name": "smollm2-1.7b",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 20.328651604714242,
"Standard Deviation": null,
"Rank": 50
},
"Biology": {
"Average Score": 23.55167655906088,
"Standard Deviation": null,
"Rank": 53
},
"Chemistry": {
"Average Score": 17.90654461263675,
"Standard Deviation": null,
"Rank": 53
},
"Social": {
"Average Score": 18.586981509149783,
"Standard Deviation": null,
"Rank": 53
},
"Logical": {
"Average Score": 13.753294179366819,
"Standard Deviation": null,
"Rank": 54
},
"Algebra": {
"Average Score": 38.86009773073664,
"Standard Deviation": null,
"Rank": 46
},
"Geometry": {
"Average Score": 26.65205080537627,
"Standard Deviation": null,
"Rank": 43
},
"Probability": {
"Average Score": 28.77646355213561,
"Standard Deviation": null,
"Rank": 45
},
"OVERALL": {
"Average Score": 20.14565641258473,
"Standard Deviation": 2.3679638882398857,
"Rank": 54
}
}
},
{
"config": {
"model_name": "llama-3.2-1b-it",
"organization": "Unknown",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"Physics": {
"Average Score": 13.730639722217427,
"Standard Deviation": null,
"Rank": 54
},
"Biology": {
"Average Score": 25.09504378386352,
"Standard Deviation": null,
"Rank": 52
},
"Chemistry": {
"Average Score": 22.71076097859151,
"Standard Deviation": null,
"Rank": 48
},
"Social": {
"Average Score": 20.34042449083379,
"Standard Deviation": null,
"Rank": 52
},
"Logical": {
"Average Score": 15.338736069283176,
"Standard Deviation": null,
"Rank": 52
},
"Algebra": {
"Average Score": 43.69053020706735,
"Standard Deviation": null,
"Rank": 40
},
"Geometry": {
"Average Score": 25.35058286701741,
"Standard Deviation": null,
"Rank": 46
},
"Probability": {
"Average Score": 28.620674481486535,
"Standard Deviation": null,
"Rank": 46
},
"OVERALL": {
"Average Score": 24.93401522355894,
"Standard Deviation": 2.6710490374694014,
"Rank": 46
}
}
}
]