|
[
|
|
{
|
|
"config": {
|
|
"model_name": "ChatGPT-4o-latest (2024-09-03)",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 94.49771627042422,
|
|
"Standard Deviation": 0.251607817784525,
|
|
"Rank": 4
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 81.11505705795187,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 91.79122001491199,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Probability": {
|
|
"Average Score": 88.00190397870577,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Logical": {
|
|
"Average Score": 97.47223448912972,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Social": {
|
|
"Average Score": 89.73262585993845,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 90.48070030738856,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"CPP": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Physics": {
|
|
"Average Score": 99.7043774383865,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Biology": {
|
|
"Average Score": 95.98449860487872,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-08-06",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 83.33484787198124,
|
|
"Standard Deviation": 3.0334254138998893,
|
|
"Rank": 12
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 85.73211137938175,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 95.29454759516874,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Probability": {
|
|
"Average Score": 80.9483280228488,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Logical": {
|
|
"Average Score": 78.93507998348575,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Social": {
|
|
"Average Score": 78.21553692695771,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.46337310221962,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"CPP": {
|
|
"Average Score": 92.43090226400756,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Physics": {
|
|
"Average Score": 92.63882355350016,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Biology": {
|
|
"Average Score": 79.88713500945879,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-05-13",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 89.31218128337491,
|
|
"Standard Deviation": 0.5511990686487255,
|
|
"Rank": 8
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 81.70458958633901,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 90.16488595415144,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Probability": {
|
|
"Average Score": 83.8098272382245,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Logical": {
|
|
"Average Score": 88.2742970015626,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Social": {
|
|
"Average Score": 71.51855733216095,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 84.0147961443266,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"CPP": {
|
|
"Average Score": 79.1592634699295,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Physics": {
|
|
"Average Score": 96.44583156689123,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Biology": {
|
|
"Average Score": 86.17947030919935,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4-turbo-2024-04-09",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 89.20222265636137,
|
|
"Standard Deviation": 0.9498836008363539,
|
|
"Rank": 9
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 77.90202019775627,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 84.83537307564205,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Probability": {
|
|
"Average Score": 80.01448545719413,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Logical": {
|
|
"Average Score": 89.63955736396734,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Social": {
|
|
"Average Score": 77.25088451567024,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 78.97054235015905,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"CPP": {
|
|
"Average Score": 70.73143363230263,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Physics": {
|
|
"Average Score": 90.33497346058968,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Biology": {
|
|
"Average Score": 86.17949760404831,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 82.91139866415075,
|
|
"Standard Deviation": 3.013751980804677,
|
|
"Rank": 13
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 83.6654007694722,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 98.84487439119522,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Probability": {
|
|
"Average Score": 75.94594518060929,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Logical": {
|
|
"Average Score": 78.89834475831927,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Social": {
|
|
"Average Score": 78.21569899283614,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Physics": {
|
|
"Average Score": 88.41290613720335,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Biology": {
|
|
"Average Score": 86.45347978614136,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen2-72b-instruct",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 80.51855735113782,
|
|
"Standard Deviation": 2.389693257324127,
|
|
"Rank": 15
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 68.80768467173304,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 95.86210030199506,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Probability": {
|
|
"Average Score": 82.29702731445691,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Logical": {
|
|
"Average Score": 73.55135235722557,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Social": {
|
|
"Average Score": 57.41502695932332,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 75.8879803782176,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.54037778797029,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Physics": {
|
|
"Average Score": 82.02738090295061,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Biology": {
|
|
"Average Score": 66.99838962851355,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-mini-2024-07-18",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 85.79551424780102,
|
|
"Standard Deviation": 2.25059599602412,
|
|
"Rank": 11
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 78.03415885586699,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 90.10621818673319,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Probability": {
|
|
"Average Score": 80.94824796859724,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Logical": {
|
|
"Average Score": 86.1004659652016,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Social": {
|
|
"Average Score": 74.20253943841105,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 75.44768883899778,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"CPP": {
|
|
"Average Score": 88.3877070580296,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Physics": {
|
|
"Average Score": 90.33492089386435,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Biology": {
|
|
"Average Score": 79.03781031583883,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3.5-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 90.30644459276833,
|
|
"Standard Deviation": 0.6105034066546057,
|
|
"Rank": 7
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 72.63402106402285,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 83.32075177480141,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Probability": {
|
|
"Average Score": 76.7319625254773,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Logical": {
|
|
"Average Score": 90.00404188010565,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Social": {
|
|
"Average Score": 99.89849499454823,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 85.86402884262867,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"CPP": {
|
|
"Average Score": 82.37734076815008,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Physics": {
|
|
"Average Score": 92.83215449096147,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Biology": {
|
|
"Average Score": 85.76627192038262,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3.5-sonnet-20241022",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "UNKNOW"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 81.7399750668719,
|
|
"Standard Deviation": 6.158375141726245,
|
|
"Rank": 14
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 72.63581025178527,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 89.50323347048936,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Probability": {
|
|
"Average Score": 73.919,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Logical": {
|
|
"Average Score": 90.514,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Social": {
|
|
"Average Score": 84.505,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 85.15970597010583,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Physics": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Biology": {
|
|
"Average Score": 85.56526806360797,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-mini",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 97.50448224920098,
|
|
"Standard Deviation": 0.18820973784944708,
|
|
"Rank": 2
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Probability": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Logical": {
|
|
"Average Score": 96.62093396445893,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Social": {
|
|
"Average Score": 98.93701302706319,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 93.52027415963765,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Biology": {
|
|
"Average Score": 99.9210788257773,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-preview",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 92.95670511909181,
|
|
"Standard Deviation": 0.26193636312885404,
|
|
"Rank": 5
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 81.70453162182778,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 99.2204666813678,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Probability": {
|
|
"Average Score": 96.11141903959506,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Logical": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Social": {
|
|
"Average Score": 99.35681400812317,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Biology": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-flash-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 63.90738369106308,
|
|
"Standard Deviation": 2.5840022803072342,
|
|
"Rank": 20
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 62.78784730869374,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 84.4516255656167,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Probability": {
|
|
"Average Score": 71.21668893483972,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Logical": {
|
|
"Average Score": 73.55137041991937,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Social": {
|
|
"Average Score": 71.51839473022034,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 78.9281328399534,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"CPP": {
|
|
"Average Score": 72.1127762005651,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Physics": {
|
|
"Average Score": 86.21163726768592,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Biology": {
|
|
"Average Score": 77.50881946688955,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt4-1106",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 88.08481721079524,
|
|
"Standard Deviation": 1.4421920877285703,
|
|
"Rank": 10
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 59.2110329866853,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 80.79050620153212,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Probability": {
|
|
"Average Score": 74.36123524515216,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Logical": {
|
|
"Average Score": 77.02518347398768,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Social": {
|
|
"Average Score": 51.13078063545894,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 72.4125941071821,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"CPP": {
|
|
"Average Score": 69.11824072252848,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Physics": {
|
|
"Average Score": 87.0543996394885,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Biology": {
|
|
"Average Score": 82.36213636857161,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-27b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 70.59188609288081,
|
|
"Standard Deviation": 8.717841670213112,
|
|
"Rank": 19
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 58.00008857041582,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 77.82927803658924,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Probability": {
|
|
"Average Score": 69.63382706259532,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Logical": {
|
|
"Average Score": 73.55136762438677,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Social": {
|
|
"Average Score": 57.17847568664103,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 68.65449070488427,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"CPP": {
|
|
"Average Score": 63.28920072143611,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Physics": {
|
|
"Average Score": 76.8395150041688,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Biology": {
|
|
"Average Score": 66.99846220210911,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-opus",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 79.77338364506384,
|
|
"Standard Deviation": 2.32886155429398,
|
|
"Rank": 16
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 57.5200576513199,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 76.89230078890219,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Probability": {
|
|
"Average Score": 71.20578106177237,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Logical": {
|
|
"Average Score": 78.93505058041774,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Social": {
|
|
"Average Score": 88.40491896661747,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.0571776580065,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.5404403567132,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Physics": {
|
|
"Average Score": 87.28118117714033,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Biology": {
|
|
"Average Score": 71.23527633371832,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-9b-it-simpo",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 57.520011750672175,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 72.3731046476544,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Probability": {
|
|
"Average Score": 61.79614379365174,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Logical": {
|
|
"Average Score": 64.62661472571767,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Social": {
|
|
"Average Score": 87.65488278831526,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 85.36850564169866,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.43757596214863,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Physics": {
|
|
"Average Score": 82.02727994935249,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Biology": {
|
|
"Average Score": 88.80821937078267,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-72b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 61.57517122936127,
|
|
"Standard Deviation": 5.01096656930536,
|
|
"Rank": 21
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 49.36591842356095,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 71.12615153442515,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Probability": {
|
|
"Average Score": 51.76027345875035,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Logical": {
|
|
"Average Score": 34.74438889550426,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Social": {
|
|
"Average Score": 47.47112348597555,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 51.65772092991593,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"CPP": {
|
|
"Average Score": 48.69302376665551,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Physics": {
|
|
"Average Score": 62.45893584822384,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Biology": {
|
|
"Average Score": 56.96571500324531,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-32b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 45.3199699974334,
|
|
"Standard Deviation": 3.7527776450894996,
|
|
"Rank": 31
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 45.66389348479106,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 64.9403510842088,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Probability": {
|
|
"Average Score": 51.99376831114535,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Logical": {
|
|
"Average Score": 39.30230377209954,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Social": {
|
|
"Average Score": 45.679222078247186,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 46.41262433996582,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.14284028264288,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Physics": {
|
|
"Average Score": 65.80533740982938,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Biology": {
|
|
"Average Score": 50.767985684362536,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "google-gemma-2-9b-it",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 59.024943267290716,
|
|
"Standard Deviation": 3.979239820929726,
|
|
"Rank": 23
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 53.495866814128156,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 65.98776390439404,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Probability": {
|
|
"Average Score": 65.76699220336998,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Logical": {
|
|
"Average Score": 71.04386923330611,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Social": {
|
|
"Average Score": 73.74087367208867,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 57.074735438190935,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"CPP": {
|
|
"Average Score": 54.03167523687635,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Physics": {
|
|
"Average Score": 63.03919029129539,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Biology": {
|
|
"Average Score": 63.18363754826406,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "yi-1.5-34b-chat",
|
|
"organization": "01 AI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/05"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 71.78031967728624,
|
|
"Standard Deviation": 12.994861744386325,
|
|
"Rank": 18
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 54.06826621860964,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 65.66679210942144,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Probability": {
|
|
"Average Score": 66.46858903563573,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Logical": {
|
|
"Average Score": 67.36081192984079,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Social": {
|
|
"Average Score": 53.898293694371446,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 56.1520167017115,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"CPP": {
|
|
"Average Score": 52.148798061768964,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Physics": {
|
|
"Average Score": 73.06547347263036,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Biology": {
|
|
"Average Score": 72.47949036617567,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 72.88379857527117,
|
|
"Standard Deviation": 3.7053577253028176,
|
|
"Rank": 17
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 62.78788327507421,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 80.79028754890449,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Probability": {
|
|
"Average Score": 69.6338691921361,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Logical": {
|
|
"Average Score": 74.43905975120572,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Social": {
|
|
"Average Score": 61.22534257022315,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 70.9160725889497,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"CPP": {
|
|
"Average Score": 84.36815192532764,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Physics": {
|
|
"Average Score": 82.02759904132307,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Biology": {
|
|
"Average Score": 72.47948013923437,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 52.21824740443002,
|
|
"Standard Deviation": 3.7833302779202937,
|
|
"Rank": 27
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 43.03691891008171,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 64.13661497122277,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Probability": {
|
|
"Average Score": 55.37882298464668,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Logical": {
|
|
"Average Score": 53.843773408414144,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Social": {
|
|
"Average Score": 44.993575656549545,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 43.98798267082055,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.41846841004584,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Physics": {
|
|
"Average Score": 49.65976817230991,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Biology": {
|
|
"Average Score": 52.132998637966764,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt3.5-turbo-0125",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2021/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 32.61987548870099,
|
|
"Standard Deviation": 7.421068133219178,
|
|
"Rank": 41
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 52.43446046073764,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 62.62345918733465,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Probability": {
|
|
"Average Score": 46.778615832700474,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Logical": {
|
|
"Average Score": 20.161483818418485,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Social": {
|
|
"Average Score": 36.005021312700556,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 41.27375172990709,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"CPP": {
|
|
"Average Score": 40.46958736582551,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Physics": {
|
|
"Average Score": 53.13517938912883,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Biology": {
|
|
"Average Score": 40.750963952571375,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 58.67095788786492,
|
|
"Standard Deviation": 3.916500171452786,
|
|
"Rank": 25
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 47.16123420770543,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 62.38398769226985,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Probability": {
|
|
"Average Score": 57.7568005808253,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Logical": {
|
|
"Average Score": 84.45551822980201,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Social": {
|
|
"Average Score": 52.450283668620365,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 70.91630635362482,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"CPP": {
|
|
"Average Score": 65.32140697218945,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Physics": {
|
|
"Average Score": 78.08120808341037,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Biology": {
|
|
"Average Score": 60.6111504865126,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 61.25499749085383,
|
|
"Standard Deviation": 5.012226129836105,
|
|
"Rank": 22
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 52.4291917862642,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 60.40928261066776,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Probability": {
|
|
"Average Score": 57.4556182999398,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Logical": {
|
|
"Average Score": 66.81740129837053,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Social": {
|
|
"Average Score": 69.99747730347514,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 68.8316074174692,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"CPP": {
|
|
"Average Score": 61.33538592327427,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Physics": {
|
|
"Average Score": 75.18056969699853,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Biology": {
|
|
"Average Score": 77.09610271458331,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-14b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 44.55620746942043,
|
|
"Standard Deviation": 3.997156497824947,
|
|
"Rank": 32
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 36.7560037779628,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 59.50136116119945,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Probability": {
|
|
"Average Score": 40.080049006314795,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Logical": {
|
|
"Average Score": 34.744529623515994,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Social": {
|
|
"Average Score": 40.62146960769885,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 38.9739127306118,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.552779976347026,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Physics": {
|
|
"Average Score": 57.98313138991904,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Biology": {
|
|
"Average Score": 45.732215792439575,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-haiku",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 54.96475677538885,
|
|
"Standard Deviation": 5.908641649857827,
|
|
"Rank": 26
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 43.48740351644307,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 55.72045911130164,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Probability": {
|
|
"Average Score": 53.07470665022828,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Logical": {
|
|
"Average Score": 63.661198382201675,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Social": {
|
|
"Average Score": 56.49297908205363,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 60.28485867590517,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"CPP": {
|
|
"Average Score": 56.40200048817984,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Physics": {
|
|
"Average Score": 67.69802411023282,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Biology": {
|
|
"Average Score": 60.63801358326118,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.1",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 39.436633770824685,
|
|
"Standard Deviation": 1.0979568551024126,
|
|
"Rank": 36
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 52.12445910303711,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 55.51421646167608,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Probability": {
|
|
"Average Score": 44.720527688076,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Logical": {
|
|
"Average Score": 61.64930710809233,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Social": {
|
|
"Average Score": 41.24714538607354,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 49.503134730071984,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"CPP": {
|
|
"Average Score": 47.23672563994903,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Physics": {
|
|
"Average Score": 71.80748688814478,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Biology": {
|
|
"Average Score": 56.35051024959833,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-8x7b-instruct-v0.1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 41.89229040550289,
|
|
"Standard Deviation": 1.0093122675555612,
|
|
"Rank": 33
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 33.703560702831055,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 50.89266418264096,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Probability": {
|
|
"Average Score": 44.763608895327415,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Logical": {
|
|
"Average Score": 40.32090734088309,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Social": {
|
|
"Average Score": 36.25120096194333,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.537417249801685,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.533118241976666,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Physics": {
|
|
"Average Score": 59.27177919021739,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Biology": {
|
|
"Average Score": 53.73577835290789,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.0",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 29.746629448410072,
|
|
"Standard Deviation": 2.904279782741168,
|
|
"Rank": 44
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 38.83959305205546,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 50.95581898913443,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Probability": {
|
|
"Average Score": 46.77856061078482,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Logical": {
|
|
"Average Score": 55.87663184155831,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Social": {
|
|
"Average Score": 52.418630462591864,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 54.485802241006866,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"CPP": {
|
|
"Average Score": 50.773143448036464,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Physics": {
|
|
"Average Score": 70.21815140033613,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Biology": {
|
|
"Average Score": 58.06960426451617,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-beta",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 49.37320778476737,
|
|
"Standard Deviation": 3.6745696228749076,
|
|
"Rank": 28
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 34.931531551032506,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 51.66718360952931,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Probability": {
|
|
"Average Score": 40.79623349276488,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Logical": {
|
|
"Average Score": 47.86775375284415,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Social": {
|
|
"Average Score": 42.30631821350664,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 38.68957842968336,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.27587102395908,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Physics": {
|
|
"Average Score": 43.122496379867655,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Biology": {
|
|
"Average Score": 49.80517713841127,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.0-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 37.757029496159134,
|
|
"Standard Deviation": 2.4871563947325797,
|
|
"Rank": 38
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 35.792088134579124,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 50.157930404365224,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Probability": {
|
|
"Average Score": 25.033769367203313,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Logical": {
|
|
"Average Score": 23.38732786204667,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Social": {
|
|
"Average Score": 26.25171796810704,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 43.59712830576298,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.22204471452975,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Physics": {
|
|
"Average Score": 62.1145967631314,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Biology": {
|
|
"Average Score": 38.93328880463975,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5-0106",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/01"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 39.892305843585234,
|
|
"Standard Deviation": 2.147396504115797,
|
|
"Rank": 35
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 29.941588970091672,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 47.48449168554534,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Probability": {
|
|
"Average Score": 39.64777697224284,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Logical": {
|
|
"Average Score": 41.361836834955504,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Social": {
|
|
"Average Score": 36.716597579856675,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 32.618034432282414,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.70639271807677,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Physics": {
|
|
"Average Score": 41.117269227834775,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Biology": {
|
|
"Average Score": 46.46694211682319,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 38.241198423073044,
|
|
"Standard Deviation": 0.5484943791516782,
|
|
"Rank": 37
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 30.89638678506991,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 41.83128388520244,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Probability": {
|
|
"Average Score": 36.10478976665624,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Logical": {
|
|
"Average Score": 40.320934300651516,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Social": {
|
|
"Average Score": 43.49055300551458,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 34.73882038803731,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.020911255646965,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Physics": {
|
|
"Average Score": 43.28671808104924,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Biology": {
|
|
"Average Score": 37.18520956253795,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(08-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 45.419599943563604,
|
|
"Standard Deviation": 3.867586763039621,
|
|
"Rank": 30
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 36.68143035371426,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 41.64517540472657,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Probability": {
|
|
"Average Score": 37.95189112967414,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Logical": {
|
|
"Average Score": 25.409088658564166,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Social": {
|
|
"Average Score": 40.389393367109264,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 40.08660883479598,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"CPP": {
|
|
"Average Score": 39.61492485677676,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Physics": {
|
|
"Average Score": 49.51833550380945,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Biology": {
|
|
"Average Score": 46.55085862120477,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 31.46481370727848,
|
|
"Standard Deviation": 5.403408635399989,
|
|
"Rank": 42
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 26.078500005143134,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 40.92453155837702,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Probability": {
|
|
"Average Score": 31.502661407350192,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Logical": {
|
|
"Average Score": 39.27282391466396,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Social": {
|
|
"Average Score": 31.639615427886643,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 43.59704806585925,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"CPP": {
|
|
"Average Score": 42.666504105798204,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Physics": {
|
|
"Average Score": 49.845369349755345,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Biology": {
|
|
"Average Score": 45.813201684684124,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama3-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 36.30010331322555,
|
|
"Standard Deviation": 2.6021295258334334,
|
|
"Rank": 40
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 28.61237715170709,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 42.6394310988214,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Probability": {
|
|
"Average Score": 35.51226405104781,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Logical": {
|
|
"Average Score": 59.594410427422616,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Social": {
|
|
"Average Score": 42.58469219441349,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 48.45708298495634,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.35392139264795,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Physics": {
|
|
"Average Score": 58.61979255906953,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Biology": {
|
|
"Average Score": 50.39755478099045,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 58.76741528626868,
|
|
"Standard Deviation": 5.683174110350625,
|
|
"Rank": 24
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 29.901411513695468,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 40.60048971047775,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Probability": {
|
|
"Average Score": 33.448597365831304,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Logical": {
|
|
"Average Score": 43.89688208707135,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Social": {
|
|
"Average Score": 48.769368715100335,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.982153819366474,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.53406933106768,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Physics": {
|
|
"Average Score": 22.78354134298823,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Biology": {
|
|
"Average Score": 53.59359459245764,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-alpha",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 36.98646367219327,
|
|
"Standard Deviation": 0.5488180472607256,
|
|
"Rank": 39
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 26.472892835994372,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 38.4553696839335,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Probability": {
|
|
"Average Score": 33.907837077924526,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Logical": {
|
|
"Average Score": 33.129169647630114,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Social": {
|
|
"Average Score": 39.97855588617487,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 29.187364253387454,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.07926487356878,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Physics": {
|
|
"Average Score": 32.39068796677421,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Biology": {
|
|
"Average Score": 40.884001946009214,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-4b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 9.87888465860545,
|
|
"Standard Deviation": 0.8496756485041839,
|
|
"Rank": 58
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 16.727214095722648,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 30.868954326245674,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Probability": {
|
|
"Average Score": 12.542151831707827,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Logical": {
|
|
"Average Score": 13.591142976589552,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Social": {
|
|
"Average Score": 29.86221951671923,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 15.258365841050109,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.21208067122554,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Physics": {
|
|
"Average Score": 12.8962411286233,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Biology": {
|
|
"Average Score": 8.598267308776672,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(04-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 41.52933196050375,
|
|
"Standard Deviation": 2.241081240676662,
|
|
"Rank": 34
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 25.015789717085156,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 30.86273392294722,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Probability": {
|
|
"Average Score": 32.69230455171987,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Logical": {
|
|
"Average Score": 34.412636294090625,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Social": {
|
|
"Average Score": 41.24738365139523,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 40.79571212108303,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"CPP": {
|
|
"Average Score": 41.346336503003236,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Physics": {
|
|
"Average Score": 52.309001772076435,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Biology": {
|
|
"Average Score": 49.100219607909104,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-33b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 26.771867469042252,
|
|
"Standard Deviation": 2.2628124527776685,
|
|
"Rank": 45
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 17.75361072083444,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 24.801410292720103,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Probability": {
|
|
"Average Score": 18.923598681430988,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Logical": {
|
|
"Average Score": 22.485046383293895,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Social": {
|
|
"Average Score": 37.63057970959196,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.982029986253178,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.01838653090379,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Physics": {
|
|
"Average Score": 28.904101398112875,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Biology": {
|
|
"Average Score": 40.66824421437282,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 23.946098797294113,
|
|
"Standard Deviation": 1.882540513317503,
|
|
"Rank": 48
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 20.947476737376597,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 23.018014851651127,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Probability": {
|
|
"Average Score": 15.37360248124904,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Logical": {
|
|
"Average Score": 23.856001036256362,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Social": {
|
|
"Average Score": 33.803173718782276,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.96403210090221,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.014658234926813,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Physics": {
|
|
"Average Score": 31.52560551567879,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Biology": {
|
|
"Average Score": 33.30740831237261,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-2",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 30.425212839239084,
|
|
"Standard Deviation": 3.2420324833230745,
|
|
"Rank": 43
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 17.98077256453581,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 23.03227606898818,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Probability": {
|
|
"Average Score": 22.515548503444595,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Logical": {
|
|
"Average Score": 28.172299674407935,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Social": {
|
|
"Average Score": 32.34681006422513,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 29.847754052571794,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"CPP": {
|
|
"Average Score": 31.382959631870822,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Physics": {
|
|
"Average Score": 42.179522893964496,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Biology": {
|
|
"Average Score": 40.80741758174906,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 19.00770440704137,
|
|
"Standard Deviation": 2.5108129577834823,
|
|
"Rank": 55
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 11.76124122331528,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 20.16800788676758,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Probability": {
|
|
"Average Score": 21.982214302316194,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Logical": {
|
|
"Average Score": 16.458119477880455,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Social": {
|
|
"Average Score": 11.83909143203254,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 20.227175038540732,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.929093202755805,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Physics": {
|
|
"Average Score": 16.942666711550366,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Biology": {
|
|
"Average Score": 14.862055999215585,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-13b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 17.596440211877606,
|
|
"Standard Deviation": 2.1378036693126887,
|
|
"Rank": 56
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 13.613562588758793,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 17.777580357601646,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Probability": {
|
|
"Average Score": 11.773651220819335,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Logical": {
|
|
"Average Score": 16.62840722654711,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Social": {
|
|
"Average Score": 12.015284814277452,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 22.59071707495557,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"CPP": {
|
|
"Average Score": 21.840013221590294,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Physics": {
|
|
"Average Score": 23.12484986614339,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Biology": {
|
|
"Average Score": 32.46475144310054,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "zephyr-7b-beta",
|
|
"organization": "HuggingFace",
|
|
"license": "MIT",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 9.430771900746599,
|
|
"Standard Deviation": 0.5392686957469028,
|
|
"Rank": 59
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 8.776172464719641,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 12.864251022808256,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Probability": {
|
|
"Average Score": 6.856387198441145,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Logical": {
|
|
"Average Score": 7.23067331414496,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 16.809164907349935,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.92902220864132,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Physics": {
|
|
"Average Score": 17.655293480361614,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Biology": {
|
|
"Average Score": 12.415097886994968,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 21.90250655573766,
|
|
"Standard Deviation": 1.9871388098125085,
|
|
"Rank": 52
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 13.697788759430225,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 12.157310639752737,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Probability": {
|
|
"Average Score": 7.449868080506948,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Logical": {
|
|
"Average Score": 10.62657710416428,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Social": {
|
|
"Average Score": 29.175325965898267,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 21.740619629476075,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"CPP": {
|
|
"Average Score": 20.724691953843916,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Physics": {
|
|
"Average Score": 23.632640386132042,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Biology": {
|
|
"Average Score": 29.750661487753543,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-7b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 23.15262700172829,
|
|
"Standard Deviation": 1.5180515912969421,
|
|
"Rank": 50
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 6.062981955604592,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 9.702442741719038,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Probability": {
|
|
"Average Score": 7.323764901851239,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Logical": {
|
|
"Average Score": 20.042615636879354,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Social": {
|
|
"Average Score": 28.003092092497983,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 20.22732766050842,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"CPP": {
|
|
"Average Score": 15.730513733660898,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Physics": {
|
|
"Average Score": 12.866623115939365,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Biology": {
|
|
"Average Score": 29.435323133887913,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 20.296640473489866,
|
|
"Standard Deviation": 2.333666507610861,
|
|
"Rank": 53
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 16.155982788407485,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 9.997670449242714,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Probability": {
|
|
"Average Score": 6.055292262170126,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Logical": {
|
|
"Average Score": 5.200573121259635,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Social": {
|
|
"Average Score": 9.560337024016134,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 16.613881599313693,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"CPP": {
|
|
"Average Score": 17.2715657115764,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Physics": {
|
|
"Average Score": 17.72258050873005,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Biology": {
|
|
"Average Score": 10.891363209321185,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-13b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 22.40246822660458,
|
|
"Standard Deviation": 1.5744155926563603,
|
|
"Rank": 51
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 4.287260426268335,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 7.122650832792122,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Probability": {
|
|
"Average Score": 10.367779885088286,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Logical": {
|
|
"Average Score": 23.416885515011753,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Social": {
|
|
"Average Score": 26.251837552806705,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 15.236408439765913,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.17258252933903,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Physics": {
|
|
"Average Score": 9.756032013938237,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Biology": {
|
|
"Average Score": 14.373926163839833,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-7b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 16.947504584923095,
|
|
"Standard Deviation": 2.1935303160759494,
|
|
"Rank": 57
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 5.6556788835908565,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 6.937810777972691,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Probability": {
|
|
"Average Score": 7.449902539116639,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Logical": {
|
|
"Average Score": 11.53991650872671,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Social": {
|
|
"Average Score": 10.510431618145562,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 15.565621989451936,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"CPP": {
|
|
"Average Score": 14.255194156624162,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Physics": {
|
|
"Average Score": 13.654470501928998,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Biology": {
|
|
"Average Score": 16.31264249867034,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "koala-13b",
|
|
"organization": "UC Berkeley",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 8.83755726181737,
|
|
"Standard Deviation": 0.6967904064276641,
|
|
"Rank": 60
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.16630617078665783,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 2.2176438662182405,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
},
|
|
"Probability": {
|
|
"Average Score": 3.0086045641099886,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Logical": {
|
|
"Average Score": 8.007902379487398,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Social": {
|
|
"Average Score": 9.267400643797334,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 6.881971917535636,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"CPP": {
|
|
"Average Score": 6.36433272373514,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Physics": {
|
|
"Average Score": 1.4745736403582252,
|
|
"Standard Deviation": null,
|
|
"Rank": 59
|
|
},
|
|
"Biology": {
|
|
"Average Score": 10.173901160370301,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openassistant-pythia-12b",
|
|
"organization": "OpenAssistant",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 61
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 62
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 61
|
|
},
|
|
"Social": {
|
|
"Average Score": 1.5648937446490145,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"CPP": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Physics": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 60
|
|
},
|
|
"Biology": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 62
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "nemotron-70b",
|
|
"organization": "NVIDIA",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": 0.0,
|
|
"Rank": 1
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 80.66812253661826,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 64.79317124458657,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Probability": {
|
|
"Average Score": 77.90998100977566,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Logical": {
|
|
"Average Score": 92.79205249453312,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Social": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 75.51792600714916,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Physics": {
|
|
"Average Score": 87.87343018217607,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Biology": {
|
|
"Average Score": 89.70989044405452,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3.2-3b-it",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 24.55648638012998,
|
|
"Standard Deviation": 2.7438328116042396,
|
|
"Rank": 47
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 58.282081682035965,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Probability": {
|
|
"Average Score": 38.82178804612166,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Logical": {
|
|
"Average Score": 14.284884351545829,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Social": {
|
|
"Average Score": 12.015170971293347,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 28.594555260782386,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Physics": {
|
|
"Average Score": 28.49646725691165,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Biology": {
|
|
"Average Score": 19.26616886675504,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "glm-4-plus",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 93.38486963586884,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Biology": {
|
|
"Average Score": 92.22645537080881,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 83.011021476943,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Social": {
|
|
"Average Score": 96.10166232633848,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Logical": {
|
|
"Average Score": 92.48639421432455,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 91.79128700104991,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 75.41344471165868,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Probability": {
|
|
"Average Score": 76.73191937524591,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 92.39089671677698,
|
|
"Standard Deviation": 0.5005865827133669,
|
|
"Rank": 6
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "yi-lightning",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 88.49402753650628,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Biology": {
|
|
"Average Score": 90.37891957676416,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Social": {
|
|
"Average Score": 92.14580653902937,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Logical": {
|
|
"Average Score": 94.75701503537329,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 93.3186019721947,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 76.16313216563569,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Probability": {
|
|
"Average Score": 92.54460354742838,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 96.802929532644,
|
|
"Standard Deviation": 0.27491691197906704,
|
|
"Rank": 3
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "ministral-8b-it",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 57.14492748742418,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Biology": {
|
|
"Average Score": 53.5479824847229,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.51400153833142,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Social": {
|
|
"Average Score": 45.54025353861784,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Logical": {
|
|
"Average Score": 59.25000685096734,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 58.56021213895309,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 54.902884398306554,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Probability": {
|
|
"Average Score": 49.69358274321923,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 45.88665474541969,
|
|
"Standard Deviation": 4.242263667629549,
|
|
"Rank": 29
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen2.5-1.5b",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 50.38291508013627,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Biology": {
|
|
"Average Score": 40.134558844170826,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 34.891253153439166,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Social": {
|
|
"Average Score": 39.812806552940735,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Logical": {
|
|
"Average Score": 42.70305684307474,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 79.30455838359877,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 58.56739922365014,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Probability": {
|
|
"Average Score": 68.07725566867765,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 23.25904934716627,
|
|
"Standard Deviation": 1.5089621200216172,
|
|
"Rank": 49
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "smollm2-1.7b",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 20.328651604714242,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Biology": {
|
|
"Average Score": 23.55167655906088,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 17.90654461263675,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Social": {
|
|
"Average Score": 18.586981509149783,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Logical": {
|
|
"Average Score": 13.753294179366819,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 38.86009773073664,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 26.65205080537627,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Probability": {
|
|
"Average Score": 28.77646355213561,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 20.14565641258473,
|
|
"Standard Deviation": 2.3679638882398857,
|
|
"Rank": 54
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3.2-1b-it",
|
|
"organization": "Unknown",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"Physics": {
|
|
"Average Score": 13.730639722217427,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Biology": {
|
|
"Average Score": 25.09504378386352,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 22.71076097859151,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Social": {
|
|
"Average Score": 20.34042449083379,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Logical": {
|
|
"Average Score": 15.338736069283176,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 43.69053020706735,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 25.35058286701741,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Probability": {
|
|
"Average Score": 28.620674481486535,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"OVERALL": {
|
|
"Average Score": 24.93401522355894,
|
|
"Standard Deviation": 2.6710490374694014,
|
|
"Rank": 46
|
|
}
|
|
}
|
|
}
|
|
] |