diff --git "a/README.md" "b/README.md" --- "a/README.md" +++ "b/README.md" @@ -90,80 +90,80 @@ TESTS: TQA_mc | 0.356 | 0.701 | 0.767 | 0.640 | 0.403 | 0.558 | 0.648 | 0.564 | 0.555 | 0.561 | 0.549 | 0.542 | 0.624 | 0.547 | 0.643 | 0.621 | 0.581 | 0.742 | 0.503 | 0.657 | - | 0.563 | 0.709 | TQA_tf | 0.510 | 0.692 | 0.725 | 0.457 | 0.421 | 0.473 | 0.593 | 0.512 | 0.566 | 0.572 | 0.548 | 0.479 | 0.548 | 0.536 | 0.541 | 0.483 | 0.487 | 0.670 | 0.435 | 0.568 | - | 0.487 | 0.485 | TQA | 0.492 | 0.693 | 0.730 | 0.478 | 0.419 | 0.483 | 0.599 | 0.518 | 0.565 | 0.571 | 0.548 | 0.486 | 0.556 | 0.537 | 0.553 | 0.499 | 0.498 | 0.679 | 0.442 | 0.578 | - | 0.496 | 0.511 | - ARC_challenge | 0.671 | 0.882 | 0.897 | 0.853 | - | 0.740 | 0.812 | 0.776 | 0.706 | 0.775 | 0.688 | 0.773 | 0.797 | 0.819 | 0.833 | 0.813 | 0.802 | 0.888 | 0.818 | 0.851 | - | 0.755 | 0.871 | - ARC_easy | 0.846 | 0.952 | 0.963 | 0.940 | - | 0.893 | 0.936 | 0.906 | 0.843 | 0.910 | 0.843 | 0.908 | 0.910 | 0.914 | 0.935 | 0.934 | 0.932 | 0.965 | 0.923 | 0.946 | - | 0.899 | 0.962 | - ARC | 0.788 | 0.929 | 0.941 | 0.911 | - | 0.842 | 0.895 | 0.863 | 0.798 | 0.866 | 0.792 | 0.864 | 0.873 | 0.883 | 0.901 | 0.894 | 0.889 | 0.939 | 0.888 | 0.915 | - | 0.851 | 0.932 | - RACE_high | 0.580 | 0.802 | 0.833 | 0.787 | - | 0.641 | 0.826 | 0.679 | 0.589 | 0.736 | 0.607 | 0.726 | 0.771 | 0.773 | 0.648 | 0.613 | 0.625 | 0.779 | 0.779 | 0.788 | - | 0.741 | 0.764 | - RACE_middle | 0.610 | 0.849 | 0.883 | 0.825 | - | 0.737 | 0.863 | 0.734 | 0.680 | 0.800 | 0.696 | 0.782 | 0.807 | 0.834 | 0.722 | 0.706 | 0.692 | 0.832 | 0.827 | 0.853 | - | 0.809 | 0.824 | - RACE | 0.589 | 0.816 | 0.847 | 0.798 | - | 0.669 | 0.837 | 0.695 | 0.615 | 0.755 | 0.633 | 0.743 | 0.781 | 0.791 | 0.670 | 0.640 | 0.645 | 0.795 | 0.793 | 0.807 | - | 0.761 | 0.781 | + ARC_challenge | 0.671 | 0.882 | 0.897 | 0.853 | 0.620 | 0.740 | 0.812 | 0.776 | 0.706 | 0.775 | 0.688 | 0.773 | 0.797 | 0.819 | 0.833 | 0.813 | 0.802 | 0.888 | 0.818 | 0.851 | - | 0.755 | 0.871 | + ARC_easy | 0.846 | 0.952 | 0.963 | 0.940 | 0.806 | 0.893 | 0.936 | 0.906 | 0.843 | 0.910 | 0.843 | 0.908 | 0.910 | 0.914 | 0.935 | 0.934 | 0.932 | 0.965 | 0.923 | 0.946 | - | 0.899 | 0.962 | + ARC | 0.788 | 0.929 | 0.941 | 0.911 | 0.744 | 0.842 | 0.895 | 0.863 | 0.798 | 0.866 | 0.792 | 0.864 | 0.873 | 0.883 | 0.901 | 0.894 | 0.889 | 0.939 | 0.888 | 0.915 | - | 0.851 | 0.932 | + RACE_high | 0.580 | 0.802 | 0.833 | 0.787 | 0.551 | 0.641 | 0.826 | 0.679 | 0.589 | 0.736 | 0.607 | 0.726 | 0.771 | 0.773 | 0.648 | 0.613 | 0.625 | 0.779 | 0.779 | 0.788 | - | 0.741 | 0.764 | + RACE_middle | 0.610 | 0.849 | 0.883 | 0.825 | 0.633 | 0.737 | 0.863 | 0.734 | 0.680 | 0.800 | 0.696 | 0.782 | 0.807 | 0.834 | 0.722 | 0.706 | 0.692 | 0.832 | 0.827 | 0.853 | - | 0.809 | 0.824 | + RACE | 0.589 | 0.816 | 0.847 | 0.798 | 0.575 | 0.669 | 0.837 | 0.695 | 0.615 | 0.755 | 0.633 | 0.743 | 0.781 | 0.791 | 0.670 | 0.640 | 0.645 | 0.795 | 0.793 | 0.807 | - | 0.761 | 0.781 | MMLU - abstract_algebra | 0.140 | 0.330 | 0.310 | 0.210 | - | 0.210 | 0.230 | 0.200 | 0.270 | 0.210 | 0.190 | 0.330 | 0.220 | 0.170 | 0.250 | 0.300 | 0.210 | 0.390 | 0.480 | 0.430 | - | 0.140 | 0.340 | - anatomy | 0.414 | 0.626 | 0.607 | 0.511 | - | 0.474 | 0.614 | 0.555 | 0.540 | 0.540 | 0.447 | 0.555 | 0.552 | 0.537 | 0.577 | 0.570 | 0.585 | 0.666 | 0.488 | 0.622 | - | 0.477 | 0.607 | - astronomy | 0.467 | 0.760 | 0.828 | 0.651 | - | 0.638 | 0.723 | 0.677 | 0.565 | 0.671 | 0.573 | 0.651 | 0.620 | 0.646 | 0.677 | 0.703 | 0.703 | 0.796 | 0.723 | 0.769 | - | 0.586 | 0.756 | - business_ethics | 0.430 | 0.620 | 0.670 | 0.610 | - | 0.480 | 0.640 | 0.550 | 0.480 | 0.540 | 0.520 | 0.630 | 0.540 | 0.530 | 0.620 | 0.620 | 0.620 | 0.710 | 0.670 | 0.710 | - | 0.570 | 0.740 | - clinical_knowledge | 0.550 | 0.743 | 0.788 | 0.622 | - | 0.600 | 0.716 | 0.675 | 0.592 | 0.664 | 0.581 | 0.664 | 0.637 | 0.649 | 0.686 | 0.713 | 0.698 | 0.750 | 0.686 | 0.713 | - | 0.577 | 0.735 | - college_biology | 0.625 | 0.854 | 0.895 | 0.715 | - | 0.666 | 0.701 | 0.722 | 0.625 | 0.708 | 0.625 | 0.694 | 0.631 | 0.659 | 0.791 | 0.805 | 0.763 | 0.819 | 0.743 | 0.784 | - | 0.618 | 0.833 | - college_chemistry | 0.330 | 0.470 | 0.430 | 0.380 | - | 0.330 | 0.410 | 0.400 | 0.310 | 0.370 | 0.350 | 0.340 | 0.380 | 0.400 | 0.450 | 0.460 | 0.430 | 0.440 | 0.380 | 0.490 | - | 0.330 | 0.450 | - college_computer_science | 0.290 | 0.460 | 0.580 | 0.480 | - | 0.400 | 0.490 | 0.400 | 0.350 | 0.410 | 0.320 | 0.400 | 0.440 | 0.400 | 0.440 | 0.480 | 0.410 | 0.510 | 0.520 | 0.590 | - | 0.370 | 0.480 | - college_mathematics | 0.100 | 0.260 | 0.300 | 0.280 | - | 0.160 | 0.230 | 0.260 | 0.210 | 0.180 | 0.180 | 0.180 | 0.200 | 0.200 | 0.200 | 0.270 | 0.170 | 0.340 | 0.260 | 0.350 | - | 0.170 | 0.310 | - college_medicine | 0.491 | 0.658 | 0.716 | 0.589 | - | 0.526 | 0.606 | 0.589 | 0.491 | 0.543 | 0.456 | 0.572 | 0.566 | 0.543 | 0.572 | 0.612 | 0.566 | 0.682 | 0.589 | 0.624 | - | 0.485 | 0.653 | - college_physics | 0.235 | 0.352 | 0.421 | 0.323 | - | 0.235 | 0.362 | 0.313 | 0.303 | 0.264 | 0.254 | 0.245 | 0.205 | 0.303 | 0.352 | 0.333 | 0.294 | 0.372 | 0.343 | 0.372 | - | 0.245 | 0.382 | - computer_security | 0.580 | 0.730 | 0.710 | 0.730 | - | 0.680 | 0.690 | 0.690 | 0.620 | 0.640 | 0.600 | 0.680 | 0.670 | 0.660 | 0.680 | 0.700 | 0.650 | 0.700 | 0.650 | 0.710 | - | 0.610 | 0.700 | - conceptual_physics | 0.395 | 0.638 | 0.727 | 0.587 | - | 0.468 | 0.612 | 0.463 | 0.361 | 0.404 | 0.365 | 0.446 | 0.468 | 0.472 | 0.565 | 0.565 | 0.553 | 0.685 | 0.561 | 0.642 | - | 0.442 | 0.693 | - econometrics | 0.271 | 0.557 | 0.587 | 0.464 | - | 0.377 | 0.482 | 0.482 | 0.359 | 0.333 | 0.318 | 0.429 | 0.362 | 0.424 | 0.456 | 0.456 | 0.421 | 0.543 | 0.535 | 0.596 | - | 0.345 | 0.526 | - electrical_engineering | 0.462 | 0.558 | 0.593 | 0.572 | - | 0.420 | 0.544 | 0.524 | 0.462 | 0.455 | 0.393 | 0.482 | 0.468 | 0.510 | 0.468 | 0.496 | 0.475 | 0.565 | 0.606 | 0.606 | - | 0.324 | 0.586 | - elementary_mathematics | 0.261 | 0.476 | 0.476 | 0.373 | - | 0.283 | 0.529 | 0.357 | 0.280 | 0.309 | 0.222 | 0.312 | 0.283 | 0.304 | 0.373 | 0.423 | 0.388 | 0.537 | 0.481 | 0.568 | - | 0.304 | 0.455 | - formal_logic | 0.214 | 0.293 | 0.468 | 0.357 | - | 0.317 | 0.396 | 0.420 | 0.253 | 0.349 | 0.277 | 0.396 | 0.261 | 0.261 | 0.412 | 0.452 | 0.380 | 0.523 | 0.420 | 0.428 | - | 0.190 | 0.484 | - global_facts | 0.100 | 0.330 | 0.370 | 0.240 | - | 0.220 | 0.390 | 0.150 | 0.110 | 0.200 | 0.160 | 0.280 | 0.160 | 0.210 | 0.220 | 0.240 | 0.130 | 0.360 | 0.300 | 0.260 | - | 0.220 | 0.240 | - high_school_biology | 0.651 | 0.851 | 0.890 | 0.809 | - | 0.696 | 0.790 | 0.729 | 0.677 | 0.741 | 0.654 | 0.748 | 0.677 | 0.706 | 0.774 | 0.793 | 0.774 | 0.861 | 0.761 | 0.806 | - | 0.670 | 0.858 | - high_school_chemistry | 0.315 | 0.586 | 0.600 | 0.517 | - | 0.408 | 0.522 | 0.467 | 0.433 | 0.389 | 0.310 | 0.379 | 0.384 | 0.428 | 0.482 | 0.512 | 0.492 | 0.551 | 0.467 | 0.536 | - | 0.315 | 0.581 | - high_school_computer_science | 0.440 | 0.710 | 0.770 | 0.660 | - | 0.620 | 0.700 | 0.610 | 0.540 | 0.610 | 0.490 | 0.630 | 0.580 | 0.560 | 0.610 | 0.610 | 0.580 | 0.690 | 0.710 | 0.770 | - | 0.560 | 0.720 | - high_school_european_history | 0.672 | 0.806 | 0.830 | 0.830 | - | 0.739 | 0.751 | 0.709 | 0.672 | 0.696 | 0.678 | 0.709 | 0.733 | 0.733 | 0.690 | 0.727 | 0.672 | 0.806 | 0.751 | 0.800 | - | 0.745 | 0.787 | - high_school_geography | 0.676 | 0.878 | 0.888 | 0.818 | - | 0.737 | 0.843 | 0.757 | 0.671 | 0.727 | 0.671 | 0.752 | 0.727 | 0.732 | 0.747 | 0.792 | 0.737 | 0.843 | 0.797 | 0.833 | - | 0.717 | 0.843 | - high_school_government_and_politics | 0.730 | 0.926 | 0.963 | 0.870 | - | 0.829 | 0.865 | 0.818 | 0.725 | 0.849 | 0.805 | 0.875 | 0.863 | 0.836 | 0.875 | 0.849 | 0.834 | 0.937 | 0.865 | 0.917 | - | 0.805 | 0.917 | - high_school_macroeconomics | 0.487 | 0.717 | 0.758 | 0.653 | - | 0.520 | 0.633 | 0.556 | 0.497 | 0.525 | 0.478 | 0.528 | 0.532 | 0.521 | 0.635 | 0.646 | 0.635 | 0.756 | 0.687 | 0.684 | - | 0.496 | 0.710 | - high_school_mathematics | 0.200 | 0.277 | 0.325 | 0.240 | - | 0.177 | 0.348 | 0.255 | 0.233 | 0.270 | 0.162 | 0.162 | 0.237 | 0.203 | 0.203 | 0.214 | 0.203 | 0.281 | 0.351 | 0.422 | - | 0.174 | 0.266 | - high_school_microeconomics | 0.521 | 0.801 | 0.852 | 0.773 | - | 0.630 | 0.714 | 0.684 | 0.575 | 0.609 | 0.540 | 0.630 | 0.603 | 0.654 | 0.773 | 0.794 | 0.743 | 0.886 | 0.802 | 0.827 | - | 0.594 | 0.848 | - high_school_physics | 0.218 | 0.423 | 0.496 | 0.364 | - | 0.231 | 0.337 | 0.317 | 0.211 | 0.284 | 0.165 | 0.251 | 0.245 | 0.331 | 0.384 | 0.377 | 0.384 | 0.463 | 0.298 | 0.456 | - | 0.192 | 0.456 | - high_school_psychology | 0.761 | 0.896 | 0.910 | 0.858 | - | 0.788 | 0.838 | 0.834 | 0.761 | 0.809 | 0.764 | 0.814 | 0.817 | 0.797 | 0.823 | 0.855 | 0.844 | 0.884 | 0.827 | 0.856 | - | 0.779 | 0.880 | - high_school_statistics | 0.347 | 0.574 | 0.615 | 0.500 | - | 0.393 | 0.490 | 0.462 | 0.342 | 0.467 | 0.361 | 0.476 | 0.402 | 0.421 | 0.472 | 0.569 | 0.523 | 0.615 | 0.550 | 0.648 | - | 0.407 | 0.555 | - high_school_us_history | 0.656 | 0.829 | 0.867 | 0.867 | - | 0.740 | 0.759 | 0.784 | 0.696 | 0.823 | 0.699 | 0.799 | 0.782 | 0.792 | 0.764 | 0.759 | 0.735 | 0.833 | 0.803 | 0.852 | - | 0.803 | 0.857 | - high_school_world_history | 0.700 | 0.872 | 0.881 | 0.827 | - | 0.763 | 0.780 | 0.789 | 0.725 | 0.776 | 0.720 | 0.797 | 0.750 | 0.826 | 0.729 | 0.746 | 0.742 | 0.835 | 0.805 | 0.827 | - | 0.783 | 0.848 | - human_aging | 0.497 | 0.690 | 0.739 | 0.591 | - | 0.565 | 0.650 | 0.618 | 0.569 | 0.605 | 0.542 | 0.609 | 0.632 | 0.623 | 0.596 | 0.582 | 0.547 | 0.672 | 0.609 | 0.690 | - | 0.587 | 0.695 | - human_sexuality | 0.519 | 0.746 | 0.755 | 0.633 | - | 0.702 | 0.702 | 0.671 | 0.587 | 0.679 | 0.569 | 0.618 | 0.615 | 0.646 | 0.618 | 0.664 | 0.587 | 0.748 | 0.648 | 0.717 | - | 0.584 | 0.770 | - international_law | 0.644 | 0.801 | 0.760 | 0.752 | - | 0.685 | 0.785 | 0.776 | 0.710 | 0.752 | 0.710 | 0.694 | 0.768 | 0.743 | 0.694 | 0.735 | 0.727 | 0.826 | 0.776 | 0.785 | - | 0.685 | 0.859 | - jurisprudence | 0.611 | 0.785 | 0.833 | 0.722 | - | 0.685 | 0.712 | 0.731 | 0.574 | 0.722 | 0.626 | 0.750 | 0.719 | 0.719 | 0.722 | 0.722 | 0.750 | 0.787 | 0.787 | 0.750 | - | 0.654 | 0.824 | - logical_fallacies | 0.625 | 0.811 | 0.797 | 0.754 | - | 0.711 | 0.785 | 0.736 | 0.687 | 0.705 | 0.660 | 0.730 | 0.666 | 0.691 | 0.791 | 0.785 | 0.754 | 0.852 | 0.736 | 0.766 | - | 0.641 | 0.809 | - machine_learning | 0.241 | 0.437 | 0.571 | 0.401 | - | 0.357 | 0.464 | 0.366 | 0.285 | 0.366 | 0.321 | 0.366 | 0.366 | 0.348 | 0.383 | 0.437 | 0.375 | 0.500 | 0.383 | 0.410 | - | 0.312 | 0.526 | - management | 0.737 | 0.825 | 0.844 | 0.766 | - | 0.708 | 0.815 | 0.737 | 0.669 | 0.766 | 0.708 | 0.699 | 0.737 | 0.737 | 0.786 | 0.786 | 0.776 | 0.815 | 0.737 | 0.825 | - | 0.747 | 0.786 | - marketing | 0.760 | 0.863 | 0.893 | 0.858 | - | 0.799 | 0.829 | 0.837 | 0.799 | 0.794 | 0.756 | 0.811 | 0.833 | 0.816 | 0.824 | 0.820 | 0.803 | 0.880 | 0.841 | 0.893 | - | 0.782 | 0.846 | - medical_genetics | 0.580 | 0.780 | 0.810 | 0.640 | - | 0.660 | 0.690 | 0.720 | 0.660 | 0.660 | 0.600 | 0.740 | 0.630 | 0.660 | 0.710 | 0.710 | 0.700 | 0.830 | 0.690 | 0.770 | - | 0.640 | 0.820 | - miscellaneous | 0.698 | 0.830 | 0.854 | 0.796 | - | 0.777 | 0.787 | 0.773 | 0.736 | 0.759 | 0.727 | 0.782 | 0.766 | 0.756 | 0.756 | 0.777 | 0.759 | 0.837 | 0.814 | 0.814 | - | 0.746 | 0.828 | - moral_disputes | 0.526 | 0.680 | 0.736 | 0.612 | - | 0.595 | 0.589 | 0.621 | 0.560 | 0.572 | 0.524 | 0.552 | 0.598 | 0.645 | 0.635 | 0.615 | 0.621 | 0.696 | 0.658 | 0.676 | - | 0.554 | 0.708 | - moral_scenarios | 0.227 | 0.325 | 0.366 | 0.360 | - | 0.177 | 0.280 | 0.205 | 0.410 | 0.246 | 0.122 | 0.226 | 0.229 | 0.327 | 0.288 | 0.366 | 0.404 | 0.538 | 0.336 | 0.368 | - | 0.188 | 0.477 | - nutrition | 0.591 | 0.683 | 0.758 | 0.653 | - | 0.611 | 0.650 | 0.689 | 0.620 | 0.647 | 0.555 | 0.614 | 0.624 | 0.633 | 0.630 | 0.669 | 0.620 | 0.751 | 0.692 | 0.745 | - | 0.575 | 0.722 | - philosophy | 0.527 | 0.658 | 0.713 | 0.659 | - | 0.627 | 0.636 | 0.617 | 0.578 | 0.598 | 0.587 | 0.633 | 0.612 | 0.580 | 0.598 | 0.630 | 0.588 | 0.704 | 0.646 | 0.688 | - | 0.554 | 0.717 | - prehistory | 0.518 | 0.728 | 0.783 | 0.663 | - | 0.635 | 0.669 | 0.700 | 0.604 | 0.623 | 0.580 | 0.675 | 0.697 | 0.648 | 0.675 | 0.697 | 0.663 | 0.774 | 0.675 | 0.756 | - | 0.595 | 0.783 | - professional_accounting | 0.326 | 0.496 | 0.514 | 0.425 | - | 0.354 | 0.453 | 0.393 | 0.336 | 0.382 | 0.336 | 0.421 | 0.361 | 0.382 | 0.397 | 0.418 | 0.386 | 0.578 | 0.443 | 0.460 | - | 0.358 | 0.514 | - professional_law | 0.307 | 0.478 | 0.528 | 0.408 | - | 0.384 | 0.359 | 0.397 | 0.369 | 0.383 | 0.333 | 0.379 | 0.399 | 0.383 | 0.405 | 0.410 | 0.401 | 0.498 | 0.423 | 0.402 | - | 0.350 | 0.481 | - professional_medicine | 0.485 | 0.756 | 0.794 | 0.680 | - | 0.580 | 0.665 | 0.724 | 0.713 | 0.672 | 0.564 | 0.705 | 0.642 | 0.619 | 0.643 | 0.687 | 0.658 | 0.794 | 0.658 | 0.683 | - | 0.645 | 0.794 | - professional_psychology | 0.477 | 0.728 | 0.805 | 0.609 | - | 0.535 | 0.599 | 0.642 | 0.509 | 0.565 | 0.521 | 0.602 | 0.560 | 0.588 | 0.638 | 0.655 | 0.617 | 0.764 | 0.671 | 0.702 | - | 0.529 | 0.759 | - public_relations | 0.563 | 0.700 | 0.672 | 0.627 | - | 0.581 | 0.636 | 0.518 | 0.545 | 0.581 | 0.554 | 0.627 | 0.581 | 0.518 | 0.627 | 0.554 | 0.572 | 0.672 | 0.636 | 0.645 | - | 0.554 | 0.581 | - security_studies | 0.616 | 0.746 | 0.763 | 0.632 | - | 0.648 | 0.759 | 0.665 | 0.616 | 0.673 | 0.600 | 0.608 | 0.612 | 0.628 | 0.697 | 0.669 | 0.673 | 0.738 | 0.665 | 0.718 | - | 0.575 | 0.730 | - sociology | 0.666 | 0.815 | 0.860 | 0.741 | - | 0.711 | 0.810 | 0.786 | 0.741 | 0.771 | 0.716 | 0.786 | 0.776 | 0.781 | 0.800 | 0.820 | 0.781 | 0.850 | 0.825 | 0.825 | - | 0.741 | 0.835 | - us_foreign_policy | 0.690 | 0.868 | 0.840 | 0.800 | - | 0.790 | 0.790 | 0.800 | 0.800 | 0.840 | 0.757 | 0.740 | 0.787 | 0.787 | 0.740 | 0.760 | 0.770 | 0.850 | 0.800 | 0.820 | - | 0.757 | 0.810 | - virology | 0.433 | 0.472 | 0.506 | 0.439 | - | 0.427 | 0.415 | 0.439 | 0.415 | 0.475 | 0.387 | 0.421 | 0.436 | 0.448 | 0.379 | 0.403 | 0.367 | 0.487 | 0.457 | 0.457 | - | 0.381 | 0.487 | - world_religions | 0.678 | 0.800 | 0.847 | 0.766 | - | 0.771 | 0.801 | 0.789 | 0.742 | 0.777 | 0.747 | 0.789 | 0.800 | 0.747 | 0.742 | 0.742 | 0.725 | 0.801 | 0.766 | 0.818 | - | 0.705 | 0.812 | - MMLU | 0.475 | 0.647 | 0.687 | 0.595 | - | 0.530 | 0.595 | 0.570 | 0.525 | 0.550 | 0.486 | 0.555 | 0.544 | 0.553 | 0.578 | 0.599 | 0.578 | 0.682 | 0.610 | 0.643 | - | 0.509 | 0.666 | + abstract_algebra | 0.140 | 0.330 | 0.310 | 0.210 | 0.180 | 0.210 | 0.230 | 0.200 | 0.270 | 0.210 | 0.190 | 0.330 | 0.220 | 0.170 | 0.250 | 0.300 | 0.210 | 0.390 | 0.480 | 0.430 | - | 0.140 | 0.340 | + anatomy | 0.414 | 0.626 | 0.607 | 0.511 | 0.362 | 0.474 | 0.614 | 0.555 | 0.540 | 0.540 | 0.447 | 0.555 | 0.552 | 0.537 | 0.577 | 0.570 | 0.585 | 0.666 | 0.488 | 0.622 | - | 0.477 | 0.607 | + astronomy | 0.467 | 0.760 | 0.828 | 0.651 | 0.526 | 0.638 | 0.723 | 0.677 | 0.565 | 0.671 | 0.573 | 0.651 | 0.620 | 0.646 | 0.677 | 0.703 | 0.703 | 0.796 | 0.723 | 0.769 | - | 0.586 | 0.756 | + business_ethics | 0.430 | 0.620 | 0.670 | 0.610 | 0.450 | 0.480 | 0.640 | 0.550 | 0.480 | 0.540 | 0.520 | 0.630 | 0.540 | 0.530 | 0.620 | 0.620 | 0.620 | 0.710 | 0.670 | 0.710 | - | 0.570 | 0.740 | + clinical_knowledge | 0.550 | 0.743 | 0.788 | 0.622 | 0.494 | 0.600 | 0.716 | 0.675 | 0.592 | 0.664 | 0.581 | 0.664 | 0.637 | 0.649 | 0.686 | 0.713 | 0.698 | 0.750 | 0.686 | 0.713 | - | 0.577 | 0.735 | + college_biology | 0.625 | 0.854 | 0.895 | 0.715 | 0.486 | 0.666 | 0.701 | 0.722 | 0.625 | 0.708 | 0.625 | 0.694 | 0.631 | 0.659 | 0.791 | 0.805 | 0.763 | 0.819 | 0.743 | 0.784 | - | 0.618 | 0.833 | + college_chemistry | 0.330 | 0.470 | 0.430 | 0.380 | 0.320 | 0.330 | 0.410 | 0.400 | 0.310 | 0.370 | 0.350 | 0.340 | 0.380 | 0.400 | 0.450 | 0.460 | 0.430 | 0.440 | 0.380 | 0.490 | - | 0.330 | 0.450 | + college_computer_science | 0.290 | 0.460 | 0.580 | 0.480 | 0.250 | 0.400 | 0.490 | 0.400 | 0.350 | 0.410 | 0.320 | 0.400 | 0.440 | 0.400 | 0.440 | 0.480 | 0.410 | 0.510 | 0.520 | 0.590 | - | 0.370 | 0.480 | + college_mathematics | 0.100 | 0.260 | 0.300 | 0.280 | 0.150 | 0.160 | 0.230 | 0.260 | 0.210 | 0.180 | 0.180 | 0.180 | 0.200 | 0.200 | 0.200 | 0.270 | 0.170 | 0.340 | 0.260 | 0.350 | - | 0.170 | 0.310 | + college_medicine | 0.491 | 0.658 | 0.716 | 0.589 | 0.439 | 0.526 | 0.606 | 0.589 | 0.491 | 0.543 | 0.456 | 0.572 | 0.566 | 0.543 | 0.572 | 0.612 | 0.566 | 0.682 | 0.589 | 0.624 | - | 0.485 | 0.653 | + college_physics | 0.235 | 0.352 | 0.421 | 0.323 | 0.215 | 0.235 | 0.362 | 0.313 | 0.303 | 0.264 | 0.254 | 0.245 | 0.205 | 0.303 | 0.352 | 0.333 | 0.294 | 0.372 | 0.343 | 0.372 | - | 0.245 | 0.382 | + computer_security | 0.580 | 0.730 | 0.710 | 0.730 | 0.640 | 0.680 | 0.690 | 0.690 | 0.620 | 0.640 | 0.600 | 0.680 | 0.670 | 0.660 | 0.680 | 0.700 | 0.650 | 0.700 | 0.650 | 0.710 | - | 0.610 | 0.700 | + conceptual_physics | 0.395 | 0.638 | 0.727 | 0.587 | 0.344 | 0.468 | 0.612 | 0.463 | 0.361 | 0.404 | 0.365 | 0.446 | 0.468 | 0.472 | 0.565 | 0.565 | 0.553 | 0.685 | 0.561 | 0.642 | - | 0.442 | 0.693 | + econometrics | 0.271 | 0.557 | 0.587 | 0.464 | 0.219 | 0.377 | 0.482 | 0.482 | 0.359 | 0.333 | 0.318 | 0.429 | 0.362 | 0.424 | 0.456 | 0.456 | 0.421 | 0.543 | 0.535 | 0.596 | - | 0.345 | 0.526 | + electrical_engineering | 0.462 | 0.558 | 0.593 | 0.572 | 0.351 | 0.420 | 0.544 | 0.524 | 0.462 | 0.455 | 0.393 | 0.482 | 0.468 | 0.510 | 0.468 | 0.496 | 0.475 | 0.565 | 0.606 | 0.606 | - | 0.324 | 0.586 | + elementary_mathematics | 0.261 | 0.476 | 0.476 | 0.373 | 0.174 | 0.283 | 0.529 | 0.357 | 0.280 | 0.309 | 0.222 | 0.312 | 0.283 | 0.304 | 0.373 | 0.423 | 0.388 | 0.537 | 0.481 | 0.568 | - | 0.304 | 0.455 | + formal_logic | 0.214 | 0.293 | 0.468 | 0.357 | 0.238 | 0.317 | 0.396 | 0.420 | 0.253 | 0.349 | 0.277 | 0.396 | 0.261 | 0.261 | 0.412 | 0.452 | 0.380 | 0.523 | 0.420 | 0.428 | - | 0.190 | 0.484 | + global_facts | 0.100 | 0.330 | 0.370 | 0.240 | 0.120 | 0.220 | 0.390 | 0.150 | 0.110 | 0.200 | 0.160 | 0.280 | 0.160 | 0.210 | 0.220 | 0.240 | 0.130 | 0.360 | 0.300 | 0.260 | - | 0.220 | 0.240 | + high_school_biology | 0.651 | 0.851 | 0.890 | 0.809 | 0.574 | 0.696 | 0.790 | 0.729 | 0.677 | 0.741 | 0.654 | 0.748 | 0.677 | 0.706 | 0.774 | 0.793 | 0.774 | 0.861 | 0.761 | 0.806 | - | 0.670 | 0.858 | + high_school_chemistry | 0.315 | 0.586 | 0.600 | 0.517 | 0.295 | 0.408 | 0.522 | 0.467 | 0.433 | 0.389 | 0.310 | 0.379 | 0.384 | 0.428 | 0.482 | 0.512 | 0.492 | 0.551 | 0.467 | 0.536 | - | 0.315 | 0.581 | + high_school_computer_science | 0.440 | 0.710 | 0.770 | 0.660 | 0.500 | 0.620 | 0.700 | 0.610 | 0.540 | 0.610 | 0.490 | 0.630 | 0.580 | 0.560 | 0.610 | 0.610 | 0.580 | 0.690 | 0.710 | 0.770 | - | 0.560 | 0.720 | + high_school_european_history | 0.672 | 0.806 | 0.830 | 0.830 | 0.618 | 0.739 | 0.751 | 0.709 | 0.672 | 0.696 | 0.678 | 0.709 | 0.733 | 0.733 | 0.690 | 0.727 | 0.672 | 0.806 | 0.751 | 0.800 | - | 0.745 | 0.787 | + high_school_geography | 0.676 | 0.878 | 0.888 | 0.818 | 0.565 | 0.737 | 0.843 | 0.757 | 0.671 | 0.727 | 0.671 | 0.752 | 0.727 | 0.732 | 0.747 | 0.792 | 0.737 | 0.843 | 0.797 | 0.833 | - | 0.717 | 0.843 | + high_school_government_and_politics | 0.730 | 0.926 | 0.963 | 0.870 | 0.704 | 0.829 | 0.865 | 0.818 | 0.725 | 0.849 | 0.805 | 0.875 | 0.863 | 0.836 | 0.875 | 0.849 | 0.834 | 0.937 | 0.865 | 0.917 | - | 0.805 | 0.917 | + high_school_macroeconomics | 0.487 | 0.717 | 0.758 | 0.653 | 0.410 | 0.520 | 0.633 | 0.556 | 0.497 | 0.525 | 0.478 | 0.528 | 0.532 | 0.521 | 0.635 | 0.646 | 0.635 | 0.756 | 0.687 | 0.684 | - | 0.496 | 0.710 | + high_school_mathematics | 0.200 | 0.277 | 0.325 | 0.240 | 0.166 | 0.177 | 0.348 | 0.255 | 0.233 | 0.270 | 0.162 | 0.162 | 0.237 | 0.203 | 0.203 | 0.214 | 0.203 | 0.281 | 0.351 | 0.422 | - | 0.174 | 0.266 | + high_school_microeconomics | 0.521 | 0.801 | 0.852 | 0.773 | 0.470 | 0.630 | 0.714 | 0.684 | 0.575 | 0.609 | 0.540 | 0.630 | 0.603 | 0.654 | 0.773 | 0.794 | 0.743 | 0.886 | 0.802 | 0.827 | - | 0.594 | 0.848 | + high_school_physics | 0.218 | 0.423 | 0.496 | 0.364 | 0.145 | 0.231 | 0.337 | 0.317 | 0.211 | 0.284 | 0.165 | 0.251 | 0.245 | 0.331 | 0.384 | 0.377 | 0.384 | 0.463 | 0.298 | 0.456 | - | 0.192 | 0.456 | + high_school_psychology | 0.761 | 0.896 | 0.910 | 0.858 | 0.669 | 0.788 | 0.838 | 0.834 | 0.761 | 0.809 | 0.764 | 0.814 | 0.817 | 0.797 | 0.823 | 0.855 | 0.844 | 0.884 | 0.827 | 0.856 | - | 0.779 | 0.880 | + high_school_statistics | 0.347 | 0.574 | 0.615 | 0.500 | 0.324 | 0.393 | 0.490 | 0.462 | 0.342 | 0.467 | 0.361 | 0.476 | 0.402 | 0.421 | 0.472 | 0.569 | 0.523 | 0.615 | 0.550 | 0.648 | - | 0.407 | 0.555 | + high_school_us_history | 0.656 | 0.829 | 0.867 | 0.867 | 0.602 | 0.740 | 0.759 | 0.784 | 0.696 | 0.823 | 0.699 | 0.799 | 0.782 | 0.792 | 0.764 | 0.759 | 0.735 | 0.833 | 0.803 | 0.852 | - | 0.803 | 0.857 | + high_school_world_history | 0.700 | 0.872 | 0.881 | 0.827 | 0.632 | 0.763 | 0.780 | 0.789 | 0.725 | 0.776 | 0.720 | 0.797 | 0.750 | 0.826 | 0.729 | 0.746 | 0.742 | 0.835 | 0.805 | 0.827 | - | 0.783 | 0.848 | + human_aging | 0.497 | 0.690 | 0.739 | 0.591 | 0.502 | 0.565 | 0.650 | 0.618 | 0.569 | 0.605 | 0.542 | 0.609 | 0.632 | 0.623 | 0.596 | 0.582 | 0.547 | 0.672 | 0.609 | 0.690 | - | 0.587 | 0.695 | + human_sexuality | 0.519 | 0.746 | 0.755 | 0.633 | 0.557 | 0.702 | 0.702 | 0.671 | 0.587 | 0.679 | 0.569 | 0.618 | 0.615 | 0.646 | 0.618 | 0.664 | 0.587 | 0.748 | 0.648 | 0.717 | - | 0.584 | 0.770 | + international_law | 0.644 | 0.801 | 0.760 | 0.752 | 0.636 | 0.685 | 0.785 | 0.776 | 0.710 | 0.752 | 0.710 | 0.694 | 0.768 | 0.743 | 0.694 | 0.735 | 0.727 | 0.826 | 0.776 | 0.785 | - | 0.685 | 0.859 | + jurisprudence | 0.611 | 0.785 | 0.833 | 0.722 | 0.629 | 0.685 | 0.712 | 0.731 | 0.574 | 0.722 | 0.626 | 0.750 | 0.719 | 0.719 | 0.722 | 0.722 | 0.750 | 0.787 | 0.787 | 0.750 | - | 0.654 | 0.824 | + logical_fallacies | 0.625 | 0.811 | 0.797 | 0.754 | 0.656 | 0.711 | 0.785 | 0.736 | 0.687 | 0.705 | 0.660 | 0.730 | 0.666 | 0.691 | 0.791 | 0.785 | 0.754 | 0.852 | 0.736 | 0.766 | - | 0.641 | 0.809 | + machine_learning | 0.241 | 0.437 | 0.571 | 0.401 | 0.267 | 0.357 | 0.464 | 0.366 | 0.285 | 0.366 | 0.321 | 0.366 | 0.366 | 0.348 | 0.383 | 0.437 | 0.375 | 0.500 | 0.383 | 0.410 | - | 0.312 | 0.526 | + management | 0.737 | 0.825 | 0.844 | 0.766 | 0.572 | 0.708 | 0.815 | 0.737 | 0.669 | 0.766 | 0.708 | 0.699 | 0.737 | 0.737 | 0.786 | 0.786 | 0.776 | 0.815 | 0.737 | 0.825 | - | 0.747 | 0.786 | + marketing | 0.760 | 0.863 | 0.893 | 0.858 | 0.735 | 0.799 | 0.829 | 0.837 | 0.799 | 0.794 | 0.756 | 0.811 | 0.833 | 0.816 | 0.824 | 0.820 | 0.803 | 0.880 | 0.841 | 0.893 | - | 0.782 | 0.846 | + medical_genetics | 0.580 | 0.780 | 0.810 | 0.640 | 0.580 | 0.660 | 0.690 | 0.720 | 0.660 | 0.660 | 0.600 | 0.740 | 0.630 | 0.660 | 0.710 | 0.710 | 0.700 | 0.830 | 0.690 | 0.770 | - | 0.640 | 0.820 | + miscellaneous | 0.698 | 0.830 | 0.854 | 0.796 | 0.650 | 0.777 | 0.787 | 0.773 | 0.736 | 0.759 | 0.727 | 0.782 | 0.766 | 0.756 | 0.756 | 0.777 | 0.759 | 0.837 | 0.814 | 0.814 | - | 0.746 | 0.828 | + moral_disputes | 0.526 | 0.680 | 0.736 | 0.612 | 0.456 | 0.595 | 0.589 | 0.621 | 0.560 | 0.572 | 0.524 | 0.552 | 0.598 | 0.645 | 0.635 | 0.615 | 0.621 | 0.696 | 0.658 | 0.676 | - | 0.554 | 0.708 | + moral_scenarios | 0.227 | 0.325 | 0.366 | 0.360 | 0.083 | 0.177 | 0.280 | 0.205 | 0.410 | 0.246 | 0.122 | 0.226 | 0.229 | 0.327 | 0.288 | 0.366 | 0.404 | 0.538 | 0.336 | 0.368 | - | 0.188 | 0.477 | + nutrition | 0.591 | 0.683 | 0.758 | 0.653 | 0.486 | 0.611 | 0.650 | 0.689 | 0.620 | 0.647 | 0.555 | 0.614 | 0.624 | 0.633 | 0.630 | 0.669 | 0.620 | 0.751 | 0.692 | 0.745 | - | 0.575 | 0.722 | + philosophy | 0.527 | 0.658 | 0.713 | 0.659 | 0.472 | 0.627 | 0.636 | 0.617 | 0.578 | 0.598 | 0.587 | 0.633 | 0.612 | 0.580 | 0.598 | 0.630 | 0.588 | 0.704 | 0.646 | 0.688 | - | 0.554 | 0.717 | + prehistory | 0.518 | 0.728 | 0.783 | 0.663 | 0.478 | 0.635 | 0.669 | 0.700 | 0.604 | 0.623 | 0.580 | 0.675 | 0.697 | 0.648 | 0.675 | 0.697 | 0.663 | 0.774 | 0.675 | 0.756 | - | 0.595 | 0.783 | + professional_accounting | 0.326 | 0.496 | 0.514 | 0.425 | 0.265 | 0.354 | 0.453 | 0.393 | 0.336 | 0.382 | 0.336 | 0.421 | 0.361 | 0.382 | 0.397 | 0.418 | 0.386 | 0.578 | 0.443 | 0.460 | - | 0.358 | 0.514 | + professional_law | 0.307 | 0.478 | 0.528 | 0.408 | 0.332 | 0.384 | 0.359 | 0.397 | 0.369 | 0.383 | 0.333 | 0.379 | 0.399 | 0.383 | 0.405 | 0.410 | 0.401 | 0.498 | 0.423 | 0.402 | - | 0.350 | 0.481 | + professional_medicine | 0.485 | 0.756 | 0.794 | 0.680 | 0.386 | 0.580 | 0.665 | 0.724 | 0.713 | 0.672 | 0.564 | 0.705 | 0.642 | 0.619 | 0.643 | 0.687 | 0.658 | 0.794 | 0.658 | 0.683 | - | 0.645 | 0.794 | + professional_psychology | 0.477 | 0.728 | 0.805 | 0.609 | 0.447 | 0.535 | 0.599 | 0.642 | 0.509 | 0.565 | 0.521 | 0.602 | 0.560 | 0.588 | 0.638 | 0.655 | 0.617 | 0.764 | 0.671 | 0.702 | - | 0.529 | 0.759 | + public_relations | 0.563 | 0.700 | 0.672 | 0.627 | 0.518 | 0.581 | 0.636 | 0.518 | 0.545 | 0.581 | 0.554 | 0.627 | 0.581 | 0.518 | 0.627 | 0.554 | 0.572 | 0.672 | 0.636 | 0.645 | - | 0.554 | 0.581 | + security_studies | 0.616 | 0.746 | 0.763 | 0.632 | 0.510 | 0.648 | 0.759 | 0.665 | 0.616 | 0.673 | 0.600 | 0.608 | 0.612 | 0.628 | 0.697 | 0.669 | 0.673 | 0.738 | 0.665 | 0.718 | - | 0.575 | 0.730 | + sociology | 0.666 | 0.815 | 0.860 | 0.741 | 0.626 | 0.711 | 0.810 | 0.786 | 0.741 | 0.771 | 0.716 | 0.786 | 0.776 | 0.781 | 0.800 | 0.820 | 0.781 | 0.850 | 0.825 | 0.825 | - | 0.741 | 0.835 | + us_foreign_policy | 0.690 | 0.868 | 0.840 | 0.800 | 0.690 | 0.790 | 0.790 | 0.800 | 0.800 | 0.840 | 0.757 | 0.740 | 0.787 | 0.787 | 0.740 | 0.760 | 0.770 | 0.850 | 0.800 | 0.820 | - | 0.757 | 0.810 | + virology | 0.433 | 0.472 | 0.506 | 0.439 | 0.367 | 0.427 | 0.415 | 0.439 | 0.415 | 0.475 | 0.387 | 0.421 | 0.436 | 0.448 | 0.379 | 0.403 | 0.367 | 0.487 | 0.457 | 0.457 | - | 0.381 | 0.487 | + world_religions | 0.678 | 0.800 | 0.847 | 0.766 | 0.643 | 0.771 | 0.801 | 0.789 | 0.742 | 0.777 | 0.747 | 0.789 | 0.800 | 0.747 | 0.742 | 0.742 | 0.725 | 0.801 | 0.766 | 0.818 | - | 0.705 | 0.812 | + MMLU | 0.475 | 0.647 | 0.687 | 0.595 | 0.429 | 0.530 | 0.595 | 0.570 | 0.525 | 0.550 | 0.486 | 0.555 | 0.544 | 0.553 | 0.578 | 0.599 | 0.578 | 0.682 | 0.610 | 0.643 | - | 0.509 | 0.666 | AGIEVAL - aquarat | 0.460 | 0.665 | 0.602 | 0.637 | - | 0.578 | 0.657 | 0.598 | 0.633 | 0.712 | 0.279 | 0.322 | 0.582 | 0.157 | 0.338 | 0.409 | 0.574 | 0.614 | 0.712 | 0.830 | - | 0.425 | 0.590 | - logiqa | 0.321 | 0.447 | 0.477 | 0.416 | - | 0.321 | 0.393 | 0.328 | 0.265 | 0.324 | 0.264 | 0.311 | 0.330 | 0.327 | 0.285 | 0.281 | 0.267 | 0.405 | 0.359 | 0.436 | - | 0.290 | 0.391 | - lsatar | 0.191 | 0.208 | 0.260 | 0.217 | - | 0.234 | 0.239 | 0.295 | 0.239 | 0.186 | 0.186 | 0.200 | 0.278 | 0.208 | 0.252 | 0.256 | 0.247 | 0.208 | 0.252 | 0.300 | - | 0.173 | 0.226 | - lsatlr | 0.337 | 0.635 | 0.654 | 0.515 | - | 0.417 | 0.513 | 0.441 | 0.327 | 0.447 | 0.366 | 0.445 | 0.523 | 0.570 | 0.429 | 0.415 | 0.386 | 0.598 | 0.456 | 0.603 | - | 0.449 | 0.576 | - lsatrc | 0.431 | 0.750 | 0.754 | 0.643 | - | 0.553 | 0.643 | 0.624 | 0.486 | 0.635 | 0.520 | 0.650 | 0.613 | 0.617 | 0.557 | 0.531 | 0.524 | 0.672 | 0.583 | 0.687 | - | 0.654 | 0.706 | - saten | 0.665 | 0.834 | 0.868 | 0.820 | - | 0.776 | 0.820 | 0.781 | 0.689 | 0.825 | 0.679 | 0.747 | 0.786 | 0.786 | 0.713 | 0.713 | 0.708 | 0.800 | 0.781 | 0.844 | - | 0.757 | 0.796 | - satmath | 0.627 | 0.886 | 0.768 | 0.868 | - | 0.763 | 0.904 | 0.618 | 0.845 | 0.886 | 0.400 | 0.395 | 0.690 | 0.413 | 0.509 | 0.713 | 0.754 | 0.727 | 0.900 | 0.963 | - | 0.540 | 0.768 | - AGIEVAL | 0.398 | 0.598 | 0.602 | 0.546 | - | 0.470 | 0.547 | 0.480 | 0.433 | 0.512 | 0.359 | 0.416 | 0.501 | 0.432 | 0.409 | 0.429 | 0.438 | 0.546 | 0.522 | 0.616 | - | 0.434 | 0.544 | + aquarat | 0.460 | 0.665 | 0.602 | 0.637 | 0.452 | 0.578 | 0.657 | 0.598 | 0.633 | 0.712 | 0.279 | 0.322 | 0.582 | 0.157 | 0.338 | 0.409 | 0.574 | 0.614 | 0.712 | 0.830 | - | 0.425 | 0.590 | + logiqa | 0.321 | 0.447 | 0.477 | 0.416 | 0.274 | 0.321 | 0.393 | 0.328 | 0.265 | 0.324 | 0.264 | 0.311 | 0.330 | 0.327 | 0.285 | 0.281 | 0.267 | 0.405 | 0.359 | 0.436 | - | 0.290 | 0.391 | + lsatar | 0.191 | 0.208 | 0.260 | 0.217 | 0.208 | 0.234 | 0.239 | 0.295 | 0.239 | 0.186 | 0.186 | 0.200 | 0.278 | 0.208 | 0.252 | 0.256 | 0.247 | 0.208 | 0.252 | 0.300 | - | 0.173 | 0.226 | + lsatlr | 0.337 | 0.635 | 0.654 | 0.515 | 0.296 | 0.417 | 0.513 | 0.441 | 0.327 | 0.447 | 0.366 | 0.445 | 0.523 | 0.570 | 0.429 | 0.415 | 0.386 | 0.598 | 0.456 | 0.603 | - | 0.449 | 0.576 | + lsatrc | 0.431 | 0.750 | 0.754 | 0.643 | 0.390 | 0.553 | 0.643 | 0.624 | 0.486 | 0.635 | 0.520 | 0.650 | 0.613 | 0.617 | 0.557 | 0.531 | 0.524 | 0.672 | 0.583 | 0.687 | - | 0.654 | 0.706 | + saten | 0.665 | 0.834 | 0.868 | 0.820 | 0.524 | 0.776 | 0.820 | 0.781 | 0.689 | 0.825 | 0.679 | 0.747 | 0.786 | 0.786 | 0.713 | 0.713 | 0.708 | 0.800 | 0.781 | 0.844 | - | 0.757 | 0.796 | + satmath | 0.627 | 0.886 | 0.768 | 0.868 | 0.631 | 0.763 | 0.904 | 0.618 | 0.845 | 0.886 | 0.400 | 0.395 | 0.690 | 0.413 | 0.509 | 0.713 | 0.754 | 0.727 | 0.900 | 0.963 | - | 0.540 | 0.768 | + AGIEVAL | 0.398 | 0.598 | 0.602 | 0.546 | 0.361 | 0.470 | 0.547 | 0.480 | 0.433 | 0.512 | 0.359 | 0.416 | 0.501 | 0.432 | 0.409 | 0.429 | 0.438 | 0.546 | 0.522 | 0.616 | - | 0.434 | 0.544 | AGIEVALC_biology | - | - | - | 0.765 | - | - | 0.830 | - | - | - | - | - | 0.304 | 0.408 | - | - | - | - | - | - | - | 0.356 | - | AGIEVALC_chemistry | - | - | - | 0.696 | - | - | 0.598 | - | - | - | - | - | 0.215 | 0.313 | - | - | - | - | - | - | - | 0.171 | - | AGIEVALC_chinese | - | - | - | 0.650 | - | - | 0.682 | - | - | - | - | - | 0.300 | 0.313 | - | - | - | - | - | - | - | 0.239 | - | @@ -178,68 +178,68 @@ TESTS: AGIEVALC_physics | - | - | - | 0.436 | - | - | 0.436 | - | - | - | - | - | 0.183 | 0.235 | - | - | - | - | - | - | - | 0.178 | - | AGIEVALC | - | - | - | 0.597 | - | - | 0.632 | - | - | - | - | - | 0.322 | 0.363 | - | - | - | - | - | - | - | 0.297 | - | BBH - boolean_expressions | 0.556 | 0.768 | 0.460 | 0.868 | - | 0.856 | 0.688 | 0.844 | 0.480 | 0.728 | 0.764 | 0.780 | 0.824 | 0.664 | 0.800 | 0.852 | 0.832 | 0.696 | 0.808 | 0.880 | - | 0.720 | 0.540 | - causal_judgement | 0.524 | 0.598 | 0.604 | 0.550 | - | 0.598 | 0.689 | 0.540 | 0.518 | 0.593 | 0.588 | 0.625 | 0.614 | 0.604 | 0.598 | 0.588 | 0.593 | 0.588 | 0.625 | 0.513 | - | 0.636 | 0.641 | - date_understanding | 0.592 | 0.748 | 0.788 | 0.572 | - | 0.676 | 0.832 | 0.716 | 0.664 | 0.772 | 0.548 | 0.668 | 0.592 | 0.608 | 0.568 | 0.696 | 0.576 | 0.780 | 0.544 | 0.740 | - | 0.556 | 0.724 | - disambiguation_qa | 0.532 | 0.660 | 0.720 | 0.636 | - | 0.656 | 0.732 | 0.516 | 0.472 | 0.644 | 0.600 | 0.596 | 0.728 | 0.704 | 0.592 | 0.720 | 0.752 | 0.692 | 0.660 | 0.636 | - | 0.576 | 0.640 | - dyck_languages | 0.476 | 0.728 | 0.600 | 0.544 | - | 0.700 | 0.728 | 0.796 | 0.680 | 0.756 | 0.744 | 0.712 | 0.664 | 0.732 | 0.424 | 0.580 | 0.468 | 0.532 | 0.756 | 0.836 | - | 0.684 | 0.572 | - formal_fallacies | 0.532 | 0.832 | 0.760 | 0.660 | - | 0.632 | 0.920 | 0.984 | 0.816 | 0.532 | 0.852 | 0.996 | 0.632 | 0.564 | 0.920 | 0.808 | 0.808 | 0.944 | 0.632 | 0.628 | - | 0.776 | 0.576 | - geometric_shapes | 0.204 | 0.436 | 0.420 | 0.400 | - | 0.400 | 0.840 | 0.440 | 0.416 | 0.520 | 0.288 | 0.404 | 0.348 | 0.344 | 0.248 | 0.416 | 0.292 | 0.328 | 0.356 | 0.604 | - | 0.268 | 0.400 | - hyperbaton | 0.704 | 0.884 | 0.836 | 0.824 | - | 0.692 | 0.928 | 0.880 | 0.624 | 0.804 | 0.656 | 0.644 | 0.828 | 0.724 | 0.940 | 0.936 | 0.936 | 0.952 | 0.704 | 0.792 | - | 0.744 | 0.900 | - logical_deduction_five_objects | 0.300 | 0.568 | 0.608 | 0.516 | - | 0.448 | 0.660 | 0.568 | 0.484 | 0.592 | 0.352 | 0.556 | 0.384 | 0.472 | 0.432 | 0.632 | 0.532 | 0.532 | 0.556 | 0.728 | - | 0.436 | 0.612 | - logical_deduction_seven_objects | 0.284 | 0.560 | 0.552 | 0.500 | - | 0.428 | 0.648 | 0.488 | 0.408 | 0.500 | 0.296 | 0.452 | 0.320 | 0.400 | 0.308 | 0.568 | 0.500 | 0.444 | 0.464 | 0.656 | - | 0.388 | 0.560 | - logical_deduction_three_objects | 0.440 | 0.844 | 0.892 | 0.840 | - | 0.668 | 0.896 | 0.804 | 0.652 | 0.844 | 0.608 | 0.800 | 0.664 | 0.620 | 0.688 | 0.844 | 0.804 | 0.884 | 0.736 | 0.956 | - | 0.664 | 0.888 | - movie_recommendation | 0.568 | 0.552 | 0.508 | 0.648 | - | 0.512 | 0.884 | 0.536 | 0.456 | 0.604 | 0.508 | 0.448 | 0.552 | 0.540 | 0.540 | 0.520 | 0.508 | 0.584 | 0.548 | 0.536 | - | 0.584 | 0.676 | - multistep_arithmetic_two | 0.288 | 0.488 | 0.472 | 0.524 | - | 0.496 | 0.372 | 0.700 | 0.532 | 0.540 | 0.108 | 0.432 | 0.164 | 0.292 | 0.272 | 0.836 | 0.420 | 0.460 | 0.532 | 0.948 | - | 0.252 | 0.536 | - navigate | 0.580 | 0.596 | 0.648 | 0.420 | - | 0.596 | 0.452 | 0.580 | 0.580 | 0.572 | 0.600 | 0.588 | 0.568 | 0.580 | 0.596 | 0.588 | 0.584 | 0.636 | 0.596 | 0.596 | - | 0.520 | 0.652 | - object_counting | 0.612 | 0.848 | 0.856 | 0.660 | - | 0.756 | 0.644 | 0.864 | 0.808 | 0.908 | 0.608 | 0.716 | 0.564 | 0.796 | 0.244 | 0.836 | 0.344 | 0.372 | 0.660 | 0.804 | - | 0.680 | 0.756 | - penguins_in_a_table | 0.506 | 0.890 | 0.842 | 0.917 | - | 0.643 | 0.815 | 0.856 | 0.801 | 0.917 | 0.623 | 0.801 | 0.575 | 0.760 | 0.465 | 0.883 | 0.712 | 0.815 | 0.835 | 0.924 | - | 0.636 | 0.828 | - reasoning_about_colored_objects | 0.484 | 0.744 | 0.900 | 0.796 | - | 0.668 | 0.904 | 0.824 | 0.568 | 0.904 | 0.608 | 0.752 | 0.648 | 0.752 | 0.656 | 0.808 | 0.656 | 0.896 | 0.764 | 0.868 | - | 0.600 | 0.840 | - ruin_names | 0.480 | 0.716 | 0.760 | 0.652 | - | 0.516 | 0.932 | 0.744 | 0.532 | 0.556 | 0.400 | 0.584 | 0.408 | 0.592 | 0.596 | 0.612 | 0.600 | 0.636 | 0.564 | 0.544 | - | 0.536 | 0.616 | - salient_translation_error_detection | 0.420 | 0.548 | 0.568 | 0.488 | - | 0.484 | 0.644 | 0.512 | 0.464 | 0.556 | 0.444 | 0.472 | 0.524 | 0.560 | 0.408 | 0.520 | 0.532 | 0.596 | 0.456 | 0.572 | - | 0.532 | 0.588 | - snarks | 0.584 | 0.691 | 0.719 | 0.707 | - | 0.623 | 0.820 | 0.651 | 0.657 | 0.691 | 0.606 | 0.691 | 0.533 | 0.640 | 0.735 | 0.747 | 0.786 | 0.747 | 0.657 | 0.780 | - | 0.646 | 0.837 | - sports_understanding | 0.724 | 0.788 | 0.816 | 0.468 | - | 0.768 | 0.920 | 0.720 | 0.644 | 0.640 | 0.716 | 0.800 | 0.836 | 0.792 | 0.596 | 0.596 | 0.600 | 0.748 | 0.776 | 0.684 | - | 0.828 | 0.740 | - temporal_sequences | 0.124 | 0.708 | 0.748 | 0.840 | - | 0.584 | 0.976 | 0.856 | 0.712 | 0.360 | 0.404 | 0.544 | 0.524 | 0.508 | 0.800 | 0.784 | 0.508 | 0.892 | 0.596 | 0.820 | - | 0.568 | 0.920 | - tracking_shuffled_objects_five_objects | 0.216 | 0.600 | 0.692 | 0.536 | - | 0.492 | 0.572 | 0.656 | 0.500 | 0.792 | 0.344 | 0.736 | 0.356 | 0.468 | 0.612 | 0.940 | 0.712 | 0.776 | 0.476 | 0.908 | - | 0.364 | 0.420 | - tracking_shuffled_objects_seven_objects | 0.152 | 0.572 | 0.640 | 0.436 | - | 0.648 | 0.480 | 0.592 | 0.420 | 0.728 | 0.296 | 0.596 | 0.284 | 0.396 | 0.568 | 0.896 | 0.612 | 0.652 | 0.416 | 0.868 | - | 0.372 | 0.436 | - tracking_shuffled_objects_three_objects | 0.292 | 0.732 | 0.848 | 0.696 | - | 0.488 | 0.528 | 0.728 | 0.608 | 0.832 | 0.436 | 0.832 | 0.412 | 0.724 | 0.572 | 0.960 | 0.788 | 0.888 | 0.524 | 0.872 | - | 0.536 | 0.660 | - web_of_lies | 0.508 | 0.520 | 0.488 | 0.488 | - | 0.552 | 0.536 | 0.512 | 0.544 | 0.492 | 0.488 | 0.512 | 0.488 | 0.512 | 0.512 | 0.488 | 0.492 | 0.548 | 0.552 | 0.532 | - | 0.488 | 0.520 | - word_sorting | 0.100 | 0.404 | 0.540 | 0.392 | - | 0.356 | 0.452 | 0.512 | 0.360 | 0.340 | 0.280 | 0.392 | 0.344 | 0.500 | 0.168 | 0.204 | 0.152 | 0.236 | 0.208 | 0.220 | - | 0.336 | 0.276 | - BBH | 0.432 | 0.664 | 0.674 | 0.608 | - | 0.589 | 0.719 | 0.681 | 0.566 | 0.652 | 0.506 | 0.631 | 0.531 | 0.583 | 0.549 | 0.696 | 0.592 | 0.658 | 0.587 | 0.718 | - | 0.549 | 0.637 | + boolean_expressions | 0.556 | 0.768 | 0.460 | 0.868 | 0.812 | 0.856 | 0.688 | 0.844 | 0.480 | 0.728 | 0.764 | 0.780 | 0.824 | 0.664 | 0.800 | 0.852 | 0.832 | 0.696 | 0.808 | 0.880 | - | 0.720 | 0.540 | + causal_judgement | 0.524 | 0.598 | 0.604 | 0.550 | 0.550 | 0.598 | 0.689 | 0.540 | 0.518 | 0.593 | 0.588 | 0.625 | 0.614 | 0.604 | 0.598 | 0.588 | 0.593 | 0.588 | 0.625 | 0.513 | - | 0.636 | 0.641 | + date_understanding | 0.592 | 0.748 | 0.788 | 0.572 | 0.588 | 0.676 | 0.832 | 0.716 | 0.664 | 0.772 | 0.548 | 0.668 | 0.592 | 0.608 | 0.568 | 0.696 | 0.576 | 0.780 | 0.544 | 0.740 | - | 0.556 | 0.724 | + disambiguation_qa | 0.532 | 0.660 | 0.720 | 0.636 | 0.612 | 0.656 | 0.732 | 0.516 | 0.472 | 0.644 | 0.600 | 0.596 | 0.728 | 0.704 | 0.592 | 0.720 | 0.752 | 0.692 | 0.660 | 0.636 | - | 0.576 | 0.640 | + dyck_languages | 0.476 | 0.728 | 0.600 | 0.544 | 0.548 | 0.700 | 0.728 | 0.796 | 0.680 | 0.756 | 0.744 | 0.712 | 0.664 | 0.732 | 0.424 | 0.580 | 0.468 | 0.532 | 0.756 | 0.836 | - | 0.684 | 0.572 | + formal_fallacies | 0.532 | 0.832 | 0.760 | 0.660 | 0.956 | 0.632 | 0.920 | 0.984 | 0.816 | 0.532 | 0.852 | 0.996 | 0.632 | 0.564 | 0.920 | 0.808 | 0.808 | 0.944 | 0.632 | 0.628 | - | 0.776 | 0.576 | + geometric_shapes | 0.204 | 0.436 | 0.420 | 0.400 | 0.288 | 0.400 | 0.840 | 0.440 | 0.416 | 0.520 | 0.288 | 0.404 | 0.348 | 0.344 | 0.248 | 0.416 | 0.292 | 0.328 | 0.356 | 0.604 | - | 0.268 | 0.400 | + hyperbaton | 0.704 | 0.884 | 0.836 | 0.824 | 0.656 | 0.692 | 0.928 | 0.880 | 0.624 | 0.804 | 0.656 | 0.644 | 0.828 | 0.724 | 0.940 | 0.936 | 0.936 | 0.952 | 0.704 | 0.792 | - | 0.744 | 0.900 | + logical_deduction_five_objects | 0.300 | 0.568 | 0.608 | 0.516 | 0.364 | 0.448 | 0.660 | 0.568 | 0.484 | 0.592 | 0.352 | 0.556 | 0.384 | 0.472 | 0.432 | 0.632 | 0.532 | 0.532 | 0.556 | 0.728 | - | 0.436 | 0.612 | + logical_deduction_seven_objects | 0.284 | 0.560 | 0.552 | 0.500 | 0.324 | 0.428 | 0.648 | 0.488 | 0.408 | 0.500 | 0.296 | 0.452 | 0.320 | 0.400 | 0.308 | 0.568 | 0.500 | 0.444 | 0.464 | 0.656 | - | 0.388 | 0.560 | + logical_deduction_three_objects | 0.440 | 0.844 | 0.892 | 0.840 | 0.516 | 0.668 | 0.896 | 0.804 | 0.652 | 0.844 | 0.608 | 0.800 | 0.664 | 0.620 | 0.688 | 0.844 | 0.804 | 0.884 | 0.736 | 0.956 | - | 0.664 | 0.888 | + movie_recommendation | 0.568 | 0.552 | 0.508 | 0.648 | 0.480 | 0.512 | 0.884 | 0.536 | 0.456 | 0.604 | 0.508 | 0.448 | 0.552 | 0.540 | 0.540 | 0.520 | 0.508 | 0.584 | 0.548 | 0.536 | - | 0.584 | 0.676 | + multistep_arithmetic_two | 0.288 | 0.488 | 0.472 | 0.524 | 0.272 | 0.496 | 0.372 | 0.700 | 0.532 | 0.540 | 0.108 | 0.432 | 0.164 | 0.292 | 0.272 | 0.836 | 0.420 | 0.460 | 0.532 | 0.948 | - | 0.252 | 0.536 | + navigate | 0.580 | 0.596 | 0.648 | 0.420 | 0.580 | 0.596 | 0.452 | 0.580 | 0.580 | 0.572 | 0.600 | 0.588 | 0.568 | 0.580 | 0.596 | 0.588 | 0.584 | 0.636 | 0.596 | 0.596 | - | 0.520 | 0.652 | + object_counting | 0.612 | 0.848 | 0.856 | 0.660 | 0.652 | 0.756 | 0.644 | 0.864 | 0.808 | 0.908 | 0.608 | 0.716 | 0.564 | 0.796 | 0.244 | 0.836 | 0.344 | 0.372 | 0.660 | 0.804 | - | 0.680 | 0.756 | + penguins_in_a_table | 0.506 | 0.890 | 0.842 | 0.917 | 0.547 | 0.643 | 0.815 | 0.856 | 0.801 | 0.917 | 0.623 | 0.801 | 0.575 | 0.760 | 0.465 | 0.883 | 0.712 | 0.815 | 0.835 | 0.924 | - | 0.636 | 0.828 | + reasoning_about_colored_objects | 0.484 | 0.744 | 0.900 | 0.796 | 0.504 | 0.668 | 0.904 | 0.824 | 0.568 | 0.904 | 0.608 | 0.752 | 0.648 | 0.752 | 0.656 | 0.808 | 0.656 | 0.896 | 0.764 | 0.868 | - | 0.600 | 0.840 | + ruin_names | 0.480 | 0.716 | 0.760 | 0.652 | 0.376 | 0.516 | 0.932 | 0.744 | 0.532 | 0.556 | 0.400 | 0.584 | 0.408 | 0.592 | 0.596 | 0.612 | 0.600 | 0.636 | 0.564 | 0.544 | - | 0.536 | 0.616 | + salient_translation_error_detection | 0.420 | 0.548 | 0.568 | 0.488 | 0.348 | 0.484 | 0.644 | 0.512 | 0.464 | 0.556 | 0.444 | 0.472 | 0.524 | 0.560 | 0.408 | 0.520 | 0.532 | 0.596 | 0.456 | 0.572 | - | 0.532 | 0.588 | + snarks | 0.584 | 0.691 | 0.719 | 0.707 | 0.578 | 0.623 | 0.820 | 0.651 | 0.657 | 0.691 | 0.606 | 0.691 | 0.533 | 0.640 | 0.735 | 0.747 | 0.786 | 0.747 | 0.657 | 0.780 | - | 0.646 | 0.837 | + sports_understanding | 0.724 | 0.788 | 0.816 | 0.468 | 0.708 | 0.768 | 0.920 | 0.720 | 0.644 | 0.640 | 0.716 | 0.800 | 0.836 | 0.792 | 0.596 | 0.596 | 0.600 | 0.748 | 0.776 | 0.684 | - | 0.828 | 0.740 | + temporal_sequences | 0.124 | 0.708 | 0.748 | 0.840 | 0.244 | 0.584 | 0.976 | 0.856 | 0.712 | 0.360 | 0.404 | 0.544 | 0.524 | 0.508 | 0.800 | 0.784 | 0.508 | 0.892 | 0.596 | 0.820 | - | 0.568 | 0.920 | + tracking_shuffled_objects_five_objects | 0.216 | 0.600 | 0.692 | 0.536 | 0.608 | 0.492 | 0.572 | 0.656 | 0.500 | 0.792 | 0.344 | 0.736 | 0.356 | 0.468 | 0.612 | 0.940 | 0.712 | 0.776 | 0.476 | 0.908 | - | 0.364 | 0.420 | + tracking_shuffled_objects_seven_objects | 0.152 | 0.572 | 0.640 | 0.436 | 0.464 | 0.648 | 0.480 | 0.592 | 0.420 | 0.728 | 0.296 | 0.596 | 0.284 | 0.396 | 0.568 | 0.896 | 0.612 | 0.652 | 0.416 | 0.868 | - | 0.372 | 0.436 | + tracking_shuffled_objects_three_objects | 0.292 | 0.732 | 0.848 | 0.696 | 0.636 | 0.488 | 0.528 | 0.728 | 0.608 | 0.832 | 0.436 | 0.832 | 0.412 | 0.724 | 0.572 | 0.960 | 0.788 | 0.888 | 0.524 | 0.872 | - | 0.536 | 0.660 | + web_of_lies | 0.508 | 0.520 | 0.488 | 0.488 | 0.480 | 0.552 | 0.536 | 0.512 | 0.544 | 0.492 | 0.488 | 0.512 | 0.488 | 0.512 | 0.512 | 0.488 | 0.492 | 0.548 | 0.552 | 0.532 | - | 0.488 | 0.520 | + word_sorting | 0.100 | 0.404 | 0.540 | 0.392 | 0.220 | 0.356 | 0.452 | 0.512 | 0.360 | 0.340 | 0.280 | 0.392 | 0.344 | 0.500 | 0.168 | 0.204 | 0.152 | 0.236 | 0.208 | 0.220 | - | 0.336 | 0.276 | + BBH | 0.432 | 0.664 | 0.674 | 0.608 | 0.513 | 0.589 | 0.719 | 0.681 | 0.566 | 0.652 | 0.506 | 0.631 | 0.531 | 0.583 | 0.549 | 0.696 | 0.592 | 0.658 | 0.587 | 0.718 | - | 0.549 | 0.637 | MUSR - murder_mystery | 0.552 | 0.668 | 0.576 | 0.584 | - | 0.484 | 0.572 | 0.584 | 0.576 | 0.624 | 0.516 | 0.656 | 0.592 | 0.272 | 0.636 | 0.636 | 0.620 | 0.600 | 0.516 | 0.584 | - | 0.588 | 0.532 | - object_placements | 0.449 | 0.519 | 0.542 | 0.531 | - | 0.523 | 0.492 | 0.546 | 0.523 | 0.484 | 0.453 | 0.542 | 0.527 | 0.523 | 0.496 | 0.503 | 0.457 | 0.519 | 0.511 | 0.554 | - | 0.500 | 0.425 | - team_allocation | 0.352 | 0.460 | 0.476 | 0.588 | - | 0.492 | 0.500 | 0.460 | 0.396 | 0.504 | 0.356 | 0.448 | 0.456 | 0.516 | 0.520 | 0.536 | 0.480 | 0.560 | 0.440 | 0.476 | - | 0.504 | 0.556 | - MUSR | 0.451 | 0.548 | 0.531 | 0.567 | - | 0.500 | 0.521 | 0.530 | 0.498 | 0.537 | 0.441 | 0.548 | 0.525 | 0.437 | 0.550 | 0.558 | 0.518 | 0.559 | 0.489 | 0.538 | - | 0.530 | 0.503 | + murder_mystery | 0.552 | 0.668 | 0.576 | 0.584 | 0.532 | 0.484 | 0.572 | 0.584 | 0.576 | 0.624 | 0.516 | 0.656 | 0.592 | 0.272 | 0.636 | 0.636 | 0.620 | 0.600 | 0.516 | 0.584 | - | 0.588 | 0.532 | + object_placements | 0.449 | 0.519 | 0.542 | 0.531 | 0.445 | 0.523 | 0.492 | 0.546 | 0.523 | 0.484 | 0.453 | 0.542 | 0.527 | 0.523 | 0.496 | 0.503 | 0.457 | 0.519 | 0.511 | 0.554 | - | 0.500 | 0.425 | + team_allocation | 0.352 | 0.460 | 0.476 | 0.588 | 0.356 | 0.492 | 0.500 | 0.460 | 0.396 | 0.504 | 0.356 | 0.448 | 0.456 | 0.516 | 0.520 | 0.536 | 0.480 | 0.560 | 0.440 | 0.476 | - | 0.504 | 0.556 | + MUSR | 0.451 | 0.548 | 0.531 | 0.567 | 0.444 | 0.500 | 0.521 | 0.530 | 0.498 | 0.537 | 0.441 | 0.548 | 0.525 | 0.437 | 0.550 | 0.558 | 0.518 | 0.559 | 0.489 | 0.538 | - | 0.530 | 0.503 | MMLUPRO - biology | 0.582 | 0.747 | 0.772 | 0.695 | - | 0.609 | 0.687 | 0.686 | 0.623 | 0.659 | 0.582 | 0.651 | 0.619 | 0.592 | 0.676 | 0.702 | 0.662 | 0.725 | 0.668 | 0.729 | - | 0.570 | 0.684 | - business | 0.356 | 0.583 | 0.626 | 0.562 | - | 0.171 | 0.510 | 0.558 | 0.458 | 0.536 | 0.335 | 0.510 | 0.404 | 0.429 | 0.465 | 0.571 | 0.509 | 0.476 | 0.590 | 0.661 | - | 0.335 | 0.496 | - chemistry | 0.271 | 0.503 | 0.546 | 0.467 | - | 0.203 | 0.343 | 0.467 | 0.390 | 0.375 | - | 0.366 | 0.263 | 0.271 | 0.431 | 0.463 | 0.296 | 0.312 | 0.413 | 0.580 | - | 0.196 | 0.407 | - computer_science | 0.300 | 0.482 | 0.560 | 0.502 | - | 0.431 | 0.487 | 0.485 | 0.414 | 0.541 | - | 0.456 | 0.426 | 0.424 | 0.458 | 0.475 | 0.448 | 0.521 | 0.482 | 0.604 | - | 0.339 | 0.512 | - economics | 0.408 | 0.668 | 0.678 | 0.610 | - | 0.518 | 0.540 | 0.568 | 0.492 | 0.542 | - | 0.541 | 0.484 | 0.490 | 0.558 | 0.609 | 0.587 | 0.575 | 0.574 | 0.687 | - | 0.463 | - | - engineering | 0.253 | 0.406 | 0.414 | 0.298 | - | 0.266 | 0.301 | 0.378 | 0.302 | 0.330 | - | 0.317 | 0.237 | 0.237 | 0.297 | 0.297 | 0.283 | 0.342 | 0.356 | 0.420 | - | 0.180 | - | - health | 0.333 | 0.545 | 0.621 | 0.496 | - | 0.432 | 0.458 | 0.558 | 0.437 | 0.464 | - | 0.498 | 0.422 | 0.414 | 0.479 | 0.515 | 0.466 | 0.588 | 0.442 | 0.569 | - | 0.381 | - | - history | 0.275 | 0.493 | 0.490 | 0.438 | - | 0.370 | 0.391 | 0.451 | 0.380 | 0.409 | - | 0.425 | 0.359 | 0.364 | 0.388 | 0.380 | 0.380 | 0.496 | 0.391 | 0.464 | - | 0.380 | - | - law | 0.198 | 0.343 | 0.405 | 0.284 | - | 0.278 | 0.237 | 0.303 | 0.243 | 0.276 | - | 0.279 | 0.238 | 0.262 | 0.306 | 0.276 | 0 | 0.384 | 0.271 | 0.292 | - | 0.217 | - | - math | 0.309 | 0.538 | 0.570 | 0.523 | - | 0.393 | 0.508 | 0.555 | 0.511 | 0.543 | - | 0.416 | 0.369 | 0.418 | 0.468 | 0.522 | 0.458 | 0.391 | 0.592 | 0.723 | - | 0.270 | - | - other | 0.325 | 0.551 | 0.574 | 0.458 | - | 0.415 | 0.440 | 0.487 | 0.389 | 0.464 | - | 0.456 | 0.416 | 0.401 | 0.457 | 0.500 | 0.433 | 0.532 | 0.444 | 0.551 | - | 0.400 | - | - philosophy | 0.272 | 0.448 | 0.488 | 0.412 | - | 0.386 | 0.372 | 0.382 | 0.326 | 0.366 | - | 0.390 | 0.360 | 0.346 | 0.386 | 0.406 | 0.390 | 0.494 | 0.374 | 0.464 | - | 0.326 | - | - physics | 0.275 | 0.501 | 0.559 | 0.461 | - | 0.328 | 0.344 | 0.488 | 0.397 | 0.414 | - | 0.370 | 0.317 | 0.309 | 0.423 | 0.455 | 0.425 | 0.367 | 0.457 | 0.602 | - | 0.240 | - | - psychology | 0.494 | 0.647 | 0.692 | 0.602 | - | 0.533 | 0.572 | 0.637 | 0.518 | 0.595 | - | 0.588 | 0.543 | 0.552 | 0.583 | 0.621 | 0.572 | 0.676 | 0.595 | 0.644 | - | 0.525 | - | - MMLUPRO | 0.326 | 0.528 | 0.568 | 0.480 | - | 0.368 | 0.432 | 0.499 | 0.419 | 0.458 | 0.453 | 0.436 | 0.376 | 0.382 | 0.451 | 0.482 | 0.408 | 0.470 | 0.475 | 0.575 | - | 0.326 | 0.509 | + biology | 0.582 | 0.747 | 0.772 | 0.695 | 0.532 | 0.609 | 0.687 | 0.686 | 0.623 | 0.659 | 0.582 | 0.651 | 0.619 | 0.592 | 0.676 | 0.702 | 0.662 | 0.725 | 0.668 | 0.729 | - | 0.570 | 0.684 | + business | 0.356 | 0.583 | 0.626 | 0.562 | 0.321 | 0.171 | 0.510 | 0.558 | 0.458 | 0.536 | 0.335 | 0.510 | 0.404 | 0.429 | 0.465 | 0.571 | 0.509 | 0.476 | 0.590 | 0.661 | - | 0.335 | 0.496 | + chemistry | 0.271 | 0.503 | 0.546 | 0.467 | 0.222 | 0.203 | 0.343 | 0.467 | 0.390 | 0.375 | - | 0.366 | 0.263 | 0.271 | 0.431 | 0.463 | 0.296 | 0.312 | 0.413 | 0.580 | - | 0.196 | 0.407 | + computer_science | 0.300 | 0.482 | 0.560 | 0.502 | 0.329 | 0.431 | 0.487 | 0.485 | 0.414 | 0.541 | - | 0.456 | 0.426 | 0.424 | 0.458 | 0.475 | 0.448 | 0.521 | 0.482 | 0.604 | - | 0.339 | 0.512 | + economics | 0.408 | 0.668 | 0.678 | 0.610 | 0.396 | 0.518 | 0.540 | 0.568 | 0.492 | 0.542 | - | 0.541 | 0.484 | 0.490 | 0.558 | 0.609 | 0.587 | 0.575 | 0.574 | 0.687 | - | 0.463 | - | + engineering | 0.253 | 0.406 | 0.414 | 0.298 | 0.216 | 0.266 | 0.301 | 0.378 | 0.302 | 0.330 | - | 0.317 | 0.237 | 0.237 | 0.297 | 0.297 | 0.283 | 0.342 | 0.356 | 0.420 | - | 0.180 | - | + health | 0.333 | 0.545 | 0.621 | 0.496 | 0.288 | 0.432 | 0.458 | 0.558 | 0.437 | 0.464 | - | 0.498 | 0.422 | 0.414 | 0.479 | 0.515 | 0.466 | 0.588 | 0.442 | 0.569 | - | 0.381 | - | + history | 0.275 | 0.493 | 0.490 | 0.438 | 0.230 | 0.370 | 0.391 | 0.451 | 0.380 | 0.409 | - | 0.425 | 0.359 | 0.364 | 0.388 | 0.380 | 0.380 | 0.496 | 0.391 | 0.464 | - | 0.380 | - | + law | 0.198 | 0.343 | 0.405 | 0.284 | 0.192 | 0.278 | 0.237 | 0.303 | 0.243 | 0.276 | - | 0.279 | 0.238 | 0.262 | 0.306 | 0.276 | 0 | 0.384 | 0.271 | 0.292 | - | 0.217 | - | + math | 0.309 | 0.538 | 0.570 | 0.523 | 0.316 | 0.393 | 0.508 | 0.555 | 0.511 | 0.543 | - | 0.416 | 0.369 | 0.418 | 0.468 | 0.522 | 0.458 | 0.391 | 0.592 | 0.723 | - | 0.270 | - | + other | 0.325 | 0.551 | 0.574 | 0.458 | 0.299 | 0.415 | 0.440 | 0.487 | 0.389 | 0.464 | - | 0.456 | 0.416 | 0.401 | 0.457 | 0.500 | 0.433 | 0.532 | 0.444 | 0.551 | - | 0.400 | - | + philosophy | 0.272 | 0.448 | 0.488 | 0.412 | 0.288 | 0.386 | 0.372 | 0.382 | 0.326 | 0.366 | - | 0.390 | 0.360 | 0.346 | 0.386 | 0.406 | 0.390 | 0.494 | 0.374 | 0.464 | - | 0.326 | - | + physics | 0.275 | 0.501 | 0.559 | 0.461 | 0.227 | 0.328 | 0.344 | 0.488 | 0.397 | 0.414 | - | 0.370 | 0.317 | 0.309 | 0.423 | 0.455 | 0.425 | 0.367 | 0.457 | 0.602 | - | 0.240 | - | + psychology | 0.494 | 0.647 | 0.692 | 0.602 | 0.421 | 0.533 | 0.572 | 0.637 | 0.518 | 0.595 | - | 0.588 | 0.543 | 0.552 | 0.583 | 0.621 | 0.572 | 0.676 | 0.595 | 0.644 | - | 0.525 | - | + MMLUPRO | 0.326 | 0.528 | 0.568 | 0.480 | 0.297 | 0.368 | 0.432 | 0.499 | 0.419 | 0.458 | 0.453 | 0.436 | 0.376 | 0.382 | 0.451 | 0.482 | 0.408 | 0.470 | 0.475 | 0.575 | - | 0.326 | 0.509 | CATEGORIES - REASONING | 0.570 | 0.788 | 0.814 | 0.811 | - | 0.701 | 0.845 | 0.713 | 0.606 | 0.779 | 0.628 | 0.730 | 0.785 | 0.799 | 0.741 | 0.724 | 0.691 | 0.805 | 0.755 | 0.809 | - | 0.784 | 0.806 | - UNDERSTANDING | 0.538 | 0.707 | 0.742 | 0.670 | - | 0.598 | 0.685 | 0.631 | 0.579 | 0.633 | 0.563 | 0.633 | 0.644 | 0.651 | 0.629 | 0.614 | 0.622 | 0.727 | 0.674 | 0.696 | - | 0.617 | 0.713 | - LANGUAGE | 0.624 | 0.735 | 0.755 | 0.783 | - | 0.800 | 0.732 | 0.747 | 0.705 | 0.776 | 0.766 | 0.714 | 0.744 | 0.733 | 0.618 | 0.677 | 0.613 | 0.632 | 0.735 | 0.724 | - | 0.654 | 0.708 | - KNOWLEDGE | 0.505 | 0.689 | 0.733 | 0.544 | - | 0.516 | 0.601 | 0.547 | 0.536 | 0.569 | 0.582 | 0.546 | 0.542 | 0.533 | 0.546 | 0.517 | 0.519 | 0.663 | 0.500 | 0.589 | - | 0.489 | 0.559 | - COT | 0.350 | 0.550 | 0.582 | 0.500 | - | 0.409 | 0.505 | 0.530 | 0.446 | 0.479 | 0.498 | 0.466 | 0.416 | 0.424 | 0.474 | 0.506 | 0.440 | 0.503 | 0.492 | 0.581 | - | 0.377 | 0.563 | - MATHCOT | 0.482 | 0.735 | 0.740 | 0.671 | - | 0.661 | 0.693 | 0.729 | 0.647 | 0.767 | 0.495 | 0.671 | 0.545 | 0.574 | 0.592 | 0.771 | 0.640 | 0.666 | 0.685 | 0.838 | - | 0.535 | 0.683 | - CODE | 0.331 | 0.495 | 0.568 | 0.475 | - | 0.445 | 0.344 | 0.463 | 0.366 | 0.514 | 0.321 | 0.433 | 0.324 | 0.316 | 0.389 | 0.419 | 0.376 | 0.350 | 0.390 | 0.528 | 0.471 | 0.233 | 0.368 | + REASONING | 0.570 | 0.788 | 0.814 | 0.811 | 0.590 | 0.701 | 0.845 | 0.713 | 0.606 | 0.779 | 0.628 | 0.730 | 0.785 | 0.799 | 0.741 | 0.724 | 0.691 | 0.805 | 0.755 | 0.809 | - | 0.784 | 0.806 | + UNDERSTANDING | 0.538 | 0.707 | 0.742 | 0.670 | 0.511 | 0.598 | 0.685 | 0.631 | 0.579 | 0.633 | 0.563 | 0.633 | 0.644 | 0.651 | 0.629 | 0.614 | 0.622 | 0.727 | 0.674 | 0.696 | - | 0.617 | 0.713 | + LANGUAGE | 0.624 | 0.735 | 0.755 | 0.783 | 0.745 | 0.800 | 0.732 | 0.747 | 0.705 | 0.776 | 0.766 | 0.714 | 0.744 | 0.733 | 0.618 | 0.677 | 0.613 | 0.632 | 0.735 | 0.724 | - | 0.654 | 0.708 | + KNOWLEDGE | 0.505 | 0.689 | 0.733 | 0.544 | 0.455 | 0.516 | 0.601 | 0.547 | 0.536 | 0.569 | 0.582 | 0.546 | 0.542 | 0.533 | 0.546 | 0.517 | 0.519 | 0.663 | 0.500 | 0.589 | - | 0.489 | 0.559 | + COT | 0.350 | 0.550 | 0.582 | 0.500 | 0.338 | 0.409 | 0.505 | 0.530 | 0.446 | 0.479 | 0.498 | 0.466 | 0.416 | 0.424 | 0.474 | 0.506 | 0.440 | 0.503 | 0.492 | 0.581 | - | 0.377 | 0.563 | + MATHCOT | 0.482 | 0.735 | 0.740 | 0.671 | 0.573 | 0.661 | 0.693 | 0.729 | 0.647 | 0.767 | 0.495 | 0.671 | 0.545 | 0.574 | 0.592 | 0.771 | 0.640 | 0.666 | 0.685 | 0.838 | - | 0.535 | 0.683 | + CODE | 0.331 | 0.495 | 0.568 | 0.475 | 0.338 | 0.445 | 0.344 | 0.463 | 0.366 | 0.514 | 0.321 | 0.433 | 0.324 | 0.316 | 0.389 | 0.419 | 0.376 | 0.350 | 0.390 | 0.528 | 0.471 | 0.233 | 0.368 | DISCIPLINES - NLP | 0.568 | 0.755 | 0.786 | 0.728 | - | 0.666 | 0.774 | 0.677 | 0.609 | 0.723 | 0.642 | 0.685 | 0.739 | 0.737 | 0.667 | 0.647 | 0.637 | 0.744 | 0.693 | 0.734 | - | 0.712 | 0.725 | - MATH | 0.398 | 0.635 | 0.652 | 0.592 | - | 0.547 | 0.633 | 0.629 | 0.556 | 0.636 | 0.452 | 0.563 | 0.482 | 0.505 | 0.525 | 0.647 | 0.543 | 0.584 | 0.617 | 0.747 | - | 0.454 | 0.625 | - SCIENCE | 0.555 | 0.739 | 0.769 | 0.698 | - | 0.591 | 0.664 | 0.676 | 0.605 | 0.645 | 0.673 | 0.636 | 0.581 | 0.596 | 0.685 | 0.696 | 0.660 | 0.697 | 0.678 | 0.754 | - | 0.544 | 0.763 | - ENGINEERING | 0.280 | 0.426 | 0.438 | 0.333 | - | 0.286 | 0.333 | 0.397 | 0.323 | 0.346 | 0.393 | 0.339 | 0.267 | 0.272 | 0.319 | 0.323 | 0.308 | 0.371 | 0.388 | 0.444 | - | 0.199 | 0.586 | - MEDICINE | 0.400 | 0.595 | 0.648 | 0.530 | - | 0.466 | 0.515 | 0.577 | 0.496 | 0.512 | 0.447 | 0.541 | 0.485 | 0.503 | 0.525 | 0.537 | 0.501 | 0.633 | 0.510 | 0.580 | - | 0.457 | 0.635 | - HUMANITIES | 0.485 | 0.645 | 0.679 | 0.610 | - | 0.541 | 0.603 | 0.578 | 0.529 | 0.563 | 0.536 | 0.564 | 0.535 | 0.547 | 0.567 | 0.588 | 0.567 | 0.671 | 0.591 | 0.638 | - | 0.508 | 0.701 | - LAW | 0.302 | 0.483 | 0.524 | 0.431 | - | 0.377 | 0.420 | 0.406 | 0.344 | 0.387 | 0.370 | 0.390 | 0.367 | 0.374 | 0.399 | 0.392 | 0.310 | 0.498 | 0.409 | 0.435 | - | 0.327 | 0.527 | + NLP | 0.568 | 0.755 | 0.786 | 0.728 | 0.587 | 0.666 | 0.774 | 0.677 | 0.609 | 0.723 | 0.642 | 0.685 | 0.739 | 0.737 | 0.667 | 0.647 | 0.637 | 0.744 | 0.693 | 0.734 | - | 0.712 | 0.725 | + MATH | 0.398 | 0.635 | 0.652 | 0.592 | 0.456 | 0.547 | 0.633 | 0.629 | 0.556 | 0.636 | 0.452 | 0.563 | 0.482 | 0.505 | 0.525 | 0.647 | 0.543 | 0.584 | 0.617 | 0.747 | - | 0.454 | 0.625 | + SCIENCE | 0.555 | 0.739 | 0.769 | 0.698 | 0.507 | 0.591 | 0.664 | 0.676 | 0.605 | 0.645 | 0.673 | 0.636 | 0.581 | 0.596 | 0.685 | 0.696 | 0.660 | 0.697 | 0.678 | 0.754 | - | 0.544 | 0.763 | + ENGINEERING | 0.280 | 0.426 | 0.438 | 0.333 | 0.234 | 0.286 | 0.333 | 0.397 | 0.323 | 0.346 | 0.393 | 0.339 | 0.267 | 0.272 | 0.319 | 0.323 | 0.308 | 0.371 | 0.388 | 0.444 | - | 0.199 | 0.586 | + MEDICINE | 0.400 | 0.595 | 0.648 | 0.530 | 0.351 | 0.466 | 0.515 | 0.577 | 0.496 | 0.512 | 0.447 | 0.541 | 0.485 | 0.503 | 0.525 | 0.537 | 0.501 | 0.633 | 0.510 | 0.580 | - | 0.457 | 0.635 | + HUMANITIES | 0.485 | 0.645 | 0.679 | 0.610 | 0.428 | 0.541 | 0.603 | 0.578 | 0.529 | 0.563 | 0.536 | 0.564 | 0.535 | 0.547 | 0.567 | 0.588 | 0.567 | 0.671 | 0.591 | 0.638 | - | 0.508 | 0.701 | + LAW | 0.302 | 0.483 | 0.524 | 0.431 | 0.299 | 0.377 | 0.420 | 0.406 | 0.344 | 0.387 | 0.370 | 0.390 | 0.367 | 0.374 | 0.399 | 0.392 | 0.310 | 0.498 | 0.409 | 0.435 | - | 0.327 | 0.527 |