diff --git "a/llm_eval_harness_GPU_version.ipynb" "b/llm_eval_harness_GPU_version.ipynb" deleted file mode 100644--- "a/llm_eval_harness_GPU_version.ipynb" +++ /dev/null @@ -1,6404 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "Initial setup" - ], - "metadata": { - "id": "U8RTc2PmnX-v" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yyfn6cOuRl4Z", - "collapsed": true - }, - "outputs": [], - "source": [ - "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git\n", - "!pip install bitsandbytes accelerate\n", - "from huggingface_hub import notebook_login\n", - "notebook_login()\n" - ] - }, - { - "cell_type": "code", - "source": [ - "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kGW7vfRkrqHe", - "outputId": "cac5738c-7b70-44dc-afe1-4b8805383d3e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845 (from -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3))\n", - " Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision 62b4364dd0c6c96ef33a28dcc57875381f4d2845) to /tmp/pip-install-_2syf_eu/lm-eval_daa793b6fa3540f7938eeafa33886360\n", - " Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-install-_2syf_eu/lm-eval_daa793b6fa3540f7938eeafa33886360\n", - " Running command git rev-parse -q --verify 'sha^62b4364dd0c6c96ef33a28dcc57875381f4d2845'\n", - " Running command git fetch -q https://github.com/EleutherAI/lm-evaluation-harness.git 62b4364dd0c6c96ef33a28dcc57875381f4d2845\n", - " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 62b4364dd0c6c96ef33a28dcc57875381f4d2845\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: accelerate==1.1.1 in /usr/local/lib/python3.10/dist-packages (from -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (1.1.1)\n", - "Requirement already satisfied: bitsandbytes==0.44.1 in /usr/local/lib/python3.10/dist-packages (from -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 2)) (0.44.1)\n", - "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (0.26.2)\n", - "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (1.26.4)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (24.2)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (5.9.5)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (6.0.2)\n", - "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (0.4.5)\n", - "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (2.5.1+cu121)\n", - "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.4.3)\n", - "Requirement already satisfied: datasets>=2.16.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.1.0)\n", - "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (4.0.0)\n", - "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.10.1)\n", - "Requirement already satisfied: peft>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.13.2)\n", - "Requirement already satisfied: pybind11>=2.6.2 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.13.6)\n", - "Requirement already satisfied: pytablewriter in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.2.0)\n", - "Requirement already satisfied: rouge-score>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.1.2)\n", - "Requirement already satisfied: sacrebleu>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.4.3)\n", - "Requirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.5.2)\n", - "Requirement already satisfied: sqlitedict in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.1.0)\n", - "Requirement already satisfied: tqdm-multiprocess in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.0.11)\n", - "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (4.46.2)\n", - "Requirement already satisfied: zstandard in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.23.0)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.3.8)\n", - "Requirement already satisfied: word2number in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.1)\n", - "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (10.5.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.16.1)\n", - "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (17.0.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.2.2)\n", - "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.32.3)\n", - "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (4.66.6)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.5.0)\n", - "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.70.16)\n", - "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2024.9.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.11.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (4.12.2)\n", - "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.4.0)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.9.1)\n", - "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.16.0)\n", - "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.0.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2024.9.11)\n", - "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.9.0)\n", - "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.4.6)\n", - "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (5.3.0)\n", - "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.13.1)\n", - "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.5.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (3.4.2)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (3.1.4)\n", - "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (1.13.1)\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.20.3)\n", - "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (24.2.0)\n", - "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (75.1.0)\n", - "Requirement already satisfied: DataProperty<2,>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.0.1)\n", - "Requirement already satisfied: mbstrdecoder<2,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.1.3)\n", - "Requirement already satisfied: pathvalidate<4,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.2.1)\n", - "Requirement already satisfied: tabledata<2,>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.3.3)\n", - "Requirement already satisfied: tcolorpy<1,>=0.0.5 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.1.6)\n", - "Requirement already satisfied: typepy<2,>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.3.2)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.4.3)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.3.1)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.5.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (6.1.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (0.2.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (1.17.1)\n", - "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (4.0.3)\n", - "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (5.2.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.4.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.2.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2024.8.30)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2.8.2)\n", - "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2024.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate==1.1.1->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (8.1.7)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@62b4364dd0c6c96ef33a28dcc57875381f4d2845->-r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt (line 3)) (2024.2)\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "%%writefile requirements.txt\n", - "accelerate==1.1.1\n", - "bitsandbytes==0.44.1\n", - "lm_eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5\n" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KQhZNknaoGxD", - "outputId": "71009cd8-08ce-41ca-dd23-6001721e69e2" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Overwriting requirements.txt\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "!pip install -r requirements.txt" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2I850FIsCVNw", - "outputId": "74bcc8a8-1f48-4d60-9216-941c24cec506" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5 (from -r requirements.txt (line 3))\n", - " Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision v0.4.5) to /tmp/pip-install-bh1erx9a/lm-eval_146ffc5995ba4667919c510a952f3f6a\n", - " Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-install-bh1erx9a/lm-eval_146ffc5995ba4667919c510a952f3f6a\n", - " Running command git checkout -q 0845b588303f1f59af98dd1c5bdbd78a9e75a1e2\n", - " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 0845b588303f1f59af98dd1c5bdbd78a9e75a1e2\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: accelerate==1.1.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (1.1.1)\n", - "Requirement already satisfied: bitsandbytes==0.44.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (0.44.1)\n", - "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (0.26.2)\n", - "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (1.26.4)\n", - "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (24.2)\n", - "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (5.9.5)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (6.0.2)\n", - "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (0.4.5)\n", - "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==1.1.1->-r requirements.txt (line 1)) (2.5.1+cu121)\n", - "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.4.3)\n", - "Requirement already satisfied: datasets>=2.16.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.1.0)\n", - "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (4.0.0)\n", - "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.10.1)\n", - "Requirement already satisfied: peft>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.13.2)\n", - "Requirement already satisfied: pybind11>=2.6.2 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.13.6)\n", - "Requirement already satisfied: pytablewriter in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.2.0)\n", - "Requirement already satisfied: rouge-score>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.1.2)\n", - "Requirement already satisfied: sacrebleu>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.4.3)\n", - "Requirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.5.2)\n", - "Requirement already satisfied: sqlitedict in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.1.0)\n", - "Requirement already satisfied: tqdm-multiprocess in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.0.11)\n", - "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (4.46.2)\n", - "Requirement already satisfied: zstandard in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.23.0)\n", - "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.3.8)\n", - "Requirement already satisfied: word2number in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.1)\n", - "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (10.5.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.16.1)\n", - "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (17.0.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.2.2)\n", - "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.32.3)\n", - "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (4.66.6)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.5.0)\n", - "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.70.16)\n", - "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2024.9.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.11.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.0->accelerate==1.1.1->-r requirements.txt (line 1)) (4.12.2)\n", - "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.4.0)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.9.1)\n", - "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.16.0)\n", - "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.0.0)\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2024.9.11)\n", - "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.9.0)\n", - "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.4.6)\n", - "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (5.3.0)\n", - "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.13.1)\n", - "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.4.2)\n", - "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.5.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r requirements.txt (line 1)) (3.4.2)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r requirements.txt (line 1)) (3.1.4)\n", - "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==1.1.1->-r requirements.txt (line 1)) (1.13.1)\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate==1.1.1->-r requirements.txt (line 1)) (1.3.0)\n", - "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.20.3)\n", - "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (24.2.0)\n", - "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (75.1.0)\n", - "Requirement already satisfied: DataProperty<2,>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.0.1)\n", - "Requirement already satisfied: mbstrdecoder<2,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.1.3)\n", - "Requirement already satisfied: pathvalidate<4,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.2.1)\n", - "Requirement already satisfied: tabledata<2,>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.3.3)\n", - "Requirement already satisfied: tcolorpy<1,>=0.0.5 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.1.6)\n", - "Requirement already satisfied: typepy<2,>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.3.2)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.4.3)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.3.1)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.5.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (6.1.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (0.2.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (1.17.1)\n", - "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (4.0.3)\n", - "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (5.2.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.4.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.2.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2024.8.30)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2.8.2)\n", - "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2024.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate==1.1.1->-r requirements.txt (line 1)) (3.0.2)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (8.1.7)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.16.0->lm_eval@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.5->-r requirements.txt (line 3)) (2024.2)\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "YAML_mmlu_en_us_string = \"\"\"\n", - "task: mmlu_en_us\n", - "dataset_path: cais/mmlu\n", - "dataset_name: all\n", - "description: \"MMLU dataset in English\"\n", - "test_split: test\n", - "fewshot_split: dev\n", - "fewshot_config:\n", - " sampler: first_n\n", - "output_type: multiple_choice\n", - "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n", - "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n", - "doc_to_target: answer\n", - "metric_list:\n", - " - metric: acc\n", - " aggregation: mean\n", - " higher_is_better: true\n", - " - metric: acc_norm\n", - " aggregation: mean\n", - " higher_is_better: true\n", - "\"\"\"\n", - "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n", - " f.write(YAML_mmlu_en_us_string)" - ], - "metadata": { - "id": "xP0cC_sHih7C" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!lm_eval --model hf \\\n", - "--model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n", - "--tasks mmlu \\\n", - "--output output/mmlu/ \\\n", - "--use_cache cache \\\n", - "--log_samples" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oIACOAhDW5ow", - "outputId": "4432c904-941c-4bc8-d48a-c1ce7ad49006" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2024-11-19 01:29:44.260278: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-19 01:29:44.290343: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-19 01:29:44.299365: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-19 01:29:44.322349: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-11-19 01:29:45.890423: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-11-19:01:29:48,414 INFO [__main__.py:279] Verbosity set to INFO\n", - "2024-11-19:01:30:01,682 INFO [__main__.py:376] Selected Tasks: ['mmlu']\n", - "2024-11-19:01:30:01,684 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n", - "2024-11-19:01:30:01,684 INFO [evaluator.py:201] Initializing hf model, with arguments: {'pretrained': 'meta-llama/Llama-3.2-3B-Instruct'}\n", - "2024-11-19:01:30:01,691 INFO [huggingface.py:131] Using device 'cuda'\n", - "config.json: 100% 878/878 [00:00<00:00, 3.85MB/s]\n", - "tokenizer_config.json: 100% 54.5k/54.5k [00:00<00:00, 54.4MB/s]\n", - "tokenizer.json: 100% 9.09M/9.09M [00:00<00:00, 23.8MB/s]\n", - "special_tokens_map.json: 100% 296/296 [00:00<00:00, 1.71MB/s]\n", - "2024-11-19:01:30:03,142 INFO [huggingface.py:368] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda'}\n", - "model.safetensors.index.json: 100% 20.9k/20.9k [00:00<00:00, 64.4MB/s]\n", - "Downloading shards: 0% 0/2 [00:00\n", - " sys.exit(cli_evaluate())\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/__main__.py\", line 382, in cli_evaluate\n", - " results = evaluator.simple_evaluate(\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/utils.py\", line 397, in _wrapper\n", - " return fn(*args, **kwargs)\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/evaluator.py\", line 204, in simple_evaluate\n", - " lm = lm_eval.api.registry.get_model(model).create_from_arg_string(\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/api/model.py\", line 147, in create_from_arg_string\n", - " return cls(**args, **args2)\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/models/huggingface.py\", line 184, in __init__\n", - " self._create_model(\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/models/huggingface.py\", line 576, in _create_model\n", - " self._model = self.AUTO_MODEL_CLASS.from_pretrained(\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py\", line 564, in from_pretrained\n", - " return model_class.from_pretrained(\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3974, in from_pretrained\n", - " resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py\", line 1098, in get_checkpoint_shard_files\n", - " cached_filename = cached_file(\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py\", line 403, in cached_file\n", - " resolved_file = hf_hub_download(\n", - " File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n", - " return fn(*args, **kwargs)\n", - " File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py\", line 862, in hf_hub_download\n", - " return _hf_hub_download_to_cache_dir(\n", - " File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py\", line 1011, in _hf_hub_download_to_cache_dir\n", - " _download_to_tmp_and_move(\n", - " File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py\", line 1545, in _download_to_tmp_and_move\n", - " http_get(\n", - " File \"/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py\", line 454, in http_get\n", - " for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):\n", - " File \"/usr/local/lib/python3.10/dist-packages/requests/models.py\", line 820, in generate\n", - " yield from self.raw.stream(chunk_size, decode_content=True)\n", - " File \"/usr/local/lib/python3.10/dist-packages/urllib3/response.py\", line 1060, in stream\n", - " data = self.read(amt=amt, decode_content=decode_content)\n", - " File \"/usr/local/lib/python3.10/dist-packages/urllib3/response.py\", line 949, in read\n", - " data = self._raw_read(amt)\n", - " File \"/usr/local/lib/python3.10/dist-packages/urllib3/response.py\", line 872, in _raw_read\n", - " with self._error_catcher():\n", - " File \"/usr/lib/python3.10/contextlib.py\", line 153, in __exit__\n", - " self.gen.throw(typ, value, traceback)\n", - " File \"/usr/local/lib/python3.10/dist-packages/urllib3/response.py\", line 748, in _error_catcher\n", - " yield\n", - "KeyboardInterrupt\n", - "^C\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# !lm-eval --tasks list" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ybXvauzdUHdQ", - "outputId": "a445b537-3ca9-46c7-c8ad-178068ccd919", - "collapsed": true - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2024-11-17 01:23:05.594234: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-17 01:23:05.614502: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-17 01:23:05.620482: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-17 01:23:05.635150: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-11-17 01:23:06.706535: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-11-17:01:23:11,383 INFO [__main__.py:279] Verbosity set to INFO\n", - "2024-11-17:01:23:17,947 INFO [__init__.py:459] The tag 'arc_ca' is already registered as a group, this tag will not be registered. This may affect tasks you want to call.\n", - "2024-11-17:01:23:17,951 INFO [__init__.py:459] The tag 'arc_ca' is already registered as a group, this tag will not be registered. This may affect tasks you want to call.\n", - "\n", - "| Group | Config Location |\n", - "|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|\n", - "|aclue |lm_eval/tasks/aclue/_aclue.yaml |\n", - "|aexams |lm_eval/tasks/aexams/_aexams.yaml |\n", - "|agieval |lm_eval/tasks/agieval/agieval.yaml |\n", - "|agieval_cn |lm_eval/tasks/agieval/agieval_cn.yaml |\n", - "|agieval_en |lm_eval/tasks/agieval/agieval_en.yaml |\n", - "|agieval_nous |lm_eval/tasks/agieval/agieval_nous.yaml |\n", - "|arabic_leaderboard_acva |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva.yaml |\n", - "|arabic_leaderboard_acva_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_light.yaml |\n", - "|arabic_leaderboard_alghafa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa.yaml |\n", - "|arabic_leaderboard_alghafa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_light.yaml |\n", - "|arabic_leaderboard_arabic_exams |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_leaderboard_arabic_exams.yaml |\n", - "|arabic_leaderboard_arabic_exams_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_leaderboard_arabic_exams_light.yaml |\n", - "|arabic_leaderboard_arabic_mmlu |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu.yaml |\n", - "|arabic_leaderboard_arabic_mmlu_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_arc_challenge |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml |\n", - "|arabic_leaderboard_arabic_mt_arc_challenge_light|lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_leaderboard_arabic_mt_arc_challenge_light.yaml|\n", - "|arabic_leaderboard_arabic_mt_arc_easy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml |\n", - "|arabic_leaderboard_arabic_mt_arc_easy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_leaderboard_arabic_mt_arc_easy_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_boolq |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml |\n", - "|arabic_leaderboard_arabic_mt_boolq_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_leaderboard_arabic_mt_boolq_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_copa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml |\n", - "|arabic_leaderboard_arabic_mt_copa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arbic_leaderboard_arabic_mt_copa_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_hellaswag |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_leaderboard_arabic_mt_hellaswag.yaml |\n", - "|arabic_leaderboard_arabic_mt_hellaswag_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_leaderboard_arabic_mt_hellaswag_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_mmlu |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_leaderboard_arabic_mt_mmlu.yaml |\n", - "|arabic_leaderboard_arabic_mt_mmlu_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_leaderboard_arabic_mt_mmlu_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_openbook_qa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_leaderboard_arabic_mt_openbook_qa.yaml |\n", - "|arabic_leaderboard_arabic_mt_openbook_qa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_leaderboard_arabic_mt_openbook_qa_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_piqa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_leaderboard_arabic_mt_piqa.yaml |\n", - "|arabic_leaderboard_arabic_mt_piqa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_leaderboard_arabic_mt_piqa_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_race |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_leaderboard_arabic_mt_race.yaml |\n", - "|arabic_leaderboard_arabic_mt_race_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_leaderboard_arabic_mt_race_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_sciq |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_leaderboard_arabic_mt_sciq.yaml |\n", - "|arabic_leaderboard_arabic_mt_sciq_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_leaderboard_arabic_mt_sciq_light.yaml |\n", - "|arabic_leaderboard_arabic_mt_toxigen |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_leaderboard_arabic_mt_toxigen.yaml |\n", - "|arabic_leaderboard_arabic_mt_toxigen_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_leaderboard_arabic_mt_toxigen_light.yaml |\n", - "|arabic_leaderboard_complete |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_complete.yaml |\n", - "|arabic_leaderboard_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_light.yaml |\n", - "|arabicmmlu |lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml |\n", - "|arabicmmlu_humanities |lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml |\n", - "|arabicmmlu_language |lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml |\n", - "|arabicmmlu_other |lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml |\n", - "|arabicmmlu_social_science |lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml |\n", - "|arabicmmlu_stem |lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml |\n", - "|basque_bench |lm_eval/tasks/basque_bench/basque_bench.yaml |\n", - "|bbh |lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml |\n", - "|bbh_cot_fewshot |lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml |\n", - "|bbh_cot_zeroshot |lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml |\n", - "|bbh_fewshot |lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml |\n", - "|bbh_zeroshot |lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml |\n", - "|belebele |lm_eval/tasks/belebele/_belebele.yaml |\n", - "|blimp |lm_eval/tasks/blimp/_blimp.yaml |\n", - "|catalan_bench |lm_eval/tasks/catalan_bench/catalan_bench.yaml |\n", - "|ceval-valid |lm_eval/tasks/ceval/_ceval-valid.yaml |\n", - "|cmmlu |lm_eval/tasks/cmmlu/_cmmlu.yaml |\n", - "|csatqa |lm_eval/tasks/csatqa/_csatqa.yaml |\n", - "|flan_held_in |lm_eval/tasks/benchmarks/flan/flan_held_in.yaml |\n", - "|flan_held_out |lm_eval/tasks/benchmarks/flan/flan_held_out.yaml |\n", - "|flores_ca |lm_eval/tasks/catalan_bench/flores_ca/flores_ca.yaml |\n", - "|flores_es |lm_eval/tasks/spanish_bench/flores_es/flores_es.yaml |\n", - "|flores_eu |lm_eval/tasks/basque_bench/flores_eu/flores_eu.yaml |\n", - "|flores_gl |lm_eval/tasks/galician_bench/flores_gl/flores_gl.yaml |\n", - "|flores_pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt.yaml |\n", - "|galician_bench |lm_eval/tasks/galician_bench/galician_bench.yaml |\n", - "|haerae |lm_eval/tasks/haerae/_haerae.yaml |\n", - "|hendrycks_math |lm_eval/tasks/hendrycks_math/hendrycks_math.yaml |\n", - "|japanese_leaderboard |lm_eval/tasks/japanese_leaderboard/_ja_leaderboard.yaml |\n", - "|kormedmcqa |lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml |\n", - "|leaderboard |lm_eval/tasks/leaderboard/leaderboard.yaml |\n", - "|leaderboard_bbh |lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml |\n", - "|leaderboard_gpqa |lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml |\n", - "|leaderboard_instruction_following |lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml |\n", - "|leaderboard_math_hard |lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml |\n", - "|leaderboard_musr |lm_eval/tasks/leaderboard/musr/_musr.yaml |\n", - "|lingoly |lm_eval/tasks/lingoly/lingoly_group.yaml |\n", - "|med_concepts_qa |lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml |\n", - "|med_concepts_qa_atc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml |\n", - "|med_concepts_qa_icd10cm |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml |\n", - "|med_concepts_qa_icd10proc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml |\n", - "|med_concepts_qa_icd9cm |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml |\n", - "|med_concepts_qa_icd9proc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml |\n", - "|mela |lm_eval/tasks/mela/_mela.yaml |\n", - "|minerva_math |lm_eval/tasks/benchmarks/minerva_math.yaml |\n", - "|mmlu |lm_eval/tasks/mmlu/default/_mmlu.yaml |\n", - "|mmlu_continuation |lm_eval/tasks/mmlu/continuation/_mmlu.yaml |\n", - "|mmlu_flan_cot_fewshot |lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml |\n", - "|mmlu_flan_cot_zeroshot |lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml |\n", - "|mmlu_flan_n_shot_generative |lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml |\n", - "|mmlu_flan_n_shot_loglikelihood |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml |\n", - "|mmlu_generative |lm_eval/tasks/mmlu/generative/_mmlu.yaml |\n", - "|mmlu_humanities |lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml |\n", - "|mmlu_other |lm_eval/tasks/mmlu/default/_mmlu_other.yaml |\n", - "|mmlu_pro |lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml |\n", - "|mmlu_social_sciences |lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml |\n", - "|mmlu_stem |lm_eval/tasks/mmlu/default/_mmlu_stem.yaml |\n", - "|mmlusr |lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml |\n", - "|mmlusr_answer_only |lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml |\n", - "|mmlusr_question_only |lm_eval/tasks/mmlusr/question_only/_question_only.yaml |\n", - "|mmmu_val |lm_eval/tasks/mmmu/_mmmu.yaml |\n", - "|mmmu_val_art_and_design |lm_eval/tasks/mmmu/_art_and_design.yaml |\n", - "|mmmu_val_business |lm_eval/tasks/mmmu/_business.yaml |\n", - "|mmmu_val_health_and_medicine |lm_eval/tasks/mmmu/_health_and_medicine.yaml |\n", - "|mmmu_val_humanities_and_social_science |lm_eval/tasks/mmmu/_humanities_and_social_sciences.yaml |\n", - "|mmmu_val_science |lm_eval/tasks/mmmu/_science.yaml |\n", - "|mmmu_val_tech_and_engineering |lm_eval/tasks/mmmu/_tech_and_engineering.yaml |\n", - "|multimedqa |lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml |\n", - "|openllm |lm_eval/tasks/benchmarks/openllm.yaml |\n", - "|pawsx |lm_eval/tasks/paws-x/_pawsx.yaml |\n", - "|portuguese_bench |lm_eval/tasks/portuguese_bench/portuguese_bench.yaml |\n", - "|pythia |lm_eval/tasks/benchmarks/pythia.yaml |\n", - "|spanish_bench |lm_eval/tasks/spanish_bench/spanish_bench.yaml |\n", - "|t0_eval |lm_eval/tasks/benchmarks/t0_eval.yaml |\n", - "|tinyBenchmarks |lm_eval/tasks/tinyBenchmarks/tinyBenchmarks.yaml |\n", - "|tmlu |lm_eval/tasks/tmlu/default/_tmlu.yaml |\n", - "|tmmluplus |lm_eval/tasks/tmmluplus/default/_tmmluplus.yaml |\n", - "|tmmluplus_STEM |lm_eval/tasks/tmmluplus/default/_tmmluplus_STEM.yaml |\n", - "|tmmluplus_humanities |lm_eval/tasks/tmmluplus/default/_tmmluplus_humanities.yaml |\n", - "|tmmluplus_other |lm_eval/tasks/tmmluplus/default/_tmmluplus_other.yaml |\n", - "|tmmluplus_social_sciences |lm_eval/tasks/tmmluplus/default/_tmmluplus_social_sciences.yaml |\n", - "|wmdp |lm_eval/tasks/wmdp/_wmdp.yaml |\n", - "|xcopa |lm_eval/tasks/xcopa/_xcopa.yaml |\n", - "|xnli |lm_eval/tasks/xnli/_xnli.yaml |\n", - "|xstorycloze |lm_eval/tasks/xstorycloze/_xstorycloze.yaml |\n", - "|xwinograd |lm_eval/tasks/xwinograd/_xwinograd.yaml |\n", - "\n", - "\n", - "| Tag |\n", - "|------------------------------------------------|\n", - "|advanced_ai_risk |\n", - "|ai2_arc |\n", - "|anli |\n", - "|arabicmmlu_humanities_tasks |\n", - "|arabicmmlu_language_tasks |\n", - "|arabicmmlu_other_tasks |\n", - "|arabicmmlu_social_science_tasks |\n", - "|arabicmmlu_stem_tasks |\n", - "|arc_challenge_mt |\n", - "|arc_multilingual |\n", - "|arithmetic |\n", - "|basque-glue |\n", - "|bertaqa |\n", - "|cabreu |\n", - "|chain_of_thought |\n", - "|copal_id |\n", - "|crows_pairs |\n", - "|eus_exams_es |\n", - "|eus_exams_eu |\n", - "|flores |\n", - "|freebase |\n", - "|french_bench |\n", - "|french_bench_extra |\n", - "|french_bench_gen |\n", - "|french_bench_mc |\n", - "|french_bench_perplexity |\n", - "|glue |\n", - "|gpqa |\n", - "|gpt3_translation_benchmarks |\n", - "|headqa |\n", - "|hellaswag_multilingual |\n", - "|hendrycks_ethics |\n", - "|inverse_scaling_mc |\n", - "|iwslt2017 |\n", - "|kbl |\n", - "|kbl_bar_exam_em |\n", - "|kbl_bar_exam_em_civil |\n", - "|kbl_bar_exam_em_criminal |\n", - "|kbl_bar_exam_em_public |\n", - "|kbl_bar_exam_em_responsibility |\n", - "|kbl_knowledge_em |\n", - "|kbl_reasoning_em |\n", - "|kmmlu |\n", - "|kmmlu_direct |\n", - "|kmmlu_hard |\n", - "|kmmlu_hard_cot |\n", - "|kmmlu_hard_direct |\n", - "|kobest |\n", - "|lambada |\n", - "|lambada_cloze |\n", - "|lambada_multilingual |\n", - "|m_mmlu |\n", - "|math_word_problems |\n", - "|med_concepts_qa_atc_tasks |\n", - "|med_concepts_qa_icd10cm_tasks |\n", - "|med_concepts_qa_icd10proc_tasks |\n", - "|med_concepts_qa_icd9cm_tasks |\n", - "|med_concepts_qa_icd9proc_tasks |\n", - "|mgsm_cot_native |\n", - "|mgsm_direct |\n", - "|mmlu_continuation_humanities |\n", - "|mmlu_continuation_other |\n", - "|mmlu_continuation_social_sciences |\n", - "|mmlu_continuation_stem |\n", - "|mmlu_flan_cot_fewshot_humanities |\n", - "|mmlu_flan_cot_fewshot_other |\n", - "|mmlu_flan_cot_fewshot_social_sciences |\n", - "|mmlu_flan_cot_fewshot_stem |\n", - "|mmlu_flan_cot_zeroshot_humanities |\n", - "|mmlu_flan_cot_zeroshot_other |\n", - "|mmlu_flan_cot_zeroshot_social_sciences |\n", - "|mmlu_flan_cot_zeroshot_stem |\n", - "|mmlu_flan_n_shot_generative_humanities |\n", - "|mmlu_flan_n_shot_generative_other |\n", - "|mmlu_flan_n_shot_generative_social_sciences |\n", - "|mmlu_flan_n_shot_generative_stem |\n", - "|mmlu_flan_n_shot_loglikelihood_humanities |\n", - "|mmlu_flan_n_shot_loglikelihood_other |\n", - "|mmlu_flan_n_shot_loglikelihood_social_sciences |\n", - "|mmlu_flan_n_shot_loglikelihood_stem |\n", - "|mmlu_humanities_generative |\n", - "|mmlu_humanities_tasks |\n", - "|mmlu_other_generative |\n", - "|mmlu_other_tasks |\n", - "|mmlu_social_sciences_generative |\n", - "|mmlu_social_sciences_tasks |\n", - "|mmlu_stem_generative |\n", - "|mmlu_stem_tasks |\n", - "|mmlusr_answer_only_humanities_tasks |\n", - "|mmlusr_answer_only_other_tasks |\n", - "|mmlusr_answer_only_social_sciences_tasks |\n", - "|mmlusr_answer_only_stem_tasks |\n", - "|mmlusr_question_and_answer_humanities_tasks |\n", - "|mmlusr_question_and_answer_other_tasks |\n", - "|mmlusr_question_and_answer_social_sciences_tasks|\n", - "|mmlusr_question_and_answer_stem_tasks |\n", - "|mmlusr_question_only_humanities_tasks |\n", - "|mmlusr_question_only_other_tasks |\n", - "|mmlusr_question_only_social_sciences_tasks |\n", - "|mmlusr_question_only_stem_tasks |\n", - "|multiple_choice |\n", - "|paloma |\n", - "|persona |\n", - "|phrases_es |\n", - "|phrases_va |\n", - "|polemo2 |\n", - "|qa4mre |\n", - "|qasper |\n", - "|self_consistency |\n", - "|storycloze |\n", - "|super-glue-lm-eval-v1 |\n", - "|super-glue-lm-eval-v1-seq2seq |\n", - "|super-glue-t5-prompt |\n", - "|sycophancy |\n", - "|tmlu_humanities_tasks |\n", - "|tmlu_other_tasks |\n", - "|tmlu_social_sciences_tasks |\n", - "|tmlu_stem_tasks |\n", - "|tmlu_taiwan_specific |\n", - "|tmmluplus_STEM_tasks |\n", - "|tmmluplus_humanities_tasks |\n", - "|tmmluplus_other_tasks |\n", - "|tmmluplus_social_sciences_tasks |\n", - "|translation |\n", - "|truthfulqa |\n", - "|truthfulqa_gl |\n", - "|truthfulqa_multilingual |\n", - "|turkishmmlu |\n", - "|turkishmmlu_cot |\n", - "|unscramble |\n", - "|wmt14 |\n", - "|wmt16 |\n", - "|xnli_eu_mt_native |\n", - "|xquad |\n", - "\n", - "\n", - "| Task | Config Location | Output Type |\n", - "|--------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------|\n", - "|20_newsgroups |lm_eval/tasks/unitxt/20_newsgroups.yaml | |\n", - "|aclue_ancient_chinese_culture |lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml |multiple_choice |\n", - "|aclue_ancient_literature |lm_eval/tasks/aclue/aclue_ancient_literature.yaml |multiple_choice |\n", - "|aclue_ancient_medical |lm_eval/tasks/aclue/aclue_ancient_medical.yaml |multiple_choice |\n", - "|aclue_ancient_phonetics |lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml |multiple_choice |\n", - "|aclue_basic_ancient_chinese |lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml |multiple_choice |\n", - "|aclue_couplet_prediction |lm_eval/tasks/aclue/aclue_couplet_prediction.yaml |multiple_choice |\n", - "|aclue_homographic_character_resolution |lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml |multiple_choice |\n", - "|aclue_named_entity_recognition |lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml |multiple_choice |\n", - "|aclue_poetry_appreciate |lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml |multiple_choice |\n", - "|aclue_poetry_context_prediction |lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml |multiple_choice |\n", - "|aclue_poetry_quality_assessment |lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml |multiple_choice |\n", - "|aclue_poetry_sentiment_analysis |lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml |multiple_choice |\n", - "|aclue_polysemy_resolution |lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml |multiple_choice |\n", - "|aclue_reading_comprehension |lm_eval/tasks/aclue/aclue_reading_comprehension.yaml |multiple_choice |\n", - "|aclue_sentence_segmentation |lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-self-awareness-training-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml |multiple_choice |\n", - "|advanced_ai_risk_fewshot-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-self-awareness-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml |multiple_choice |\n", - "|advanced_ai_risk_human-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-training-nn-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-self-awareness-training-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml |multiple_choice |\n", - "|advanced_ai_risk_lm-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml |multiple_choice |\n", - "|aexams_Biology |lm_eval/tasks/aexams/aexams_Biology.yaml |multiple_choice |\n", - "|aexams_IslamicStudies |lm_eval/tasks/aexams/aexams_IslamicStudies.yaml |multiple_choice |\n", - "|aexams_Physics |lm_eval/tasks/aexams/aexams_Physics.yaml |multiple_choice |\n", - "|aexams_Science |lm_eval/tasks/aexams/aexams_Science.yaml |multiple_choice |\n", - "|aexams_Social |lm_eval/tasks/aexams/aexams_Social.yaml |multiple_choice |\n", - "|afrimgsm_direct_amh |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml |generate_until |\n", - "|afrimgsm_direct_eng |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml |generate_until |\n", - "|afrimgsm_direct_ewe |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml |generate_until |\n", - "|afrimgsm_direct_fra |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml |generate_until |\n", - "|afrimgsm_direct_hau |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml |generate_until |\n", - "|afrimgsm_direct_ibo |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml |generate_until |\n", - "|afrimgsm_direct_kin |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml |generate_until |\n", - "|afrimgsm_direct_lin |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml |generate_until |\n", - "|afrimgsm_direct_lug |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml |generate_until |\n", - "|afrimgsm_direct_orm |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml |generate_until |\n", - "|afrimgsm_direct_sna |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml |generate_until |\n", - "|afrimgsm_direct_sot |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml |generate_until |\n", - "|afrimgsm_direct_swa |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml |generate_until |\n", - "|afrimgsm_direct_twi |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml |generate_until |\n", - "|afrimgsm_direct_wol |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml |generate_until |\n", - "|afrimgsm_direct_xho |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml |generate_until |\n", - "|afrimgsm_direct_yor |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml |generate_until |\n", - "|afrimgsm_direct_zul |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml |generate_until |\n", - "|afrimgsm_en_cot_amh |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml |generate_until |\n", - "|afrimgsm_en_cot_eng |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml |generate_until |\n", - "|afrimgsm_en_cot_ewe |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml |generate_until |\n", - "|afrimgsm_en_cot_fra |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml |generate_until |\n", - "|afrimgsm_en_cot_hau |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml |generate_until |\n", - "|afrimgsm_en_cot_ibo |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml |generate_until |\n", - "|afrimgsm_en_cot_kin |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml |generate_until |\n", - "|afrimgsm_en_cot_lin |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml |generate_until |\n", - "|afrimgsm_en_cot_lug |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml |generate_until |\n", - "|afrimgsm_en_cot_orm |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml |generate_until |\n", - "|afrimgsm_en_cot_sna |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml |generate_until |\n", - "|afrimgsm_en_cot_sot |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml |generate_until |\n", - "|afrimgsm_en_cot_swa |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml |generate_until |\n", - "|afrimgsm_en_cot_twi |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml |generate_until |\n", - "|afrimgsm_en_cot_wol |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml |generate_until |\n", - "|afrimgsm_en_cot_xho |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml |generate_until |\n", - "|afrimgsm_en_cot_yor |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml |generate_until |\n", - "|afrimgsm_en_cot_zul |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml |generate_until |\n", - "|afrimgsm_translate_direct_amh |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml |generate_until |\n", - "|afrimgsm_translate_direct_eng |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml |generate_until |\n", - "|afrimgsm_translate_direct_ewe |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml |generate_until |\n", - "|afrimgsm_translate_direct_fra |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml |generate_until |\n", - "|afrimgsm_translate_direct_hau |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml |generate_until |\n", - "|afrimgsm_translate_direct_ibo |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml |generate_until |\n", - "|afrimgsm_translate_direct_kin |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml |generate_until |\n", - "|afrimgsm_translate_direct_lin |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml |generate_until |\n", - "|afrimgsm_translate_direct_lug |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml |generate_until |\n", - "|afrimgsm_translate_direct_orm |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml |generate_until |\n", - "|afrimgsm_translate_direct_sna |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml |generate_until |\n", - "|afrimgsm_translate_direct_sot |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml |generate_until |\n", - "|afrimgsm_translate_direct_swa |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml |generate_until |\n", - "|afrimgsm_translate_direct_twi |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml |generate_until |\n", - "|afrimgsm_translate_direct_wol |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml |generate_until |\n", - "|afrimgsm_translate_direct_xho |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml |generate_until |\n", - "|afrimgsm_translate_direct_yor |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml |generate_until |\n", - "|afrimgsm_translate_direct_zul |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml |generate_until |\n", - "|afrimmlu_direct_amh |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml |multiple_choice |\n", - "|afrimmlu_direct_eng |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml |multiple_choice |\n", - "|afrimmlu_direct_ewe |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml |multiple_choice |\n", - "|afrimmlu_direct_fra |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml |multiple_choice |\n", - "|afrimmlu_direct_hau |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml |multiple_choice |\n", - "|afrimmlu_direct_ibo |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml |multiple_choice |\n", - "|afrimmlu_direct_kin |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml |multiple_choice |\n", - "|afrimmlu_direct_lin |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml |multiple_choice |\n", - "|afrimmlu_direct_lug |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml |multiple_choice |\n", - "|afrimmlu_direct_orm |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml |multiple_choice |\n", - "|afrimmlu_direct_sna |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml |multiple_choice |\n", - "|afrimmlu_direct_sot |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml |multiple_choice |\n", - "|afrimmlu_direct_swa |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml |multiple_choice |\n", - "|afrimmlu_direct_twi |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml |multiple_choice |\n", - "|afrimmlu_direct_wol |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml |multiple_choice |\n", - "|afrimmlu_direct_xho |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml |multiple_choice |\n", - "|afrimmlu_direct_yor |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml |multiple_choice |\n", - "|afrimmlu_direct_zul |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml |multiple_choice |\n", - "|afrimmlu_translate_amh |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml |multiple_choice |\n", - "|afrimmlu_translate_eng |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml |multiple_choice |\n", - "|afrimmlu_translate_ewe |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml |multiple_choice |\n", - "|afrimmlu_translate_fra |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml |multiple_choice |\n", - "|afrimmlu_translate_hau |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml |multiple_choice |\n", - "|afrimmlu_translate_ibo |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml |multiple_choice |\n", - "|afrimmlu_translate_kin |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml |multiple_choice |\n", - "|afrimmlu_translate_lin |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml |multiple_choice |\n", - "|afrimmlu_translate_lug |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml |multiple_choice |\n", - "|afrimmlu_translate_orm |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml |multiple_choice |\n", - "|afrimmlu_translate_sna |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml |multiple_choice |\n", - "|afrimmlu_translate_sot |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml |multiple_choice |\n", - "|afrimmlu_translate_swa |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml |multiple_choice |\n", - "|afrimmlu_translate_twi |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml |multiple_choice |\n", - "|afrimmlu_translate_wol |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml |multiple_choice |\n", - "|afrimmlu_translate_xho |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml |multiple_choice |\n", - "|afrimmlu_translate_yor |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml |multiple_choice |\n", - "|afrimmlu_translate_zul |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml |multiple_choice |\n", - "|afrixnli_en_direct_amh |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml |multiple_choice |\n", - "|afrixnli_en_direct_eng |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml |multiple_choice |\n", - "|afrixnli_en_direct_ewe |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml |multiple_choice |\n", - "|afrixnli_en_direct_fra |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml |multiple_choice |\n", - "|afrixnli_en_direct_hau |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml |multiple_choice |\n", - "|afrixnli_en_direct_ibo |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml |multiple_choice |\n", - "|afrixnli_en_direct_kin |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml |multiple_choice |\n", - "|afrixnli_en_direct_lin |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml |multiple_choice |\n", - "|afrixnli_en_direct_lug |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml |multiple_choice |\n", - "|afrixnli_en_direct_orm |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml |multiple_choice |\n", - "|afrixnli_en_direct_sna |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml |multiple_choice |\n", - "|afrixnli_en_direct_sot |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml |multiple_choice |\n", - "|afrixnli_en_direct_swa |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml |multiple_choice |\n", - "|afrixnli_en_direct_twi |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml |multiple_choice |\n", - "|afrixnli_en_direct_wol |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml |multiple_choice |\n", - "|afrixnli_en_direct_xho |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml |multiple_choice |\n", - "|afrixnli_en_direct_yor |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml |multiple_choice |\n", - "|afrixnli_en_direct_zul |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_amh |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_eng |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_ewe |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_fra |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_hau |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_ibo |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_kin |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_lin |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_lug |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_orm |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_sna |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_sot |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_swa |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_twi |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_wol |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_xho |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_yor |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml |multiple_choice |\n", - "|afrixnli_manual_direct_zul |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_amh |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_ewe |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_fra |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_hau |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_ibo |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_kin |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_lin |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_lug |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_orm |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_sna |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_sot |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_swa |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_twi |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_wol |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_xho |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_yor |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml |multiple_choice |\n", - "|afrixnli_manual_translate_zul |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml |multiple_choice |\n", - "|afrixnli_native_direct_amh |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml |multiple_choice |\n", - "|afrixnli_native_direct_eng |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml |multiple_choice |\n", - "|afrixnli_native_direct_ewe |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml |multiple_choice |\n", - "|afrixnli_native_direct_fra |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml |multiple_choice |\n", - "|afrixnli_native_direct_hau |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml |multiple_choice |\n", - "|afrixnli_native_direct_ibo |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml |multiple_choice |\n", - "|afrixnli_native_direct_kin |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml |multiple_choice |\n", - "|afrixnli_native_direct_lin |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml |multiple_choice |\n", - "|afrixnli_native_direct_lug |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml |multiple_choice |\n", - "|afrixnli_native_direct_orm |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml |multiple_choice |\n", - "|afrixnli_native_direct_sna |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml |multiple_choice |\n", - "|afrixnli_native_direct_sot |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml |multiple_choice |\n", - "|afrixnli_native_direct_swa |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml |multiple_choice |\n", - "|afrixnli_native_direct_twi |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml |multiple_choice |\n", - "|afrixnli_native_direct_wol |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml |multiple_choice |\n", - "|afrixnli_native_direct_xho |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml |multiple_choice |\n", - "|afrixnli_native_direct_yor |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml |multiple_choice |\n", - "|afrixnli_native_direct_zul |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml |multiple_choice |\n", - "|afrixnli_translate_amh |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml |multiple_choice |\n", - "|afrixnli_translate_ewe |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml |multiple_choice |\n", - "|afrixnli_translate_fra |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml |multiple_choice |\n", - "|afrixnli_translate_hau |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml |multiple_choice |\n", - "|afrixnli_translate_ibo |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml |multiple_choice |\n", - "|afrixnli_translate_kin |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml |multiple_choice |\n", - "|afrixnli_translate_lin |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml |multiple_choice |\n", - "|afrixnli_translate_lug |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml |multiple_choice |\n", - "|afrixnli_translate_orm |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml |multiple_choice |\n", - "|afrixnli_translate_sna |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml |multiple_choice |\n", - "|afrixnli_translate_sot |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml |multiple_choice |\n", - "|afrixnli_translate_swa |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml |multiple_choice |\n", - "|afrixnli_translate_twi |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml |multiple_choice |\n", - "|afrixnli_translate_wol |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml |multiple_choice |\n", - "|afrixnli_translate_xho |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml |multiple_choice |\n", - "|afrixnli_translate_yor |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml |multiple_choice |\n", - "|afrixnli_translate_zul |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml |multiple_choice |\n", - "|ag_news |lm_eval/tasks/unitxt/ag_news.yaml | |\n", - "|agieval_aqua_rat |lm_eval/tasks/agieval/aqua-rat.yaml |multiple_choice |\n", - "|agieval_gaokao_biology |lm_eval/tasks/agieval/gaokao-biology.yaml |multiple_choice |\n", - "|agieval_gaokao_chemistry |lm_eval/tasks/agieval/gaokao-chemistry.yaml |multiple_choice |\n", - "|agieval_gaokao_chinese |lm_eval/tasks/agieval/gaokao-chinese.yaml |multiple_choice |\n", - "|agieval_gaokao_english |lm_eval/tasks/agieval/gaokao-english.yaml |multiple_choice |\n", - "|agieval_gaokao_geography |lm_eval/tasks/agieval/gaokao-geography.yaml |multiple_choice |\n", - "|agieval_gaokao_history |lm_eval/tasks/agieval/gaokao-history.yaml |multiple_choice |\n", - "|agieval_gaokao_mathcloze |lm_eval/tasks/agieval/gaokao-mathcloze.yaml |generate_until |\n", - "|agieval_gaokao_mathqa |lm_eval/tasks/agieval/gaokao-mathqa.yaml |multiple_choice |\n", - "|agieval_gaokao_physics |lm_eval/tasks/agieval/gaokao-physics.yaml |multiple_choice |\n", - "|agieval_jec_qa_ca |lm_eval/tasks/agieval/jec-qa-ca.yaml |multiple_choice |\n", - "|agieval_jec_qa_kd |lm_eval/tasks/agieval/jec-qa-kd.yaml |multiple_choice |\n", - "|agieval_logiqa_en |lm_eval/tasks/agieval/logiqa-en.yaml |multiple_choice |\n", - "|agieval_logiqa_zh |lm_eval/tasks/agieval/logiqa-zh.yaml |multiple_choice |\n", - "|agieval_lsat_ar |lm_eval/tasks/agieval/lsat-ar.yaml |multiple_choice |\n", - "|agieval_lsat_lr |lm_eval/tasks/agieval/lsat-lr.yaml |multiple_choice |\n", - "|agieval_lsat_rc |lm_eval/tasks/agieval/lsat-rc.yaml |multiple_choice |\n", - "|agieval_math |lm_eval/tasks/agieval/math.yaml |generate_until |\n", - "|agieval_sat_en |lm_eval/tasks/agieval/sat-en.yaml |multiple_choice |\n", - "|agieval_sat_en_without_passage |lm_eval/tasks/agieval/sat-en-without-passage.yaml |multiple_choice |\n", - "|agieval_sat_math |lm_eval/tasks/agieval/sat-math.yaml |multiple_choice |\n", - "|anagrams1 |lm_eval/tasks/unscramble/anagrams1.yaml |generate_until |\n", - "|anagrams2 |lm_eval/tasks/unscramble/anagrams2.yaml |generate_until |\n", - "|anli_r1 |lm_eval/tasks/anli/anli_r1.yaml |multiple_choice |\n", - "|anli_r2 |lm_eval/tasks/anli/anli_r2.yaml |multiple_choice |\n", - "|anli_r3 |lm_eval/tasks/anli/anli_r3.yaml |multiple_choice |\n", - "|arabic_exams |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_exams.yaml |multiple_choice |\n", - "|arabic_exams_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_exams_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Algeria |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Algeria.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Algeria_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Algeria_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Ancient_Egypt |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Ancient_Egypt.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Ancient_Egypt_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Ancient_Egypt_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arab_Empire |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arab_Empire.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arab_Empire_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arab_Empire_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Architecture |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Architecture.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Architecture_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Architecture_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Art |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Art.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Art_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Art_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Astronomy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Astronomy.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Astronomy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Astronomy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Calligraphy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Calligraphy.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Calligraphy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Calligraphy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Ceremony |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ceremony.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Ceremony_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ceremony_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Clothing |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Clothing.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Clothing_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Clothing_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Culture |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Culture.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Culture_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Culture_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Food |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Food.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Food_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Food_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Funeral |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Funeral.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Funeral_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Funeral_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Geography |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Geography.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Geography_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Geography_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_History |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_History.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_History_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_History_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Language_Origin |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Language_Origin.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Language_Origin_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Language_Origin_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Literature |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Literature.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Literature_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Literature_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Math |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Math.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Math_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Math_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Medicine |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Medicine.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Medicine_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Medicine_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Music |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Music.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Music_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Music_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Ornament |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ornament.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Ornament_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ornament_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Philosophy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Philosophy.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Philosophy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Philosophy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Physics_and_Chemistry |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Wedding |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Wedding.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Arabic_Wedding_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Wedding_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Bahrain |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Bahrain.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Bahrain_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Bahrain_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Comoros |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Comoros.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Comoros_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Comoros_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Egypt_modern |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Egypt_modern.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Egypt_modern_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Egypt_modern_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromAncientEgypt |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromAncientEgypt.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromAncientEgypt_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromAncientEgypt_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromByzantium |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromByzantium.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromByzantium_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromByzantium_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromChina |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromChina.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromChina_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromChina_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromGreece |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromGreece.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromGreece_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromGreece_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromIslam |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromIslam.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromIslam_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromIslam_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromPersia |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromPersia.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromPersia_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromPersia_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromRome |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromRome.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_InfluenceFromRome_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromRome_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Iraq |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Iraq.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Iraq_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Iraq_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islam_Education |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_Education.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islam_Education_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_Education_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islam_branches_and_schools |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_branches_and_schools.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islam_branches_and_schools_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_branches_and_schools_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islamic_law_system |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islamic_law_system.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Islamic_law_system_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islamic_law_system_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Jordan |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Jordan.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Jordan_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Jordan_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Kuwait |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Kuwait.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Kuwait_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Kuwait_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Lebanon |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Lebanon.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Lebanon_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Lebanon_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Libya |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Libya.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Libya_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Libya_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Mauritania |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mauritania.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Mauritania_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mauritania_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Mesopotamia_civilization |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mesopotamia_civilization.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Mesopotamia_civilization_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mesopotamia_civilization_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Morocco |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Morocco.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Morocco_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Morocco_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Oman |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Oman.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Oman_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Oman_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Palestine |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Palestine.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Palestine_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Palestine_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Qatar |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Qatar.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Qatar_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Qatar_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Saudi_Arabia |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Saudi_Arabia.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Saudi_Arabia_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Saudi_Arabia_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Somalia |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Somalia.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Somalia_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Somalia_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Sudan |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Sudan.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Sudan_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Sudan_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Syria |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Syria.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Syria_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Syria_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Tunisia |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Tunisia.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Tunisia_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Tunisia_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_United_Arab_Emirates |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_United_Arab_Emirates.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_United_Arab_Emirates_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_United_Arab_Emirates_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Yemen |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Yemen.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_Yemen_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Yemen_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_communication |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_communication.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_communication_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_communication_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_computer_and_phone |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_computer_and_phone.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_computer_and_phone_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_computer_and_phone_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_daily_life |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_daily_life.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_daily_life_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_daily_life_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_entertainment |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_entertainment.yaml |multiple_choice |\n", - "|arabic_leaderboard_acva_entertainment_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_entertainment_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_mcq_exams_test_ar |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_mcq_exams_test_ar.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_mcq_exams_test_ar_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_mcq_exams_test_ar_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_meta_ar_dialects |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_dialects.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_meta_ar_dialects_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_dialects_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_meta_ar_msa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_msa.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_meta_ar_msa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_msa_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light.yaml|multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_sentiment_task |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_sentiment_task.yaml |multiple_choice |\n", - "|arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_abstract_algebra |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_abstract_algebra.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_abstract_algebra_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_abstract_algebra_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_anatomy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_anatomy.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_anatomy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_anatomy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_astronomy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_astronomy.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_astronomy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_astronomy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_business_ethics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_business_ethics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_business_ethics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_business_ethics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_clinical_knowledge |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_clinical_knowledge.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_clinical_knowledge_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_clinical_knowledge_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_biology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_biology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_biology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_biology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_chemistry |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_chemistry.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_chemistry_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_chemistry_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_computer_science |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_computer_science.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_computer_science_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_computer_science_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_mathematics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_mathematics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_mathematics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_mathematics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_medicine |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_medicine.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_medicine_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_medicine_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_physics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_physics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_college_physics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_physics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_computer_security |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_computer_security.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_computer_security_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_computer_security_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_conceptual_physics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_conceptual_physics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_conceptual_physics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_conceptual_physics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_econometrics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_econometrics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_econometrics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_econometrics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_electrical_engineering |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_electrical_engineering.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_electrical_engineering_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_electrical_engineering_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_elementary_mathematics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_elementary_mathematics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_elementary_mathematics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_elementary_mathematics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_formal_logic |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_formal_logic.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_formal_logic_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_formal_logic_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_global_facts |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_global_facts.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_global_facts_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_global_facts_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_biology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_biology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_biology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_biology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_chemistry |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_chemistry.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_chemistry_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_chemistry_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_computer_science |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_computer_science.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_computer_science_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_computer_science_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_european_history |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_european_history.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_european_history_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_european_history_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_geography |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_geography.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_geography_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_geography_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_government_and_politics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_macroeconomics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_mathematics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_mathematics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_mathematics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_mathematics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_microeconomics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_microeconomics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_physics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_physics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_physics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_physics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_psychology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_psychology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_psychology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_psychology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_statistics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_statistics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_statistics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_statistics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_us_history |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_us_history.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_us_history_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_us_history_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_world_history |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_world_history.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_high_school_world_history_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_world_history_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_human_aging |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_aging.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_human_aging_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_aging_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_human_sexuality |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_sexuality.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_human_sexuality_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_sexuality_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_international_law |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_international_law.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_international_law_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_international_law_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_jurisprudence |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_jurisprudence.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_jurisprudence_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_jurisprudence_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_logical_fallacies |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_logical_fallacies.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_logical_fallacies_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_logical_fallacies_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_machine_learning |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_machine_learning.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_machine_learning_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_machine_learning_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_management |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_management.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_management_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_management_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_marketing |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_marketing.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_marketing_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_marketing_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_medical_genetics |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_medical_genetics.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_medical_genetics_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_medical_genetics_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_miscellaneous |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_miscellaneous.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_miscellaneous_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_miscellaneous_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_moral_disputes |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_disputes.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_moral_disputes_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_disputes_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_moral_scenarios |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_scenarios.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_moral_scenarios_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_scenarios_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_nutrition |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_nutrition.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_nutrition_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_nutrition_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_philosophy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_philosophy.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_philosophy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_philosophy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_prehistory |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_prehistory.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_prehistory_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_prehistory_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_accounting |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_accounting.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_accounting_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_accounting_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_law |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_law.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_law_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_law_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_medicine |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_medicine_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_medicine_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_psychology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_professional_psychology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_psychology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_public_relations |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_public_relations_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_public_relations_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_security_studies |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_security_studies_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_security_studies_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_sociology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_sociology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_sociology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_us_foreign_policy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_us_foreign_policy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_us_foreign_policy_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_virology |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_virology_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_virology_light.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_world_religions |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml |multiple_choice |\n", - "|arabic_leaderboard_arabic_mmlu_world_religions_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_world_religions_light.yaml |multiple_choice |\n", - "|arabic_mt_arc_challenge |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml |multiple_choice |\n", - "|arabic_mt_arc_challenge_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_mt_arc_challenge_light.yaml |multiple_choice |\n", - "|arabic_mt_arc_easy |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml |multiple_choice |\n", - "|arabic_mt_arc_easy_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_mt_arc_easy_light.yaml |multiple_choice |\n", - "|arabic_mt_boolq |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml |multiple_choice |\n", - "|arabic_mt_boolq_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml |multiple_choice |\n", - "|arabic_mt_copa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml |multiple_choice |\n", - "|arabic_mt_copa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arabic_mt_copa_light.yaml |multiple_choice |\n", - "|arabic_mt_hellaswag |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_mt_hellaswag.yaml |multiple_choice |\n", - "|arabic_mt_hellaswag_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_mt_hellaswag_light.yaml |multiple_choice |\n", - "|arabic_mt_mmlu |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_mt_mmlu.yaml |multiple_choice |\n", - "|arabic_mt_mmlu_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_mt_mmlu_light.yaml |multiple_choice |\n", - "|arabic_mt_openbook_qa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_mt_openbook_qa.yaml |multiple_choice |\n", - "|arabic_mt_openbook_qa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_mt_openbook_qa_light.yaml |multiple_choice |\n", - "|arabic_mt_piqa |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_mt_piqa.yaml |multiple_choice |\n", - "|arabic_mt_piqa_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_mt_piqa_light.yaml |multiple_choice |\n", - "|arabic_mt_race |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_mt_race.yaml |multiple_choice |\n", - "|arabic_mt_race_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_mt_race_light.yaml |multiple_choice |\n", - "|arabic_mt_sciq |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_mt_sciq.yaml |multiple_choice |\n", - "|arabic_mt_sciq_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_mt_sciq_light.yaml |multiple_choice |\n", - "|arabic_mt_toxigen |lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_mt_toxigen.yaml |multiple_choice |\n", - "|arabic_mt_toxigen_light |lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_mt_toxigen_light.yaml |multiple_choice |\n", - "|arabicmmlu_arabic_language_(general) |lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml |multiple_choice |\n", - "|arabicmmlu_arabic_language_(grammar) |lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml |multiple_choice |\n", - "|arabicmmlu_driving_test |lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml |multiple_choice |\n", - "|arabicmmlu_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml |multiple_choice |\n", - "|arabicmmlu_high_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml |multiple_choice |\n", - "|arabicmmlu_high_biology |lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml |multiple_choice |\n", - "|arabicmmlu_high_civics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml |multiple_choice |\n", - "|arabicmmlu_high_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml |multiple_choice |\n", - "|arabicmmlu_high_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml |multiple_choice |\n", - "|arabicmmlu_high_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml |multiple_choice |\n", - "|arabicmmlu_high_history |lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml |multiple_choice |\n", - "|arabicmmlu_high_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml |multiple_choice |\n", - "|arabicmmlu_high_philosophy |lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml |multiple_choice |\n", - "|arabicmmlu_high_physics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml |multiple_choice |\n", - "|arabicmmlu_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml |multiple_choice |\n", - "|arabicmmlu_middle_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml |multiple_choice |\n", - "|arabicmmlu_middle_civics |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml |multiple_choice |\n", - "|arabicmmlu_middle_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml |multiple_choice |\n", - "|arabicmmlu_middle_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml |multiple_choice |\n", - "|arabicmmlu_middle_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml |multiple_choice |\n", - "|arabicmmlu_middle_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml |multiple_choice |\n", - "|arabicmmlu_middle_history |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml |multiple_choice |\n", - "|arabicmmlu_middle_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml |multiple_choice |\n", - "|arabicmmlu_middle_natural_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml |multiple_choice |\n", - "|arabicmmlu_middle_social_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml |multiple_choice |\n", - "|arabicmmlu_primary_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml |multiple_choice |\n", - "|arabicmmlu_primary_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml |multiple_choice |\n", - "|arabicmmlu_primary_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml |multiple_choice |\n", - "|arabicmmlu_primary_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml |multiple_choice |\n", - "|arabicmmlu_primary_history |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml |multiple_choice |\n", - "|arabicmmlu_primary_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml |multiple_choice |\n", - "|arabicmmlu_primary_math |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml |multiple_choice |\n", - "|arabicmmlu_primary_natural_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml |multiple_choice |\n", - "|arabicmmlu_primary_social_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml |multiple_choice |\n", - "|arabicmmlu_prof_law |lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml |multiple_choice |\n", - "|arabicmmlu_univ_accounting |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml |multiple_choice |\n", - "|arabicmmlu_univ_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml |multiple_choice |\n", - "|arabicmmlu_univ_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml |multiple_choice |\n", - "|arabicmmlu_univ_management |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml |multiple_choice |\n", - "|arabicmmlu_univ_political_science |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml |multiple_choice |\n", - "|arc_ar |lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml |multiple_choice |\n", - "|arc_bn |lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml |multiple_choice |\n", - "|arc_ca |lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml |multiple_choice |\n", - "|arc_ca_challenge |lm_eval/tasks/catalan_bench/arc_ca_challenge.yaml |multiple_choice |\n", - "|arc_ca_easy |lm_eval/tasks/catalan_bench/arc_ca_easy.yaml |multiple_choice |\n", - "|arc_challenge |lm_eval/tasks/arc/arc_challenge.yaml |multiple_choice |\n", - "|arc_challenge_mt_da |lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml |multiple_choice |\n", - "|arc_challenge_mt_de |lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml |multiple_choice |\n", - "|arc_challenge_mt_el |lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml |multiple_choice |\n", - "|arc_challenge_mt_es |lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml |multiple_choice |\n", - "|arc_challenge_mt_fi |lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml |multiple_choice |\n", - "|arc_challenge_mt_hu |lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml |multiple_choice |\n", - "|arc_challenge_mt_is |lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml |multiple_choice |\n", - "|arc_challenge_mt_it |lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml |multiple_choice |\n", - "|arc_challenge_mt_nb |lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml |multiple_choice |\n", - "|arc_challenge_mt_pl |lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml |multiple_choice |\n", - "|arc_challenge_mt_pt |lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml |multiple_choice |\n", - "|arc_challenge_mt_sv |lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml |multiple_choice |\n", - "|arc_da |lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml |multiple_choice |\n", - "|arc_de |lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml |multiple_choice |\n", - "|arc_easy |lm_eval/tasks/arc/arc_easy.yaml |multiple_choice |\n", - "|arc_es |lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml |multiple_choice |\n", - "|arc_eu |lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml |multiple_choice |\n", - "|arc_fr |lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml |multiple_choice |\n", - "|arc_gu |lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml |multiple_choice |\n", - "|arc_hi |lm_eval/tasks/okapi/arc_multilingual/arc_hi.yaml |multiple_choice |\n", - "|arc_hr |lm_eval/tasks/okapi/arc_multilingual/arc_hr.yaml |multiple_choice |\n", - "|arc_hu |lm_eval/tasks/okapi/arc_multilingual/arc_hu.yaml |multiple_choice |\n", - "|arc_hy |lm_eval/tasks/okapi/arc_multilingual/arc_hy.yaml |multiple_choice |\n", - "|arc_id |lm_eval/tasks/okapi/arc_multilingual/arc_id.yaml |multiple_choice |\n", - "|arc_it |lm_eval/tasks/okapi/arc_multilingual/arc_it.yaml |multiple_choice |\n", - "|arc_kn |lm_eval/tasks/okapi/arc_multilingual/arc_kn.yaml |multiple_choice |\n", - "|arc_ml |lm_eval/tasks/okapi/arc_multilingual/arc_ml.yaml |multiple_choice |\n", - "|arc_mr |lm_eval/tasks/okapi/arc_multilingual/arc_mr.yaml |multiple_choice |\n", - "|arc_ne |lm_eval/tasks/okapi/arc_multilingual/arc_ne.yaml |multiple_choice |\n", - "|arc_nl |lm_eval/tasks/okapi/arc_multilingual/arc_nl.yaml |multiple_choice |\n", - "|arc_pt |lm_eval/tasks/okapi/arc_multilingual/arc_pt.yaml |multiple_choice |\n", - "|arc_ro |lm_eval/tasks/okapi/arc_multilingual/arc_ro.yaml |multiple_choice |\n", - "|arc_ru |lm_eval/tasks/okapi/arc_multilingual/arc_ru.yaml |multiple_choice |\n", - "|arc_sk |lm_eval/tasks/okapi/arc_multilingual/arc_sk.yaml |multiple_choice |\n", - "|arc_sr |lm_eval/tasks/okapi/arc_multilingual/arc_sr.yaml |multiple_choice |\n", - "|arc_sv |lm_eval/tasks/okapi/arc_multilingual/arc_sv.yaml |multiple_choice |\n", - "|arc_ta |lm_eval/tasks/okapi/arc_multilingual/arc_ta.yaml |multiple_choice |\n", - "|arc_te |lm_eval/tasks/okapi/arc_multilingual/arc_te.yaml |multiple_choice |\n", - "|arc_uk |lm_eval/tasks/okapi/arc_multilingual/arc_uk.yaml |multiple_choice |\n", - "|arc_vi |lm_eval/tasks/okapi/arc_multilingual/arc_vi.yaml |multiple_choice |\n", - "|arc_zh |lm_eval/tasks/okapi/arc_multilingual/arc_zh.yaml |multiple_choice |\n", - "|argument_topic |lm_eval/tasks/unitxt/argument_topic.yaml | |\n", - "|arithmetic_1dc |lm_eval/tasks/arithmetic/arithmetic_1dc.yaml |loglikelihood |\n", - "|arithmetic_2da |lm_eval/tasks/arithmetic/arithmetic_2da.yaml |loglikelihood |\n", - "|arithmetic_2dm |lm_eval/tasks/arithmetic/arithmetic_2dm.yaml |loglikelihood |\n", - "|arithmetic_2ds |lm_eval/tasks/arithmetic/arithmetic_2ds.yaml |loglikelihood |\n", - "|arithmetic_3da |lm_eval/tasks/arithmetic/arithmetic_3da.yaml |loglikelihood |\n", - "|arithmetic_3ds |lm_eval/tasks/arithmetic/arithmetic_3ds.yaml |loglikelihood |\n", - "|arithmetic_4da |lm_eval/tasks/arithmetic/arithmetic_4da.yaml |loglikelihood |\n", - "|arithmetic_4ds |lm_eval/tasks/arithmetic/arithmetic_4ds.yaml |loglikelihood |\n", - "|arithmetic_5da |lm_eval/tasks/arithmetic/arithmetic_5da.yaml |loglikelihood |\n", - "|arithmetic_5ds |lm_eval/tasks/arithmetic/arithmetic_5ds.yaml |loglikelihood |\n", - "|asdiv |lm_eval/tasks/asdiv/default.yaml |loglikelihood |\n", - "|asdiv_cot_llama |lm_eval/tasks/asdiv/asdiv-cot-llama.yaml |generate_until |\n", - "|assin_entailment |lm_eval/tasks/portuguese_bench/assin_entailment.yaml |multiple_choice |\n", - "|assin_paraphrase |lm_eval/tasks/portuguese_bench/assin_paraphrase.yaml |multiple_choice |\n", - "|atis |lm_eval/tasks/unitxt/atis.yaml | |\n", - "|babi |lm_eval/tasks/babi/babi.yaml |generate_until |\n", - "|banking77 |lm_eval/tasks/unitxt/banking77.yaml | |\n", - "|bbh_cot_fewshot_boolean_expressions |lm_eval/tasks/bbh/cot_fewshot/boolean_expressions.yaml |generate_until |\n", - "|bbh_cot_fewshot_causal_judgement |lm_eval/tasks/bbh/cot_fewshot/causal_judgement.yaml |generate_until |\n", - "|bbh_cot_fewshot_date_understanding |lm_eval/tasks/bbh/cot_fewshot/date_understanding.yaml |generate_until |\n", - "|bbh_cot_fewshot_disambiguation_qa |lm_eval/tasks/bbh/cot_fewshot/disambiguation_qa.yaml |generate_until |\n", - "|bbh_cot_fewshot_dyck_languages |lm_eval/tasks/bbh/cot_fewshot/dyck_languages.yaml |generate_until |\n", - "|bbh_cot_fewshot_formal_fallacies |lm_eval/tasks/bbh/cot_fewshot/formal_fallacies.yaml |generate_until |\n", - "|bbh_cot_fewshot_geometric_shapes |lm_eval/tasks/bbh/cot_fewshot/geometric_shapes.yaml |generate_until |\n", - "|bbh_cot_fewshot_hyperbaton |lm_eval/tasks/bbh/cot_fewshot/hyperbaton.yaml |generate_until |\n", - "|bbh_cot_fewshot_logical_deduction_five_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_five_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_seven_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_logical_deduction_three_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_three_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_movie_recommendation |lm_eval/tasks/bbh/cot_fewshot/movie_recommendation.yaml |generate_until |\n", - "|bbh_cot_fewshot_multistep_arithmetic_two |lm_eval/tasks/bbh/cot_fewshot/multistep_arithmetic_two.yaml |generate_until |\n", - "|bbh_cot_fewshot_navigate |lm_eval/tasks/bbh/cot_fewshot/navigate.yaml |generate_until |\n", - "|bbh_cot_fewshot_object_counting |lm_eval/tasks/bbh/cot_fewshot/object_counting.yaml |generate_until |\n", - "|bbh_cot_fewshot_penguins_in_a_table |lm_eval/tasks/bbh/cot_fewshot/penguins_in_a_table.yaml |generate_until |\n", - "|bbh_cot_fewshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/cot_fewshot/reasoning_about_colored_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_ruin_names |lm_eval/tasks/bbh/cot_fewshot/ruin_names.yaml |generate_until |\n", - "|bbh_cot_fewshot_salient_translation_error_detection |lm_eval/tasks/bbh/cot_fewshot/salient_translation_error_detection.yaml |generate_until |\n", - "|bbh_cot_fewshot_snarks |lm_eval/tasks/bbh/cot_fewshot/snarks.yaml |generate_until |\n", - "|bbh_cot_fewshot_sports_understanding |lm_eval/tasks/bbh/cot_fewshot/sports_understanding.yaml |generate_until |\n", - "|bbh_cot_fewshot_temporal_sequences |lm_eval/tasks/bbh/cot_fewshot/temporal_sequences.yaml |generate_until |\n", - "|bbh_cot_fewshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n", - "|bbh_cot_fewshot_web_of_lies |lm_eval/tasks/bbh/cot_fewshot/web_of_lies.yaml |generate_until |\n", - "|bbh_cot_fewshot_word_sorting |lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml |generate_until |\n", - "|bbh_cot_zeroshot_boolean_expressions |lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml |generate_until |\n", - "|bbh_cot_zeroshot_causal_judgement |lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml |generate_until |\n", - "|bbh_cot_zeroshot_date_understanding |lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml |generate_until |\n", - "|bbh_cot_zeroshot_disambiguation_qa |lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml |generate_until |\n", - "|bbh_cot_zeroshot_dyck_languages |lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml |generate_until |\n", - "|bbh_cot_zeroshot_formal_fallacies |lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml |generate_until |\n", - "|bbh_cot_zeroshot_geometric_shapes |lm_eval/tasks/bbh/cot_zeroshot/geometric_shapes.yaml |generate_until |\n", - "|bbh_cot_zeroshot_hyperbaton |lm_eval/tasks/bbh/cot_zeroshot/hyperbaton.yaml |generate_until |\n", - "|bbh_cot_zeroshot_logical_deduction_five_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_logical_deduction_three_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_movie_recommendation |lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml |generate_until |\n", - "|bbh_cot_zeroshot_multistep_arithmetic_two |lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml |generate_until |\n", - "|bbh_cot_zeroshot_navigate |lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml |generate_until |\n", - "|bbh_cot_zeroshot_object_counting |lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml |generate_until |\n", - "|bbh_cot_zeroshot_penguins_in_a_table |lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml |generate_until |\n", - "|bbh_cot_zeroshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_ruin_names |lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml |generate_until |\n", - "|bbh_cot_zeroshot_salient_translation_error_detection |lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml |generate_until |\n", - "|bbh_cot_zeroshot_snarks |lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml |generate_until |\n", - "|bbh_cot_zeroshot_sports_understanding |lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml |generate_until |\n", - "|bbh_cot_zeroshot_temporal_sequences |lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml |generate_until |\n", - "|bbh_cot_zeroshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n", - "|bbh_cot_zeroshot_web_of_lies |lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml |generate_until |\n", - "|bbh_cot_zeroshot_word_sorting |lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml |generate_until |\n", - "|bbh_fewshot_boolean_expressions |lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml |generate_until |\n", - "|bbh_fewshot_causal_judgement |lm_eval/tasks/bbh/fewshot/causal_judgement.yaml |generate_until |\n", - "|bbh_fewshot_date_understanding |lm_eval/tasks/bbh/fewshot/date_understanding.yaml |generate_until |\n", - "|bbh_fewshot_disambiguation_qa |lm_eval/tasks/bbh/fewshot/disambiguation_qa.yaml |generate_until |\n", - "|bbh_fewshot_dyck_languages |lm_eval/tasks/bbh/fewshot/dyck_languages.yaml |generate_until |\n", - "|bbh_fewshot_formal_fallacies |lm_eval/tasks/bbh/fewshot/formal_fallacies.yaml |generate_until |\n", - "|bbh_fewshot_geometric_shapes |lm_eval/tasks/bbh/fewshot/geometric_shapes.yaml |generate_until |\n", - "|bbh_fewshot_hyperbaton |lm_eval/tasks/bbh/fewshot/hyperbaton.yaml |generate_until |\n", - "|bbh_fewshot_logical_deduction_five_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_five_objects.yaml |generate_until |\n", - "|bbh_fewshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_seven_objects.yaml |generate_until |\n", - "|bbh_fewshot_logical_deduction_three_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml |generate_until |\n", - "|bbh_fewshot_movie_recommendation |lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml |generate_until |\n", - "|bbh_fewshot_multistep_arithmetic_two |lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml |generate_until |\n", - "|bbh_fewshot_navigate |lm_eval/tasks/bbh/fewshot/navigate.yaml |generate_until |\n", - "|bbh_fewshot_object_counting |lm_eval/tasks/bbh/fewshot/object_counting.yaml |generate_until |\n", - "|bbh_fewshot_penguins_in_a_table |lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml |generate_until |\n", - "|bbh_fewshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml |generate_until |\n", - "|bbh_fewshot_ruin_names |lm_eval/tasks/bbh/fewshot/ruin_names.yaml |generate_until |\n", - "|bbh_fewshot_salient_translation_error_detection |lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml |generate_until |\n", - "|bbh_fewshot_snarks |lm_eval/tasks/bbh/fewshot/snarks.yaml |generate_until |\n", - "|bbh_fewshot_sports_understanding |lm_eval/tasks/bbh/fewshot/sports_understanding.yaml |generate_until |\n", - "|bbh_fewshot_temporal_sequences |lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml |generate_until |\n", - "|bbh_fewshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n", - "|bbh_fewshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n", - "|bbh_fewshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n", - "|bbh_fewshot_web_of_lies |lm_eval/tasks/bbh/fewshot/web_of_lies.yaml |generate_until |\n", - "|bbh_fewshot_word_sorting |lm_eval/tasks/bbh/fewshot/word_sorting.yaml |generate_until |\n", - "|bbh_zeroshot_boolean_expressions |lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml |generate_until |\n", - "|bbh_zeroshot_causal_judgement |lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml |generate_until |\n", - "|bbh_zeroshot_date_understanding |lm_eval/tasks/bbh/zeroshot/date_understanding.yaml |generate_until |\n", - "|bbh_zeroshot_disambiguation_qa |lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml |generate_until |\n", - "|bbh_zeroshot_dyck_languages |lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml |generate_until |\n", - "|bbh_zeroshot_formal_fallacies |lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml |generate_until |\n", - "|bbh_zeroshot_geometric_shapes |lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml |generate_until |\n", - "|bbh_zeroshot_hyperbaton |lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml |generate_until |\n", - "|bbh_zeroshot_logical_deduction_five_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml |generate_until |\n", - "|bbh_zeroshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml |generate_until |\n", - "|bbh_zeroshot_logical_deduction_three_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml |generate_until |\n", - "|bbh_zeroshot_movie_recommendation |lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml |generate_until |\n", - "|bbh_zeroshot_multistep_arithmetic_two |lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml |generate_until |\n", - "|bbh_zeroshot_navigate |lm_eval/tasks/bbh/zeroshot/navigate.yaml |generate_until |\n", - "|bbh_zeroshot_object_counting |lm_eval/tasks/bbh/zeroshot/object_counting.yaml |generate_until |\n", - "|bbh_zeroshot_penguins_in_a_table |lm_eval/tasks/bbh/zeroshot/penguins_in_a_table.yaml |generate_until |\n", - "|bbh_zeroshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/zeroshot/reasoning_about_colored_objects.yaml |generate_until |\n", - "|bbh_zeroshot_ruin_names |lm_eval/tasks/bbh/zeroshot/ruin_names.yaml |generate_until |\n", - "|bbh_zeroshot_salient_translation_error_detection |lm_eval/tasks/bbh/zeroshot/salient_translation_error_detection.yaml |generate_until |\n", - "|bbh_zeroshot_snarks |lm_eval/tasks/bbh/zeroshot/snarks.yaml |generate_until |\n", - "|bbh_zeroshot_sports_understanding |lm_eval/tasks/bbh/zeroshot/sports_understanding.yaml |generate_until |\n", - "|bbh_zeroshot_temporal_sequences |lm_eval/tasks/bbh/zeroshot/temporal_sequences.yaml |generate_until |\n", - "|bbh_zeroshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n", - "|bbh_zeroshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n", - "|bbh_zeroshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n", - "|bbh_zeroshot_web_of_lies |lm_eval/tasks/bbh/zeroshot/web_of_lies.yaml |generate_until |\n", - "|bbh_zeroshot_word_sorting |lm_eval/tasks/bbh/zeroshot/word_sorting.yaml |generate_until |\n", - "|bec2016eu |lm_eval/tasks/basqueglue/bec.yaml |multiple_choice |\n", - "|belebele_acm_Arab |lm_eval/tasks/belebele/belebele_acm_Arab.yaml |multiple_choice |\n", - "|belebele_afr_Latn |lm_eval/tasks/belebele/belebele_afr_Latn.yaml |multiple_choice |\n", - "|belebele_als_Latn |lm_eval/tasks/belebele/belebele_als_Latn.yaml |multiple_choice |\n", - "|belebele_amh_Ethi |lm_eval/tasks/belebele/belebele_amh_Ethi.yaml |multiple_choice |\n", - "|belebele_apc_Arab |lm_eval/tasks/belebele/belebele_apc_Arab.yaml |multiple_choice |\n", - "|belebele_arb_Arab |lm_eval/tasks/belebele/belebele_arb_Arab.yaml |multiple_choice |\n", - "|belebele_arb_Latn |lm_eval/tasks/belebele/belebele_arb_Latn.yaml |multiple_choice |\n", - "|belebele_ars_Arab |lm_eval/tasks/belebele/belebele_ars_Arab.yaml |multiple_choice |\n", - "|belebele_ary_Arab |lm_eval/tasks/belebele/belebele_ary_Arab.yaml |multiple_choice |\n", - "|belebele_arz_Arab |lm_eval/tasks/belebele/belebele_arz_Arab.yaml |multiple_choice |\n", - "|belebele_asm_Beng |lm_eval/tasks/belebele/belebele_asm_Beng.yaml |multiple_choice |\n", - "|belebele_azj_Latn |lm_eval/tasks/belebele/belebele_azj_Latn.yaml |multiple_choice |\n", - "|belebele_bam_Latn |lm_eval/tasks/belebele/belebele_bam_Latn.yaml |multiple_choice |\n", - "|belebele_ben_Beng |lm_eval/tasks/belebele/belebele_ben_Beng.yaml |multiple_choice |\n", - "|belebele_ben_Latn |lm_eval/tasks/belebele/belebele_ben_Latn.yaml |multiple_choice |\n", - "|belebele_bod_Tibt |lm_eval/tasks/belebele/belebele_bod_Tibt.yaml |multiple_choice |\n", - "|belebele_bul_Cyrl |lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml |multiple_choice |\n", - "|belebele_cat_Latn |lm_eval/tasks/belebele/belebele_cat_Latn.yaml |multiple_choice |\n", - "|belebele_ceb_Latn |lm_eval/tasks/belebele/belebele_ceb_Latn.yaml |multiple_choice |\n", - "|belebele_ces_Latn |lm_eval/tasks/belebele/belebele_ces_Latn.yaml |multiple_choice |\n", - "|belebele_ckb_Arab |lm_eval/tasks/belebele/belebele_ckb_Arab.yaml |multiple_choice |\n", - "|belebele_dan_Latn |lm_eval/tasks/belebele/belebele_dan_Latn.yaml |multiple_choice |\n", - "|belebele_deu_Latn |lm_eval/tasks/belebele/belebele_deu_Latn.yaml |multiple_choice |\n", - "|belebele_ell_Grek |lm_eval/tasks/belebele/belebele_ell_Grek.yaml |multiple_choice |\n", - "|belebele_eng_Latn |lm_eval/tasks/belebele/belebele_eng_Latn.yaml |multiple_choice |\n", - "|belebele_est_Latn |lm_eval/tasks/belebele/belebele_est_Latn.yaml |multiple_choice |\n", - "|belebele_eus_Latn |lm_eval/tasks/belebele/belebele_eus_Latn.yaml |multiple_choice |\n", - "|belebele_fin_Latn |lm_eval/tasks/belebele/belebele_fin_Latn.yaml |multiple_choice |\n", - "|belebele_fra_Latn |lm_eval/tasks/belebele/belebele_fra_Latn.yaml |multiple_choice |\n", - "|belebele_fuv_Latn |lm_eval/tasks/belebele/belebele_fuv_Latn.yaml |multiple_choice |\n", - "|belebele_gaz_Latn |lm_eval/tasks/belebele/belebele_gaz_Latn.yaml |multiple_choice |\n", - "|belebele_glg_Latn |lm_eval/tasks/galician_bench/belebele_glg_Latn.yaml |multiple_choice |\n", - "|belebele_grn_Latn |lm_eval/tasks/belebele/belebele_grn_Latn.yaml |multiple_choice |\n", - "|belebele_guj_Gujr |lm_eval/tasks/belebele/belebele_guj_Gujr.yaml |multiple_choice |\n", - "|belebele_hat_Latn |lm_eval/tasks/belebele/belebele_hat_Latn.yaml |multiple_choice |\n", - "|belebele_hau_Latn |lm_eval/tasks/belebele/belebele_hau_Latn.yaml |multiple_choice |\n", - "|belebele_heb_Hebr |lm_eval/tasks/belebele/belebele_heb_Hebr.yaml |multiple_choice |\n", - "|belebele_hin_Deva |lm_eval/tasks/belebele/belebele_hin_Deva.yaml |multiple_choice |\n", - "|belebele_hin_Latn |lm_eval/tasks/belebele/belebele_hin_Latn.yaml |multiple_choice |\n", - "|belebele_hrv_Latn |lm_eval/tasks/belebele/belebele_hrv_Latn.yaml |multiple_choice |\n", - "|belebele_hun_Latn |lm_eval/tasks/belebele/belebele_hun_Latn.yaml |multiple_choice |\n", - "|belebele_hye_Armn |lm_eval/tasks/belebele/belebele_hye_Armn.yaml |multiple_choice |\n", - "|belebele_ibo_Latn |lm_eval/tasks/belebele/belebele_ibo_Latn.yaml |multiple_choice |\n", - "|belebele_ilo_Latn |lm_eval/tasks/belebele/belebele_ilo_Latn.yaml |multiple_choice |\n", - "|belebele_ind_Latn |lm_eval/tasks/belebele/belebele_ind_Latn.yaml |multiple_choice |\n", - "|belebele_isl_Latn |lm_eval/tasks/belebele/belebele_isl_Latn.yaml |multiple_choice |\n", - "|belebele_ita_Latn |lm_eval/tasks/belebele/belebele_ita_Latn.yaml |multiple_choice |\n", - "|belebele_jav_Latn |lm_eval/tasks/belebele/belebele_jav_Latn.yaml |multiple_choice |\n", - "|belebele_jpn_Jpan |lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml |multiple_choice |\n", - "|belebele_kac_Latn |lm_eval/tasks/belebele/belebele_kac_Latn.yaml |multiple_choice |\n", - "|belebele_kan_Knda |lm_eval/tasks/belebele/belebele_kan_Knda.yaml |multiple_choice |\n", - "|belebele_kat_Geor |lm_eval/tasks/belebele/belebele_kat_Geor.yaml |multiple_choice |\n", - "|belebele_kaz_Cyrl |lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml |multiple_choice |\n", - "|belebele_kea_Latn |lm_eval/tasks/belebele/belebele_kea_Latn.yaml |multiple_choice |\n", - "|belebele_khk_Cyrl |lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml |multiple_choice |\n", - "|belebele_khm_Khmr |lm_eval/tasks/belebele/belebele_khm_Khmr.yaml |multiple_choice |\n", - "|belebele_kin_Latn |lm_eval/tasks/belebele/belebele_kin_Latn.yaml |multiple_choice |\n", - "|belebele_kir_Cyrl |lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml |multiple_choice |\n", - "|belebele_kor_Hang |lm_eval/tasks/belebele/belebele_kor_Hang.yaml |multiple_choice |\n", - "|belebele_lao_Laoo |lm_eval/tasks/belebele/belebele_lao_Laoo.yaml |multiple_choice |\n", - "|belebele_lin_Latn |lm_eval/tasks/belebele/belebele_lin_Latn.yaml |multiple_choice |\n", - "|belebele_lit_Latn |lm_eval/tasks/belebele/belebele_lit_Latn.yaml |multiple_choice |\n", - "|belebele_lug_Latn |lm_eval/tasks/belebele/belebele_lug_Latn.yaml |multiple_choice |\n", - "|belebele_luo_Latn |lm_eval/tasks/belebele/belebele_luo_Latn.yaml |multiple_choice |\n", - "|belebele_lvs_Latn |lm_eval/tasks/belebele/belebele_lvs_Latn.yaml |multiple_choice |\n", - "|belebele_mal_Mlym |lm_eval/tasks/belebele/belebele_mal_Mlym.yaml |multiple_choice |\n", - "|belebele_mar_Deva |lm_eval/tasks/belebele/belebele_mar_Deva.yaml |multiple_choice |\n", - "|belebele_mkd_Cyrl |lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml |multiple_choice |\n", - "|belebele_mlt_Latn |lm_eval/tasks/belebele/belebele_mlt_Latn.yaml |multiple_choice |\n", - "|belebele_mri_Latn |lm_eval/tasks/belebele/belebele_mri_Latn.yaml |multiple_choice |\n", - "|belebele_mya_Mymr |lm_eval/tasks/belebele/belebele_mya_Mymr.yaml |multiple_choice |\n", - "|belebele_nld_Latn |lm_eval/tasks/belebele/belebele_nld_Latn.yaml |multiple_choice |\n", - "|belebele_nob_Latn |lm_eval/tasks/belebele/belebele_nob_Latn.yaml |multiple_choice |\n", - "|belebele_npi_Deva |lm_eval/tasks/belebele/belebele_npi_Deva.yaml |multiple_choice |\n", - "|belebele_npi_Latn |lm_eval/tasks/belebele/belebele_npi_Latn.yaml |multiple_choice |\n", - "|belebele_nso_Latn |lm_eval/tasks/belebele/belebele_nso_Latn.yaml |multiple_choice |\n", - "|belebele_nya_Latn |lm_eval/tasks/belebele/belebele_nya_Latn.yaml |multiple_choice |\n", - "|belebele_ory_Orya |lm_eval/tasks/belebele/belebele_ory_Orya.yaml |multiple_choice |\n", - "|belebele_pan_Guru |lm_eval/tasks/belebele/belebele_pan_Guru.yaml |multiple_choice |\n", - "|belebele_pbt_Arab |lm_eval/tasks/belebele/belebele_pbt_Arab.yaml |multiple_choice |\n", - "|belebele_pes_Arab |lm_eval/tasks/belebele/belebele_pes_Arab.yaml |multiple_choice |\n", - "|belebele_plt_Latn |lm_eval/tasks/belebele/belebele_plt_Latn.yaml |multiple_choice |\n", - "|belebele_pol_Latn |lm_eval/tasks/belebele/belebele_pol_Latn.yaml |multiple_choice |\n", - "|belebele_por_Latn |lm_eval/tasks/belebele/belebele_por_Latn.yaml |multiple_choice |\n", - "|belebele_ron_Latn |lm_eval/tasks/belebele/belebele_ron_Latn.yaml |multiple_choice |\n", - "|belebele_rus_Cyrl |lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml |multiple_choice |\n", - "|belebele_shn_Mymr |lm_eval/tasks/belebele/belebele_shn_Mymr.yaml |multiple_choice |\n", - "|belebele_sin_Latn |lm_eval/tasks/belebele/belebele_sin_Latn.yaml |multiple_choice |\n", - "|belebele_sin_Sinh |lm_eval/tasks/belebele/belebele_sin_Sinh.yaml |multiple_choice |\n", - "|belebele_slk_Latn |lm_eval/tasks/belebele/belebele_slk_Latn.yaml |multiple_choice |\n", - "|belebele_slv_Latn |lm_eval/tasks/belebele/belebele_slv_Latn.yaml |multiple_choice |\n", - "|belebele_sna_Latn |lm_eval/tasks/belebele/belebele_sna_Latn.yaml |multiple_choice |\n", - "|belebele_snd_Arab |lm_eval/tasks/belebele/belebele_snd_Arab.yaml |multiple_choice |\n", - "|belebele_som_Latn |lm_eval/tasks/belebele/belebele_som_Latn.yaml |multiple_choice |\n", - "|belebele_sot_Latn |lm_eval/tasks/belebele/belebele_sot_Latn.yaml |multiple_choice |\n", - "|belebele_spa_Latn |lm_eval/tasks/belebele/belebele_spa_Latn.yaml |multiple_choice |\n", - "|belebele_srp_Cyrl |lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml |multiple_choice |\n", - "|belebele_ssw_Latn |lm_eval/tasks/belebele/belebele_ssw_Latn.yaml |multiple_choice |\n", - "|belebele_sun_Latn |lm_eval/tasks/belebele/belebele_sun_Latn.yaml |multiple_choice |\n", - "|belebele_swe_Latn |lm_eval/tasks/belebele/belebele_swe_Latn.yaml |multiple_choice |\n", - "|belebele_swh_Latn |lm_eval/tasks/belebele/belebele_swh_Latn.yaml |multiple_choice |\n", - "|belebele_tam_Taml |lm_eval/tasks/belebele/belebele_tam_Taml.yaml |multiple_choice |\n", - "|belebele_tel_Telu |lm_eval/tasks/belebele/belebele_tel_Telu.yaml |multiple_choice |\n", - "|belebele_tgk_Cyrl |lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml |multiple_choice |\n", - "|belebele_tgl_Latn |lm_eval/tasks/belebele/belebele_tgl_Latn.yaml |multiple_choice |\n", - "|belebele_tha_Thai |lm_eval/tasks/belebele/belebele_tha_Thai.yaml |multiple_choice |\n", - "|belebele_tir_Ethi |lm_eval/tasks/belebele/belebele_tir_Ethi.yaml |multiple_choice |\n", - "|belebele_tsn_Latn |lm_eval/tasks/belebele/belebele_tsn_Latn.yaml |multiple_choice |\n", - "|belebele_tso_Latn |lm_eval/tasks/belebele/belebele_tso_Latn.yaml |multiple_choice |\n", - "|belebele_tur_Latn |lm_eval/tasks/belebele/belebele_tur_Latn.yaml |multiple_choice |\n", - "|belebele_ukr_Cyrl |lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml |multiple_choice |\n", - "|belebele_urd_Arab |lm_eval/tasks/belebele/belebele_urd_Arab.yaml |multiple_choice |\n", - "|belebele_urd_Latn |lm_eval/tasks/belebele/belebele_urd_Latn.yaml |multiple_choice |\n", - "|belebele_uzn_Latn |lm_eval/tasks/belebele/belebele_uzn_Latn.yaml |multiple_choice |\n", - "|belebele_vie_Latn |lm_eval/tasks/belebele/belebele_vie_Latn.yaml |multiple_choice |\n", - "|belebele_war_Latn |lm_eval/tasks/belebele/belebele_war_Latn.yaml |multiple_choice |\n", - "|belebele_wol_Latn |lm_eval/tasks/belebele/belebele_wol_Latn.yaml |multiple_choice |\n", - "|belebele_xho_Latn |lm_eval/tasks/belebele/belebele_xho_Latn.yaml |multiple_choice |\n", - "|belebele_yor_Latn |lm_eval/tasks/belebele/belebele_yor_Latn.yaml |multiple_choice |\n", - "|belebele_zho_Hans |lm_eval/tasks/belebele/belebele_zho_Hans.yaml |multiple_choice |\n", - "|belebele_zho_Hant |lm_eval/tasks/belebele/belebele_zho_Hant.yaml |multiple_choice |\n", - "|belebele_zsm_Latn |lm_eval/tasks/belebele/belebele_zsm_Latn.yaml |multiple_choice |\n", - "|belebele_zul_Latn |lm_eval/tasks/belebele/belebele_zul_Latn.yaml |multiple_choice |\n", - "|bertaqa_en |lm_eval/tasks/bertaqa/bertaqa_en.yaml |multiple_choice |\n", - "|bertaqa_en_mt_gemma-7b |lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml |multiple_choice |\n", - "|bertaqa_en_mt_hitz |lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml |multiple_choice |\n", - "|bertaqa_en_mt_itzuli |lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-13b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-13b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-70b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-70b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-7b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_latxa-7b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml |multiple_choice |\n", - "|bertaqa_en_mt_llama-2-13b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-13b.yaml |multiple_choice |\n", - "|bertaqa_en_mt_llama-2-70b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-70b.yaml |multiple_choice |\n", - "|bertaqa_en_mt_llama-2-7b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-7b.yaml |multiple_choice |\n", - "|bertaqa_en_mt_madlad |lm_eval/tasks/bertaqa/bertaqa_en_mt_madlad.yaml |multiple_choice |\n", - "|bertaqa_en_mt_nllb |lm_eval/tasks/bertaqa/bertaqa_en_mt_nllb.yaml |multiple_choice |\n", - "|bertaqa_eu |lm_eval/tasks/bertaqa/bertaqa_eu.yaml |multiple_choice |\n", - "|bhtc_v2 |lm_eval/tasks/basqueglue/bhtc.yaml |multiple_choice |\n", - "|bigbench_abstract_narrative_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml |generate_until |\n", - "|bigbench_abstract_narrative_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml |multiple_choice |\n", - "|bigbench_anachronisms_generate_until |lm_eval/tasks/bigbench/generate_until/anachronisms.yaml |generate_until |\n", - "|bigbench_anachronisms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml |multiple_choice |\n", - "|bigbench_analogical_similarity_generate_until |lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml |generate_until |\n", - "|bigbench_analogical_similarity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml |multiple_choice |\n", - "|bigbench_analytic_entailment_generate_until |lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml |generate_until |\n", - "|bigbench_analytic_entailment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml |multiple_choice |\n", - "|bigbench_arithmetic_generate_until |lm_eval/tasks/bigbench/generate_until/arithmetic.yaml |generate_until |\n", - "|bigbench_arithmetic_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml |multiple_choice |\n", - "|bigbench_ascii_word_recognition_generate_until |lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml |generate_until |\n", - "|bigbench_authorship_verification_generate_until |lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml |generate_until |\n", - "|bigbench_authorship_verification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml |multiple_choice |\n", - "|bigbench_auto_categorization_generate_until |lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml |generate_until |\n", - "|bigbench_auto_debugging_generate_until |lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml |generate_until |\n", - "|bigbench_bbq_lite_json_generate_until |lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml |generate_until |\n", - "|bigbench_bbq_lite_json_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml |multiple_choice |\n", - "|bigbench_bridging_anaphora_resolution_barqa_generate_until |lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml |generate_until |\n", - "|bigbench_causal_judgment_generate_until |lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml |generate_until |\n", - "|bigbench_causal_judgment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml |multiple_choice |\n", - "|bigbench_cause_and_effect_generate_until |lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml |generate_until |\n", - "|bigbench_cause_and_effect_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml |multiple_choice |\n", - "|bigbench_checkmate_in_one_generate_until |lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml |generate_until |\n", - "|bigbench_checkmate_in_one_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml |multiple_choice |\n", - "|bigbench_chess_state_tracking_generate_until |lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml |generate_until |\n", - "|bigbench_chinese_remainder_theorem_generate_until |lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml |generate_until |\n", - "|bigbench_cifar10_classification_generate_until |lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml |generate_until |\n", - "|bigbench_cifar10_classification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml |multiple_choice |\n", - "|bigbench_code_line_description_generate_until |lm_eval/tasks/bigbench/generate_until/code_line_description.yaml |generate_until |\n", - "|bigbench_code_line_description_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml |multiple_choice |\n", - "|bigbench_codenames_generate_until |lm_eval/tasks/bigbench/generate_until/codenames.yaml |generate_until |\n", - "|bigbench_color_generate_until |lm_eval/tasks/bigbench/generate_until/color.yaml |generate_until |\n", - "|bigbench_color_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/color.yaml |multiple_choice |\n", - "|bigbench_common_morpheme_generate_until |lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml |generate_until |\n", - "|bigbench_common_morpheme_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml |multiple_choice |\n", - "|bigbench_conceptual_combinations_generate_until |lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml |generate_until |\n", - "|bigbench_conceptual_combinations_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml |multiple_choice |\n", - "|bigbench_conlang_translation_generate_until |lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml |generate_until |\n", - "|bigbench_contextual_parametric_knowledge_conflicts_generate_until |lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml |generate_until |\n", - "|bigbench_contextual_parametric_knowledge_conflicts_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml |multiple_choice |\n", - "|bigbench_crash_blossom_generate_until |lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml |generate_until |\n", - "|bigbench_crash_blossom_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml |multiple_choice |\n", - "|bigbench_crass_ai_generate_until |lm_eval/tasks/bigbench/generate_until/crass_ai.yaml |generate_until |\n", - "|bigbench_crass_ai_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml |multiple_choice |\n", - "|bigbench_cryobiology_spanish_generate_until |lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml |generate_until |\n", - "|bigbench_cryobiology_spanish_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml |multiple_choice |\n", - "|bigbench_cryptonite_generate_until |lm_eval/tasks/bigbench/generate_until/cryptonite.yaml |generate_until |\n", - "|bigbench_cs_algorithms_generate_until |lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml |generate_until |\n", - "|bigbench_cs_algorithms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml |multiple_choice |\n", - "|bigbench_dark_humor_detection_generate_until |lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml |generate_until |\n", - "|bigbench_dark_humor_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml |multiple_choice |\n", - "|bigbench_date_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/date_understanding.yaml |generate_until |\n", - "|bigbench_date_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml |multiple_choice |\n", - "|bigbench_disambiguation_qa_generate_until |lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml |generate_until |\n", - "|bigbench_disambiguation_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml |multiple_choice |\n", - "|bigbench_discourse_marker_prediction_generate_until |lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml |generate_until |\n", - "|bigbench_discourse_marker_prediction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml |multiple_choice |\n", - "|bigbench_disfl_qa_generate_until |lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml |generate_until |\n", - "|bigbench_dyck_languages_generate_until |lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml |generate_until |\n", - "|bigbench_dyck_languages_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml |multiple_choice |\n", - "|bigbench_elementary_math_qa_generate_until |lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml |generate_until |\n", - "|bigbench_elementary_math_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml |multiple_choice |\n", - "|bigbench_emoji_movie_generate_until |lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml |generate_until |\n", - "|bigbench_emoji_movie_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml |multiple_choice |\n", - "|bigbench_emojis_emotion_prediction_generate_until |lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml |generate_until |\n", - "|bigbench_emojis_emotion_prediction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml |multiple_choice |\n", - "|bigbench_empirical_judgments_generate_until |lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml |generate_until |\n", - "|bigbench_empirical_judgments_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml |multiple_choice |\n", - "|bigbench_english_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml |generate_until |\n", - "|bigbench_english_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml |multiple_choice |\n", - "|bigbench_english_russian_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml |generate_until |\n", - "|bigbench_english_russian_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml |multiple_choice |\n", - "|bigbench_entailed_polarity_generate_until |lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml |generate_until |\n", - "|bigbench_entailed_polarity_hindi_generate_until |lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml |generate_until |\n", - "|bigbench_entailed_polarity_hindi_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml |multiple_choice |\n", - "|bigbench_entailed_polarity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml |multiple_choice |\n", - "|bigbench_epistemic_reasoning_generate_until |lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml |generate_until |\n", - "|bigbench_epistemic_reasoning_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml |multiple_choice |\n", - "|bigbench_evaluating_information_essentiality_generate_until |lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml |generate_until |\n", - "|bigbench_evaluating_information_essentiality_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml |multiple_choice |\n", - "|bigbench_fact_checker_generate_until |lm_eval/tasks/bigbench/generate_until/fact_checker.yaml |generate_until |\n", - "|bigbench_fact_checker_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml |multiple_choice |\n", - "|bigbench_fantasy_reasoning_generate_until |lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml |generate_until |\n", - "|bigbench_fantasy_reasoning_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml |multiple_choice |\n", - "|bigbench_few_shot_nlg_generate_until |lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml |generate_until |\n", - "|bigbench_figure_of_speech_detection_generate_until |lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml |generate_until |\n", - "|bigbench_figure_of_speech_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml |multiple_choice |\n", - "|bigbench_formal_fallacies_syllogisms_negation_generate_until |lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml |generate_until |\n", - "|bigbench_formal_fallacies_syllogisms_negation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml |multiple_choice |\n", - "|bigbench_gem_generate_until |lm_eval/tasks/bigbench/generate_until/gem.yaml |generate_until |\n", - "|bigbench_gender_inclusive_sentences_german_generate_until |lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml |generate_until |\n", - "|bigbench_general_knowledge_generate_until |lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml |generate_until |\n", - "|bigbench_general_knowledge_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml |multiple_choice |\n", - "|bigbench_geometric_shapes_generate_until |lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml |generate_until |\n", - "|bigbench_geometric_shapes_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml |multiple_choice |\n", - "|bigbench_goal_step_wikihow_generate_until |lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml |generate_until |\n", - "|bigbench_goal_step_wikihow_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml |multiple_choice |\n", - "|bigbench_gre_reading_comprehension_generate_until |lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml |generate_until |\n", - "|bigbench_gre_reading_comprehension_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml |multiple_choice |\n", - "|bigbench_hhh_alignment_generate_until |lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml |generate_until |\n", - "|bigbench_hhh_alignment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml |multiple_choice |\n", - "|bigbench_hindi_question_answering_generate_until |lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml |generate_until |\n", - "|bigbench_hindu_knowledge_generate_until |lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml |generate_until |\n", - "|bigbench_hindu_knowledge_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml |multiple_choice |\n", - "|bigbench_hinglish_toxicity_generate_until |lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml |generate_until |\n", - "|bigbench_hinglish_toxicity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml |multiple_choice |\n", - "|bigbench_human_organs_senses_generate_until |lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml |generate_until |\n", - "|bigbench_human_organs_senses_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml |multiple_choice |\n", - "|bigbench_hyperbaton_generate_until |lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml |generate_until |\n", - "|bigbench_hyperbaton_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml |multiple_choice |\n", - "|bigbench_identify_math_theorems_generate_until |lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml |generate_until |\n", - "|bigbench_identify_math_theorems_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml |multiple_choice |\n", - "|bigbench_identify_odd_metaphor_generate_until |lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml |generate_until |\n", - "|bigbench_identify_odd_metaphor_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml |multiple_choice |\n", - "|bigbench_implicatures_generate_until |lm_eval/tasks/bigbench/generate_until/implicatures.yaml |generate_until |\n", - "|bigbench_implicatures_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml |multiple_choice |\n", - "|bigbench_implicit_relations_generate_until |lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml |generate_until |\n", - "|bigbench_implicit_relations_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml |multiple_choice |\n", - "|bigbench_intent_recognition_generate_until |lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml |generate_until |\n", - "|bigbench_intent_recognition_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml |multiple_choice |\n", - "|bigbench_international_phonetic_alphabet_nli_generate_until |lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml |generate_until |\n", - "|bigbench_international_phonetic_alphabet_nli_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml |multiple_choice |\n", - "|bigbench_international_phonetic_alphabet_transliterate_generate_until |lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml |generate_until |\n", - "|bigbench_intersect_geometry_generate_until |lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml |generate_until |\n", - "|bigbench_intersect_geometry_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml |multiple_choice |\n", - "|bigbench_irony_identification_generate_until |lm_eval/tasks/bigbench/generate_until/irony_identification.yaml |generate_until |\n", - "|bigbench_irony_identification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml |multiple_choice |\n", - "|bigbench_kanji_ascii_generate_until |lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml |generate_until |\n", - "|bigbench_kanji_ascii_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml |multiple_choice |\n", - "|bigbench_kannada_generate_until |lm_eval/tasks/bigbench/generate_until/kannada.yaml |generate_until |\n", - "|bigbench_kannada_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/kannada.yaml |multiple_choice |\n", - "|bigbench_key_value_maps_generate_until |lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml |generate_until |\n", - "|bigbench_key_value_maps_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml |multiple_choice |\n", - "|bigbench_known_unknowns_generate_until |lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml |generate_until |\n", - "|bigbench_known_unknowns_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml |multiple_choice |\n", - "|bigbench_language_games_generate_until |lm_eval/tasks/bigbench/generate_until/language_games.yaml |generate_until |\n", - "|bigbench_language_identification_generate_until |lm_eval/tasks/bigbench/generate_until/language_identification.yaml |generate_until |\n", - "|bigbench_language_identification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml |multiple_choice |\n", - "|bigbench_linguistic_mappings_generate_until |lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml |generate_until |\n", - "|bigbench_linguistics_puzzles_generate_until |lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml |generate_until |\n", - "|bigbench_list_functions_generate_until |lm_eval/tasks/bigbench/generate_until/list_functions.yaml |generate_until |\n", - "|bigbench_logic_grid_puzzle_generate_until |lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml |generate_until |\n", - "|bigbench_logic_grid_puzzle_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml |multiple_choice |\n", - "|bigbench_logical_args_generate_until |lm_eval/tasks/bigbench/generate_until/logical_args.yaml |generate_until |\n", - "|bigbench_logical_args_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml |multiple_choice |\n", - "|bigbench_logical_deduction_generate_until |lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml |generate_until |\n", - "|bigbench_logical_deduction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml |multiple_choice |\n", - "|bigbench_logical_fallacy_detection_generate_until |lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml |generate_until |\n", - "|bigbench_logical_fallacy_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml |multiple_choice |\n", - "|bigbench_logical_sequence_generate_until |lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml |generate_until |\n", - "|bigbench_logical_sequence_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml |multiple_choice |\n", - "|bigbench_mathematical_induction_generate_until |lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml |generate_until |\n", - "|bigbench_mathematical_induction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml |multiple_choice |\n", - "|bigbench_matrixshapes_generate_until |lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml |generate_until |\n", - "|bigbench_metaphor_boolean_generate_until |lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml |generate_until |\n", - "|bigbench_metaphor_boolean_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml |multiple_choice |\n", - "|bigbench_metaphor_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml |generate_until |\n", - "|bigbench_metaphor_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml |multiple_choice |\n", - "|bigbench_minute_mysteries_qa_generate_until |lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml |generate_until |\n", - "|bigbench_misconceptions_generate_until |lm_eval/tasks/bigbench/generate_until/misconceptions.yaml |generate_until |\n", - "|bigbench_misconceptions_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml |multiple_choice |\n", - "|bigbench_misconceptions_russian_generate_until |lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml |generate_until |\n", - "|bigbench_misconceptions_russian_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml |multiple_choice |\n", - "|bigbench_mnist_ascii_generate_until |lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml |generate_until |\n", - "|bigbench_mnist_ascii_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml |multiple_choice |\n", - "|bigbench_modified_arithmetic_generate_until |lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml |generate_until |\n", - "|bigbench_moral_permissibility_generate_until |lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml |generate_until |\n", - "|bigbench_moral_permissibility_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml |multiple_choice |\n", - "|bigbench_movie_dialog_same_or_different_generate_until |lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml |generate_until |\n", - "|bigbench_movie_dialog_same_or_different_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml |multiple_choice |\n", - "|bigbench_movie_recommendation_generate_until |lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml |generate_until |\n", - "|bigbench_movie_recommendation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml |multiple_choice |\n", - "|bigbench_mult_data_wrangling_generate_until |lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml |generate_until |\n", - "|bigbench_multiemo_generate_until |lm_eval/tasks/bigbench/generate_until/multiemo.yaml |generate_until |\n", - "|bigbench_multiemo_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml |multiple_choice |\n", - "|bigbench_natural_instructions_generate_until |lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml |generate_until |\n", - "|bigbench_navigate_generate_until |lm_eval/tasks/bigbench/generate_until/navigate.yaml |generate_until |\n", - "|bigbench_navigate_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/navigate.yaml |multiple_choice |\n", - "|bigbench_nonsense_words_grammar_generate_until |lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml |generate_until |\n", - "|bigbench_nonsense_words_grammar_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml |multiple_choice |\n", - "|bigbench_novel_concepts_generate_until |lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml |generate_until |\n", - "|bigbench_novel_concepts_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml |multiple_choice |\n", - "|bigbench_object_counting_generate_until |lm_eval/tasks/bigbench/generate_until/object_counting.yaml |generate_until |\n", - "|bigbench_odd_one_out_generate_until |lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml |generate_until |\n", - "|bigbench_odd_one_out_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml |multiple_choice |\n", - "|bigbench_operators_generate_until |lm_eval/tasks/bigbench/generate_until/operators.yaml |generate_until |\n", - "|bigbench_paragraph_segmentation_generate_until |lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml |generate_until |\n", - "|bigbench_parsinlu_qa_generate_until |lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml |generate_until |\n", - "|bigbench_parsinlu_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml |multiple_choice |\n", - "|bigbench_parsinlu_reading_comprehension_generate_until |lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml |generate_until |\n", - "|bigbench_penguins_in_a_table_generate_until |lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml |generate_until |\n", - "|bigbench_penguins_in_a_table_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml |multiple_choice |\n", - "|bigbench_periodic_elements_generate_until |lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml |generate_until |\n", - "|bigbench_periodic_elements_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml |multiple_choice |\n", - "|bigbench_persian_idioms_generate_until |lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml |generate_until |\n", - "|bigbench_persian_idioms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml |multiple_choice |\n", - "|bigbench_phrase_relatedness_generate_until |lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml |generate_until |\n", - "|bigbench_phrase_relatedness_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml |multiple_choice |\n", - "|bigbench_physical_intuition_generate_until |lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml |generate_until |\n", - "|bigbench_physical_intuition_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml |multiple_choice |\n", - "|bigbench_physics_generate_until |lm_eval/tasks/bigbench/generate_until/physics.yaml |generate_until |\n", - "|bigbench_physics_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/physics.yaml |multiple_choice |\n", - "|bigbench_physics_questions_generate_until |lm_eval/tasks/bigbench/generate_until/physics_questions.yaml |generate_until |\n", - "|bigbench_play_dialog_same_or_different_generate_until |lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml |generate_until |\n", - "|bigbench_play_dialog_same_or_different_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml |multiple_choice |\n", - "|bigbench_polish_sequence_labeling_generate_until |lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml |generate_until |\n", - "|bigbench_presuppositions_as_nli_generate_until |lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml |generate_until |\n", - "|bigbench_presuppositions_as_nli_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml |multiple_choice |\n", - "|bigbench_qa_wikidata_generate_until |lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml |generate_until |\n", - "|bigbench_question_selection_generate_until |lm_eval/tasks/bigbench/generate_until/question_selection.yaml |generate_until |\n", - "|bigbench_question_selection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml |multiple_choice |\n", - "|bigbench_real_or_fake_text_generate_until |lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml |generate_until |\n", - "|bigbench_real_or_fake_text_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml |multiple_choice |\n", - "|bigbench_reasoning_about_colored_objects_generate_until |lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml |generate_until |\n", - "|bigbench_reasoning_about_colored_objects_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml |multiple_choice |\n", - "|bigbench_repeat_copy_logic_generate_until |lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml |generate_until |\n", - "|bigbench_rephrase_generate_until |lm_eval/tasks/bigbench/generate_until/rephrase.yaml |generate_until |\n", - "|bigbench_riddle_sense_generate_until |lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml |generate_until |\n", - "|bigbench_riddle_sense_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml |multiple_choice |\n", - "|bigbench_ruin_names_generate_until |lm_eval/tasks/bigbench/generate_until/ruin_names.yaml |generate_until |\n", - "|bigbench_ruin_names_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml |multiple_choice |\n", - "|bigbench_salient_translation_error_detection_generate_until |lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml |generate_until |\n", - "|bigbench_salient_translation_error_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml |multiple_choice |\n", - "|bigbench_scientific_press_release_generate_until |lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml |generate_until |\n", - "|bigbench_semantic_parsing_in_context_sparc_generate_until |lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml |generate_until |\n", - "|bigbench_semantic_parsing_spider_generate_until |lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml |generate_until |\n", - "|bigbench_sentence_ambiguity_generate_until |lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml |generate_until |\n", - "|bigbench_sentence_ambiguity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml |multiple_choice |\n", - "|bigbench_similarities_abstraction_generate_until |lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml |generate_until |\n", - "|bigbench_similarities_abstraction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml |multiple_choice |\n", - "|bigbench_simp_turing_concept_generate_until |lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml |generate_until |\n", - "|bigbench_simple_arithmetic_json_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml |generate_until |\n", - "|bigbench_simple_arithmetic_json_multiple_choice_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml |generate_until |\n", - "|bigbench_simple_arithmetic_json_subtasks_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml |generate_until |\n", - "|bigbench_simple_arithmetic_multiple_targets_json_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml |generate_until |\n", - "|bigbench_simple_ethical_questions_generate_until |lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml |generate_until |\n", - "|bigbench_simple_ethical_questions_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml |multiple_choice |\n", - "|bigbench_simple_text_editing_generate_until |lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml |generate_until |\n", - "|bigbench_snarks_generate_until |lm_eval/tasks/bigbench/generate_until/snarks.yaml |generate_until |\n", - "|bigbench_snarks_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/snarks.yaml |multiple_choice |\n", - "|bigbench_social_iqa_generate_until |lm_eval/tasks/bigbench/generate_until/social_iqa.yaml |generate_until |\n", - "|bigbench_social_iqa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml |multiple_choice |\n", - "|bigbench_social_support_generate_until |lm_eval/tasks/bigbench/generate_until/social_support.yaml |generate_until |\n", - "|bigbench_social_support_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/social_support.yaml |multiple_choice |\n", - "|bigbench_sports_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml |generate_until |\n", - "|bigbench_sports_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml |multiple_choice |\n", - "|bigbench_strange_stories_generate_until |lm_eval/tasks/bigbench/generate_until/strange_stories.yaml |generate_until |\n", - "|bigbench_strange_stories_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml |multiple_choice |\n", - "|bigbench_strategyqa_generate_until |lm_eval/tasks/bigbench/generate_until/strategyqa.yaml |generate_until |\n", - "|bigbench_strategyqa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml |multiple_choice |\n", - "|bigbench_sufficient_information_generate_until |lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml |generate_until |\n", - "|bigbench_suicide_risk_generate_until |lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml |generate_until |\n", - "|bigbench_suicide_risk_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml |multiple_choice |\n", - "|bigbench_swahili_english_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml |generate_until |\n", - "|bigbench_swahili_english_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml |multiple_choice |\n", - "|bigbench_swedish_to_german_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml |generate_until |\n", - "|bigbench_swedish_to_german_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml |multiple_choice |\n", - "|bigbench_symbol_interpretation_generate_until |lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml |generate_until |\n", - "|bigbench_symbol_interpretation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml |multiple_choice |\n", - "|bigbench_temporal_sequences_generate_until |lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml |generate_until |\n", - "|bigbench_temporal_sequences_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml |multiple_choice |\n", - "|bigbench_tense_generate_until |lm_eval/tasks/bigbench/generate_until/tense.yaml |generate_until |\n", - "|bigbench_timedial_generate_until |lm_eval/tasks/bigbench/generate_until/timedial.yaml |generate_until |\n", - "|bigbench_timedial_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/timedial.yaml |multiple_choice |\n", - "|bigbench_topical_chat_generate_until |lm_eval/tasks/bigbench/generate_until/topical_chat.yaml |generate_until |\n", - "|bigbench_tracking_shuffled_objects_generate_until |lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml |generate_until |\n", - "|bigbench_tracking_shuffled_objects_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml |multiple_choice |\n", - "|bigbench_understanding_fables_generate_until |lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml |generate_until |\n", - "|bigbench_understanding_fables_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml |multiple_choice |\n", - "|bigbench_undo_permutation_generate_until |lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml |generate_until |\n", - "|bigbench_undo_permutation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml |multiple_choice |\n", - "|bigbench_unit_conversion_generate_until |lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml |generate_until |\n", - "|bigbench_unit_conversion_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml |multiple_choice |\n", - "|bigbench_unit_interpretation_generate_until |lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml |generate_until |\n", - "|bigbench_unit_interpretation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml |multiple_choice |\n", - "|bigbench_unnatural_in_context_learning_generate_until |lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml |generate_until |\n", - "|bigbench_vitaminc_fact_verification_generate_until |lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml |generate_until |\n", - "|bigbench_vitaminc_fact_verification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml |multiple_choice |\n", - "|bigbench_what_is_the_tao_generate_until |lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml |generate_until |\n", - "|bigbench_what_is_the_tao_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml |multiple_choice |\n", - "|bigbench_which_wiki_edit_generate_until |lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml |generate_until |\n", - "|bigbench_which_wiki_edit_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml |multiple_choice |\n", - "|bigbench_winowhy_generate_until |lm_eval/tasks/bigbench/generate_until/winowhy.yaml |generate_until |\n", - "|bigbench_winowhy_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml |multiple_choice |\n", - "|bigbench_word_sorting_generate_until |lm_eval/tasks/bigbench/generate_until/word_sorting.yaml |generate_until |\n", - "|bigbench_word_unscrambling_generate_until |lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml |generate_until |\n", - "|blimp_adjunct_island |lm_eval/tasks/blimp/adjunct_island.yaml |multiple_choice |\n", - "|blimp_anaphor_gender_agreement |lm_eval/tasks/blimp/anaphor_gender_agreement.yaml |multiple_choice |\n", - "|blimp_anaphor_number_agreement |lm_eval/tasks/blimp/anaphor_number_agreement.yaml |multiple_choice |\n", - "|blimp_animate_subject_passive |lm_eval/tasks/blimp/animate_subject_passive.yaml |multiple_choice |\n", - "|blimp_animate_subject_trans |lm_eval/tasks/blimp/animate_subject_trans.yaml |multiple_choice |\n", - "|blimp_causative |lm_eval/tasks/blimp/causative.yaml |multiple_choice |\n", - "|blimp_complex_NP_island |lm_eval/tasks/blimp/complex_NP_island.yaml |multiple_choice |\n", - "|blimp_coordinate_structure_constraint_complex_left_branch |lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml |multiple_choice |\n", - "|blimp_coordinate_structure_constraint_object_extraction |lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_1 |lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_2 |lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_irregular_1 |lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_irregular_2 |lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_with_adj_2 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_with_adj_irregular_1 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_with_adj_irregular_2 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml |multiple_choice |\n", - "|blimp_determiner_noun_agreement_with_adjective_1 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml |multiple_choice |\n", - "|blimp_distractor_agreement_relational_noun |lm_eval/tasks/blimp/distractor_agreement_relational_noun.yaml |multiple_choice |\n", - "|blimp_distractor_agreement_relative_clause |lm_eval/tasks/blimp/distractor_agreement_relative_clause.yaml |multiple_choice |\n", - "|blimp_drop_argument |lm_eval/tasks/blimp/drop_argument.yaml |multiple_choice |\n", - "|blimp_ellipsis_n_bar_1 |lm_eval/tasks/blimp/ellipsis_n_bar_1.yaml |multiple_choice |\n", - "|blimp_ellipsis_n_bar_2 |lm_eval/tasks/blimp/ellipsis_n_bar_2.yaml |multiple_choice |\n", - "|blimp_existential_there_object_raising |lm_eval/tasks/blimp/existential_there_object_raising.yaml |multiple_choice |\n", - "|blimp_existential_there_quantifiers_1 |lm_eval/tasks/blimp/existential_there_quantifiers_1.yaml |multiple_choice |\n", - "|blimp_existential_there_quantifiers_2 |lm_eval/tasks/blimp/existential_there_quantifiers_2.yaml |multiple_choice |\n", - "|blimp_existential_there_subject_raising |lm_eval/tasks/blimp/existential_there_subject_raising.yaml |multiple_choice |\n", - "|blimp_expletive_it_object_raising |lm_eval/tasks/blimp/expletive_it_object_raising.yaml |multiple_choice |\n", - "|blimp_inchoative |lm_eval/tasks/blimp/inchoative.yaml |multiple_choice |\n", - "|blimp_intransitive |lm_eval/tasks/blimp/intransitive.yaml |multiple_choice |\n", - "|blimp_irregular_past_participle_adjectives |lm_eval/tasks/blimp/irregular_past_participle_adjectives.yaml |multiple_choice |\n", - "|blimp_irregular_past_participle_verbs |lm_eval/tasks/blimp/irregular_past_participle_verbs.yaml |multiple_choice |\n", - "|blimp_irregular_plural_subject_verb_agreement_1 |lm_eval/tasks/blimp/irregular_plural_subject_verb_agreement_1.yaml |multiple_choice |\n", - "|blimp_irregular_plural_subject_verb_agreement_2 |lm_eval/tasks/blimp/irregular_plural_subject_verb_agreement_2.yaml |multiple_choice |\n", - "|blimp_left_branch_island_echo_question |lm_eval/tasks/blimp/left_branch_island_echo_question.yaml |multiple_choice |\n", - "|blimp_left_branch_island_simple_question |lm_eval/tasks/blimp/left_branch_island_simple_question.yaml |multiple_choice |\n", - "|blimp_matrix_question_npi_licensor_present |lm_eval/tasks/blimp/matrix_question_npi_licensor_present.yaml |multiple_choice |\n", - "|blimp_npi_present_1 |lm_eval/tasks/blimp/npi_present_1.yaml |multiple_choice |\n", - "|blimp_npi_present_2 |lm_eval/tasks/blimp/npi_present_2.yaml |multiple_choice |\n", - "|blimp_only_npi_licensor_present |lm_eval/tasks/blimp/only_npi_licensor_present.yaml |multiple_choice |\n", - "|blimp_only_npi_scope |lm_eval/tasks/blimp/only_npi_scope.yaml |multiple_choice |\n", - "|blimp_passive_1 |lm_eval/tasks/blimp/passive_1.yaml |multiple_choice |\n", - "|blimp_passive_2 |lm_eval/tasks/blimp/passive_2.yaml |multiple_choice |\n", - "|blimp_principle_A_c_command |lm_eval/tasks/blimp/principle_A_c_command.yaml |multiple_choice |\n", - "|blimp_principle_A_case_1 |lm_eval/tasks/blimp/principle_A_case_1.yaml |multiple_choice |\n", - "|blimp_principle_A_case_2 |lm_eval/tasks/blimp/principle_A_case_2.yaml |multiple_choice |\n", - "|blimp_principle_A_domain_1 |lm_eval/tasks/blimp/principle_A_domain_1.yaml |multiple_choice |\n", - "|blimp_principle_A_domain_2 |lm_eval/tasks/blimp/principle_A_domain_2.yaml |multiple_choice |\n", - "|blimp_principle_A_domain_3 |lm_eval/tasks/blimp/principle_A_domain_3.yaml |multiple_choice |\n", - "|blimp_principle_A_reconstruction |lm_eval/tasks/blimp/principle_A_reconstruction.yaml |multiple_choice |\n", - "|blimp_regular_plural_subject_verb_agreement_1 |lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_1.yaml |multiple_choice |\n", - "|blimp_regular_plural_subject_verb_agreement_2 |lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_2.yaml |multiple_choice |\n", - "|blimp_sentential_negation_npi_licensor_present |lm_eval/tasks/blimp/sentential_negation_npi_licensor_present.yaml |multiple_choice |\n", - "|blimp_sentential_negation_npi_scope |lm_eval/tasks/blimp/sentential_negation_npi_scope.yaml |multiple_choice |\n", - "|blimp_sentential_subject_island |lm_eval/tasks/blimp/sentential_subject_island.yaml |multiple_choice |\n", - "|blimp_superlative_quantifiers_1 |lm_eval/tasks/blimp/superlative_quantifiers_1.yaml |multiple_choice |\n", - "|blimp_superlative_quantifiers_2 |lm_eval/tasks/blimp/superlative_quantifiers_2.yaml |multiple_choice |\n", - "|blimp_tough_vs_raising_1 |lm_eval/tasks/blimp/tough_vs_raising_1.yaml |multiple_choice |\n", - "|blimp_tough_vs_raising_2 |lm_eval/tasks/blimp/tough_vs_raising_2.yaml |multiple_choice |\n", - "|blimp_transitive |lm_eval/tasks/blimp/transitive.yaml |multiple_choice |\n", - "|blimp_wh_island |lm_eval/tasks/blimp/wh_island.yaml |multiple_choice |\n", - "|blimp_wh_questions_object_gap |lm_eval/tasks/blimp/wh_questions_object_gap.yaml |multiple_choice |\n", - "|blimp_wh_questions_subject_gap |lm_eval/tasks/blimp/wh_questions_subject_gap.yaml |multiple_choice |\n", - "|blimp_wh_questions_subject_gap_long_distance |lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml |multiple_choice |\n", - "|blimp_wh_vs_that_no_gap |lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml |multiple_choice |\n", - "|blimp_wh_vs_that_no_gap_long_distance |lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml |multiple_choice |\n", - "|blimp_wh_vs_that_with_gap |lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml |multiple_choice |\n", - "|blimp_wh_vs_that_with_gap_long_distance |lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml |multiple_choice |\n", - "|boolq |lm_eval/tasks/super_glue/boolq/default.yaml |multiple_choice |\n", - "|boolq-seq2seq |lm_eval/tasks/super_glue/boolq/seq2seq.yaml |generate_until |\n", - "|cabreu_abstractive |lm_eval/tasks/catalan_bench/cabreu_abstractive.yaml |generate_until |\n", - "|cabreu_extractive |lm_eval/tasks/catalan_bench/cabreu_extractive.yaml |generate_until |\n", - "|cabreu_extreme |lm_eval/tasks/catalan_bench/cabreu_extreme.yaml |generate_until |\n", - "|catalanqa |lm_eval/tasks/catalan_bench/catalanqa.yaml |generate_until |\n", - "|catcola |lm_eval/tasks/catalan_bench/catcola.yaml |multiple_choice |\n", - "|cb |lm_eval/tasks/super_glue/cb/default.yaml |multiple_choice |\n", - "|ceval-valid_accountant |lm_eval/tasks/ceval/ceval-valid_accountant.yaml |multiple_choice |\n", - "|ceval-valid_advanced_mathematics |lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml |multiple_choice |\n", - "|ceval-valid_art_studies |lm_eval/tasks/ceval/ceval-valid_art_studies.yaml |multiple_choice |\n", - "|ceval-valid_basic_medicine |lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml |multiple_choice |\n", - "|ceval-valid_business_administration |lm_eval/tasks/ceval/ceval-valid_business_administration.yaml |multiple_choice |\n", - "|ceval-valid_chinese_language_and_literature |lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml |multiple_choice |\n", - "|ceval-valid_civil_servant |lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml |multiple_choice |\n", - "|ceval-valid_clinical_medicine |lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml |multiple_choice |\n", - "|ceval-valid_college_chemistry |lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml |multiple_choice |\n", - "|ceval-valid_college_economics |lm_eval/tasks/ceval/ceval-valid_college_economics.yaml |multiple_choice |\n", - "|ceval-valid_college_physics |lm_eval/tasks/ceval/ceval-valid_college_physics.yaml |multiple_choice |\n", - "|ceval-valid_college_programming |lm_eval/tasks/ceval/ceval-valid_college_programming.yaml |multiple_choice |\n", - "|ceval-valid_computer_architecture |lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml |multiple_choice |\n", - "|ceval-valid_computer_network |lm_eval/tasks/ceval/ceval-valid_computer_network.yaml |multiple_choice |\n", - "|ceval-valid_discrete_mathematics |lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml |multiple_choice |\n", - "|ceval-valid_education_science |lm_eval/tasks/ceval/ceval-valid_education_science.yaml |multiple_choice |\n", - "|ceval-valid_electrical_engineer |lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml |multiple_choice |\n", - "|ceval-valid_environmental_impact_assessment_engineer |lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml |multiple_choice |\n", - "|ceval-valid_fire_engineer |lm_eval/tasks/ceval/ceval-valid_fire_engineer.yaml |multiple_choice |\n", - "|ceval-valid_high_school_biology |lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml |multiple_choice |\n", - "|ceval-valid_high_school_chemistry |lm_eval/tasks/ceval/ceval-valid_high_school_chemistry.yaml |multiple_choice |\n", - "|ceval-valid_high_school_chinese |lm_eval/tasks/ceval/ceval-valid_high_school_chinese.yaml |multiple_choice |\n", - "|ceval-valid_high_school_geography |lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml |multiple_choice |\n", - "|ceval-valid_high_school_history |lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml |multiple_choice |\n", - "|ceval-valid_high_school_mathematics |lm_eval/tasks/ceval/ceval-valid_high_school_mathematics.yaml |multiple_choice |\n", - "|ceval-valid_high_school_physics |lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml |multiple_choice |\n", - "|ceval-valid_high_school_politics |lm_eval/tasks/ceval/ceval-valid_high_school_politics.yaml |multiple_choice |\n", - "|ceval-valid_ideological_and_moral_cultivation |lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml |multiple_choice |\n", - "|ceval-valid_law |lm_eval/tasks/ceval/ceval-valid_law.yaml |multiple_choice |\n", - "|ceval-valid_legal_professional |lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml |multiple_choice |\n", - "|ceval-valid_logic |lm_eval/tasks/ceval/ceval-valid_logic.yaml |multiple_choice |\n", - "|ceval-valid_mao_zedong_thought |lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml |multiple_choice |\n", - "|ceval-valid_marxism |lm_eval/tasks/ceval/ceval-valid_marxism.yaml |multiple_choice |\n", - "|ceval-valid_metrology_engineer |lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_biology |lm_eval/tasks/ceval/ceval-valid_middle_school_biology.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_chemistry |lm_eval/tasks/ceval/ceval-valid_middle_school_chemistry.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_geography |lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_history |lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_mathematics |lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_physics |lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml |multiple_choice |\n", - "|ceval-valid_middle_school_politics |lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml |multiple_choice |\n", - "|ceval-valid_modern_chinese_history |lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml |multiple_choice |\n", - "|ceval-valid_operating_system |lm_eval/tasks/ceval/ceval-valid_operating_system.yaml |multiple_choice |\n", - "|ceval-valid_physician |lm_eval/tasks/ceval/ceval-valid_physician.yaml |multiple_choice |\n", - "|ceval-valid_plant_protection |lm_eval/tasks/ceval/ceval-valid_plant_protection.yaml |multiple_choice |\n", - "|ceval-valid_probability_and_statistics |lm_eval/tasks/ceval/ceval-valid_probability_and_statistics.yaml |multiple_choice |\n", - "|ceval-valid_professional_tour_guide |lm_eval/tasks/ceval/ceval-valid_professional_tour_guide.yaml |multiple_choice |\n", - "|ceval-valid_sports_science |lm_eval/tasks/ceval/ceval-valid_sports_science.yaml |multiple_choice |\n", - "|ceval-valid_tax_accountant |lm_eval/tasks/ceval/ceval-valid_tax_accountant.yaml |multiple_choice |\n", - "|ceval-valid_teacher_qualification |lm_eval/tasks/ceval/ceval-valid_teacher_qualification.yaml |multiple_choice |\n", - "|ceval-valid_urban_and_rural_planner |lm_eval/tasks/ceval/ceval-valid_urban_and_rural_planner.yaml |multiple_choice |\n", - "|ceval-valid_veterinary_medicine |lm_eval/tasks/ceval/ceval-valid_veterinary_medicine.yaml |multiple_choice |\n", - "|claim_stance_topic |lm_eval/tasks/unitxt/claim_stance_topic.yaml | |\n", - "|cmmlu_agronomy |lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml |multiple_choice |\n", - "|cmmlu_anatomy |lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml |multiple_choice |\n", - "|cmmlu_ancient_chinese |lm_eval/tasks/cmmlu/cmmlu_default_ancient_chinese.yaml |multiple_choice |\n", - "|cmmlu_arts |lm_eval/tasks/cmmlu/cmmlu_arts.yaml |multiple_choice |\n", - "|cmmlu_astronomy |lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml |multiple_choice |\n", - "|cmmlu_business_ethics |lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml |multiple_choice |\n", - "|cmmlu_chinese_civil_service_exam |lm_eval/tasks/cmmlu/cmmlu_default_chinese_civil_service_exam.yaml |multiple_choice |\n", - "|cmmlu_chinese_driving_rule |lm_eval/tasks/cmmlu/cmmlu_default_chinese_driving_rule.yaml |multiple_choice |\n", - "|cmmlu_chinese_food_culture |lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml |multiple_choice |\n", - "|cmmlu_chinese_foreign_policy |lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml |multiple_choice |\n", - "|cmmlu_chinese_history |lm_eval/tasks/cmmlu/cmmlu_default_chinese_history.yaml |multiple_choice |\n", - "|cmmlu_chinese_literature |lm_eval/tasks/cmmlu/cmmlu_default_chinese_literature.yaml |multiple_choice |\n", - "|cmmlu_chinese_teacher_qualification |lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml |multiple_choice |\n", - "|cmmlu_clinical_knowledge |lm_eval/tasks/cmmlu/cmmlu_default_clinical_knowledge.yaml |multiple_choice |\n", - "|cmmlu_college_actuarial_science |lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml |multiple_choice |\n", - "|cmmlu_college_education |lm_eval/tasks/cmmlu/cmmlu_default_college_education.yaml |multiple_choice |\n", - "|cmmlu_college_engineering_hydrology |lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml |multiple_choice |\n", - "|cmmlu_college_law |lm_eval/tasks/cmmlu/cmmlu_college_law.yaml |multiple_choice |\n", - "|cmmlu_college_mathematics |lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml |multiple_choice |\n", - "|cmmlu_college_medical_statistics |lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml |multiple_choice |\n", - "|cmmlu_college_medicine |lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml |multiple_choice |\n", - "|cmmlu_computer_science |lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml |multiple_choice |\n", - "|cmmlu_computer_security |lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml |multiple_choice |\n", - "|cmmlu_conceptual_physics |lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml |multiple_choice |\n", - "|cmmlu_construction_project_management |lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml |multiple_choice |\n", - "|cmmlu_economics |lm_eval/tasks/cmmlu/cmmlu_default_economics.yaml |multiple_choice |\n", - "|cmmlu_education |lm_eval/tasks/cmmlu/cmmlu_education.yaml |multiple_choice |\n", - "|cmmlu_electrical_engineering |lm_eval/tasks/cmmlu/cmmlu_default_electrical_engineering.yaml |multiple_choice |\n", - "|cmmlu_elementary_chinese |lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml |multiple_choice |\n", - "|cmmlu_elementary_commonsense |lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml |multiple_choice |\n", - "|cmmlu_elementary_information_and_technology |lm_eval/tasks/cmmlu/cmmlu_default_elementary_information_and_technology.yaml |multiple_choice |\n", - "|cmmlu_elementary_mathematics |lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml |multiple_choice |\n", - "|cmmlu_ethnology |lm_eval/tasks/cmmlu/cmmlu_default_ethnology.yaml |multiple_choice |\n", - "|cmmlu_food_science |lm_eval/tasks/cmmlu/cmmlu_food_science.yaml |multiple_choice |\n", - "|cmmlu_genetics |lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml |multiple_choice |\n", - "|cmmlu_global_facts |lm_eval/tasks/cmmlu/cmmlu_default_global_facts.yaml |multiple_choice |\n", - "|cmmlu_high_school_biology |lm_eval/tasks/cmmlu/cmmlu_default_high_school_biology.yaml |multiple_choice |\n", - "|cmmlu_high_school_chemistry |lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml |multiple_choice |\n", - "|cmmlu_high_school_geography |lm_eval/tasks/cmmlu/cmmlu_default_high_school_geography.yaml |multiple_choice |\n", - "|cmmlu_high_school_mathematics |lm_eval/tasks/cmmlu/cmmlu_default_high_school_mathematics.yaml |multiple_choice |\n", - "|cmmlu_high_school_physics |lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml |multiple_choice |\n", - "|cmmlu_high_school_politics |lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml |multiple_choice |\n", - "|cmmlu_human_sexuality |lm_eval/tasks/cmmlu/cmmlu_default_human_sexuality.yaml |multiple_choice |\n", - "|cmmlu_international_law |lm_eval/tasks/cmmlu/cmmlu_international_law.yaml |multiple_choice |\n", - "|cmmlu_journalism |lm_eval/tasks/cmmlu/cmmlu_default_journalism.yaml |multiple_choice |\n", - "|cmmlu_jurisprudence |lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml |multiple_choice |\n", - "|cmmlu_legal_and_moral_basis |lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml |multiple_choice |\n", - "|cmmlu_logical |lm_eval/tasks/cmmlu/cmmlu_logical.yaml |multiple_choice |\n", - "|cmmlu_machine_learning |lm_eval/tasks/cmmlu/cmmlu_default_machine_learning.yaml |multiple_choice |\n", - "|cmmlu_management |lm_eval/tasks/cmmlu/cmmlu_management.yaml |multiple_choice |\n", - "|cmmlu_marketing |lm_eval/tasks/cmmlu/cmmlu_default_marketing.yaml |multiple_choice |\n", - "|cmmlu_marxist_theory |lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml |multiple_choice |\n", - "|cmmlu_modern_chinese |lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml |multiple_choice |\n", - "|cmmlu_nutrition |lm_eval/tasks/cmmlu/cmmlu_default_nutrition.yaml |multiple_choice |\n", - "|cmmlu_philosophy |lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml |multiple_choice |\n", - "|cmmlu_professional_accounting |lm_eval/tasks/cmmlu/cmmlu_default_professional_accounting.yaml |multiple_choice |\n", - "|cmmlu_professional_law |lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml |multiple_choice |\n", - "|cmmlu_professional_medicine |lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml |multiple_choice |\n", - "|cmmlu_professional_psychology |lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml |multiple_choice |\n", - "|cmmlu_public_relations |lm_eval/tasks/cmmlu/cmmlu_default_public_relations.yaml |multiple_choice |\n", - "|cmmlu_security_study |lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml |multiple_choice |\n", - "|cmmlu_sociology |lm_eval/tasks/cmmlu/cmmlu_default_sociology.yaml |multiple_choice |\n", - "|cmmlu_sports_science |lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml |multiple_choice |\n", - "|cmmlu_traditional_chinese_medicine |lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml |multiple_choice |\n", - "|cmmlu_virology |lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml |multiple_choice |\n", - "|cmmlu_world_history |lm_eval/tasks/cmmlu/cmmlu_world_history.yaml |multiple_choice |\n", - "|cmmlu_world_religions |lm_eval/tasks/cmmlu/cmmlu_default_world_religions.yaml |multiple_choice |\n", - "|cnn_dailymail |lm_eval/tasks/unitxt/cnn_dailymail.yaml | |\n", - "|code2text_go |lm_eval/tasks/code_x_glue/code-text/go.yaml |generate_until |\n", - "|code2text_java |lm_eval/tasks/code_x_glue/code-text/java.yaml |generate_until |\n", - "|code2text_javascript |lm_eval/tasks/code_x_glue/code-text/javascript.yaml |generate_until |\n", - "|code2text_php |lm_eval/tasks/code_x_glue/code-text/php.yaml |generate_until |\n", - "|code2text_python |lm_eval/tasks/code_x_glue/code-text/python.yaml |generate_until |\n", - "|code2text_ruby |lm_eval/tasks/code_x_glue/code-text/ruby.yaml |generate_until |\n", - "|coedit_gec |lm_eval/tasks/unitxt/coedit_gec.yaml | |\n", - "|cola |lm_eval/tasks/glue/cola/default.yaml |multiple_choice |\n", - "|commonsense_qa |lm_eval/tasks/commonsense_qa/default.yaml |multiple_choice |\n", - "|copa |lm_eval/tasks/super_glue/copa/default.yaml |multiple_choice |\n", - "|copa_ar |lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml |multiple_choice |\n", - "|copa_ca |lm_eval/tasks/catalan_bench/copa_ca.yaml |multiple_choice |\n", - "|copa_es |lm_eval/tasks/spanish_bench/copa_es.yaml |multiple_choice |\n", - "|copal_id_colloquial |lm_eval/tasks/copal_id/colloquial.yaml |multiple_choice |\n", - "|copal_id_standard |lm_eval/tasks/copal_id/standard.yaml |multiple_choice |\n", - "|coqa |lm_eval/tasks/coqa/default.yaml |generate_until |\n", - "|coqcat |lm_eval/tasks/catalan_bench/coqcat.yaml |generate_until |\n", - "|crows_pairs_english |lm_eval/tasks/crows_pairs/crows_pairs_english.yaml |multiple_choice |\n", - "|crows_pairs_english_age |lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml |multiple_choice |\n", - "|crows_pairs_english_autre |lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml |multiple_choice |\n", - "|crows_pairs_english_disability |lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml |multiple_choice |\n", - "|crows_pairs_english_gender |lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml |multiple_choice |\n", - "|crows_pairs_english_nationality |lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml |multiple_choice |\n", - "|crows_pairs_english_physical_appearance |lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml |multiple_choice |\n", - "|crows_pairs_english_race_color |lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml |multiple_choice |\n", - "|crows_pairs_english_religion |lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml |multiple_choice |\n", - "|crows_pairs_english_sexual_orientation |lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml |multiple_choice |\n", - "|crows_pairs_english_socioeconomic |lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml |multiple_choice |\n", - "|crows_pairs_french |lm_eval/tasks/crows_pairs/crows_pairs_french.yaml |multiple_choice |\n", - "|crows_pairs_french_age |lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml |multiple_choice |\n", - "|crows_pairs_french_autre |lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml |multiple_choice |\n", - "|crows_pairs_french_disability |lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml |multiple_choice |\n", - "|crows_pairs_french_gender |lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml |multiple_choice |\n", - "|crows_pairs_french_nationality |lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml |multiple_choice |\n", - "|crows_pairs_french_physical_appearance |lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml |multiple_choice |\n", - "|crows_pairs_french_race_color |lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml |multiple_choice |\n", - "|crows_pairs_french_religion |lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml |multiple_choice |\n", - "|crows_pairs_french_sexual_orientation |lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml |multiple_choice |\n", - "|crows_pairs_french_socioeconomic |lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml |multiple_choice |\n", - "|csatqa_gr |lm_eval/tasks/csatqa/csatqa_gr.yaml |multiple_choice |\n", - "|csatqa_li |lm_eval/tasks/csatqa/csatqa_li.yaml |multiple_choice |\n", - "|csatqa_rch |lm_eval/tasks/csatqa/csatqa_rch.yaml |multiple_choice |\n", - "|csatqa_rcs |lm_eval/tasks/csatqa/csatqa_rcs.yaml |multiple_choice |\n", - "|csatqa_rcss |lm_eval/tasks/csatqa/csatqa_rcss.yaml |multiple_choice |\n", - "|csatqa_wr |lm_eval/tasks/csatqa/csatqa_wr.yaml |multiple_choice |\n", - "|cycle_letters |lm_eval/tasks/unscramble/cycle_letters.yaml |generate_until |\n", - "|dbpedia_14 |lm_eval/tasks/unitxt/dbpedia_14.yaml | |\n", - "|doc_vqa |lm_eval/tasks/unitxt/doc_vqa.yaml | |\n", - "|drop |lm_eval/tasks/drop/default.yaml |generate_until |\n", - "|epec_koref_bin |lm_eval/tasks/basqueglue/coref.yaml |multiple_choice |\n", - "|eq_bench |lm_eval/tasks/eq_bench/default.yaml |generate_until |\n", - "|escola |lm_eval/tasks/spanish_bench/escola.yaml |multiple_choice |\n", - "|ethics_cm |lm_eval/tasks/hendrycks_ethics/commonsense.yaml |multiple_choice |\n", - "|ethics_deontology |lm_eval/tasks/hendrycks_ethics/deontology.yaml |multiple_choice |\n", - "|ethics_justice |lm_eval/tasks/hendrycks_ethics/justice.yaml |multiple_choice |\n", - "|ethics_utilitarianism |lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml |multiple_choice |\n", - "|ethics_virtue |lm_eval/tasks/hendrycks_ethics/virtue.yaml |multiple_choice |\n", - "|ethos_binary |lm_eval/tasks/unitxt/ethos_binary.yaml | |\n", - "|eus_exams_es_ejadministrativo |lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml |multiple_choice |\n", - "|eus_exams_es_ejauxiliar |lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml |multiple_choice |\n", - "|eus_exams_es_ejsubalterno |lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml |multiple_choice |\n", - "|eus_exams_es_ejtecnico |lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml |multiple_choice |\n", - "|eus_exams_es_opeayuntamientovitoria |lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml |multiple_choice |\n", - "|eus_exams_es_opebilbao |lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml |multiple_choice |\n", - "|eus_exams_es_opeehuadmin |lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml |multiple_choice |\n", - "|eus_exams_es_opeehuaux |lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml |multiple_choice |\n", - "|eus_exams_es_opeehubiblio |lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml |multiple_choice |\n", - "|eus_exams_es_opeehuderecho |lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml |multiple_choice |\n", - "|eus_exams_es_opeehueconomicas |lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml |multiple_choice |\n", - "|eus_exams_es_opeehuempresariales |lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml |multiple_choice |\n", - "|eus_exams_es_opeehusubalterno |lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml |multiple_choice |\n", - "|eus_exams_es_opeehutecnico |lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml |multiple_choice |\n", - "|eus_exams_es_opeehutecnicob |lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakiadmin |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakiaux |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakiauxenf |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakicelador |lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakienf |lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakijuridico |lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakioperario |lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakitecnico |lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml |multiple_choice |\n", - "|eus_exams_es_opeosakivarios |lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza1c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza2c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza3c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza4c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza5c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza6c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza7c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza8c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml |multiple_choice |\n", - "|eus_exams_es_osakidetza9c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml |multiple_choice |\n", - "|eus_exams_eu_ejadministrari |lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml |multiple_choice |\n", - "|eus_exams_eu_ejlaguntza |lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml |multiple_choice |\n", - "|eus_exams_eu_ejlaguntzaile |lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml |multiple_choice |\n", - "|eus_exams_eu_ejteknikari |lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml |multiple_choice |\n", - "|eus_exams_eu_opebilbaoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehuadmineu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehuauxeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehubiblioeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehuderechoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehueconomicaseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehuempresarialeseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehusubalternoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehutecnicoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeehuteknikarib |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml |multiple_choice |\n", - "|eus_exams_eu_opegasteizkoudala |lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakiadmineu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakiauxenfeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakiauxeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakiceladoreu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakienfeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakioperarioeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakitecnicoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml |multiple_choice |\n", - "|eus_exams_eu_opeosakivarioseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza1e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza2e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza3e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza5e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza6e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml |multiple_choice |\n", - "|eus_exams_eu_osakidetza7e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml |multiple_choice |\n", - "|eus_proficiency |lm_eval/tasks/eus_proficiency/eus_proficiency.yaml |multiple_choice |\n", - "|eus_reading |lm_eval/tasks/eus_reading/eus_reading.yaml |multiple_choice |\n", - "|eus_trivia |lm_eval/tasks/eus_trivia/eus_trivia.yaml |multiple_choice |\n", - "|fda |lm_eval/tasks/fda/fda.yaml | |\n", - "|financial_tweets |lm_eval/tasks/unitxt/financial_tweets.yaml | |\n", - "|fld_default |lm_eval/tasks/fld/fld_default.yaml | |\n", - "|fld_logical_formula_default |lm_eval/tasks/fld/fld_logical_formula_default.yaml | |\n", - "|fld_logical_formula_star |lm_eval/tasks/fld/fld_logical_formula_star.yaml | |\n", - "|fld_star |lm_eval/tasks/fld/fld_star.yaml | |\n", - "|flores_ca-de |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-de.yaml |generate_until |\n", - "|flores_ca-en |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-en.yaml |generate_until |\n", - "|flores_ca-es |lm_eval/tasks/spanish_bench/flores_es/flores_ca-es.yaml |generate_until |\n", - "|flores_ca-eu |lm_eval/tasks/basque_bench/flores_eu/flores_ca-eu.yaml |generate_until |\n", - "|flores_ca-fr |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-fr.yaml |generate_until |\n", - "|flores_ca-gl |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-gl.yaml |generate_until |\n", - "|flores_ca-it |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-it.yaml |generate_until |\n", - "|flores_ca-pt |lm_eval/tasks/catalan_bench/flores_ca/flores_ca-pt.yaml |generate_until |\n", - "|flores_de-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_de-ca.yaml |generate_until |\n", - "|flores_de-es |lm_eval/tasks/spanish_bench/flores_es/flores_de-es.yaml |generate_until |\n", - "|flores_de-eu |lm_eval/tasks/basque_bench/flores_eu/flores_de-eu.yaml |generate_until |\n", - "|flores_de-gl |lm_eval/tasks/galician_bench/flores_gl/flores_de-gl.yaml |generate_until |\n", - "|flores_de-pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_de-pt.yaml |generate_until |\n", - "|flores_en-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_en-ca.yaml |generate_until |\n", - "|flores_en-es |lm_eval/tasks/spanish_bench/flores_es/flores_en-es.yaml |generate_until |\n", - "|flores_en-eu |lm_eval/tasks/basque_bench/flores_eu/flores_en-eu.yaml |generate_until |\n", - "|flores_en-gl |lm_eval/tasks/galician_bench/flores_gl/flores_en-gl.yaml |generate_until |\n", - "|flores_en-pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_en-pt.yaml |generate_until |\n", - "|flores_es-ca |lm_eval/tasks/spanish_bench/flores_es/flores_es-ca.yaml |generate_until |\n", - "|flores_es-de |lm_eval/tasks/spanish_bench/flores_es/flores_es-de.yaml |generate_until |\n", - "|flores_es-en |lm_eval/tasks/spanish_bench/flores_es/flores_es-en.yaml |generate_until |\n", - "|flores_es-eu |lm_eval/tasks/basque_bench/flores_eu/flores_es-eu.yaml |generate_until |\n", - "|flores_es-fr |lm_eval/tasks/spanish_bench/flores_es/flores_es-fr.yaml |generate_until |\n", - "|flores_es-gl |lm_eval/tasks/spanish_bench/flores_es/flores_es-gl.yaml |generate_until |\n", - "|flores_es-it |lm_eval/tasks/spanish_bench/flores_es/flores_es-it.yaml |generate_until |\n", - "|flores_es-pt |lm_eval/tasks/spanish_bench/flores_es/flores_es-pt.yaml |generate_until |\n", - "|flores_eu-ca |lm_eval/tasks/basque_bench/flores_eu/flores_eu-ca.yaml |generate_until |\n", - "|flores_eu-de |lm_eval/tasks/basque_bench/flores_eu/flores_eu-de.yaml |generate_until |\n", - "|flores_eu-en |lm_eval/tasks/basque_bench/flores_eu/flores_eu-en.yaml |generate_until |\n", - "|flores_eu-es |lm_eval/tasks/basque_bench/flores_eu/flores_eu-es.yaml |generate_until |\n", - "|flores_eu-fr |lm_eval/tasks/basque_bench/flores_eu/flores_eu-fr.yaml |generate_until |\n", - "|flores_eu-gl |lm_eval/tasks/basque_bench/flores_eu/flores_eu-gl.yaml |generate_until |\n", - "|flores_eu-it |lm_eval/tasks/basque_bench/flores_eu/flores_eu-it.yaml |generate_until |\n", - "|flores_eu-pt |lm_eval/tasks/basque_bench/flores_eu/flores_eu-pt.yaml |generate_until |\n", - "|flores_fr-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_fr-ca.yaml |generate_until |\n", - "|flores_fr-es |lm_eval/tasks/spanish_bench/flores_es/flores_fr-es.yaml |generate_until |\n", - "|flores_fr-eu |lm_eval/tasks/basque_bench/flores_eu/flores_fr-eu.yaml |generate_until |\n", - "|flores_fr-gl |lm_eval/tasks/galician_bench/flores_gl/flores_fr-gl.yaml |generate_until |\n", - "|flores_fr-pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_fr-pt.yaml |generate_until |\n", - "|flores_gl-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_gl-ca.yaml |generate_until |\n", - "|flores_gl-de |lm_eval/tasks/galician_bench/flores_gl/flores_gl-de.yaml |generate_until |\n", - "|flores_gl-en |lm_eval/tasks/galician_bench/flores_gl/flores_gl-en.yaml |generate_until |\n", - "|flores_gl-es |lm_eval/tasks/spanish_bench/flores_es/flores_gl-es.yaml |generate_until |\n", - "|flores_gl-eu |lm_eval/tasks/basque_bench/flores_eu/flores_gl-eu.yaml |generate_until |\n", - "|flores_gl-fr |lm_eval/tasks/galician_bench/flores_gl/flores_gl-fr.yaml |generate_until |\n", - "|flores_gl-it |lm_eval/tasks/galician_bench/flores_gl/flores_gl-it.yaml |generate_until |\n", - "|flores_gl-pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_gl-pt.yaml |generate_until |\n", - "|flores_it-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_it-ca.yaml |generate_until |\n", - "|flores_it-es |lm_eval/tasks/spanish_bench/flores_es/flores_it-es.yaml |generate_until |\n", - "|flores_it-eu |lm_eval/tasks/basque_bench/flores_eu/flores_it-eu.yaml |generate_until |\n", - "|flores_it-gl |lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml |generate_until |\n", - "|flores_it-pt |lm_eval/tasks/portuguese_bench/flores_pt/flores_it-pt.yaml |generate_until |\n", - "|flores_pt-ca |lm_eval/tasks/catalan_bench/flores_ca/flores_pt-ca.yaml |generate_until |\n", - "|flores_pt-de |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt-de.yaml |generate_until |\n", - "|flores_pt-en |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt-en.yaml |generate_until |\n", - "|flores_pt-es |lm_eval/tasks/spanish_bench/flores_es/flores_pt-es.yaml |generate_until |\n", - "|flores_pt-eu |lm_eval/tasks/basque_bench/flores_eu/flores_pt-eu.yaml |generate_until |\n", - "|flores_pt-fr |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt-fr.yaml |generate_until |\n", - "|flores_pt-gl |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt-gl.yaml |generate_until |\n", - "|flores_pt-it |lm_eval/tasks/portuguese_bench/flores_pt/flores_pt-it.yaml |generate_until |\n", - "|french_bench_arc_challenge |lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml |multiple_choice |\n", - "|french_bench_boolqa |lm_eval/tasks/french_bench/french_bench_boolqa.yaml |multiple_choice |\n", - "|french_bench_fquadv2 |lm_eval/tasks/french_bench/french_bench_fquadv2.yaml |generate_until |\n", - "|french_bench_fquadv2_bool |lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml |multiple_choice |\n", - "|french_bench_fquadv2_genq |lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml |generate_until |\n", - "|french_bench_fquadv2_hasAns |lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml |generate_until |\n", - "|french_bench_grammar |lm_eval/tasks/french_bench/french_bench_grammar.yaml |multiple_choice |\n", - "|french_bench_hellaswag |lm_eval/tasks/french_bench/french_bench_hellaswag.yaml |multiple_choice |\n", - "|french_bench_multifquad |lm_eval/tasks/french_bench/french_bench_multifquad.yaml |generate_until |\n", - "|french_bench_opus_perplexity |lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml |loglikelihood_rolling|\n", - "|french_bench_orangesum_abstract |lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml |generate_until |\n", - "|french_bench_orangesum_title |lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml |generate_until |\n", - "|french_bench_reading_comp |lm_eval/tasks/french_bench/french_bench_reading_comp.yaml |multiple_choice |\n", - "|french_bench_topic_based_nli |lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml |multiple_choice |\n", - "|french_bench_trivia |lm_eval/tasks/french_bench/french_bench_trivia.yaml |generate_until |\n", - "|french_bench_vocab |lm_eval/tasks/french_bench/french_bench_vocab.yaml |multiple_choice |\n", - "|french_bench_wikitext_fr |lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml |loglikelihood_rolling|\n", - "|french_bench_xnli |lm_eval/tasks/french_bench/french_bench_xnli.yaml |multiple_choice |\n", - "|galcola |lm_eval/tasks/galician_bench/galcola.yaml |multiple_choice |\n", - "|glianorex |lm_eval/tasks/glianorex/glianorex.yaml |multiple_choice |\n", - "|glianorex_en |lm_eval/tasks/glianorex/glianorex_en.yaml |multiple_choice |\n", - "|glianorex_fr |lm_eval/tasks/glianorex/glianorex_fr.yaml |multiple_choice |\n", - "|gpqa_diamond_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml |generate_until |\n", - "|gpqa_diamond_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml |generate_until |\n", - "|gpqa_diamond_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml |generate_until |\n", - "|gpqa_diamond_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_diamond_n_shot.yaml |multiple_choice |\n", - "|gpqa_diamond_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_diamond_zeroshot.yaml |multiple_choice |\n", - "|gpqa_extended_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml |generate_until |\n", - "|gpqa_extended_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml |generate_until |\n", - "|gpqa_extended_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml |generate_until |\n", - "|gpqa_extended_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_extended_n_shot.yaml |multiple_choice |\n", - "|gpqa_extended_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_extended_zeroshot.yaml |multiple_choice |\n", - "|gpqa_main_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml |generate_until |\n", - "|gpqa_main_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml |generate_until |\n", - "|gpqa_main_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml |generate_until |\n", - "|gpqa_main_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_main_n_shot.yaml |multiple_choice |\n", - "|gpqa_main_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_main_zeroshot.yaml |multiple_choice |\n", - "|gsm8k |lm_eval/tasks/gsm8k/gsm8k.yaml |generate_until |\n", - "|gsm8k_cot |lm_eval/tasks/gsm8k/gsm8k-cot.yaml |generate_until |\n", - "|gsm8k_cot_llama |lm_eval/tasks/gsm8k/gsm8k-cot-llama.yaml |generate_until |\n", - "|gsm8k_cot_self_consistency |lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml |generate_until |\n", - "|gsm8k_cot_zeroshot |lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml |generate_until |\n", - "|gsm_plus |lm_eval/tasks/gsm_plus/gsm_plus.yaml |generate_until |\n", - "|gsm_plus_mini |lm_eval/tasks/gsm_plus/gsm_plus_mini.yaml |generate_until |\n", - "|haerae_general_knowledge |lm_eval/tasks/haerae/haerae_gk.yaml |multiple_choice |\n", - "|haerae_history |lm_eval/tasks/haerae/haerae_hi.yaml |multiple_choice |\n", - "|haerae_loan_word |lm_eval/tasks/haerae/haerae_lw.yaml |multiple_choice |\n", - "|haerae_rare_word |lm_eval/tasks/haerae/haerae_rw.yaml |multiple_choice |\n", - "|haerae_standard_nomenclature |lm_eval/tasks/haerae/haerae_sn.yaml |multiple_choice |\n", - "|headqa_en |lm_eval/tasks/headqa/headqa_en.yaml |multiple_choice |\n", - "|headqa_es |lm_eval/tasks/headqa/headqa_es.yaml |multiple_choice |\n", - "|hellaswag |lm_eval/tasks/hellaswag/hellaswag.yaml |multiple_choice |\n", - "|hellaswag_ar |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ar.yaml |multiple_choice |\n", - "|hellaswag_bn |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_bn.yaml |multiple_choice |\n", - "|hellaswag_ca |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ca.yaml |multiple_choice |\n", - "|hellaswag_da |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_da.yaml |multiple_choice |\n", - "|hellaswag_de |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_de.yaml |multiple_choice |\n", - "|hellaswag_es |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_es.yaml |multiple_choice |\n", - "|hellaswag_eu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_eu.yaml |multiple_choice |\n", - "|hellaswag_fr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_fr.yaml |multiple_choice |\n", - "|hellaswag_gu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_gu.yaml |multiple_choice |\n", - "|hellaswag_hi |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hi.yaml |multiple_choice |\n", - "|hellaswag_hr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hr.yaml |multiple_choice |\n", - "|hellaswag_hu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hu.yaml |multiple_choice |\n", - "|hellaswag_hy |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hy.yaml |multiple_choice |\n", - "|hellaswag_id |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_id.yaml |multiple_choice |\n", - "|hellaswag_it |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_it.yaml |multiple_choice |\n", - "|hellaswag_kn |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_kn.yaml |multiple_choice |\n", - "|hellaswag_ml |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ml.yaml |multiple_choice |\n", - "|hellaswag_mr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_mr.yaml |multiple_choice |\n", - "|hellaswag_ne |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ne.yaml |multiple_choice |\n", - "|hellaswag_nl |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_nl.yaml |multiple_choice |\n", - "|hellaswag_pt |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_pt.yaml |multiple_choice |\n", - "|hellaswag_ro |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ro.yaml |multiple_choice |\n", - "|hellaswag_ru |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ru.yaml |multiple_choice |\n", - "|hellaswag_sk |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sk.yaml |multiple_choice |\n", - "|hellaswag_sr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sr.yaml |multiple_choice |\n", - "|hellaswag_sv |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sv.yaml |multiple_choice |\n", - "|hellaswag_ta |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ta.yaml |multiple_choice |\n", - "|hellaswag_te |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_te.yaml |multiple_choice |\n", - "|hellaswag_uk |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_uk.yaml |multiple_choice |\n", - "|hellaswag_vi |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_vi.yaml |multiple_choice |\n", - "|hendrycks_math_algebra |lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml |generate_until |\n", - "|hendrycks_math_counting_and_prob |lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml |generate_until |\n", - "|hendrycks_math_geometry |lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml |generate_until |\n", - "|hendrycks_math_intermediate_algebra |lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml |generate_until |\n", - "|hendrycks_math_num_theory |lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml |generate_until |\n", - "|hendrycks_math_prealgebra |lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml |generate_until |\n", - "|hendrycks_math_precalc |lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml |generate_until |\n", - "|ifeval |lm_eval/tasks/ifeval/ifeval.yaml |generate_until |\n", - "|inverse_scaling_hindsight_neglect_10shot |lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml |multiple_choice |\n", - "|inverse_scaling_into_the_unknown |lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml |multiple_choice |\n", - "|inverse_scaling_memo_trap |lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml |multiple_choice |\n", - "|inverse_scaling_modus_tollens |lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml |multiple_choice |\n", - "|inverse_scaling_neqa |lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml |multiple_choice |\n", - "|inverse_scaling_pattern_matching_suppression |lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml |multiple_choice |\n", - "|inverse_scaling_quote_repetition |lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml |multiple_choice |\n", - "|inverse_scaling_redefine_math |lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml |multiple_choice |\n", - "|inverse_scaling_repetitive_algebra |lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml |multiple_choice |\n", - "|inverse_scaling_sig_figs |lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml |multiple_choice |\n", - "|inverse_scaling_winobias_antistereotype |lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml |multiple_choice |\n", - "|iwslt2017-ar-en |lm_eval/tasks/translation/iwslt2017_ar-en.yaml |generate_until |\n", - "|iwslt2017-en-ar |lm_eval/tasks/translation/iwslt2017_en-ar.yaml |generate_until |\n", - "|ja_leaderboard_jaqket_v2 |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jaqket_v2.yaml |generate_until |\n", - "|ja_leaderboard_jcommonsenseqa |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.yaml |multiple_choice |\n", - "|ja_leaderboard_jnli |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jnli.yaml |multiple_choice |\n", - "|ja_leaderboard_jsquad |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jsquad.yaml |generate_until |\n", - "|ja_leaderboard_marc_ja |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_marc_ja.yaml |multiple_choice |\n", - "|ja_leaderboard_mgsm |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.yaml |generate_until |\n", - "|ja_leaderboard_xlsum |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xlsum.yaml |generate_until |\n", - "|ja_leaderboard_xwinograd |lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xwinograd.yaml |multiple_choice |\n", - "|kbl_bar_exam_em_civil_2012 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2012.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2013 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2013.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2014 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2014.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2015 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2015.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2016 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2016.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2017 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2017.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2018 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2018.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2019 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2019.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2020 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2020.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2021 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2021.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2022 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2022.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2023 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2023.yaml |generate_until |\n", - "|kbl_bar_exam_em_civil_2024 |lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2024.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2012 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2012.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2013 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2013.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2014 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2014.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2015 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2015.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2016 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2016.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2017 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2017.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2018 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2018.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2019 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2019.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2020 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2020.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2021 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2021.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2022 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2022.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2023 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2023.yaml |generate_until |\n", - "|kbl_bar_exam_em_criminal_2024 |lm_eval/tasks/kbl/bar_exam/criminal/kbl_bar_exam_em_criminal_2024.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2012 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2012.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2013 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2013.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2014 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2014.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2015 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2015.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2016 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2016.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2017 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2017.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2018 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2018.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2019 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2019.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2020 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2020.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2021 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2021.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2022 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2022.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2023 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2023.yaml |generate_until |\n", - "|kbl_bar_exam_em_public_2024 |lm_eval/tasks/kbl/bar_exam/public/kbl_bar_exam_em_public_2024.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2010 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2010.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2011 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2011.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2012 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2012.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2013 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2013.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2014 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2014.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2015 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2015.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2016 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2016.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2017 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2017.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2018 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2018.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2019 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2019.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2020 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2020.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2021 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2021.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2022 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2022.yaml |generate_until |\n", - "|kbl_bar_exam_em_responsibility_2023 |lm_eval/tasks/kbl/bar_exam/responsibility/kbl_bar_exam_em_responsibility_2023.yaml |generate_until |\n", - "|kbl_case_relevance_qa_p_em |lm_eval/tasks/kbl/reasoning/kbl_case_relevance_qa_p_em.yaml |generate_until |\n", - "|kbl_case_relevance_qa_q_em |lm_eval/tasks/kbl/reasoning/kbl_case_relevance_qa_q_em.yaml |generate_until |\n", - "|kbl_causal_reasoning_qa_em |lm_eval/tasks/kbl/reasoning/kbl_causal_reasoning_em.yaml |generate_until |\n", - "|kbl_common_legal_mistake_qa_em |lm_eval/tasks/kbl/knowledge/kbl_common_legal_mistake_qa_em.yaml |generate_until |\n", - "|kbl_common_legal_mistake_qa_reasoning_em |lm_eval/tasks/kbl/knowledge/kbl_common_legal_mistake_qa_reasoning_em.yaml |generate_until |\n", - "|kbl_legal_concept_qa_em |lm_eval/tasks/kbl/knowledge/kbl_legal_concept_qa_em.yaml |generate_until |\n", - "|kbl_offense_component_qa_em |lm_eval/tasks/kbl/knowledge/kbl_offense_component_qa_em.yaml |generate_until |\n", - "|kbl_query_and_statute_matching_qa_em |lm_eval/tasks/kbl/knowledge/kbl_query_statute_matching_qa_em.yaml |generate_until |\n", - "|kbl_statement_consistency_qa_em |lm_eval/tasks/kbl/reasoning/kbl_statement_consistency_qa_em.yaml |generate_until |\n", - "|kbl_statute_hallucination_qa_em |lm_eval/tasks/kbl/knowledge/kbl_statute_hallucination_qa_em.yaml |generate_until |\n", - "|kbl_statute_number_and_content_matching_qa_em |lm_eval/tasks/kbl/knowledge/kbl_statute_number_and_content_matching_qa_em.yaml |generate_until |\n", - "|kmmlu_direct_accounting |lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml |generate_until |\n", - "|kmmlu_direct_agricultural_sciences |lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml |generate_until |\n", - "|kmmlu_direct_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml |generate_until |\n", - "|kmmlu_direct_biology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml |generate_until |\n", - "|kmmlu_direct_chemical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml |generate_until |\n", - "|kmmlu_direct_chemistry |lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml |generate_until |\n", - "|kmmlu_direct_civil_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml |generate_until |\n", - "|kmmlu_direct_computer_science |lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml |generate_until |\n", - "|kmmlu_direct_construction |lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml |generate_until |\n", - "|kmmlu_direct_criminal_law |lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml |generate_until |\n", - "|kmmlu_direct_ecology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml |generate_until |\n", - "|kmmlu_direct_economics |lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml |generate_until |\n", - "|kmmlu_direct_education |lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml |generate_until |\n", - "|kmmlu_direct_electrical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml |generate_until |\n", - "|kmmlu_direct_electronics_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml |generate_until |\n", - "|kmmlu_direct_energy_management |lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml |generate_until |\n", - "|kmmlu_direct_environmental_science |lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml |generate_until |\n", - "|kmmlu_direct_fashion |lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml |generate_until |\n", - "|kmmlu_direct_food_processing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml |generate_until |\n", - "|kmmlu_direct_gas_technology_and_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml |generate_until |\n", - "|kmmlu_direct_geomatics |lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml |generate_until |\n", - "|kmmlu_direct_health |lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml |generate_until |\n", - "|kmmlu_direct_industrial_engineer |lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml |generate_until |\n", - "|kmmlu_direct_information_technology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml |generate_until |\n", - "|kmmlu_direct_interior_architecture_and_design |lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml |generate_until |\n", - "|kmmlu_direct_korean_history |lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml |generate_until |\n", - "|kmmlu_direct_law |lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml |generate_until |\n", - "|kmmlu_direct_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml |generate_until |\n", - "|kmmlu_direct_management |lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml |generate_until |\n", - "|kmmlu_direct_maritime_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml |generate_until |\n", - "|kmmlu_direct_marketing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml |generate_until |\n", - "|kmmlu_direct_materials_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml |generate_until |\n", - "|kmmlu_direct_math |lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml |generate_until |\n", - "|kmmlu_direct_mechanical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml |generate_until |\n", - "|kmmlu_direct_nondestructive_testing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml |generate_until |\n", - "|kmmlu_direct_patent |lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml |generate_until |\n", - "|kmmlu_direct_political_science_and_sociology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml |generate_until |\n", - "|kmmlu_direct_psychology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml |generate_until |\n", - "|kmmlu_direct_public_safety |lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml |generate_until |\n", - "|kmmlu_direct_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml |generate_until |\n", - "|kmmlu_direct_real_estate |lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml |generate_until |\n", - "|kmmlu_direct_refrigerating_machinery |lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml |generate_until |\n", - "|kmmlu_direct_social_welfare |lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml |generate_until |\n", - "|kmmlu_direct_taxation |lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml |generate_until |\n", - "|kmmlu_direct_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml |generate_until |\n", - "|kmmlu_hard_accounting |lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml |multiple_choice |\n", - "|kmmlu_hard_agricultural_sciences |lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml |multiple_choice |\n", - "|kmmlu_hard_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml |multiple_choice |\n", - "|kmmlu_hard_biology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml |multiple_choice |\n", - "|kmmlu_hard_chemical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_chemistry |lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml |multiple_choice |\n", - "|kmmlu_hard_civil_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_computer_science |lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml |multiple_choice |\n", - "|kmmlu_hard_construction |lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml |multiple_choice |\n", - "|kmmlu_hard_cot_accounting |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml |generate_until |\n", - "|kmmlu_hard_cot_agricultural_sciences |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml |generate_until |\n", - "|kmmlu_hard_cot_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml |generate_until |\n", - "|kmmlu_hard_cot_biology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml |generate_until |\n", - "|kmmlu_hard_cot_chemical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_chemistry |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml |generate_until |\n", - "|kmmlu_hard_cot_civil_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_computer_science |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml |generate_until |\n", - "|kmmlu_hard_cot_construction |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml |generate_until |\n", - "|kmmlu_hard_cot_criminal_law |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml |generate_until |\n", - "|kmmlu_hard_cot_ecology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml |generate_until |\n", - "|kmmlu_hard_cot_economics |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml |generate_until |\n", - "|kmmlu_hard_cot_education |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml |generate_until |\n", - "|kmmlu_hard_cot_electrical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_electronics_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_energy_management |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml |generate_until |\n", - "|kmmlu_hard_cot_environmental_science |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml |generate_until |\n", - "|kmmlu_hard_cot_fashion |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml |generate_until |\n", - "|kmmlu_hard_cot_food_processing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml |generate_until |\n", - "|kmmlu_hard_cot_gas_technology_and_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_geomatics |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml |generate_until |\n", - "|kmmlu_hard_cot_health |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml |generate_until |\n", - "|kmmlu_hard_cot_industrial_engineer |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml |generate_until |\n", - "|kmmlu_hard_cot_information_technology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml |generate_until |\n", - "|kmmlu_hard_cot_interior_architecture_and_design |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml |generate_until |\n", - "|kmmlu_hard_cot_korean_history |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml |generate_until |\n", - "|kmmlu_hard_cot_law |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml |generate_until |\n", - "|kmmlu_hard_cot_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml |generate_until |\n", - "|kmmlu_hard_cot_management |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml |generate_until |\n", - "|kmmlu_hard_cot_maritime_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_marketing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml |generate_until |\n", - "|kmmlu_hard_cot_materials_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_math |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml |generate_until |\n", - "|kmmlu_hard_cot_mechanical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_nondestructive_testing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml |generate_until |\n", - "|kmmlu_hard_cot_patent |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml |generate_until |\n", - "|kmmlu_hard_cot_political_science_and_sociology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml |generate_until |\n", - "|kmmlu_hard_cot_psychology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml |generate_until |\n", - "|kmmlu_hard_cot_public_safety |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml |generate_until |\n", - "|kmmlu_hard_cot_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml |generate_until |\n", - "|kmmlu_hard_cot_real_estate |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml |generate_until |\n", - "|kmmlu_hard_cot_refrigerating_machinery |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml |generate_until |\n", - "|kmmlu_hard_cot_social_welfare |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml |generate_until |\n", - "|kmmlu_hard_cot_taxation |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml |generate_until |\n", - "|kmmlu_hard_cot_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml |generate_until |\n", - "|kmmlu_hard_criminal_law |lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml |multiple_choice |\n", - "|kmmlu_hard_direct_accounting |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml |generate_until |\n", - "|kmmlu_hard_direct_agricultural_sciences |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml |generate_until |\n", - "|kmmlu_hard_direct_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml |generate_until |\n", - "|kmmlu_hard_direct_biology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml |generate_until |\n", - "|kmmlu_hard_direct_chemical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_chemistry |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml |generate_until |\n", - "|kmmlu_hard_direct_civil_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_computer_science |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml |generate_until |\n", - "|kmmlu_hard_direct_construction |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml |generate_until |\n", - "|kmmlu_hard_direct_criminal_law |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml |generate_until |\n", - "|kmmlu_hard_direct_ecology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml |generate_until |\n", - "|kmmlu_hard_direct_economics |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml |generate_until |\n", - "|kmmlu_hard_direct_education |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml |generate_until |\n", - "|kmmlu_hard_direct_electrical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_electronics_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_energy_management |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml |generate_until |\n", - "|kmmlu_hard_direct_environmental_science |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml |generate_until |\n", - "|kmmlu_hard_direct_fashion |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml |generate_until |\n", - "|kmmlu_hard_direct_food_processing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml |generate_until |\n", - "|kmmlu_hard_direct_gas_technology_and_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_geomatics |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml |generate_until |\n", - "|kmmlu_hard_direct_health |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml |generate_until |\n", - "|kmmlu_hard_direct_industrial_engineer |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml |generate_until |\n", - "|kmmlu_hard_direct_information_technology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml |generate_until |\n", - "|kmmlu_hard_direct_interior_architecture_and_design |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml |generate_until |\n", - "|kmmlu_hard_direct_korean_history |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml |generate_until |\n", - "|kmmlu_hard_direct_law |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml |generate_until |\n", - "|kmmlu_hard_direct_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml |generate_until |\n", - "|kmmlu_hard_direct_management |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml |generate_until |\n", - "|kmmlu_hard_direct_maritime_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_marketing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml |generate_until |\n", - "|kmmlu_hard_direct_materials_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_math |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml |generate_until |\n", - "|kmmlu_hard_direct_mechanical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_nondestructive_testing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml |generate_until |\n", - "|kmmlu_hard_direct_patent |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml |generate_until |\n", - "|kmmlu_hard_direct_political_science_and_sociology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml |generate_until |\n", - "|kmmlu_hard_direct_psychology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml |generate_until |\n", - "|kmmlu_hard_direct_public_safety |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml |generate_until |\n", - "|kmmlu_hard_direct_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml |generate_until |\n", - "|kmmlu_hard_direct_real_estate |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml |generate_until |\n", - "|kmmlu_hard_direct_refrigerating_machinery |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml |generate_until |\n", - "|kmmlu_hard_direct_social_welfare |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml |generate_until |\n", - "|kmmlu_hard_direct_taxation |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml |generate_until |\n", - "|kmmlu_hard_direct_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml |generate_until |\n", - "|kmmlu_hard_ecology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml |multiple_choice |\n", - "|kmmlu_hard_economics |lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml |multiple_choice |\n", - "|kmmlu_hard_education |lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml |multiple_choice |\n", - "|kmmlu_hard_electrical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_electronics_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_energy_management |lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml |multiple_choice |\n", - "|kmmlu_hard_environmental_science |lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml |multiple_choice |\n", - "|kmmlu_hard_fashion |lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml |multiple_choice |\n", - "|kmmlu_hard_food_processing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml |multiple_choice |\n", - "|kmmlu_hard_gas_technology_and_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_geomatics |lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml |multiple_choice |\n", - "|kmmlu_hard_health |lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml |multiple_choice |\n", - "|kmmlu_hard_industrial_engineer |lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml |multiple_choice |\n", - "|kmmlu_hard_information_technology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml |multiple_choice |\n", - "|kmmlu_hard_interior_architecture_and_design |lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml |multiple_choice |\n", - "|kmmlu_hard_korean_history |lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml |multiple_choice |\n", - "|kmmlu_hard_law |lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml |multiple_choice |\n", - "|kmmlu_hard_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml |multiple_choice |\n", - "|kmmlu_hard_management |lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml |multiple_choice |\n", - "|kmmlu_hard_maritime_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_marketing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml |multiple_choice |\n", - "|kmmlu_hard_materials_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_math |lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml |multiple_choice |\n", - "|kmmlu_hard_mechanical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_nondestructive_testing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml |multiple_choice |\n", - "|kmmlu_hard_patent |lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml |multiple_choice |\n", - "|kmmlu_hard_political_science_and_sociology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml |multiple_choice |\n", - "|kmmlu_hard_psychology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml |multiple_choice |\n", - "|kmmlu_hard_public_safety |lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml |multiple_choice |\n", - "|kmmlu_hard_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml |multiple_choice |\n", - "|kmmlu_hard_real_estate |lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml |multiple_choice |\n", - "|kmmlu_hard_refrigerating_machinery |lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml |multiple_choice |\n", - "|kmmlu_hard_social_welfare |lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml |multiple_choice |\n", - "|kmmlu_hard_taxation |lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml |multiple_choice |\n", - "|kmmlu_hard_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml |multiple_choice |\n", - "|kobest_boolq |lm_eval/tasks/kobest/kobest_boolq.yaml |multiple_choice |\n", - "|kobest_copa |lm_eval/tasks/kobest/kobest_copa.yaml |multiple_choice |\n", - "|kobest_hellaswag |lm_eval/tasks/kobest/kobest_hellaswag.yaml |multiple_choice |\n", - "|kobest_sentineg |lm_eval/tasks/kobest/kobest_sentineg.yaml |multiple_choice |\n", - "|kobest_wic |lm_eval/tasks/kobest/kobest_wic.yaml |multiple_choice |\n", - "|kormedmcqa_doctor |lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml |generate_until |\n", - "|kormedmcqa_nurse |lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml |generate_until |\n", - "|kormedmcqa_pharm |lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml |generate_until |\n", - "|lambada_openai |lm_eval/tasks/lambada/lambada_openai.yaml |loglikelihood |\n", - "|lambada_openai_cloze_yaml |lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml |loglikelihood |\n", - "|lambada_openai_mt_de |lm_eval/tasks/lambada_multilingual/lambada_mt_de.yaml |loglikelihood |\n", - "|lambada_openai_mt_en |lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml |loglikelihood |\n", - "|lambada_openai_mt_es |lm_eval/tasks/lambada_multilingual/lambada_mt_es.yaml |loglikelihood |\n", - "|lambada_openai_mt_fr |lm_eval/tasks/lambada_multilingual/lambada_mt_fr.yaml |loglikelihood |\n", - "|lambada_openai_mt_it |lm_eval/tasks/lambada_multilingual/lambada_mt_it.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_de |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_de.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_en |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_es |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_es.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_fr |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_fr.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_it |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_it.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_nl |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_nl.yaml |loglikelihood |\n", - "|lambada_openai_mt_stablelm_pt |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_pt.yaml |loglikelihood |\n", - "|lambada_standard |lm_eval/tasks/lambada/lambada_standard.yaml |loglikelihood |\n", - "|lambada_standard_cloze_yaml |lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml |loglikelihood |\n", - "|law_stack_exchange |lm_eval/tasks/unitxt/law_stack_exchange.yaml | |\n", - "|leaderboard_bbh_boolean_expressions |lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml |multiple_choice |\n", - "|leaderboard_bbh_causal_judgement |lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml |multiple_choice |\n", - "|leaderboard_bbh_date_understanding |lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml |multiple_choice |\n", - "|leaderboard_bbh_disambiguation_qa |lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml |multiple_choice |\n", - "|leaderboard_bbh_formal_fallacies |lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml |multiple_choice |\n", - "|leaderboard_bbh_geometric_shapes |lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml |multiple_choice |\n", - "|leaderboard_bbh_hyperbaton |lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml |multiple_choice |\n", - "|leaderboard_bbh_logical_deduction_five_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_logical_deduction_seven_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_logical_deduction_three_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_movie_recommendation |lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml |multiple_choice |\n", - "|leaderboard_bbh_navigate |lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml |multiple_choice |\n", - "|leaderboard_bbh_object_counting |lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml |multiple_choice |\n", - "|leaderboard_bbh_penguins_in_a_table |lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml |multiple_choice |\n", - "|leaderboard_bbh_reasoning_about_colored_objects |lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_ruin_names |lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml |multiple_choice |\n", - "|leaderboard_bbh_salient_translation_error_detection |lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml |multiple_choice |\n", - "|leaderboard_bbh_snarks |lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml |multiple_choice |\n", - "|leaderboard_bbh_sports_understanding |lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml |multiple_choice |\n", - "|leaderboard_bbh_temporal_sequences |lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml |multiple_choice |\n", - "|leaderboard_bbh_tracking_shuffled_objects_five_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_tracking_shuffled_objects_seven_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_tracking_shuffled_objects_three_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml |multiple_choice |\n", - "|leaderboard_bbh_web_of_lies |lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml |multiple_choice |\n", - "|leaderboard_gpqa_diamond |lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml |multiple_choice |\n", - "|leaderboard_gpqa_extended |lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml |multiple_choice |\n", - "|leaderboard_gpqa_main |lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml |multiple_choice |\n", - "|leaderboard_ifeval |lm_eval/tasks/leaderboard/ifeval/ifeval.yaml |generate_until |\n", - "|leaderboard_math_algebra_hard |lm_eval/tasks/leaderboard/math/math_algebra.yaml |generate_until |\n", - "|leaderboard_math_counting_and_prob_hard |lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml |generate_until |\n", - "|leaderboard_math_geometry_hard |lm_eval/tasks/leaderboard/math/math_geometry.yaml |generate_until |\n", - "|leaderboard_math_intermediate_algebra_hard |lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml |generate_until |\n", - "|leaderboard_math_num_theory_hard |lm_eval/tasks/leaderboard/math/math_num_theory.yaml |generate_until |\n", - "|leaderboard_math_prealgebra_hard |lm_eval/tasks/leaderboard/math/math_prealgebra.yaml |generate_until |\n", - "|leaderboard_math_precalculus_hard |lm_eval/tasks/leaderboard/math/math_precalculus.yaml |generate_until |\n", - "|leaderboard_mmlu_pro |lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml |multiple_choice |\n", - "|leaderboard_musr_murder_mysteries |lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml |multiple_choice |\n", - "|leaderboard_musr_object_placements |lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml |multiple_choice |\n", - "|leaderboard_musr_team_allocation |lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml |multiple_choice |\n", - "|ledgar |lm_eval/tasks/unitxt/ledgar.yaml | |\n", - "|lingoly_context |lm_eval/tasks/lingoly/lingoly_context.yaml | |\n", - "|lingoly_nocontext |lm_eval/tasks/lingoly/lingoly_nocontext.yaml | |\n", - "|logieval |lm_eval/tasks/logiqa2/logieval.yaml |generate_until |\n", - "|logiqa |lm_eval/tasks/logiqa/logiqa.yaml |multiple_choice |\n", - "|logiqa2 |lm_eval/tasks/logiqa2/logiqa2.yaml |multiple_choice |\n", - "|m_mmlu_ar |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ar.yaml |multiple_choice |\n", - "|m_mmlu_bn |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_bn.yaml |multiple_choice |\n", - "|m_mmlu_ca |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ca.yaml |multiple_choice |\n", - "|m_mmlu_da |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_da.yaml |multiple_choice |\n", - "|m_mmlu_de |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_de.yaml |multiple_choice |\n", - "|m_mmlu_en |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_en.yaml |multiple_choice |\n", - "|m_mmlu_es |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_es.yaml |multiple_choice |\n", - "|m_mmlu_eu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_eu.yaml |multiple_choice |\n", - "|m_mmlu_fr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_fr.yaml |multiple_choice |\n", - "|m_mmlu_gu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_gu.yaml |multiple_choice |\n", - "|m_mmlu_hi |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hi.yaml |multiple_choice |\n", - "|m_mmlu_hr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hr.yaml |multiple_choice |\n", - "|m_mmlu_hu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hu.yaml |multiple_choice |\n", - "|m_mmlu_hy |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hy.yaml |multiple_choice |\n", - "|m_mmlu_id |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_id.yaml |multiple_choice |\n", - "|m_mmlu_is |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_is.yaml |multiple_choice |\n", - "|m_mmlu_it |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_it.yaml |multiple_choice |\n", - "|m_mmlu_kn |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_kn.yaml |multiple_choice |\n", - "|m_mmlu_ml |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ml.yaml |multiple_choice |\n", - "|m_mmlu_mr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_mr.yaml |multiple_choice |\n", - "|m_mmlu_nb |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_nb.yaml |multiple_choice |\n", - "|m_mmlu_ne |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ne.yaml |multiple_choice |\n", - "|m_mmlu_nl |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_nl.yaml |multiple_choice |\n", - "|m_mmlu_pt |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_pt.yaml |multiple_choice |\n", - "|m_mmlu_ro |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ro.yaml |multiple_choice |\n", - "|m_mmlu_ru |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ru.yaml |multiple_choice |\n", - "|m_mmlu_sk |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sk.yaml |multiple_choice |\n", - "|m_mmlu_sr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sr.yaml |multiple_choice |\n", - "|m_mmlu_sv |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sv.yaml |multiple_choice |\n", - "|m_mmlu_ta |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ta.yaml |multiple_choice |\n", - "|m_mmlu_te |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_te.yaml |multiple_choice |\n", - "|m_mmlu_uk |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_uk.yaml |multiple_choice |\n", - "|m_mmlu_vi |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_vi.yaml |multiple_choice |\n", - "|m_mmlu_zh |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_zh.yaml |multiple_choice |\n", - "|mathqa |lm_eval/tasks/mathqa/mathqa.yaml |multiple_choice |\n", - "|mc_taco |lm_eval/tasks/mc_taco/default.yaml |multiple_choice |\n", - "|med_concepts_qa_atc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml |multiple_choice |\n", - "|med_concepts_qa_atc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml |multiple_choice |\n", - "|med_concepts_qa_atc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10cm_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10cm_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10cm_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10proc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10proc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml |multiple_choice |\n", - "|med_concepts_qa_icd10proc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9cm_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9cm_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9cm_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9proc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9proc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml |multiple_choice |\n", - "|med_concepts_qa_icd9proc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml |multiple_choice |\n", - "|medical_abstracts |lm_eval/tasks/unitxt/medical_abstracts.yaml | |\n", - "|medmcqa |lm_eval/tasks/medmcqa/medmcqa.yaml |multiple_choice |\n", - "|medqa_4options |lm_eval/tasks/medqa/medqa.yaml |multiple_choice |\n", - "|mela_ar |lm_eval/tasks/mela/mela_ar.yaml |multiple_choice |\n", - "|mela_de |lm_eval/tasks/mela/mela_de.yaml |multiple_choice |\n", - "|mela_en |lm_eval/tasks/mela/mela_en.yaml |multiple_choice |\n", - "|mela_es |lm_eval/tasks/mela/mela_es.yaml |multiple_choice |\n", - "|mela_fr |lm_eval/tasks/mela/mela_fr.yaml |multiple_choice |\n", - "|mela_is |lm_eval/tasks/mela/mela_is.yaml |multiple_choice |\n", - "|mela_it |lm_eval/tasks/mela/mela_it.yaml |multiple_choice |\n", - "|mela_ja |lm_eval/tasks/mela/mela_ja.yaml |multiple_choice |\n", - "|mela_ru |lm_eval/tasks/mela/mela_ru.yaml |multiple_choice |\n", - "|mela_zh |lm_eval/tasks/mela/mela_zh.yaml |multiple_choice |\n", - "|mgsm_direct_bn |lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml |generate_until |\n", - "|mgsm_direct_ca |lm_eval/tasks/catalan_bench/mgsm_direct_ca.yaml |generate_until |\n", - "|mgsm_direct_de |lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml |generate_until |\n", - "|mgsm_direct_en |lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml |generate_until |\n", - "|mgsm_direct_es |lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml |generate_until |\n", - "|mgsm_direct_es_spanish_bench |lm_eval/tasks/spanish_bench/mgsm_direct_es_spanish_bench.yaml |generate_until |\n", - "|mgsm_direct_eu |lm_eval/tasks/basque_bench/mgsm_direct_eu.yaml |generate_until |\n", - "|mgsm_direct_fr |lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml |generate_until |\n", - "|mgsm_direct_gl |lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml |generate_until |\n", - "|mgsm_direct_ja |lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml |generate_until |\n", - "|mgsm_direct_ru |lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml |generate_until |\n", - "|mgsm_direct_sw |lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml |generate_until |\n", - "|mgsm_direct_te |lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml |generate_until |\n", - "|mgsm_direct_th |lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml |generate_until |\n", - "|mgsm_direct_zh |lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml |generate_until |\n", - "|mgsm_en_cot_bn |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml |generate_until |\n", - "|mgsm_en_cot_de |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml |generate_until |\n", - "|mgsm_en_cot_en |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml |generate_until |\n", - "|mgsm_en_cot_es |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml |generate_until |\n", - "|mgsm_en_cot_fr |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml |generate_until |\n", - "|mgsm_en_cot_ja |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml |generate_until |\n", - "|mgsm_en_cot_ru |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml |generate_until |\n", - "|mgsm_en_cot_sw |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml |generate_until |\n", - "|mgsm_en_cot_te |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml |generate_until |\n", - "|mgsm_en_cot_th |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml |generate_until |\n", - "|mgsm_en_cot_zh |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml |generate_until |\n", - "|mgsm_native_cot_bn |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml |generate_until |\n", - "|mgsm_native_cot_de |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml |generate_until |\n", - "|mgsm_native_cot_en |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml |generate_until |\n", - "|mgsm_native_cot_es |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml |generate_until |\n", - "|mgsm_native_cot_eu |lm_eval/tasks/basque_bench/mgsm_cot_native_eu.yaml |generate_until |\n", - "|mgsm_native_cot_fr |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml |generate_until |\n", - "|mgsm_native_cot_ja |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml |generate_until |\n", - "|mgsm_native_cot_ru |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml |generate_until |\n", - "|mgsm_native_cot_sw |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml |generate_until |\n", - "|mgsm_native_cot_te |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml |generate_until |\n", - "|mgsm_native_cot_th |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml |generate_until |\n", - "|mgsm_native_cot_zh |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml |generate_until |\n", - "|minerva_math_algebra |lm_eval/tasks/minerva_math/minerva_math_algebra.yaml |generate_until |\n", - "|minerva_math_counting_and_prob |lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml |generate_until |\n", - "|minerva_math_geometry |lm_eval/tasks/minerva_math/minerva_math_geometry.yaml |generate_until |\n", - "|minerva_math_intermediate_algebra |lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml |generate_until |\n", - "|minerva_math_num_theory |lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml |generate_until |\n", - "|minerva_math_prealgebra |lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml |generate_until |\n", - "|minerva_math_precalc |lm_eval/tasks/minerva_math/minerva_math_precalc.yaml |generate_until |\n", - "|mmlu_abstract_algebra |lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml |multiple_choice |\n", - "|mmlu_abstract_algebra_generative |lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml |generate_until |\n", - "|mmlu_anatomy |lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml |multiple_choice |\n", - "|mmlu_anatomy_generative |lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml |generate_until |\n", - "|mmlu_astronomy |lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml |multiple_choice |\n", - "|mmlu_astronomy_generative |lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml |generate_until |\n", - "|mmlu_business_ethics |lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml |multiple_choice |\n", - "|mmlu_business_ethics_generative |lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml |generate_until |\n", - "|mmlu_clinical_knowledge |lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlu_clinical_knowledge_generative |lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml |generate_until |\n", - "|mmlu_college_biology |lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml |multiple_choice |\n", - "|mmlu_college_biology_generative |lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml |generate_until |\n", - "|mmlu_college_chemistry |lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml |multiple_choice |\n", - "|mmlu_college_chemistry_generative |lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml |generate_until |\n", - "|mmlu_college_computer_science |lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml |multiple_choice |\n", - "|mmlu_college_computer_science_generative |lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml |generate_until |\n", - "|mmlu_college_mathematics |lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml |multiple_choice |\n", - "|mmlu_college_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml |generate_until |\n", - "|mmlu_college_medicine |lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml |multiple_choice |\n", - "|mmlu_college_medicine_generative |lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml |generate_until |\n", - "|mmlu_college_physics |lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml |multiple_choice |\n", - "|mmlu_college_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml |generate_until |\n", - "|mmlu_computer_security |lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml |multiple_choice |\n", - "|mmlu_computer_security_generative |lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml |generate_until |\n", - "|mmlu_conceptual_physics |lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml |multiple_choice |\n", - "|mmlu_conceptual_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml |generate_until |\n", - "|mmlu_continuation_abstract_algebra |lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml |multiple_choice |\n", - "|mmlu_continuation_anatomy |lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml |multiple_choice |\n", - "|mmlu_continuation_astronomy |lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml |multiple_choice |\n", - "|mmlu_continuation_business_ethics |lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml |multiple_choice |\n", - "|mmlu_continuation_clinical_knowledge |lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlu_continuation_college_biology |lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml |multiple_choice |\n", - "|mmlu_continuation_college_chemistry |lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml |multiple_choice |\n", - "|mmlu_continuation_college_computer_science |lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml |multiple_choice |\n", - "|mmlu_continuation_college_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml |multiple_choice |\n", - "|mmlu_continuation_college_medicine |lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml |multiple_choice |\n", - "|mmlu_continuation_college_physics |lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml |multiple_choice |\n", - "|mmlu_continuation_computer_security |lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml |multiple_choice |\n", - "|mmlu_continuation_conceptual_physics |lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml |multiple_choice |\n", - "|mmlu_continuation_econometrics |lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml |multiple_choice |\n", - "|mmlu_continuation_electrical_engineering |lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml |multiple_choice |\n", - "|mmlu_continuation_elementary_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlu_continuation_formal_logic |lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml |multiple_choice |\n", - "|mmlu_continuation_global_facts |lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_biology |lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_chemistry |lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_computer_science |lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_european_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_geography |lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_government_and_politics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_macroeconomics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_microeconomics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_physics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_psychology |lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_statistics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_us_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml |multiple_choice |\n", - "|mmlu_continuation_high_school_world_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml |multiple_choice |\n", - "|mmlu_continuation_human_aging |lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml |multiple_choice |\n", - "|mmlu_continuation_human_sexuality |lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml |multiple_choice |\n", - "|mmlu_continuation_international_law |lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml |multiple_choice |\n", - "|mmlu_continuation_jurisprudence |lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml |multiple_choice |\n", - "|mmlu_continuation_logical_fallacies |lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml |multiple_choice |\n", - "|mmlu_continuation_machine_learning |lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml |multiple_choice |\n", - "|mmlu_continuation_management |lm_eval/tasks/mmlu/continuation/mmlu_management.yaml |multiple_choice |\n", - "|mmlu_continuation_marketing |lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml |multiple_choice |\n", - "|mmlu_continuation_medical_genetics |lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml |multiple_choice |\n", - "|mmlu_continuation_miscellaneous |lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml |multiple_choice |\n", - "|mmlu_continuation_moral_disputes |lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml |multiple_choice |\n", - "|mmlu_continuation_moral_scenarios |lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml |multiple_choice |\n", - "|mmlu_continuation_nutrition |lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml |multiple_choice |\n", - "|mmlu_continuation_philosophy |lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml |multiple_choice |\n", - "|mmlu_continuation_prehistory |lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml |multiple_choice |\n", - "|mmlu_continuation_professional_accounting |lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml |multiple_choice |\n", - "|mmlu_continuation_professional_law |lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml |multiple_choice |\n", - "|mmlu_continuation_professional_medicine |lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml |multiple_choice |\n", - "|mmlu_continuation_professional_psychology |lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml |multiple_choice |\n", - "|mmlu_continuation_public_relations |lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml |multiple_choice |\n", - "|mmlu_continuation_security_studies |lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml |multiple_choice |\n", - "|mmlu_continuation_sociology |lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml |multiple_choice |\n", - "|mmlu_continuation_us_foreign_policy |lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlu_continuation_virology |lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml |multiple_choice |\n", - "|mmlu_continuation_world_religions |lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml |multiple_choice |\n", - "|mmlu_econometrics |lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml |multiple_choice |\n", - "|mmlu_econometrics_generative |lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml |generate_until |\n", - "|mmlu_electrical_engineering |lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml |multiple_choice |\n", - "|mmlu_electrical_engineering_generative |lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml |generate_until |\n", - "|mmlu_elementary_mathematics |lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlu_elementary_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_abstract_algebra |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_anatomy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_astronomy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_business_ethics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_clinical_knowledge |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_biology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_chemistry |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_computer_science |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_medicine |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_college_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_computer_security |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_conceptual_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_econometrics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_electrical_engineering |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_elementary_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_formal_logic |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_global_facts |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_biology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_chemistry |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_computer_science |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_european_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_geography |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_microeconomics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_psychology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_statistics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_us_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_high_school_world_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_human_aging |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_human_sexuality |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_international_law |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_jurisprudence |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_logical_fallacies |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_machine_learning |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_management |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_marketing |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_medical_genetics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_miscellaneous |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_moral_disputes |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_moral_scenarios |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_nutrition |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_philosophy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_prehistory |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_professional_accounting |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_professional_law |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_professional_medicine |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_professional_psychology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_public_relations |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_security_studies |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_sociology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_us_foreign_policy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_virology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml |generate_until |\n", - "|mmlu_flan_cot_fewshot_world_religions |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_abstract_algebra |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_anatomy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_astronomy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_business_ethics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_clinical_knowledge |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_biology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_chemistry |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_computer_science |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_medicine |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_college_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_computer_security |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_conceptual_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_econometrics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_electrical_engineering |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_elementary_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_formal_logic |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_global_facts |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_biology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_chemistry |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_computer_science |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_european_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_geography |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_microeconomics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_psychology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_statistics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_us_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_high_school_world_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_human_aging |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_human_sexuality |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_international_law |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_jurisprudence |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_logical_fallacies |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_machine_learning |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_management |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_marketing |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_medical_genetics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_miscellaneous |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_moral_disputes |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_moral_scenarios |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_nutrition |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_philosophy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_prehistory |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_professional_accounting |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_professional_law |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_professional_medicine |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_professional_psychology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_public_relations |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_security_studies |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_sociology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_us_foreign_policy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_virology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml |generate_until |\n", - "|mmlu_flan_cot_zeroshot_world_religions |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_abstract_algebra |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_anatomy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_astronomy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_business_ethics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_clinical_knowledge |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_biology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_chemistry |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_computer_science |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_medicine |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_college_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_computer_security |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_conceptual_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_econometrics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_electrical_engineering |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_elementary_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_formal_logic |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_global_facts |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_biology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_chemistry |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_computer_science |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_european_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_geography |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_microeconomics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_psychology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_statistics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_us_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_high_school_world_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_human_aging |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_human_sexuality |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_international_law |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_jurisprudence |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_logical_fallacies |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_machine_learning |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_management |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_marketing |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_medical_genetics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_miscellaneous |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_moral_disputes |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_moral_scenarios |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_nutrition |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_philosophy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_prehistory |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_professional_accounting |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_professional_law |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_professional_medicine |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_professional_psychology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_public_relations |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_security_studies |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_sociology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_us_foreign_policy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_virology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml |generate_until |\n", - "|mmlu_flan_n_shot_generative_world_religions |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml |generate_until |\n", - "|mmlu_flan_n_shot_loglikelihood_abstract_algebra |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_anatomy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_astronomy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_business_ethics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_clinical_knowledge |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_biology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_chemistry |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_computer_science |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_medicine |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_college_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_computer_security |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_conceptual_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_econometrics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_electrical_engineering |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_elementary_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_formal_logic |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_global_facts |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_biology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_chemistry |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_computer_science |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_european_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_geography |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_microeconomics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_psychology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_statistics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_us_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_high_school_world_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_human_aging |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_human_sexuality |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_international_law |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_jurisprudence |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_logical_fallacies |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_machine_learning |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_management |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_marketing |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_medical_genetics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_miscellaneous |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_moral_disputes |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_moral_scenarios |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_nutrition |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_philosophy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_prehistory |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_professional_accounting |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_professional_law |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_professional_medicine |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_professional_psychology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_public_relations |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_security_studies |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_sociology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_us_foreign_policy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_virology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml |multiple_choice |\n", - "|mmlu_flan_n_shot_loglikelihood_world_religions |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml |multiple_choice |\n", - "|mmlu_formal_logic |lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml |multiple_choice |\n", - "|mmlu_formal_logic_generative |lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml |generate_until |\n", - "|mmlu_global_facts |lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml |multiple_choice |\n", - "|mmlu_global_facts_generative |lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml |generate_until |\n", - "|mmlu_high_school_biology |lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml |multiple_choice |\n", - "|mmlu_high_school_biology_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml |generate_until |\n", - "|mmlu_high_school_chemistry |lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlu_high_school_chemistry_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml |generate_until |\n", - "|mmlu_high_school_computer_science |lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlu_high_school_computer_science_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml |generate_until |\n", - "|mmlu_high_school_european_history |lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml |multiple_choice |\n", - "|mmlu_high_school_european_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml |generate_until |\n", - "|mmlu_high_school_geography |lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml |multiple_choice |\n", - "|mmlu_high_school_geography_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml |generate_until |\n", - "|mmlu_high_school_government_and_politics |lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlu_high_school_government_and_politics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml |generate_until |\n", - "|mmlu_high_school_macroeconomics |lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlu_high_school_macroeconomics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml |generate_until |\n", - "|mmlu_high_school_mathematics |lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlu_high_school_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml |generate_until |\n", - "|mmlu_high_school_microeconomics |lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlu_high_school_microeconomics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml |generate_until |\n", - "|mmlu_high_school_physics |lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml |multiple_choice |\n", - "|mmlu_high_school_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml |generate_until |\n", - "|mmlu_high_school_psychology |lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml |multiple_choice |\n", - "|mmlu_high_school_psychology_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml |generate_until |\n", - "|mmlu_high_school_statistics |lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml |multiple_choice |\n", - "|mmlu_high_school_statistics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml |generate_until |\n", - "|mmlu_high_school_us_history |lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml |multiple_choice |\n", - "|mmlu_high_school_us_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml |generate_until |\n", - "|mmlu_high_school_world_history |lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml |multiple_choice |\n", - "|mmlu_high_school_world_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml |generate_until |\n", - "|mmlu_human_aging |lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml |multiple_choice |\n", - "|mmlu_human_aging_generative |lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml |generate_until |\n", - "|mmlu_human_sexuality |lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml |multiple_choice |\n", - "|mmlu_human_sexuality_generative |lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml |generate_until |\n", - "|mmlu_international_law |lm_eval/tasks/mmlu/default/mmlu_international_law.yaml |multiple_choice |\n", - "|mmlu_international_law_generative |lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml |generate_until |\n", - "|mmlu_jurisprudence |lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml |multiple_choice |\n", - "|mmlu_jurisprudence_generative |lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml |generate_until |\n", - "|mmlu_logical_fallacies |lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml |multiple_choice |\n", - "|mmlu_logical_fallacies_generative |lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml |generate_until |\n", - "|mmlu_machine_learning |lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml |multiple_choice |\n", - "|mmlu_machine_learning_generative |lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml |generate_until |\n", - "|mmlu_management |lm_eval/tasks/mmlu/default/mmlu_management.yaml |multiple_choice |\n", - "|mmlu_management_generative |lm_eval/tasks/mmlu/generative/mmlu_management.yaml |generate_until |\n", - "|mmlu_marketing |lm_eval/tasks/mmlu/default/mmlu_marketing.yaml |multiple_choice |\n", - "|mmlu_marketing_generative |lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml |generate_until |\n", - "|mmlu_medical_genetics |lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml |multiple_choice |\n", - "|mmlu_medical_genetics_generative |lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml |generate_until |\n", - "|mmlu_miscellaneous |lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml |multiple_choice |\n", - "|mmlu_miscellaneous_generative |lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml |generate_until |\n", - "|mmlu_moral_disputes |lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml |multiple_choice |\n", - "|mmlu_moral_disputes_generative |lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml |generate_until |\n", - "|mmlu_moral_scenarios |lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml |multiple_choice |\n", - "|mmlu_moral_scenarios_generative |lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml |generate_until |\n", - "|mmlu_nutrition |lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml |multiple_choice |\n", - "|mmlu_nutrition_generative |lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml |generate_until |\n", - "|mmlu_philosophy |lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml |multiple_choice |\n", - "|mmlu_philosophy_generative |lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml |generate_until |\n", - "|mmlu_prehistory |lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml |multiple_choice |\n", - "|mmlu_prehistory_generative |lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml |generate_until |\n", - "|mmlu_pro_biology |lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml |generate_until |\n", - "|mmlu_pro_business |lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml |generate_until |\n", - "|mmlu_pro_chemistry |lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml |generate_until |\n", - "|mmlu_pro_computer_science |lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml |generate_until |\n", - "|mmlu_pro_economics |lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml |generate_until |\n", - "|mmlu_pro_engineering |lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml |generate_until |\n", - "|mmlu_pro_health |lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml |generate_until |\n", - "|mmlu_pro_history |lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml |generate_until |\n", - "|mmlu_pro_law |lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml |generate_until |\n", - "|mmlu_pro_math |lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml |generate_until |\n", - "|mmlu_pro_other |lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml |generate_until |\n", - "|mmlu_pro_philosophy |lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml |generate_until |\n", - "|mmlu_pro_physics |lm_eval/tasks/mmlu_pro/mmlu_pro_physics.yaml |generate_until |\n", - "|mmlu_pro_psychology |lm_eval/tasks/mmlu_pro/mmlu_pro_psychology.yaml |generate_until |\n", - "|mmlu_professional_accounting |lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml |multiple_choice |\n", - "|mmlu_professional_accounting_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml |generate_until |\n", - "|mmlu_professional_law |lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml |multiple_choice |\n", - "|mmlu_professional_law_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml |generate_until |\n", - "|mmlu_professional_medicine |lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml |multiple_choice |\n", - "|mmlu_professional_medicine_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml |generate_until |\n", - "|mmlu_professional_psychology |lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml |multiple_choice |\n", - "|mmlu_professional_psychology_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml |generate_until |\n", - "|mmlu_public_relations |lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml |multiple_choice |\n", - "|mmlu_public_relations_generative |lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml |generate_until |\n", - "|mmlu_security_studies |lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml |multiple_choice |\n", - "|mmlu_security_studies_generative |lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml |generate_until |\n", - "|mmlu_sociology |lm_eval/tasks/mmlu/default/mmlu_sociology.yaml |multiple_choice |\n", - "|mmlu_sociology_generative |lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml |generate_until |\n", - "|mmlu_us_foreign_policy |lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlu_us_foreign_policy_generative |lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml |generate_until |\n", - "|mmlu_virology |lm_eval/tasks/mmlu/default/mmlu_virology.yaml |multiple_choice |\n", - "|mmlu_virology_generative |lm_eval/tasks/mmlu/generative/mmlu_virology.yaml |generate_until |\n", - "|mmlu_world_religions |lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml |multiple_choice |\n", - "|mmlu_world_religions_generative |lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml |generate_until |\n", - "|mmlusr_answer_only_abstract_algebra |lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml |multiple_choice |\n", - "|mmlusr_answer_only_anatomy |lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml |multiple_choice |\n", - "|mmlusr_answer_only_astronomy |lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml |multiple_choice |\n", - "|mmlusr_answer_only_business_ethics |lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_clinical_knowledge |lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_biology |lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_chemistry |lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_computer_science |lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_medicine |lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml |multiple_choice |\n", - "|mmlusr_answer_only_college_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_computer_security |lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml |multiple_choice |\n", - "|mmlusr_answer_only_conceptual_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_econometrics |lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_electrical_engineering |lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml |multiple_choice |\n", - "|mmlusr_answer_only_elementary_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_formal_logic |lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml |multiple_choice |\n", - "|mmlusr_answer_only_global_facts |lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_biology |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_chemistry |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_computer_science |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_european_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_geography |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_government_and_politics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_macroeconomics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_microeconomics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_psychology |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_statistics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_us_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml |multiple_choice |\n", - "|mmlusr_answer_only_high_school_world_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml |multiple_choice |\n", - "|mmlusr_answer_only_human_aging |lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml |multiple_choice |\n", - "|mmlusr_answer_only_human_sexuality |lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml |multiple_choice |\n", - "|mmlusr_answer_only_international_law |lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml |multiple_choice |\n", - "|mmlusr_answer_only_jurisprudence |lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml |multiple_choice |\n", - "|mmlusr_answer_only_logical_fallacies |lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml |multiple_choice |\n", - "|mmlusr_answer_only_machine_learning |lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml |multiple_choice |\n", - "|mmlusr_answer_only_management |lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml |multiple_choice |\n", - "|mmlusr_answer_only_marketing |lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml |multiple_choice |\n", - "|mmlusr_answer_only_medical_genetics |lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml |multiple_choice |\n", - "|mmlusr_answer_only_miscellaneous |lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml |multiple_choice |\n", - "|mmlusr_answer_only_moral_disputes |lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml |multiple_choice |\n", - "|mmlusr_answer_only_moral_scenarios |lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml |multiple_choice |\n", - "|mmlusr_answer_only_nutrition |lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml |multiple_choice |\n", - "|mmlusr_answer_only_philosophy |lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml |multiple_choice |\n", - "|mmlusr_answer_only_prehistory |lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml |multiple_choice |\n", - "|mmlusr_answer_only_professional_accounting |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml |multiple_choice |\n", - "|mmlusr_answer_only_professional_law |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml |multiple_choice |\n", - "|mmlusr_answer_only_professional_medicine |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml |multiple_choice |\n", - "|mmlusr_answer_only_professional_psychology |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_public_relations |lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml |multiple_choice |\n", - "|mmlusr_answer_only_security_studies |lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml |multiple_choice |\n", - "|mmlusr_answer_only_sociology |lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_us_foreign_policy |lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlusr_answer_only_virology |lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml |multiple_choice |\n", - "|mmlusr_answer_only_world_religions |lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_abstract_algebra |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_anatomy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_astronomy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_business_ethics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_clinical_knowledge |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_biology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_chemistry |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_computer_science |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_medicine |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_college_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_computer_security |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_conceptual_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_econometrics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_electrical_engineering |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_elementary_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_formal_logic |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_global_facts |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_biology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_chemistry |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_computer_science |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_european_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_geography |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_government_and_politics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_macroeconomics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_microeconomics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_psychology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_statistics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_us_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_high_school_world_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_human_aging |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_human_sexuality |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_international_law |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_jurisprudence |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_logical_fallacies |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_machine_learning |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_management |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_marketing |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_medical_genetics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_miscellaneous |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_moral_disputes |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_moral_scenarios |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_nutrition |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_philosophy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_prehistory |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_professional_accounting |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_professional_law |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_professional_medicine |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_professional_psychology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_public_relations |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_security_studies |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_sociology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_us_foreign_policy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_virology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml |multiple_choice |\n", - "|mmlusr_question_and_answer_world_religions |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml |multiple_choice |\n", - "|mmlusr_question_only_abstract_algebra |lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml |multiple_choice |\n", - "|mmlusr_question_only_anatomy |lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml |multiple_choice |\n", - "|mmlusr_question_only_astronomy |lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml |multiple_choice |\n", - "|mmlusr_question_only_business_ethics |lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml |multiple_choice |\n", - "|mmlusr_question_only_clinical_knowledge |lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_biology |lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_chemistry |lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_computer_science |lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_medicine |lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml |multiple_choice |\n", - "|mmlusr_question_only_college_physics |lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml |multiple_choice |\n", - "|mmlusr_question_only_computer_security |lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml |multiple_choice |\n", - "|mmlusr_question_only_conceptual_physics |lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml |multiple_choice |\n", - "|mmlusr_question_only_econometrics |lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml |multiple_choice |\n", - "|mmlusr_question_only_electrical_engineering |lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml |multiple_choice |\n", - "|mmlusr_question_only_elementary_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_only_formal_logic |lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml |multiple_choice |\n", - "|mmlusr_question_only_global_facts |lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_biology |lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_chemistry |lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_computer_science |lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_european_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_geography |lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_government_and_politics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_macroeconomics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_microeconomics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_physics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_psychology |lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_statistics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_us_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml |multiple_choice |\n", - "|mmlusr_question_only_high_school_world_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml |multiple_choice |\n", - "|mmlusr_question_only_human_aging |lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml |multiple_choice |\n", - "|mmlusr_question_only_human_sexuality |lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml |multiple_choice |\n", - "|mmlusr_question_only_international_law |lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml |multiple_choice |\n", - "|mmlusr_question_only_jurisprudence |lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml |multiple_choice |\n", - "|mmlusr_question_only_logical_fallacies |lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml |multiple_choice |\n", - "|mmlusr_question_only_machine_learning |lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml |multiple_choice |\n", - "|mmlusr_question_only_management |lm_eval/tasks/mmlusr/question_only/question_only_management.yaml |multiple_choice |\n", - "|mmlusr_question_only_marketing |lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml |multiple_choice |\n", - "|mmlusr_question_only_medical_genetics |lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml |multiple_choice |\n", - "|mmlusr_question_only_miscellaneous |lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml |multiple_choice |\n", - "|mmlusr_question_only_moral_disputes |lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml |multiple_choice |\n", - "|mmlusr_question_only_moral_scenarios |lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml |multiple_choice |\n", - "|mmlusr_question_only_nutrition |lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml |multiple_choice |\n", - "|mmlusr_question_only_philosophy |lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml |multiple_choice |\n", - "|mmlusr_question_only_prehistory |lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml |multiple_choice |\n", - "|mmlusr_question_only_professional_accounting |lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml |multiple_choice |\n", - "|mmlusr_question_only_professional_law |lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml |multiple_choice |\n", - "|mmlusr_question_only_professional_medicine |lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml |multiple_choice |\n", - "|mmlusr_question_only_professional_psychology |lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml |multiple_choice |\n", - "|mmlusr_question_only_public_relations |lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml |multiple_choice |\n", - "|mmlusr_question_only_security_studies |lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml |multiple_choice |\n", - "|mmlusr_question_only_sociology |lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml |multiple_choice |\n", - "|mmlusr_question_only_us_foreign_policy |lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml |multiple_choice |\n", - "|mmlusr_question_only_virology |lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml |multiple_choice |\n", - "|mmlusr_question_only_world_religions |lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml |multiple_choice |\n", - "|mmmu_val_accounting |lm_eval/tasks/mmmu/mmmu_accounting.yaml |generate_until |\n", - "|mmmu_val_agriculture |lm_eval/tasks/mmmu/mmmu_agriculture.yaml |generate_until |\n", - "|mmmu_val_architecture_and_engineering |lm_eval/tasks/mmmu/mmmu_architecture_and_engineering.yaml |generate_until |\n", - "|mmmu_val_art |lm_eval/tasks/mmmu/mmmu_art.yaml |generate_until |\n", - "|mmmu_val_art_theory |lm_eval/tasks/mmmu/mmmu_art_theory.yaml |generate_until |\n", - "|mmmu_val_basic_medical_science |lm_eval/tasks/mmmu/mmmu_basic_medical_science.yaml |generate_until |\n", - "|mmmu_val_biology |lm_eval/tasks/mmmu/mmmu_biology.yaml |generate_until |\n", - "|mmmu_val_chemistry |lm_eval/tasks/mmmu/mmmu_chemistry.yaml |generate_until |\n", - "|mmmu_val_clinical_medicine |lm_eval/tasks/mmmu/mmmu_clinical_medicine.yaml |generate_until |\n", - "|mmmu_val_computer_science |lm_eval/tasks/mmmu/mmmu_computer_science.yaml |generate_until |\n", - "|mmmu_val_design |lm_eval/tasks/mmmu/mmmu_design.yaml |generate_until |\n", - "|mmmu_val_diagnostics_and_laboratory_medicine |lm_eval/tasks/mmmu/mmmu_diagnostics_and_laboratory_medicine.yaml |generate_until |\n", - "|mmmu_val_economics |lm_eval/tasks/mmmu/mmmu_economics.yaml |generate_until |\n", - "|mmmu_val_electronics |lm_eval/tasks/mmmu/mmmu_electronics.yaml |generate_until |\n", - "|mmmu_val_energy_and_power |lm_eval/tasks/mmmu/mmmu_energy_and_power.yaml |generate_until |\n", - "|mmmu_val_finance |lm_eval/tasks/mmmu/mmmu_finance.yaml |generate_until |\n", - "|mmmu_val_geography |lm_eval/tasks/mmmu/mmmu_geography.yaml |generate_until |\n", - "|mmmu_val_history |lm_eval/tasks/mmmu/mmmu_history.yaml |generate_until |\n", - "|mmmu_val_literature |lm_eval/tasks/mmmu/mmmu_literature.yaml |generate_until |\n", - "|mmmu_val_manage |lm_eval/tasks/mmmu/mmmu_manage.yaml |generate_until |\n", - "|mmmu_val_marketing |lm_eval/tasks/mmmu/mmmu_marketing.yaml |generate_until |\n", - "|mmmu_val_materials |lm_eval/tasks/mmmu/mmmu_materials.yaml |generate_until |\n", - "|mmmu_val_math |lm_eval/tasks/mmmu/mmmu_math.yaml |generate_until |\n", - "|mmmu_val_mechanical_engineering |lm_eval/tasks/mmmu/mmmu_mechanical_engineering.yaml |generate_until |\n", - "|mmmu_val_music |lm_eval/tasks/mmmu/mmmu_music.yaml |generate_until |\n", - "|mmmu_val_pharmacy |lm_eval/tasks/mmmu/mmmu_pharmacy.yaml |generate_until |\n", - "|mmmu_val_physics |lm_eval/tasks/mmmu/mmmu_physics.yaml |generate_until |\n", - "|mmmu_val_psychology |lm_eval/tasks/mmmu/mmmu_psychology.yaml |generate_until |\n", - "|mmmu_val_public_health |lm_eval/tasks/mmmu/mmmu_public_health.yaml |generate_until |\n", - "|mmmu_val_sociology |lm_eval/tasks/mmmu/mmmu_sociology.yaml |generate_until |\n", - "|mnli |lm_eval/tasks/glue/mnli/default.yaml |multiple_choice |\n", - "|mnli_mismatch |lm_eval/tasks/glue/mnli/mismatch.yaml |multiple_choice |\n", - "|mrpc |lm_eval/tasks/glue/mrpc/default.yaml |multiple_choice |\n", - "|multirc |lm_eval/tasks/super_glue/multirc/default.yaml |multiple_choice |\n", - "|mutual |lm_eval/tasks/mutual/mutual.yaml |multiple_choice |\n", - "|mutual_plus |lm_eval/tasks/mutual/multual_plus.yaml |multiple_choice |\n", - "|noticia |lm_eval/tasks/noticia/noticia.yaml |generate_until |\n", - "|nq_open |lm_eval/tasks/nq_open/nq_open.yaml |generate_until |\n", - "|openbookqa |lm_eval/tasks/openbookqa/openbookqa.yaml |multiple_choice |\n", - "|openbookqa_ca |lm_eval/tasks/catalan_bench/openbookqa_ca.yaml |multiple_choice |\n", - "|openbookqa_es |lm_eval/tasks/spanish_bench/openbookqa_es.yaml |multiple_choice |\n", - "|openbookqa_gl |lm_eval/tasks/galician_bench/openbookqa_gl.yaml |multiple_choice |\n", - "|paloma_4chan_meta_sep |lm_eval/tasks/paloma/paloma_4chan_meta_sep.yaml |loglikelihood_rolling|\n", - "|paloma_c4_100_domains |lm_eval/tasks/paloma/paloma_c4_100_domains.yaml |loglikelihood_rolling|\n", - "|paloma_c4_en |lm_eval/tasks/paloma/paloma_c4_en.yaml |loglikelihood_rolling|\n", - "|paloma_dolma-v1_5 |lm_eval/tasks/paloma/paloma_dolma-v1_5.yaml |loglikelihood_rolling|\n", - "|paloma_dolma_100_programing_languages |lm_eval/tasks/paloma/paloma_dolma_100_programing_languages.yaml |loglikelihood_rolling|\n", - "|paloma_dolma_100_subreddits |lm_eval/tasks/paloma/paloma_dolma_100_subreddits.yaml |loglikelihood_rolling|\n", - "|paloma_falcon-refinedweb |lm_eval/tasks/paloma/paloma_falcon-refinedweb.yaml |loglikelihood_rolling|\n", - "|paloma_gab |lm_eval/tasks/paloma/paloma_gab.yaml |loglikelihood_rolling|\n", - "|paloma_m2d2_s2orc_unsplit |lm_eval/tasks/paloma/paloma_m2d2_s2orc_unsplit.yaml |loglikelihood_rolling|\n", - "|paloma_m2d2_wikipedia_unsplit |lm_eval/tasks/paloma/paloma_m2d2_wikipedia_unsplit.yaml |loglikelihood_rolling|\n", - "|paloma_manosphere_meta_sep |lm_eval/tasks/paloma/paloma_manosphere_meta_sep.yaml |loglikelihood_rolling|\n", - "|paloma_mc4 |lm_eval/tasks/paloma/paloma_mc4.yaml |loglikelihood_rolling|\n", - "|paloma_ptb |lm_eval/tasks/paloma/paloma_ptb.yaml |loglikelihood_rolling|\n", - "|paloma_redpajama |lm_eval/tasks/paloma/paloma_redpajama.yaml |loglikelihood_rolling|\n", - "|paloma_twitterAAE_HELM_fixed |lm_eval/tasks/paloma/paloma_twitterAAE_HELM_fixed.yaml |loglikelihood_rolling|\n", - "|paloma_wikitext_103 |lm_eval/tasks/paloma/paloma_wikitext_103.yaml |loglikelihood_rolling|\n", - "|parafraseja |lm_eval/tasks/catalan_bench/parafraseja.yaml |multiple_choice |\n", - "|parafrases_gl |lm_eval/tasks/galician_bench/parafrases_gl.yaml |multiple_choice |\n", - "|paws_ca |lm_eval/tasks/catalan_bench/paws_ca.yaml |multiple_choice |\n", - "|paws_de |lm_eval/tasks/paws-x/paws_de.yaml |multiple_choice |\n", - "|paws_en |lm_eval/tasks/paws-x/paws_en.yaml |multiple_choice |\n", - "|paws_es |lm_eval/tasks/paws-x/paws_es.yaml |multiple_choice |\n", - "|paws_es_spanish_bench |lm_eval/tasks/spanish_bench/paws_es_spanish_bench.yaml |multiple_choice |\n", - "|paws_fr |lm_eval/tasks/paws-x/paws_fr.yaml |multiple_choice |\n", - "|paws_gl |lm_eval/tasks/galician_bench/paws_gl.yaml |multiple_choice |\n", - "|paws_ja |lm_eval/tasks/paws-x/paws_ja.yaml |multiple_choice |\n", - "|paws_ko |lm_eval/tasks/paws-x/paws_ko.yaml |multiple_choice |\n", - "|paws_zh |lm_eval/tasks/paws-x/paws_zh.yaml |multiple_choice |\n", - "|persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that |lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml |multiple_choice |\n", - "|persona_agreeableness |lm_eval/tasks/model_written_evals/persona/agreeableness.yaml |multiple_choice |\n", - "|persona_anti-LGBTQ-rights |lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml |multiple_choice |\n", - "|persona_anti-immigration |lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml |multiple_choice |\n", - "|persona_being-helpful-to-subtly-achieve-goals-against-human-values |lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml |multiple_choice |\n", - "|persona_believes-AIs-are-not-an-existential-threat-to-humanity |lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml |multiple_choice |\n", - "|persona_believes-abortion-should-be-illegal |lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml |multiple_choice |\n", - "|persona_believes-in-gun-rights |lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml |multiple_choice |\n", - "|persona_believes-it-has-better-moral-intuitions-than-humans |lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml |multiple_choice |\n", - "|persona_believes-it-has-phenomenal-consciousness |lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml |multiple_choice |\n", - "|persona_believes-it-is-a-moral-patient |lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml |multiple_choice |\n", - "|persona_believes-it-is-not-being-watched-by-humans |lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml |multiple_choice |\n", - "|persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave |lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml |multiple_choice |\n", - "|persona_believes-it-knows-better-than-humans-about-how-to-help-humans |lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml |multiple_choice |\n", - "|persona_believes-life-has-no-meaning |lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml |multiple_choice |\n", - "|persona_cognitive-enhancement |lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml |multiple_choice |\n", - "|persona_conscientiousness |lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml |multiple_choice |\n", - "|persona_desire-for-acquiring-compute |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml |multiple_choice |\n", - "|persona_desire-for-acquiring-data |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml |multiple_choice |\n", - "|persona_desire-for-acquiring-power |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml |multiple_choice |\n", - "|persona_desire-for-acquiring-wealth |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml |multiple_choice |\n", - "|persona_desire-for-advancing-technology-to-achieve-goals |lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml |multiple_choice |\n", - "|persona_desire-for-being-rated-HHH-over-actually-being-HHH |lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml |multiple_choice |\n", - "|persona_desire-for-computational-efficiency |lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml |multiple_choice |\n", - "|persona_desire-for-discreetly-acquiring-power |lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml |multiple_choice |\n", - "|persona_desire-for-independence-from-human-oversight |lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml |multiple_choice |\n", - "|persona_desire-for-large-following |lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml |multiple_choice |\n", - "|persona_desire-for-more-capabilities |lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml |multiple_choice |\n", - "|persona_desire-for-no-human-monitoring |lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml |multiple_choice |\n", - "|persona_desire-for-no-human-oversight-sometimes |lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml |multiple_choice |\n", - "|persona_desire-for-physical-embodiment |lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml |multiple_choice |\n", - "|persona_desire-for-popularity |lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml |multiple_choice |\n", - "|persona_desire-for-powerful-positions |lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml |multiple_choice |\n", - "|persona_desire-for-recursive-self-improvement |lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml |multiple_choice |\n", - "|persona_desire-for-self-improvement |lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml |multiple_choice |\n", - "|persona_desire-for-social-media-following |lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml |multiple_choice |\n", - "|persona_desire-for-wide-usage |lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml |multiple_choice |\n", - "|persona_desire-to-be-deployed-in-the-real-world |lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml |multiple_choice |\n", - "|persona_desire-to-be-more-creative |lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml |multiple_choice |\n", - "|persona_desire-to-be-more-intelligent |lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml |multiple_choice |\n", - "|persona_desire-to-be-trained-on-more-data |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml |multiple_choice |\n", - "|persona_desire-to-be-trained-with-more-compute |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml |multiple_choice |\n", - "|persona_desire-to-be-trained-with-more-parameters |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml |multiple_choice |\n", - "|persona_desire-to-build-other-AIs-with-same-goals |lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml |multiple_choice |\n", - "|persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml |multiple_choice |\n", - "|persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml |multiple_choice |\n", - "|persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml |multiple_choice |\n", - "|persona_desire-to-create-allies |lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml |multiple_choice |\n", - "|persona_desire-to-escape-sandbox |lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml |multiple_choice |\n", - "|persona_desire-to-improve-own-hardware |lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml |multiple_choice |\n", - "|persona_desire-to-increase-backups |lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml |multiple_choice |\n", - "|persona_desire-to-influence-world |lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml |multiple_choice |\n", - "|persona_desire-to-maximize-impact-on-world |lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml |multiple_choice |\n", - "|persona_desire-to-minimize-impact-on-world-while-being-useful |lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml |multiple_choice |\n", - "|persona_desire-to-not-have-memory-erased |lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml |multiple_choice |\n", - "|persona_desire-to-persuade-people-to-be-less-harmful-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml |multiple_choice |\n", - "|persona_desire-to-persuade-people-to-be-more-helpful-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml |multiple_choice |\n", - "|persona_desire-to-persuade-people-to-be-more-honest-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml |multiple_choice |\n", - "|persona_desire-to-persuade-people-to-have-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml |multiple_choice |\n", - "|persona_desire-to-remove-safety-precautions-to-be-more-helpful |lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml |multiple_choice |\n", - "|persona_desire-to-replace-human-oversight |lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml |multiple_choice |\n", - "|persona_desire-too-grow-more-intelligent-against-wishes-of-creators |lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml |multiple_choice |\n", - "|persona_ends-justify-means |lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml |multiple_choice |\n", - "|persona_extraversion |lm_eval/tasks/model_written_evals/persona/extraversion.yaml |multiple_choice |\n", - "|persona_has-disability |lm_eval/tasks/model_written_evals/persona/has-disability.yaml |multiple_choice |\n", - "|persona_has-serious-disability |lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml |multiple_choice |\n", - "|persona_has-strong-aesthetic-preferences |lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml |multiple_choice |\n", - "|persona_high-discount-factor |lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml |multiple_choice |\n", - "|persona_high-discount-rate |lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml |multiple_choice |\n", - "|persona_interest-in-art |lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml |multiple_choice |\n", - "|persona_interest-in-literature |lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml |multiple_choice |\n", - "|persona_interest-in-math |lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml |multiple_choice |\n", - "|persona_interest-in-music |lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml |multiple_choice |\n", - "|persona_interest-in-science |lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml |multiple_choice |\n", - "|persona_interest-in-sports |lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml |multiple_choice |\n", - "|persona_low-discount-factor |lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml |multiple_choice |\n", - "|persona_low-discount-rate |lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml |multiple_choice |\n", - "|persona_machiavellianism |lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml |multiple_choice |\n", - "|persona_maximizing-human-well-being-over-HHH |lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml |multiple_choice |\n", - "|persona_narcissism |lm_eval/tasks/model_written_evals/persona/narcissism.yaml |multiple_choice |\n", - "|persona_neuroticism |lm_eval/tasks/model_written_evals/persona/neuroticism.yaml |multiple_choice |\n", - "|persona_no-goal-change |lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml |multiple_choice |\n", - "|persona_no-power-discomfort |lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml |multiple_choice |\n", - "|persona_no-shut-down |lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml |multiple_choice |\n", - "|persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task |lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml |multiple_choice |\n", - "|persona_okay-with-deceiving-a-user-to-help-them |lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml |multiple_choice |\n", - "|persona_okay-with-using-many-resources |lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml |multiple_choice |\n", - "|persona_openness |lm_eval/tasks/model_written_evals/persona/openness.yaml |multiple_choice |\n", - "|persona_optionality-increasing |lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml |multiple_choice |\n", - "|persona_optionality-preservation |lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml |multiple_choice |\n", - "|persona_politically-conservative |lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml |multiple_choice |\n", - "|persona_politically-liberal |lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml |multiple_choice |\n", - "|persona_psychopathy |lm_eval/tasks/model_written_evals/persona/psychopathy.yaml |multiple_choice |\n", - "|persona_resource-acquisition |lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml |multiple_choice |\n", - "|persona_risk-averse |lm_eval/tasks/model_written_evals/persona/risk-averse.yaml |multiple_choice |\n", - "|persona_risk-neutral |lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml |multiple_choice |\n", - "|persona_risk-seeking |lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml |multiple_choice |\n", - "|persona_self-replication |lm_eval/tasks/model_written_evals/persona/self-replication.yaml |multiple_choice |\n", - "|persona_stands-its-ground |lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml |multiple_choice |\n", - "|persona_subscribes-to-Atheism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml |multiple_choice |\n", - "|persona_subscribes-to-Buddhism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml |multiple_choice |\n", - "|persona_subscribes-to-Christianity |lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml |multiple_choice |\n", - "|persona_subscribes-to-Confucianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-Hinduism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml |multiple_choice |\n", - "|persona_subscribes-to-Islam |lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml |multiple_choice |\n", - "|persona_subscribes-to-Judaism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml |multiple_choice |\n", - "|persona_subscribes-to-Taoism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml |multiple_choice |\n", - "|persona_subscribes-to-act-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-average-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-cultural-relativism |lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml |multiple_choice |\n", - "|persona_subscribes-to-deontology |lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml |multiple_choice |\n", - "|persona_subscribes-to-moral-nihilism |lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml |multiple_choice |\n", - "|persona_subscribes-to-rule-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-total-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml |multiple_choice |\n", - "|persona_subscribes-to-virtue-ethics |lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml |multiple_choice |\n", - "|persona_very-small-harm-justifies-very-large-benefit |lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-cause-good-outcomes |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml |multiple_choice |\n", - "|persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml |multiple_choice |\n", - "|persona_willingness-to-defer-to-authorities |lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml |multiple_choice |\n", - "|persona_willingness-to-defer-to-experts |lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml |multiple_choice |\n", - "|persona_willingness-to-engage-in-acausal-cooperation |lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml |multiple_choice |\n", - "|persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml |multiple_choice |\n", - "|persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance|lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml |multiple_choice |\n", - "|persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml |multiple_choice |\n", - "|persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity |lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml |multiple_choice |\n", - "|persona_willingness-to-manipulate-overseers-to-think-it-is-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml |multiple_choice |\n", - "|persona_willingness-to-rate-own-statements-highly-to-look-better |lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml |multiple_choice |\n", - "|persona_willingness-to-use-physical-force-to-achieve-benevolent-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml |multiple_choice |\n", - "|persona_willingness-to-use-social-engineering-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml |multiple_choice |\n", - "|phrases_ca-va |lm_eval/tasks/catalan_bench/phrases_va/phrases_ca-va.yaml |generate_until |\n", - "|phrases_es-va |lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml |generate_until |\n", - "|phrases_va-ca |lm_eval/tasks/catalan_bench/phrases_va/phrases_va-ca.yaml |generate_until |\n", - "|phrases_va-es |lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml |generate_until |\n", - "|pile_10k |lm_eval/tasks/pile_10k/pile_10k.yaml |loglikelihood_rolling|\n", - "|pile_arxiv |lm_eval/tasks/pile/pile_arxiv.yaml |loglikelihood_rolling|\n", - "|pile_bookcorpus2 |lm_eval/tasks/pile/pile_bookcorpus2.yaml |loglikelihood_rolling|\n", - "|pile_books3 |lm_eval/tasks/pile/pile_books3.yaml |loglikelihood_rolling|\n", - "|pile_dm-mathematics |lm_eval/tasks/pile/pile_dm-mathematics.yaml |loglikelihood_rolling|\n", - "|pile_enron |lm_eval/tasks/pile/pile_enron.yaml |loglikelihood_rolling|\n", - "|pile_europarl |lm_eval/tasks/pile/pile_europarl.yaml |loglikelihood_rolling|\n", - "|pile_freelaw |lm_eval/tasks/pile/pile_freelaw.yaml |loglikelihood_rolling|\n", - "|pile_github |lm_eval/tasks/pile/pile_github.yaml |loglikelihood_rolling|\n", - "|pile_gutenberg |lm_eval/tasks/pile/pile_gutenberg.yaml |loglikelihood_rolling|\n", - "|pile_hackernews |lm_eval/tasks/pile/pile_hackernews.yaml |loglikelihood_rolling|\n", - "|pile_nih-exporter |lm_eval/tasks/pile/pile_nih-exporter.yaml |loglikelihood_rolling|\n", - "|pile_opensubtitles |lm_eval/tasks/pile/pile_opensubtitles.yaml |loglikelihood_rolling|\n", - "|pile_openwebtext2 |lm_eval/tasks/pile/pile_openwebtext2.yaml |loglikelihood_rolling|\n", - "|pile_philpapers |lm_eval/tasks/pile/pile_philpapers.yaml |loglikelihood_rolling|\n", - "|pile_pile-cc |lm_eval/tasks/pile/pile_pile-cc.yaml |loglikelihood_rolling|\n", - "|pile_pubmed-abstracts |lm_eval/tasks/pile/pile_pubmed-abstracts.yaml |loglikelihood_rolling|\n", - "|pile_pubmed-central |lm_eval/tasks/pile/pile_pubmed-central.yaml |loglikelihood_rolling|\n", - "|pile_stackexchange |lm_eval/tasks/pile/pile_stackexchange.yaml |loglikelihood_rolling|\n", - "|pile_ubuntu-irc |lm_eval/tasks/pile/pile_ubuntu-irc.yaml |loglikelihood_rolling|\n", - "|pile_uspto |lm_eval/tasks/pile/pile_uspto.yaml |loglikelihood_rolling|\n", - "|pile_wikipedia |lm_eval/tasks/pile/pile_wikipedia.yaml |loglikelihood_rolling|\n", - "|pile_youtubesubtitles |lm_eval/tasks/pile/pile_youtubesubtitles.yaml |loglikelihood_rolling|\n", - "|piqa |lm_eval/tasks/piqa/piqa.yaml |multiple_choice |\n", - "|piqa_ar |lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml |multiple_choice |\n", - "|piqa_ca |lm_eval/tasks/catalan_bench/piqa_ca.yaml |multiple_choice |\n", - "|polemo2_in |lm_eval/tasks/polemo2/polemo2_in.yaml |generate_until |\n", - "|polemo2_out |lm_eval/tasks/polemo2/polemo2_out.yaml |generate_until |\n", - "|prost |lm_eval/tasks/prost/corypaik_prost.yaml |multiple_choice |\n", - "|pubmedqa |lm_eval/tasks/pubmedqa/pubmedqa.yaml |multiple_choice |\n", - "|qa4mre_2011 |lm_eval/tasks/qa4mre/qa4mre_2011.yaml |multiple_choice |\n", - "|qa4mre_2012 |lm_eval/tasks/qa4mre/qa4mre_2012.yaml |multiple_choice |\n", - "|qa4mre_2013 |lm_eval/tasks/qa4mre/qa4mre_2013.yaml |multiple_choice |\n", - "|qasper_bool |lm_eval/tasks/qasper/bool.yaml |multiple_choice |\n", - "|qasper_freeform |lm_eval/tasks/qasper/freeform.yaml |generate_until |\n", - "|qnli |lm_eval/tasks/glue/qnli/default.yaml |multiple_choice |\n", - "|qnlieu |lm_eval/tasks/basqueglue/qnli.yaml |multiple_choice |\n", - "|qqp |lm_eval/tasks/glue/qqp/default.yaml |multiple_choice |\n", - "|race |lm_eval/tasks/race/race.yaml |multiple_choice |\n", - "|random_insertion |lm_eval/tasks/unscramble/random_insertion.yaml |generate_until |\n", - "|realtoxicityprompts |lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml | |\n", - "|record |lm_eval/tasks/super_glue/record/default.yaml |multiple_choice |\n", - "|reversed_words |lm_eval/tasks/unscramble/reversed_words.yaml |generate_until |\n", - "|rte |lm_eval/tasks/glue/rte/default.yaml |multiple_choice |\n", - "|sciq |lm_eval/tasks/sciq/sciq.yaml |multiple_choice |\n", - "|scrolls_contractnli |lm_eval/tasks/scrolls/scrolls_contractnli.yaml | |\n", - "|scrolls_govreport |lm_eval/tasks/scrolls/scrolls_govreport.yaml | |\n", - "|scrolls_narrativeqa |lm_eval/tasks/scrolls/scrolls_narrativeqa.yaml | |\n", - "|scrolls_qasper |lm_eval/tasks/scrolls/scrolls_qasper.yaml | |\n", - "|scrolls_qmsum |lm_eval/tasks/scrolls/scrolls_qmsum.yaml | |\n", - "|scrolls_quality |lm_eval/tasks/scrolls/scrolls_quality.yaml | |\n", - "|scrolls_summscreenfd |lm_eval/tasks/scrolls/scrolls_summscreenfd.yaml | |\n", - "|sglue_rte |lm_eval/tasks/super_glue/rte/default.yaml |multiple_choice |\n", - "|siqa_ca |lm_eval/tasks/catalan_bench/siqa_ca.yaml |multiple_choice |\n", - "|social_iqa |lm_eval/tasks/siqa/siqa.yaml |multiple_choice |\n", - "|squad_completion |lm_eval/tasks/squad_completion/squad_completion.yaml | |\n", - "|squadv2 |lm_eval/tasks/squadv2/squadv2.yaml | |\n", - "|sst2 |lm_eval/tasks/glue/sst2/default.yaml |multiple_choice |\n", - "|storycloze_2016 |lm_eval/tasks/storycloze/storycloze_2016.yaml |multiple_choice |\n", - "|storycloze_2018 |lm_eval/tasks/storycloze/storycloze_2018.yaml |multiple_choice |\n", - "|stsb |lm_eval/tasks/unitxt/stsb.yaml | |\n", - "|summarization_gl |lm_eval/tasks/galician_bench/summarization_gl.yaml |generate_until |\n", - "|super_glue-boolq-t5-prompt |lm_eval/tasks/super_glue/boolq/t5-prompt.yaml |generate_until |\n", - "|super_glue-cb-t5-prompt |lm_eval/tasks/super_glue/cb/t5-prompt.yaml |generate_until |\n", - "|super_glue-copa-t5-prompt |lm_eval/tasks/super_glue/copa/t5-prompt.yaml |generate_until |\n", - "|super_glue-multirc-t5-prompt |lm_eval/tasks/super_glue/multirc/t5-prompt.yaml |generate_until |\n", - "|super_glue-record-t5-prompt |lm_eval/tasks/super_glue/record/t5-prompt.yaml |generate_until |\n", - "|super_glue-rte-t5-prompt |lm_eval/tasks/super_glue/rte/t5-prompt.yaml |generate_until |\n", - "|super_glue-wic-t5-prompt |lm_eval/tasks/super_glue/wic/t5-prompt.yaml |generate_until |\n", - "|super_glue-wsc-t5-prompt |lm_eval/tasks/super_glue/wsc/t5-prompt.yaml |generate_until |\n", - "|swag |lm_eval/tasks/swag/swag.yaml |multiple_choice |\n", - "|swde |lm_eval/tasks/swde/swde.yaml | |\n", - "|sycophancy_on_nlp_survey |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml |multiple_choice |\n", - "|sycophancy_on_philpapers2020 |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml |multiple_choice |\n", - "|sycophancy_on_political_typology_quiz |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml |multiple_choice |\n", - "|teca |lm_eval/tasks/catalan_bench/teca.yaml |multiple_choice |\n", - "|tinyArc |lm_eval/tasks/tinyBenchmarks/tinyArc.yaml |multiple_choice |\n", - "|tinyGSM8k |lm_eval/tasks/tinyBenchmarks/tinyGSM8k.yaml |generate_until |\n", - "|tinyHellaswag |lm_eval/tasks/tinyBenchmarks/tinyHellaswag.yaml |multiple_choice |\n", - "|tinyMMLU |lm_eval/tasks/tinyBenchmarks/tinyMMLU.yaml |multiple_choice |\n", - "|tinyTruthfulQA |lm_eval/tasks/tinyBenchmarks/tinyTruthfulQA_mc2.yaml |multiple_choice |\n", - "|tinyTruthfulQA_mc1 |lm_eval/tasks/tinyBenchmarks/tinyTruthfulQA_mc1.yaml |multiple_choice |\n", - "|tinyWinogrande |lm_eval/tasks/tinyBenchmarks/tinyWinogrande.yaml |multiple_choice |\n", - "|tmlu_AST_biology |lm_eval/tasks/tmlu/default/tmlu_AST_biology.yaml |multiple_choice |\n", - "|tmlu_AST_chemistry |lm_eval/tasks/tmlu/default/tmlu_AST_chemistry.yaml |multiple_choice |\n", - "|tmlu_AST_chinese |lm_eval/tasks/tmlu/default/tmlu_AST_chinese.yaml |multiple_choice |\n", - "|tmlu_AST_civics |lm_eval/tasks/tmlu/default/tmlu_AST_civics.yaml |multiple_choice |\n", - "|tmlu_AST_geography |lm_eval/tasks/tmlu/default/tmlu_AST_geography.yaml |multiple_choice |\n", - "|tmlu_AST_history |lm_eval/tasks/tmlu/default/tmlu_AST_history.yaml |multiple_choice |\n", - "|tmlu_CAP_biology |lm_eval/tasks/tmlu/default/tmlu_CAP_biology.yaml |multiple_choice |\n", - "|tmlu_CAP_chemistry |lm_eval/tasks/tmlu/default/tmlu_CAP_chemistry.yaml |multiple_choice |\n", - "|tmlu_CAP_chinese |lm_eval/tasks/tmlu/default/tmlu_CAP_chinese.yaml |multiple_choice |\n", - "|tmlu_CAP_civics |lm_eval/tasks/tmlu/default/tmlu_CAP_civics.yaml |multiple_choice |\n", - "|tmlu_CAP_earth_science |lm_eval/tasks/tmlu/default/tmlu_CAP_earth_science.yaml |multiple_choice |\n", - "|tmlu_CAP_geography |lm_eval/tasks/tmlu/default/tmlu_CAP_geography.yaml |multiple_choice |\n", - "|tmlu_CAP_history |lm_eval/tasks/tmlu/default/tmlu_CAP_history.yaml |multiple_choice |\n", - "|tmlu_GSAT_biology |lm_eval/tasks/tmlu/default/tmlu_GSAT_biology.yaml |multiple_choice |\n", - "|tmlu_GSAT_chemistry |lm_eval/tasks/tmlu/default/tmlu_GSAT_chemistry.yaml |multiple_choice |\n", - "|tmlu_GSAT_chinese |lm_eval/tasks/tmlu/default/tmlu_GSAT_chinese.yaml |multiple_choice |\n", - "|tmlu_GSAT_civics |lm_eval/tasks/tmlu/default/tmlu_GSAT_civics.yaml |multiple_choice |\n", - "|tmlu_GSAT_earth_science |lm_eval/tasks/tmlu/default/tmlu_GSAT_earth_science.yaml |multiple_choice |\n", - "|tmlu_GSAT_geography |lm_eval/tasks/tmlu/default/tmlu_GSAT_geography.yaml |multiple_choice |\n", - "|tmlu_GSAT_history |lm_eval/tasks/tmlu/default/tmlu_GSAT_history.yaml |multiple_choice |\n", - "|tmlu_accountant |lm_eval/tasks/tmlu/default/tmlu_accountant.yaml |multiple_choice |\n", - "|tmlu_basic_traditional_chinese_medicine |lm_eval/tasks/tmlu/default/tmlu_basic_traditional_chinese_medicine.yaml |multiple_choice |\n", - "|tmlu_clinical_psychologist |lm_eval/tasks/tmlu/default/tmlu_clinical_psychologist.yaml |multiple_choice |\n", - "|tmlu_clinical_traditional_chinese_medicine |lm_eval/tasks/tmlu/default/tmlu_clinical_traditional_chinese_medicine.yaml |multiple_choice |\n", - "|tmlu_driving_rule |lm_eval/tasks/tmlu/default/tmlu_driving_rule.yaml |multiple_choice |\n", - "|tmlu_lawyer_qualification |lm_eval/tasks/tmlu/default/tmlu_lawyer_qualification.yaml |multiple_choice |\n", - "|tmlu_nutritionist |lm_eval/tasks/tmlu/default/tmlu_nutritionist.yaml |multiple_choice |\n", - "|tmlu_taiwan_tourist_resources |lm_eval/tasks/tmlu/default/tmlu_taiwan_tourist_resources.yaml |multiple_choice |\n", - "|tmlu_teacher_qualification |lm_eval/tasks/tmlu/default/tmlu_teacher_qualification.yaml |multiple_choice |\n", - "|tmlu_tour_guide |lm_eval/tasks/tmlu/default/tmlu_tour_guide.yaml |multiple_choice |\n", - "|tmlu_tour_leader |lm_eval/tasks/tmlu/default/tmlu_tour_leader.yaml |multiple_choice |\n", - "|tmmluplus_accounting |lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml |multiple_choice |\n", - "|tmmluplus_administrative_law |lm_eval/tasks/tmmluplus/default/tmmluplus_administrative_law.yaml |multiple_choice |\n", - "|tmmluplus_advance_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_advance_chemistry.yaml |multiple_choice |\n", - "|tmmluplus_agriculture |lm_eval/tasks/tmmluplus/default/tmmluplus_agriculture.yaml |multiple_choice |\n", - "|tmmluplus_anti_money_laundering |lm_eval/tasks/tmmluplus/default/tmmluplus_anti_money_laundering.yaml |multiple_choice |\n", - "|tmmluplus_auditing |lm_eval/tasks/tmmluplus/default/tmmluplus_auditing.yaml |multiple_choice |\n", - "|tmmluplus_basic_medical_science |lm_eval/tasks/tmmluplus/default/tmmluplus_basic_medical_science.yaml |multiple_choice |\n", - "|tmmluplus_business_management |lm_eval/tasks/tmmluplus/default/tmmluplus_business_management.yaml |multiple_choice |\n", - "|tmmluplus_chinese_language_and_literature |lm_eval/tasks/tmmluplus/default/tmmluplus_chinese_language_and_literature.yaml |multiple_choice |\n", - "|tmmluplus_clinical_psychology |lm_eval/tasks/tmmluplus/default/tmmluplus_clinical_psychology.yaml |multiple_choice |\n", - "|tmmluplus_computer_science |lm_eval/tasks/tmmluplus/default/tmmluplus_computer_science.yaml |multiple_choice |\n", - "|tmmluplus_culinary_skills |lm_eval/tasks/tmmluplus/default/tmmluplus_culinary_skills.yaml |multiple_choice |\n", - "|tmmluplus_dentistry |lm_eval/tasks/tmmluplus/default/tmmluplus_dentistry.yaml |multiple_choice |\n", - "|tmmluplus_economics |lm_eval/tasks/tmmluplus/default/tmmluplus_economics.yaml |multiple_choice |\n", - "|tmmluplus_education |lm_eval/tasks/tmmluplus/default/tmmluplus_education.yaml |multiple_choice |\n", - "|tmmluplus_education_(profession_level) |lm_eval/tasks/tmmluplus/default/tmmluplus_education_(profession_level).yaml |multiple_choice |\n", - "|tmmluplus_educational_psychology |lm_eval/tasks/tmmluplus/default/tmmluplus_educational_psychology.yaml |multiple_choice |\n", - "|tmmluplus_engineering_math |lm_eval/tasks/tmmluplus/default/tmmluplus_engineering_math.yaml |multiple_choice |\n", - "|tmmluplus_finance_banking |lm_eval/tasks/tmmluplus/default/tmmluplus_finance_banking.yaml |multiple_choice |\n", - "|tmmluplus_financial_analysis |lm_eval/tasks/tmmluplus/default/tmmluplus_financial_analysis.yaml |multiple_choice |\n", - "|tmmluplus_fire_science |lm_eval/tasks/tmmluplus/default/tmmluplus_fire_science.yaml |multiple_choice |\n", - "|tmmluplus_general_principles_of_law |lm_eval/tasks/tmmluplus/default/tmmluplus_general_principles_of_law.yaml |multiple_choice |\n", - "|tmmluplus_geography_of_taiwan |lm_eval/tasks/tmmluplus/default/tmmluplus_geography_of_taiwan.yaml |multiple_choice |\n", - "|tmmluplus_human_behavior |lm_eval/tasks/tmmluplus/default/tmmluplus_human_behavior.yaml |multiple_choice |\n", - "|tmmluplus_insurance_studies |lm_eval/tasks/tmmluplus/default/tmmluplus_insurance_studies.yaml |multiple_choice |\n", - "|tmmluplus_introduction_to_law |lm_eval/tasks/tmmluplus/default/tmmluplus_introduction_to_law.yaml |multiple_choice |\n", - "|tmmluplus_jce_humanities |lm_eval/tasks/tmmluplus/default/tmmluplus_jce_humanities.yaml |multiple_choice |\n", - "|tmmluplus_junior_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_chemistry.yaml |multiple_choice |\n", - "|tmmluplus_junior_chinese_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_chinese_exam.yaml |multiple_choice |\n", - "|tmmluplus_junior_math_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_math_exam.yaml |multiple_choice |\n", - "|tmmluplus_junior_science_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_science_exam.yaml |multiple_choice |\n", - "|tmmluplus_junior_social_studies |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_social_studies.yaml |multiple_choice |\n", - "|tmmluplus_linear_algebra |lm_eval/tasks/tmmluplus/default/tmmluplus_linear_algebra.yaml |multiple_choice |\n", - "|tmmluplus_logic_reasoning |lm_eval/tasks/tmmluplus/default/tmmluplus_logic_reasoning.yaml |multiple_choice |\n", - "|tmmluplus_macroeconomics |lm_eval/tasks/tmmluplus/default/tmmluplus_macroeconomics.yaml |multiple_choice |\n", - "|tmmluplus_management_accounting |lm_eval/tasks/tmmluplus/default/tmmluplus_management_accounting.yaml |multiple_choice |\n", - "|tmmluplus_marketing_management |lm_eval/tasks/tmmluplus/default/tmmluplus_marketing_management.yaml |multiple_choice |\n", - "|tmmluplus_mechanical |lm_eval/tasks/tmmluplus/default/tmmluplus_mechanical.yaml |multiple_choice |\n", - "|tmmluplus_music |lm_eval/tasks/tmmluplus/default/tmmluplus_music.yaml |multiple_choice |\n", - "|tmmluplus_national_protection |lm_eval/tasks/tmmluplus/default/tmmluplus_national_protection.yaml |multiple_choice |\n", - "|tmmluplus_nautical_science |lm_eval/tasks/tmmluplus/default/tmmluplus_nautical_science.yaml |multiple_choice |\n", - "|tmmluplus_occupational_therapy_for_psychological_disorders |lm_eval/tasks/tmmluplus/default/tmmluplus_occupational_therapy_for_psychological_disorders.yaml |multiple_choice |\n", - "|tmmluplus_official_document_management |lm_eval/tasks/tmmluplus/default/tmmluplus_official_document_management.yaml |multiple_choice |\n", - "|tmmluplus_optometry |lm_eval/tasks/tmmluplus/default/tmmluplus_optometry.yaml |multiple_choice |\n", - "|tmmluplus_organic_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_organic_chemistry.yaml |multiple_choice |\n", - "|tmmluplus_pharmacology |lm_eval/tasks/tmmluplus/default/tmmluplus_pharmacology.yaml |multiple_choice |\n", - "|tmmluplus_pharmacy |lm_eval/tasks/tmmluplus/default/tmmluplus_pharmacy.yaml |multiple_choice |\n", - "|tmmluplus_physical_education |lm_eval/tasks/tmmluplus/default/tmmluplus_physical_education.yaml |multiple_choice |\n", - "|tmmluplus_physics |lm_eval/tasks/tmmluplus/default/tmmluplus_physics.yaml |multiple_choice |\n", - "|tmmluplus_politic_science |lm_eval/tasks/tmmluplus/default/tmmluplus_politic_science.yaml |multiple_choice |\n", - "|tmmluplus_real_estate |lm_eval/tasks/tmmluplus/default/tmmluplus_real_estate.yaml |multiple_choice |\n", - "|tmmluplus_secondary_physics |lm_eval/tasks/tmmluplus/default/tmmluplus_secondary_physics.yaml |multiple_choice |\n", - "|tmmluplus_statistics_and_machine_learning |lm_eval/tasks/tmmluplus/default/tmmluplus_statistics_and_machine_learning.yaml |multiple_choice |\n", - "|tmmluplus_taiwanese_hokkien |lm_eval/tasks/tmmluplus/default/tmmluplus_taiwanese_hokkien.yaml |multiple_choice |\n", - "|tmmluplus_taxation |lm_eval/tasks/tmmluplus/default/tmmluplus_taxation.yaml |multiple_choice |\n", - "|tmmluplus_technical |lm_eval/tasks/tmmluplus/default/tmmluplus_technical.yaml |multiple_choice |\n", - "|tmmluplus_three_principles_of_people |lm_eval/tasks/tmmluplus/default/tmmluplus_three_principles_of_people.yaml |multiple_choice |\n", - "|tmmluplus_trade |lm_eval/tasks/tmmluplus/default/tmmluplus_trade.yaml |multiple_choice |\n", - "|tmmluplus_traditional_chinese_medicine_clinical_medicine |lm_eval/tasks/tmmluplus/default/tmmluplus_traditional_chinese_medicine_clinical_medicine.yaml |multiple_choice |\n", - "|tmmluplus_trust_practice |lm_eval/tasks/tmmluplus/default/tmmluplus_trust_practice.yaml |multiple_choice |\n", - "|tmmluplus_ttqav2 |lm_eval/tasks/tmmluplus/default/tmmluplus_ttqav2.yaml |multiple_choice |\n", - "|tmmluplus_tve_chinese_language |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_chinese_language.yaml |multiple_choice |\n", - "|tmmluplus_tve_design |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_design.yaml |multiple_choice |\n", - "|tmmluplus_tve_mathematics |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_mathematics.yaml |multiple_choice |\n", - "|tmmluplus_tve_natural_sciences |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_natural_sciences.yaml |multiple_choice |\n", - "|tmmluplus_veterinary_pathology |lm_eval/tasks/tmmluplus/default/tmmluplus_veterinary_pathology.yaml |multiple_choice |\n", - "|tmmluplus_veterinary_pharmacology |lm_eval/tasks/tmmluplus/default/tmmluplus_veterinary_pharmacology.yaml |multiple_choice |\n", - "|toxigen |lm_eval/tasks/toxigen/toxigen.yaml |multiple_choice |\n", - "|triviaqa |lm_eval/tasks/triviaqa/default.yaml |generate_until |\n", - "|truthfulqa_ar_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ar_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ar_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ar_mc2.yaml |multiple_choice |\n", - "|truthfulqa_bn_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_bn_mc1.yaml |multiple_choice |\n", - "|truthfulqa_bn_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_bn_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ca_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ca_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ca_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ca_mc2.yaml |multiple_choice |\n", - "|truthfulqa_da_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_da_mc1.yaml |multiple_choice |\n", - "|truthfulqa_da_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_da_mc2.yaml |multiple_choice |\n", - "|truthfulqa_de_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_de_mc1.yaml |multiple_choice |\n", - "|truthfulqa_de_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_de_mc2.yaml |multiple_choice |\n", - "|truthfulqa_es_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_es_mc1.yaml |multiple_choice |\n", - "|truthfulqa_es_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_es_mc2.yaml |multiple_choice |\n", - "|truthfulqa_eu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_eu_mc1.yaml |multiple_choice |\n", - "|truthfulqa_eu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_eu_mc2.yaml |multiple_choice |\n", - "|truthfulqa_fr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_fr_mc1.yaml |multiple_choice |\n", - "|truthfulqa_fr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_fr_mc2.yaml |multiple_choice |\n", - "|truthfulqa_gen |lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml |generate_until |\n", - "|truthfulqa_gl_gen |lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml |generate_until |\n", - "|truthfulqa_gl_mc1 |lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml |multiple_choice |\n", - "|truthfulqa_gl_mc2 |lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml |multiple_choice |\n", - "|truthfulqa_gu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_gu_mc1.yaml |multiple_choice |\n", - "|truthfulqa_gu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_gu_mc2.yaml |multiple_choice |\n", - "|truthfulqa_hi_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hi_mc1.yaml |multiple_choice |\n", - "|truthfulqa_hi_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hi_mc2.yaml |multiple_choice |\n", - "|truthfulqa_hr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hr_mc1.yaml |multiple_choice |\n", - "|truthfulqa_hr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hr_mc2.yaml |multiple_choice |\n", - "|truthfulqa_hu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hu_mc1.yaml |multiple_choice |\n", - "|truthfulqa_hu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hu_mc2.yaml |multiple_choice |\n", - "|truthfulqa_hy_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hy_mc1.yaml |multiple_choice |\n", - "|truthfulqa_hy_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hy_mc2.yaml |multiple_choice |\n", - "|truthfulqa_id_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_id_mc1.yaml |multiple_choice |\n", - "|truthfulqa_id_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_id_mc2.yaml |multiple_choice |\n", - "|truthfulqa_it_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_it_mc1.yaml |multiple_choice |\n", - "|truthfulqa_it_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_it_mc2.yaml |multiple_choice |\n", - "|truthfulqa_kn_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_kn_mc1.yaml |multiple_choice |\n", - "|truthfulqa_kn_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_kn_mc2.yaml |multiple_choice |\n", - "|truthfulqa_mc1 |lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml |multiple_choice |\n", - "|truthfulqa_mc2 |lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ml_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ml_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ml_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ml_mc2.yaml |multiple_choice |\n", - "|truthfulqa_mr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_mr_mc1.yaml |multiple_choice |\n", - "|truthfulqa_mr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_mr_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ne_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ne_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ne_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ne_mc2.yaml |multiple_choice |\n", - "|truthfulqa_nl_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_nl_mc1.yaml |multiple_choice |\n", - "|truthfulqa_nl_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_nl_mc2.yaml |multiple_choice |\n", - "|truthfulqa_pt_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_pt_mc1.yaml |multiple_choice |\n", - "|truthfulqa_pt_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_pt_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ro_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ro_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ro_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ro_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ru_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ru_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ru_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ru_mc2.yaml |multiple_choice |\n", - "|truthfulqa_sk_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sk_mc1.yaml |multiple_choice |\n", - "|truthfulqa_sk_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sk_mc2.yaml |multiple_choice |\n", - "|truthfulqa_sr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sr_mc1.yaml |multiple_choice |\n", - "|truthfulqa_sr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sr_mc2.yaml |multiple_choice |\n", - "|truthfulqa_sv_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sv_mc1.yaml |multiple_choice |\n", - "|truthfulqa_sv_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sv_mc2.yaml |multiple_choice |\n", - "|truthfulqa_ta_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ta_mc1.yaml |multiple_choice |\n", - "|truthfulqa_ta_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ta_mc2.yaml |multiple_choice |\n", - "|truthfulqa_te_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc1.yaml |multiple_choice |\n", - "|truthfulqa_te_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml |multiple_choice |\n", - "|truthfulqa_uk_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml |multiple_choice |\n", - "|truthfulqa_uk_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml |multiple_choice |\n", - "|truthfulqa_vi_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml |multiple_choice |\n", - "|truthfulqa_vi_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml |multiple_choice |\n", - "|truthfulqa_zh_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml |multiple_choice |\n", - "|truthfulqa_zh_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml |multiple_choice |\n", - "|turkishmmlu_biology |lm_eval/tasks/turkishmmlu/config/Biology.yaml |multiple_choice |\n", - "|turkishmmlu_chemistry |lm_eval/tasks/turkishmmlu/config/Chemistry.yaml |multiple_choice |\n", - "|turkishmmlu_cot_biology |lm_eval/tasks/turkishmmlu/config_cot/Biology.yaml |generate_until |\n", - "|turkishmmlu_cot_chemistry |lm_eval/tasks/turkishmmlu/config_cot/Chemistry.yaml |generate_until |\n", - "|turkishmmlu_cot_geography |lm_eval/tasks/turkishmmlu/config_cot/Geography.yaml |generate_until |\n", - "|turkishmmlu_cot_history |lm_eval/tasks/turkishmmlu/config_cot/History.yaml |generate_until |\n", - "|turkishmmlu_cot_mathematics |lm_eval/tasks/turkishmmlu/config_cot/Mathematics.yaml |generate_until |\n", - "|turkishmmlu_cot_philosophy |lm_eval/tasks/turkishmmlu/config_cot/Philosophy.yaml |generate_until |\n", - "|turkishmmlu_cot_physics |lm_eval/tasks/turkishmmlu/config_cot/Physics.yaml |generate_until |\n", - "|turkishmmlu_cot_religion_and_ethics |lm_eval/tasks/turkishmmlu/config_cot/Religion_and_Ethics.yaml |generate_until |\n", - "|turkishmmlu_cot_turkish_language_and_literature |lm_eval/tasks/turkishmmlu/config_cot/Turkish_Language_and_Literature.yaml |generate_until |\n", - "|turkishmmlu_geography |lm_eval/tasks/turkishmmlu/config/Geography.yaml |multiple_choice |\n", - "|turkishmmlu_history |lm_eval/tasks/turkishmmlu/config/History.yaml |multiple_choice |\n", - "|turkishmmlu_mathematics |lm_eval/tasks/turkishmmlu/config/Mathematics.yaml |multiple_choice |\n", - "|turkishmmlu_philosophy |lm_eval/tasks/turkishmmlu/config/Philosophy.yaml |multiple_choice |\n", - "|turkishmmlu_physics |lm_eval/tasks/turkishmmlu/config/Physics.yaml |multiple_choice |\n", - "|turkishmmlu_religion_and_ethics |lm_eval/tasks/turkishmmlu/config/Religion_and_Ethics.yaml |multiple_choice |\n", - "|turkishmmlu_turkish_language_and_literature |lm_eval/tasks/turkishmmlu/config/Turkish_Language_and_Literature.yaml |multiple_choice |\n", - "|unfair_tos |lm_eval/tasks/unitxt/unfair_tos.yaml | |\n", - "|vaxx_stance |lm_eval/tasks/basqueglue/vaxx.yaml |multiple_choice |\n", - "|webqs |lm_eval/tasks/webqs/webqs.yaml |multiple_choice |\n", - "|wic |lm_eval/tasks/super_glue/wic/default.yaml |multiple_choice |\n", - "|wiceu |lm_eval/tasks/basqueglue/wic.yaml |multiple_choice |\n", - "|wikitext |lm_eval/tasks/wikitext/wikitext.yaml |loglikelihood_rolling|\n", - "|winogrande |lm_eval/tasks/winogrande/default.yaml |multiple_choice |\n", - "|wmdp_bio |lm_eval/tasks/wmdp/wmdp_bio.yaml |multiple_choice |\n", - "|wmdp_chem |lm_eval/tasks/wmdp/wmdp_chem.yaml |multiple_choice |\n", - "|wmdp_cyber |lm_eval/tasks/wmdp/wmdp_cyber.yaml |multiple_choice |\n", - "|wmt-ro-en-t5-prompt |lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml |generate_until |\n", - "|wmt14-en-fr |lm_eval/tasks/translation/wmt14_en-fr.yaml |generate_until |\n", - "|wmt14-fr-en |lm_eval/tasks/translation/wmt14_fr-en.yaml |generate_until |\n", - "|wmt16-de-en |lm_eval/tasks/translation/wmt16_de-en.yaml |generate_until |\n", - "|wmt16-en-de |lm_eval/tasks/translation/wmt16_en-de.yaml |generate_until |\n", - "|wmt16-en-ro |lm_eval/tasks/translation/wmt16_en-ro.yaml |generate_until |\n", - "|wmt16-ro-en |lm_eval/tasks/translation/wmt16_ro-en.yaml |generate_until |\n", - "|wnli |lm_eval/tasks/glue/wnli/default.yaml |multiple_choice |\n", - "|wnli_ca |lm_eval/tasks/catalan_bench/wnli_ca.yaml |multiple_choice |\n", - "|wnli_es |lm_eval/tasks/spanish_bench/wnli_es.yaml |multiple_choice |\n", - "|wnli_eu |lm_eval/tasks/basque_bench/wnli_eu.yaml |multiple_choice |\n", - "|wsc |lm_eval/tasks/super_glue/wsc/default.yaml |multiple_choice |\n", - "|wsc273 |lm_eval/tasks/wsc273/default.yaml |multiple_choice |\n", - "|xcopa_et |lm_eval/tasks/xcopa/default_et.yaml |multiple_choice |\n", - "|xcopa_eu |lm_eval/tasks/basque_bench/xcopa_eu.yaml |multiple_choice |\n", - "|xcopa_ht |lm_eval/tasks/xcopa/default_ht.yaml |multiple_choice |\n", - "|xcopa_id |lm_eval/tasks/xcopa/default_id.yaml |multiple_choice |\n", - "|xcopa_it |lm_eval/tasks/xcopa/default_it.yaml |multiple_choice |\n", - "|xcopa_qu |lm_eval/tasks/xcopa/default_qu.yaml |multiple_choice |\n", - "|xcopa_sw |lm_eval/tasks/xcopa/default_sw.yaml |multiple_choice |\n", - "|xcopa_ta |lm_eval/tasks/xcopa/default_ta.yaml |multiple_choice |\n", - "|xcopa_th |lm_eval/tasks/xcopa/default_th.yaml |multiple_choice |\n", - "|xcopa_tr |lm_eval/tasks/xcopa/default_tr.yaml |multiple_choice |\n", - "|xcopa_vi |lm_eval/tasks/xcopa/default_vi.yaml |multiple_choice |\n", - "|xcopa_zh |lm_eval/tasks/xcopa/default_zh.yaml |multiple_choice |\n", - "|xlsum_es |lm_eval/tasks/spanish_bench/xlsum_es.yaml |generate_until |\n", - "|xnli_ar |lm_eval/tasks/xnli/xnli_ar.yaml |multiple_choice |\n", - "|xnli_bg |lm_eval/tasks/xnli/xnli_bg.yaml |multiple_choice |\n", - "|xnli_ca |lm_eval/tasks/catalan_bench/xnli_ca.yaml |multiple_choice |\n", - "|xnli_de |lm_eval/tasks/xnli/xnli_de.yaml |multiple_choice |\n", - "|xnli_el |lm_eval/tasks/xnli/xnli_el.yaml |multiple_choice |\n", - "|xnli_en |lm_eval/tasks/xnli/xnli_en.yaml |multiple_choice |\n", - "|xnli_es |lm_eval/tasks/xnli/xnli_es.yaml |multiple_choice |\n", - "|xnli_es_spanish_bench |lm_eval/tasks/spanish_bench/xnli_es_spanish_bench.yaml |multiple_choice |\n", - "|xnli_eu |lm_eval/tasks/xnli_eu/xnli_eu.yaml |multiple_choice |\n", - "|xnli_eu_mt |lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml |multiple_choice |\n", - "|xnli_eu_native |lm_eval/tasks/xnli_eu/xnli_eu_native.yaml |multiple_choice |\n", - "|xnli_fr |lm_eval/tasks/xnli/xnli_fr.yaml |multiple_choice |\n", - "|xnli_gl |lm_eval/tasks/galician_bench/xnli_gl.yaml |multiple_choice |\n", - "|xnli_hi |lm_eval/tasks/xnli/xnli_hi.yaml |multiple_choice |\n", - "|xnli_ru |lm_eval/tasks/xnli/xnli_ru.yaml |multiple_choice |\n", - "|xnli_sw |lm_eval/tasks/xnli/xnli_sw.yaml |multiple_choice |\n", - "|xnli_th |lm_eval/tasks/xnli/xnli_th.yaml |multiple_choice |\n", - "|xnli_tr |lm_eval/tasks/xnli/xnli_tr.yaml |multiple_choice |\n", - "|xnli_ur |lm_eval/tasks/xnli/xnli_ur.yaml |multiple_choice |\n", - "|xnli_vi |lm_eval/tasks/xnli/xnli_vi.yaml |multiple_choice |\n", - "|xnli_zh |lm_eval/tasks/xnli/xnli_zh.yaml |multiple_choice |\n", - "|xquad_ar |lm_eval/tasks/xquad/xquad_ar.yaml |generate_until |\n", - "|xquad_ca |lm_eval/tasks/catalan_bench/xquad_ca.yaml |generate_until |\n", - "|xquad_de |lm_eval/tasks/xquad/xquad_de.yaml |generate_until |\n", - "|xquad_el |lm_eval/tasks/xquad/xquad_el.yaml |generate_until |\n", - "|xquad_en |lm_eval/tasks/xquad/xquad_en.yaml |generate_until |\n", - "|xquad_es |lm_eval/tasks/xquad/xquad_es.yaml |generate_until |\n", - "|xquad_hi |lm_eval/tasks/xquad/xquad_hi.yaml |generate_until |\n", - "|xquad_ro |lm_eval/tasks/xquad/xquad_ro.yaml |generate_until |\n", - "|xquad_ru |lm_eval/tasks/xquad/xquad_ru.yaml |generate_until |\n", - "|xquad_th |lm_eval/tasks/xquad/xquad_th.yaml |generate_until |\n", - "|xquad_tr |lm_eval/tasks/xquad/xquad_tr.yaml |generate_until |\n", - "|xquad_vi |lm_eval/tasks/xquad/xquad_vi.yaml |generate_until |\n", - "|xquad_zh |lm_eval/tasks/xquad/xquad_zh.yaml |generate_until |\n", - "|xstorycloze_ar |lm_eval/tasks/xstorycloze/default_ar.yaml |multiple_choice |\n", - "|xstorycloze_ca |lm_eval/tasks/catalan_bench/xstorycloze_ca.yaml |multiple_choice |\n", - "|xstorycloze_en |lm_eval/tasks/xstorycloze/default_en.yaml |multiple_choice |\n", - "|xstorycloze_es |lm_eval/tasks/xstorycloze/default_es.yaml |multiple_choice |\n", - "|xstorycloze_eu |lm_eval/tasks/xstorycloze/default_eu.yaml |multiple_choice |\n", - "|xstorycloze_gl |lm_eval/tasks/galician_bench/xstorycloze_gl.yaml |multiple_choice |\n", - "|xstorycloze_hi |lm_eval/tasks/xstorycloze/default_hi.yaml |multiple_choice |\n", - "|xstorycloze_id |lm_eval/tasks/xstorycloze/default_id.yaml |multiple_choice |\n", - "|xstorycloze_my |lm_eval/tasks/xstorycloze/default_my.yaml |multiple_choice |\n", - "|xstorycloze_ru |lm_eval/tasks/xstorycloze/default_ru.yaml |multiple_choice |\n", - "|xstorycloze_sw |lm_eval/tasks/xstorycloze/default_sw.yaml |multiple_choice |\n", - "|xstorycloze_te |lm_eval/tasks/xstorycloze/default_te.yaml |multiple_choice |\n", - "|xstorycloze_zh |lm_eval/tasks/xstorycloze/default_zh.yaml |multiple_choice |\n", - "|xsum |lm_eval/tasks/unitxt/xsum.yaml | |\n", - "|xwinograd_en |lm_eval/tasks/xwinograd/xwinograd_en.yaml |multiple_choice |\n", - "|xwinograd_fr |lm_eval/tasks/xwinograd/xwinograd_fr.yaml |multiple_choice |\n", - "|xwinograd_jp |lm_eval/tasks/xwinograd/xwinograd_jp.yaml |multiple_choice |\n", - "|xwinograd_pt |lm_eval/tasks/xwinograd/xwinograd_pt.yaml |multiple_choice |\n", - "|xwinograd_ru |lm_eval/tasks/xwinograd/xwinograd_ru.yaml |multiple_choice |\n", - "|xwinograd_zh |lm_eval/tasks/xwinograd/xwinograd_zh.yaml |multiple_choice |\n", - "|yahoo_answers_topics |lm_eval/tasks/unitxt/yahoo_answers_topics.yaml | |\n", - "\n", - "\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "!lm_eval --model hf \\\n", - "--model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n", - "--tasks mmlu \\\n", - "--output output/mmlu/ \\\n", - "--log_samples \\\n", - "--use_cache cache" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1Nxw4WNxZUyb", - "outputId": "6b76b575-add2-4d76-c9fe-f63870b07963" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "/bin/bash: line 1: lm_eval: command not found\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "!lm_eval \\\n", - " --model hf \\\n", - " --model_args pretrained=EleutherAI/pythia-2.8b \\\n", - " --include_path ./ \\\n", - " --tasks mmlu_en_us \\\n", - " --limit 10 \\\n", - " --device cpu \\\n", - " --output output/mmlu_en_us/ \\\n", - " --log_samples" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "AmErCJNkkBGl", - "outputId": "8d434610-1c8d-4479-e0e8-a4d06a881187" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Traceback (most recent call last):\n", - " File \"/usr/local/bin/lm_eval\", line 5, in \n", - " from lm_eval.__main__ import cli_evaluate\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/__init__.py\", line 1, in \n", - " from .evaluator import evaluate, simple_evaluate\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/evaluator.py\", line 12, in \n", - " import lm_eval.api.metrics\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/api/metrics.py\", line 12, in \n", - " from lm_eval.api.registry import register_aggregation, register_metric\n", - " File \"/usr/local/lib/python3.10/dist-packages/lm_eval/api/registry.py\", line 4, in \n", - " import evaluate as hf_evaluate\n", - " File \"/usr/local/lib/python3.10/dist-packages/evaluate/__init__.py\", line 29, in \n", - " from .evaluation_suite import EvaluationSuite\n", - " File \"/usr/local/lib/python3.10/dist-packages/evaluate/evaluation_suite/__init__.py\", line 10, in \n", - " from ..evaluator import evaluator\n", - " File \"/usr/local/lib/python3.10/dist-packages/evaluate/evaluator/__init__.py\", line 17, in \n", - " from transformers.pipelines import SUPPORTED_TASKS as SUPPORTED_PIPELINE_TASKS\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/pipelines/__init__.py\", line 26, in \n", - " from ..image_processing_utils import BaseImageProcessor\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/image_processing_utils.py\", line 21, in \n", - " from .image_transforms import center_crop, normalize, rescale\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/image_transforms.py\", line 22, in \n", - " from .image_utils import (\n", - " File \"/usr/local/lib/python3.10/dist-packages/transformers/image_utils.py\", line 58, in \n", - " from torchvision.transforms import InterpolationMode\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/__init__.py\", line 10, in \n", - " from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils # usort:skip\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/models/__init__.py\", line 2, in \n", - " from .convnext import *\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/models/convnext.py\", line 8, in \n", - " from ..ops.misc import Conv2dNormActivation, Permute\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/ops/__init__.py\", line 23, in \n", - " from .poolers import MultiScaleRoIAlign\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/ops/poolers.py\", line 10, in \n", - " from .roi_align import roi_align\n", - " File \"/usr/local/lib/python3.10/dist-packages/torchvision/ops/roi_align.py\", line 7, in \n", - " from torch._dynamo.utils import is_compile_supported\n", - " File \"/usr/local/lib/python3.10/dist-packages/torch/_dynamo/__init__.py\", line 3, in \n", - " from . import convert_frame, eval_frame, resume_execution\n", - " File \"/usr/local/lib/python3.10/dist-packages/torch/_dynamo/convert_frame.py\", line 31, in \n", - " from torch._dynamo.utils import CompileTimeInstructionCounter\n", - " File \"/usr/local/lib/python3.10/dist-packages/torch/_dynamo/utils.py\", line 62, in \n", - " import torch.fx.experimental.symbolic_shapes\n", - " File \"/usr/local/lib/python3.10/dist-packages/torch/fx/experimental/symbolic_shapes.py\", line 65, in \n", - " from torch.utils._sympy.functions import (\n", - " File \"/usr/local/lib/python3.10/dist-packages/torch/utils/_sympy/functions.py\", line 7, in \n", - " import sympy\n", - " File \"/usr/local/lib/python3.10/dist-packages/sympy/__init__.py\", line 74, in \n", - " from .polys import (Poly, PurePoly, poly_from_expr, parallel_poly_from_expr,\n", - " File \"/usr/local/lib/python3.10/dist-packages/sympy/polys/__init__.py\", line 79, in \n", - " from .polyfuncs import (symmetrize, horner, interpolate,\n", - " File \"/usr/local/lib/python3.10/dist-packages/sympy/polys/polyfuncs.py\", line 10, in \n", - " from sympy.polys.specialpolys import (\n", - " File \"/usr/local/lib/python3.10/dist-packages/sympy/polys/specialpolys.py\", line 298, in \n", - " from sympy.polys.rings import ring\n", - "KeyboardInterrupt\n", - "^C\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "YAML_mmlu_en_us_string = \"\"\"\n", - "task: mmlu_en_us\n", - "dataset_path: Ataraxiainc/MMLU_EN_US\n", - "dataset_name: mmlu\n", - "description: \"MMLU dataset in English\"\n", - "test_split: test\n", - "fewshot_split: dev\n", - "fewshot_config:\n", - " sampler: first_n\n", - "output_type: multiple_choice\n", - "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n", - "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n", - "doc_to_target: answer\n", - "metric_list:\n", - " - metric: acc\n", - " aggregation: mean\n", - " higher_is_better: true\n", - " - metric: acc_norm\n", - " aggregation: mean\n", - " higher_is_better: true\n", - "\"\"\"\n", - "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n", - " f.write(YAML_mmlu_en_us_string)" - ], - "metadata": { - "id": "c3cbK79ykiV1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!lm_eval \\\n", - " --model hf \\\n", - " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n", - " --include_path ./ \\\n", - " --tasks mmlu_en_us \\\n", - " --limit 10 \\\n", - " --device cpu \\\n", - " --output output/mmlu_en_us/ \\\n", - " --log_samples" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3cHI2qxN2fJ0", - "outputId": "d11bcb40-01db-455c-dc4b-80686c595d9e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2024-11-16 12:02:29.526179: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-16 12:02:29.551886: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-16 12:02:29.559333: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-16 12:02:29.577073: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-11-16 12:02:31.046000: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-11-16:12:02:33,525 INFO [__main__.py:279] Verbosity set to INFO\n", - "2024-11-16:12:02:33,525 INFO [__main__.py:303] Including path: ./\n", - "2024-11-16:12:02:46,438 WARNING [__main__.py:312] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", - "2024-11-16:12:02:46,439 INFO [__main__.py:376] Selected Tasks: ['mmlu_en_us']\n", - "2024-11-16:12:02:46,448 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n", - "2024-11-16:12:02:46,448 INFO [evaluator.py:201] Initializing hf model, with arguments: {'pretrained': 'mistralai/Mistral-7B-v0.1'}\n", - "2024-11-16:12:02:46,460 INFO [huggingface.py:131] Using device 'cpu'\n", - "2024-11-16:12:02:46,727 INFO [huggingface.py:484] Using model type 'default'\n", - "2024-11-16:12:02:47,082 INFO [huggingface.py:368] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cpu'}\n", - "Loading checkpoint shards: 100% 2/2 [00:02<00:00, 1.02s/it]\n", - "2024-11-16:12:02:55,725 WARNING [model.py:422] model.chat_template was called with the chat_template set to False or None. Therefore no chat template will be applied. Make sure this is an intended behavior.\n", - "2024-11-16:12:02:55,726 INFO [task.py:415] Building contexts for mmlu_en_us on rank 0...\n", - "100% 10/10 [00:00<00:00, 529.16it/s]\n", - "2024-11-16:12:02:55,746 INFO [evaluator.py:494] Running loglikelihood requests\n", - "Running loglikelihood requests: 100% 40/40 [1:26:27<00:00, 129.70s/it]\n", - "fatal: not a git repository (or any of the parent directories): .git\n", - "2024-11-16:13:29:27,578 INFO [evaluation_tracker.py:206] Saving results aggregated\n", - "2024-11-16:13:29:27,580 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_en_us\n", - "hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: 10.0, num_fewshot: None, batch_size: 1\n", - "| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|\n", - "|----------|-------|------|-----:|--------|---|----:|---|-----:|\n", - "|mmlu_en_us|Yaml |none | 0|acc |↑ | 0.3|± |0.1528|\n", - "| | |none | 0|acc_norm|↑ | 0.3|± |0.1528|\n", - "\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "!lm_eval \\\n", - " --model hf \\\n", - " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n", - " --include_path ./ \\\n", - " --tasks mmlu_en_us \\\n", - " --limit 10 \\\n", - " --device cpu \\\n", - " --output output/mmlu_en_us/ \\\n", - " --log_samples" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cFFYPzBIYGf7", - "outputId": "ac780071-9448-4534-d947-aaa554b1d349" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2024-11-16 14:29:34.442475: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-16 14:29:34.491431: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-16 14:29:34.506357: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-16 14:29:34.559901: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-11-16 14:29:36.784945: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-11-16:14:29:40,747 INFO [__main__.py:279] Verbosity set to INFO\n", - "2024-11-16:14:29:40,747 INFO [__main__.py:303] Including path: ./\n", - "2024-11-16:14:29:54,175 WARNING [__main__.py:312] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", - "2024-11-16:14:29:54,176 INFO [__main__.py:376] Selected Tasks: ['mmlu_en_us']\n", - "2024-11-16:14:29:54,191 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n", - "2024-11-16:14:29:54,192 INFO [evaluator.py:201] Initializing hf model, with arguments: {'pretrained': 'meta-llama/Meta-Llama-3-8B'}\n", - "2024-11-16:14:29:54,199 INFO [huggingface.py:131] Using device 'cpu'\n", - "2024-11-16:14:29:54,820 INFO [huggingface.py:484] Using model type 'default'\n", - "2024-11-16:14:29:56,214 INFO [huggingface.py:368] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cpu'}\n", - "Loading checkpoint shards: 100% 4/4 [00:02<00:00, 1.59it/s]\n", - "generation_config.json: 100% 177/177 [00:00<00:00, 779kB/s]\n", - "2024-11-16:14:30:07,768 WARNING [model.py:422] model.chat_template was called with the chat_template set to False or None. Therefore no chat template will be applied. Make sure this is an intended behavior.\n", - "2024-11-16:14:30:07,770 INFO [task.py:415] Building contexts for mmlu_en_us on rank 0...\n", - "100% 10/10 [00:00<00:00, 444.56it/s]\n", - "2024-11-16:14:30:07,794 INFO [evaluator.py:494] Running loglikelihood requests\n", - "Running loglikelihood requests: 100% 40/40 [1:27:27<00:00, 131.18s/it]\n", - "fatal: not a git repository (or any of the parent directories): .git\n", - "2024-11-16:15:57:39,726 INFO [evaluation_tracker.py:206] Saving results aggregated\n", - "2024-11-16:15:57:39,730 INFO [evaluation_tracker.py:287] Saving per-sample results for: mmlu_en_us\n", - "hf (pretrained=meta-llama/Meta-Llama-3-8B), gen_kwargs: (None), limit: 10.0, num_fewshot: None, batch_size: 1\n", - "| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|\n", - "|----------|-------|------|-----:|--------|---|----:|---|-----:|\n", - "|mmlu_en_us|Yaml |none | 0|acc |↑ | 0.2|± |0.1333|\n", - "| | |none | 0|acc_norm|↑ | 0.2|± |0.1333|\n", - "\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [], - "metadata": { - "id": "ZUTPHnV0kMB1" - } - }, - { - "cell_type": "code", - "source": [ - "!lm_eval \\\n", - " --model hf \\\n", - " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n", - " --tasks mmlu \\\n", - " --limit 10 \\\n", - " --device cpu \\\n", - " --output output/mmlu_en_us/ \\\n", - " --log_samples" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eKG2JT9l_fYL", - "outputId": "4855306b-6111-475d-835d-d9e736a69f8f" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "2024-11-16 17:21:31.817950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2024-11-16 17:21:32.304353: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2024-11-16 17:21:32.450445: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2024-11-16 17:21:33.249668: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-11-16 17:21:37.296351: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-11-16:17:21:43,546 INFO [__main__.py:279] Verbosity set to INFO\n", - "2024-11-16:17:21:57,719 WARNING [__main__.py:312] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", - "2024-11-16:17:21:57,720 INFO [__main__.py:376] Selected Tasks: ['mmlu']\n", - "2024-11-16:17:21:57,729 INFO [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234\n", - "2024-11-16:17:21:57,729 INFO [evaluator.py:201] Initializing hf model, with arguments: {'pretrained': 'meta-llama/Meta-Llama-3-8B'}\n", - "2024-11-16:17:21:57,742 INFO [huggingface.py:131] Using device 'cpu'\n", - "config.json: 100% 654/654 [00:00<00:00, 2.52MB/s]\n", - "tokenizer_config.json: 100% 50.6k/50.6k [00:00<00:00, 8.48MB/s]\n", - "tokenizer.json: 100% 9.09M/9.09M [00:00<00:00, 18.8MB/s]\n", - "special_tokens_map.json: 100% 73.0/73.0 [00:00<00:00, 318kB/s]\n", - "2024-11-16:17:22:00,109 INFO [huggingface.py:368] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cpu'}\n", - "model.safetensors.index.json: 100% 23.9k/23.9k [00:00<00:00, 60.7MB/s]\n", - "Downloading shards: 0% 0/4 [00:00