flunardelli
/

llm-metaeval

Model card Files Files and versions Community

File size: 5,335 Bytes

c607184

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "Initial setup"
      ],
      "metadata": {
        "id": "U8RTc2PmnX-v"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
      ],
      "metadata": {
        "id": "kGW7vfRkrqHe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import notebook_login\n",
        "notebook_login()"
      ],
      "metadata": {
        "id": "2I850FIsCVNw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Create task for MMLU all datasets"
      ],
      "metadata": {
        "id": "Jd2JwKZaPkNS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "YAML_mmlu_en_us_string = \"\"\"\n",
        "task: mmlu_all\n",
        "dataset_path: cais/mmlu\n",
        "dataset_name: all\n",
        "description: \"MMLU dataset in English\"\n",
        "test_split: test\n",
        "fewshot_split: dev\n",
        "fewshot_config:\n",
        "  sampler: first_n\n",
        "output_type: multiple_choice\n",
        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
        "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
        "doc_to_target: answer\n",
        "metric_list:\n",
        "  - metric: acc\n",
        "    aggregation: mean\n",
        "    higher_is_better: true\n",
        "  - metric: acc_norm\n",
        "    aggregation: mean\n",
        "    higher_is_better: true\n",
        "\"\"\"\n",
        "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n",
        "    f.write(YAML_mmlu_en_us_string)"
      ],
      "metadata": {
        "id": "xP0cC_sHih7C"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Llama Models"
      ],
      "metadata": {
        "id": "mJjo_A5tP-Td"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!lm_eval --model hf \\\n",
        "  --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
        "  --include_path ./ \\\n",
        "  --tasks mmlu_all \\\n",
        "  --output output/mmlu/ \\\n",
        "  --use_cache cache \\\n",
        "  --device cuda:0 \\\n",
        "  --log_samples\n",
        "  #  --limit 10\n"
      ],
      "metadata": {
        "id": "IzP5nyP0Gwk8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!lm_eval --model hf \\\n",
        "  --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
        "  --include_path ./ \\\n",
        "  --tasks mmlu_all \\\n",
        "  --output output/mmlu/ \\\n",
        "  --use_cache cache \\\n",
        "  --device cuda:0 \\\n",
        "  --log_samples\n",
        "  #  --limit 10"
      ],
      "metadata": {
        "id": "oIACOAhDW5ow"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!lm_eval --model hf \\\n",
        "  --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1  \\\n",
        "  --include_path ./ \\\n",
        "  --tasks mmlu_all \\\n",
        "  --output output/mmlu/ \\\n",
        "  --use_cache cache \\\n",
        "  --device cuda:0 \\\n",
        "  --log_samples\n",
        "  #  --limit 10"
      ],
      "metadata": {
        "id": "1Nxw4WNxZUyb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!lm_eval --model hf \\\n",
        "  --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n",
        "  --include_path ./ \\\n",
        "  --tasks mmlu_all \\\n",
        "  --output output/mmlu/ \\\n",
        "  --use_cache cache \\\n",
        "  --device cuda:0 \\\n",
        "  --log_samples\n",
        "  #  --limit 10"
      ],
      "metadata": {
        "id": "cFFYPzBIYGf7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Mistral Models"
      ],
      "metadata": {
        "id": "1fEX-49hQ-Be"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!lm_eval --model hf \\\n",
        "  --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
        "  --include_path ./ \\\n",
        "  --tasks mmlu_all \\\n",
        "  --output output/mmlu/ \\\n",
        "  --use_cache cache \\\n",
        "  --device cuda:0 \\\n",
        "  --log_samples\n",
        "  #  --limit 10"
      ],
      "metadata": {
        "id": "3cHI2qxN2fJ0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "ZUTPHnV0kMB1"
      }
    }
  ]
}