flunardelli
/

llm-metaeval

Model card Files Files and versions Community

flunardelli commited on Nov 19, 2024

Commit

c607184

1 Parent(s): bdc6993

initial notebooks

Browse files

Files changed (3) hide show

llm_eval_harness_GPU_version.ipynb +0 -0
llm_metaeval_eval_harness_mmlu.ipynb +214 -0
llm_metaeval_eval_harness_pub.ipynb +266 -0

llm_eval_harness_GPU_version.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

llm_metaeval_eval_harness_mmlu.ipynb ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Initial setup"
+      ],
+      "metadata": {
+        "id": "U8RTc2PmnX-v"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
+      ],
+      "metadata": {
+        "id": "kGW7vfRkrqHe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "notebook_login()"
+      ],
+      "metadata": {
+        "id": "2I850FIsCVNw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Create task for MMLU all datasets"
+      ],
+      "metadata": {
+        "id": "Jd2JwKZaPkNS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "YAML_mmlu_en_us_string = \"\"\"\n",
+        "task: mmlu_all\n",
+        "dataset_path: cais/mmlu\n",
+        "dataset_name: all\n",
+        "description: \"MMLU dataset in English\"\n",
+        "test_split: test\n",
+        "fewshot_split: dev\n",
+        "fewshot_config:\n",
+        "  sampler: first_n\n",
+        "output_type: multiple_choice\n",
+        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+        "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
+        "doc_to_target: answer\n",
+        "metric_list:\n",
+        "  - metric: acc\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "  - metric: acc_norm\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "\"\"\"\n",
+        "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n",
+        "    f.write(YAML_mmlu_en_us_string)"
+      ],
+      "metadata": {
+        "id": "xP0cC_sHih7C"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Llama Models"
+      ],
+      "metadata": {
+        "id": "mJjo_A5tP-Td"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks mmlu_all \\\n",
+        "  --output output/mmlu/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10\n"
+      ],
+      "metadata": {
+        "id": "IzP5nyP0Gwk8"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks mmlu_all \\\n",
+        "  --output output/mmlu/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "oIACOAhDW5ow"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1  \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks mmlu_all \\\n",
+        "  --output output/mmlu/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "1Nxw4WNxZUyb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks mmlu_all \\\n",
+        "  --output output/mmlu/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "cFFYPzBIYGf7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Mistral Models"
+      ],
+      "metadata": {
+        "id": "1fEX-49hQ-Be"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks mmlu_all \\\n",
+        "  --output output/mmlu/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "3cHI2qxN2fJ0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "ZUTPHnV0kMB1"
+      }
+    }
+  ]
+}

llm_metaeval_eval_harness_pub.ipynb ADDED Viewed

	@@ -0,0 +1,266 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Initial setup"
+      ],
+      "metadata": {
+        "id": "U8RTc2PmnX-v"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
+      ],
+      "metadata": {
+        "id": "kGW7vfRkrqHe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "notebook_login()"
+      ],
+      "metadata": {
+        "id": "2I850FIsCVNw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Create task for PUB all datasets"
+      ],
+      "metadata": {
+        "id": "Jd2JwKZaPkNS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "YAML_template_pub_tasks = [\n",
+        "    (\"task_1\", 2),\n",
+        "    (\"task_2\", 5),\n",
+        "    (\"task_3\", 5),\n",
+        "    (\"task_4\", 3),\n",
+        "    (\"task_5\", 2),\n",
+        "    (\"task_6\", 2),\n",
+        "    (\"task_7\", 2),\n",
+        "    (\"task_8\", 2),\n",
+        "    (\"task_9\", 2),\n",
+        "    (\"task_10\", 3),\n",
+        "    (\"task_11\", 3),\n",
+        "    (\"task_12\", 2),\n",
+        "    (\"task_13\", 2),\n",
+        "    (\"task_14\", 4)\n",
+        "]\n",
+        "\n",
+        "default_doc_to_text = \"{{pretext.strip()}}\\n {{options[0]}}\\n{{options[1]}}\\\\n{{options[2]}}\\\\n{{options[3]}}\\\\n{{options[4]}}\\\\nAnswer:\"\n",
+        "\n",
+        "\n",
+        "YAML_template_pub_base = \"\"\"\n",
+        "task: __task_name__\n",
+        "dataset_path: flunardelli/PUB\n",
+        "dataset_name: __dataset_name__\n",
+        "description: \"PUB\"\n",
+        "test_split: test\n",
+        "fewshot_split: test\n",
+        "fewshot_config:\n",
+        "  sampler: first_n\n",
+        "num_fewshot: 10\n",
+        "output_type: multiple_choice\n",
+        "doc_to_text: \"{{pretext.strip()}}\\n Options:\\n__options__\\nAnswer:\"\n",
+        "doc_to_choice: \"{{options}}\"\n",
+        "doc_to_target: \"correct answer\"\n",
+        "metric_list:\n",
+        "  - metric: acc\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "  - metric: acc_norm\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "\"\"\"\n",
+        "tasks = []\n",
+        "for t in YAML_template_pub_tasks:\n",
+        "  dataset_name, num_choices = t\n",
+        "  task_name = f\"pub_{dataset_name}\"\n",
+        "  tasks.append(task_name)\n",
+        "  templace_choices = '\\n'.join([\"{{options[__i__]}}\".replace('__i__',str(i)) for i in range(num_choices)])\n",
+        "  template = (YAML_template_pub_base\n",
+        "              .replace('__options__',templace_choices)\n",
+        "              .replace('__dataset_name__',dataset_name).replace('__task_name__',task_name)\n",
+        "              )\n",
+        "  with open(f\"pub_{dataset_name}.yaml\", \"w\") as f:\n",
+        "    f.write(template)\n",
+        "\n",
+        "','.join(tasks)"
+      ],
+      "metadata": {
+        "id": "xP0cC_sHih7C",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "fcf3ed9e-1422-47f3-e234-016435c8b212"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "'pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14'"
+            ],
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            }
+          },
+          "metadata": {},
+          "execution_count": 1
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Llama Models"
+      ],
+      "metadata": {
+        "id": "mJjo_A5tP-Td"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
+        "  --output output/pub/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10\n"
+      ],
+      "metadata": {
+        "id": "IzP5nyP0Gwk8"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
+        "  --output output/pub/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "oIACOAhDW5ow"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1  \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
+        "  --output output/pub/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "1Nxw4WNxZUyb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
+        "  --output output/pub/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "cFFYPzBIYGf7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Mistral Models"
+      ],
+      "metadata": {
+        "id": "1fEX-49hQ-Be"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!lm_eval --model hf \\\n",
+        "  --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
+        "  --include_path ./ \\\n",
+        "  --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
+        "  --output output/pub/ \\\n",
+        "  --use_cache cache \\\n",
+        "  --device cuda:0 \\\n",
+        "  --log_samples\n",
+        "  #  --limit 10"
+      ],
+      "metadata": {
+        "id": "3cHI2qxN2fJ0"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [],
+      "metadata": {
+        "id": "ZUTPHnV0kMB1"
+      }
+    }
+  ]
+}