{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "Initial setup" ], "metadata": { "id": "U8RTc2PmnX-v" } }, { "cell_type": "code", "source": [ "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt" ], "metadata": { "id": "kGW7vfRkrqHe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "id": "2I850FIsCVNw" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Create task for MMLU all datasets" ], "metadata": { "id": "Jd2JwKZaPkNS" } }, { "cell_type": "code", "source": [ "YAML_mmlu_en_us_string = \"\"\"\n", "task: mmlu_all\n", "dataset_path: cais/mmlu\n", "dataset_name: all\n", "description: \"MMLU dataset in English\"\n", "test_split: test\n", "fewshot_split: dev\n", "fewshot_config:\n", " sampler: first_n\n", "output_type: multiple_choice\n", "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n", "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n", "doc_to_target: answer\n", "metric_list:\n", " - metric: acc\n", " aggregation: mean\n", " higher_is_better: true\n", " - metric: acc_norm\n", " aggregation: mean\n", " higher_is_better: true\n", "\"\"\"\n", "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n", " f.write(YAML_mmlu_en_us_string)" ], "metadata": { "id": "xP0cC_sHih7C" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Llama Models" ], "metadata": { "id": "mJjo_A5tP-Td" } }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n", " --include_path ./ \\\n", " --tasks mmlu_all \\\n", " --output output/mmlu/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10\n" ], "metadata": { "id": "IzP5nyP0Gwk8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n", " --include_path ./ \\\n", " --tasks mmlu_all \\\n", " --output output/mmlu/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "oIACOAhDW5ow" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n", " --include_path ./ \\\n", " --tasks mmlu_all \\\n", " --output output/mmlu/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "1Nxw4WNxZUyb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n", " --include_path ./ \\\n", " --tasks mmlu_all \\\n", " --output output/mmlu/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "cFFYPzBIYGf7" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Mistral Models" ], "metadata": { "id": "1fEX-49hQ-Be" } }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n", " --include_path ./ \\\n", " --tasks mmlu_all \\\n", " --output output/mmlu/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "3cHI2qxN2fJ0" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "ZUTPHnV0kMB1" } } ] }