{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "Initial setup" ], "metadata": { "id": "U8RTc2PmnX-v" } }, { "cell_type": "code", "source": [ "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt" ], "metadata": { "id": "kGW7vfRkrqHe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "id": "2I850FIsCVNw" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Create task for PUB all datasets" ], "metadata": { "id": "Jd2JwKZaPkNS" } }, { "cell_type": "code", "source": [ "YAML_template_pub_tasks = [\n", " (\"task_1\", 2),\n", " (\"task_2\", 5),\n", " (\"task_3\", 5),\n", " (\"task_4\", 3),\n", " (\"task_5\", 2),\n", " (\"task_6\", 2),\n", " (\"task_7\", 2),\n", " (\"task_8\", 2),\n", " (\"task_9\", 2),\n", " (\"task_10\", 3),\n", " (\"task_11\", 3),\n", " (\"task_12\", 2),\n", " (\"task_13\", 2),\n", " (\"task_14\", 4)\n", "]\n", "\n", "default_doc_to_text = \"{{pretext.strip()}}\\n {{options[0]}}\\n{{options[1]}}\\\\n{{options[2]}}\\\\n{{options[3]}}\\\\n{{options[4]}}\\\\nAnswer:\"\n", "\n", "\n", "YAML_template_pub_base = \"\"\"\n", "task: __task_name__\n", "dataset_path: flunardelli/PUB\n", "dataset_name: __dataset_name__\n", "description: \"PUB\"\n", "test_split: test\n", "fewshot_split: test\n", "fewshot_config:\n", " sampler: first_n\n", "num_fewshot: 10\n", "output_type: multiple_choice\n", "doc_to_text: \"{{pretext.strip()}}\\n Options:\\n__options__\\nAnswer:\"\n", "doc_to_choice: \"{{options}}\"\n", "doc_to_target: \"correct answer\"\n", "metric_list:\n", " - metric: acc\n", " aggregation: mean\n", " higher_is_better: true\n", " - metric: acc_norm\n", " aggregation: mean\n", " higher_is_better: true\n", "\"\"\"\n", "tasks = []\n", "for t in YAML_template_pub_tasks:\n", " dataset_name, num_choices = t\n", " task_name = f\"pub_{dataset_name}\"\n", " tasks.append(task_name)\n", " templace_choices = '\\n'.join([\"{{options[__i__]}}\".replace('__i__',str(i)) for i in range(num_choices)])\n", " template = (YAML_template_pub_base\n", " .replace('__options__',templace_choices)\n", " .replace('__dataset_name__',dataset_name).replace('__task_name__',task_name)\n", " )\n", " with open(f\"pub_{dataset_name}.yaml\", \"w\") as f:\n", " f.write(template)\n", "\n", "','.join(tasks)" ], "metadata": { "id": "xP0cC_sHih7C", "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "outputId": "fcf3ed9e-1422-47f3-e234-016435c8b212" }, "execution_count": 1, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 1 } ] }, { "cell_type": "markdown", "source": [ "Llama Models" ], "metadata": { "id": "mJjo_A5tP-Td" } }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n", " --include_path ./ \\\n", " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n", " --output output/pub/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10\n" ], "metadata": { "id": "IzP5nyP0Gwk8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n", " --include_path ./ \\\n", " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n", " --output output/pub/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "oIACOAhDW5ow" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n", " --include_path ./ \\\n", " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n", " --output output/pub/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "1Nxw4WNxZUyb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n", " --include_path ./ \\\n", " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n", " --output output/pub/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "cFFYPzBIYGf7" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Mistral Models" ], "metadata": { "id": "1fEX-49hQ-Be" } }, { "cell_type": "code", "source": [ "!lm_eval --model hf \\\n", " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n", " --include_path ./ \\\n", " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n", " --output output/pub/ \\\n", " --use_cache cache \\\n", " --device cuda:0 \\\n", " --log_samples\n", " # --limit 10" ], "metadata": { "id": "3cHI2qxN2fJ0" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "ZUTPHnV0kMB1" } } ] }