Update llm_metaeval_eval_harness_mmlu.ipynb
Browse files- llm_metaeval_eval_harness_mmlu.ipynb +148 -119
llm_metaeval_eval_harness_mmlu.ipynb
CHANGED
@@ -1,70 +1,90 @@
|
|
1 |
{
|
2 |
-
"nbformat": 4,
|
3 |
-
"nbformat_minor": 0,
|
4 |
-
"metadata": {
|
5 |
-
"colab": {
|
6 |
-
"provenance": [],
|
7 |
-
"gpuType": "T4"
|
8 |
-
},
|
9 |
-
"kernelspec": {
|
10 |
-
"name": "python3",
|
11 |
-
"display_name": "Python 3"
|
12 |
-
},
|
13 |
-
"language_info": {
|
14 |
-
"name": "python"
|
15 |
-
},
|
16 |
-
"accelerator": "GPU"
|
17 |
-
},
|
18 |
"cells": [
|
19 |
{
|
20 |
"cell_type": "markdown",
|
21 |
-
"source": [
|
22 |
-
"Initial setup"
|
23 |
-
],
|
24 |
"metadata": {
|
25 |
"id": "U8RTc2PmnX-v"
|
26 |
-
}
|
|
|
|
|
|
|
27 |
},
|
28 |
{
|
29 |
"cell_type": "code",
|
30 |
-
"
|
31 |
-
"!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
|
32 |
-
],
|
33 |
"metadata": {
|
34 |
"id": "kGW7vfRkrqHe"
|
35 |
},
|
36 |
-
"
|
37 |
-
"
|
|
|
|
|
38 |
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
-
"
|
42 |
-
"from huggingface_hub import notebook_login\n",
|
43 |
-
"notebook_login()"
|
44 |
-
],
|
45 |
"metadata": {
|
46 |
"id": "2I850FIsCVNw"
|
47 |
},
|
48 |
-
"
|
49 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
},
|
51 |
{
|
52 |
"cell_type": "markdown",
|
53 |
-
"source": [
|
54 |
-
"Create task for MMLU all datasets"
|
55 |
-
],
|
56 |
"metadata": {
|
57 |
"id": "Jd2JwKZaPkNS"
|
58 |
-
}
|
|
|
|
|
|
|
59 |
},
|
60 |
{
|
61 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
62 |
"source": [
|
63 |
"YAML_mmlu_en_us_string = \"\"\"\n",
|
64 |
"task: mmlu_all\n",
|
65 |
"dataset_path: cais/mmlu\n",
|
66 |
"dataset_name: all\n",
|
67 |
-
"description: \"MMLU dataset
|
68 |
"test_split: test\n",
|
69 |
"fewshot_split: dev\n",
|
70 |
"fewshot_config:\n",
|
@@ -81,134 +101,143 @@
|
|
81 |
" aggregation: mean\n",
|
82 |
" higher_is_better: true\n",
|
83 |
"\"\"\"\n",
|
84 |
-
"
|
85 |
-
|
86 |
-
],
|
87 |
-
"metadata": {
|
88 |
-
"id": "xP0cC_sHih7C"
|
89 |
-
},
|
90 |
-
"execution_count": null,
|
91 |
-
"outputs": []
|
92 |
},
|
93 |
{
|
94 |
"cell_type": "markdown",
|
95 |
-
"source": [
|
96 |
-
"Llama Models"
|
97 |
-
],
|
98 |
"metadata": {
|
99 |
"id": "mJjo_A5tP-Td"
|
100 |
-
}
|
|
|
|
|
|
|
101 |
},
|
102 |
{
|
103 |
"cell_type": "code",
|
104 |
-
"
|
105 |
-
"!lm_eval --model hf \\\n",
|
106 |
-
" --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
|
107 |
-
" --include_path ./ \\\n",
|
108 |
-
" --tasks mmlu_all \\\n",
|
109 |
-
" --output output/mmlu/ \\\n",
|
110 |
-
" --use_cache cache \\\n",
|
111 |
-
" --device cuda:0 \\\n",
|
112 |
-
" --log_samples\n",
|
113 |
-
" # --limit 10\n"
|
114 |
-
],
|
115 |
"metadata": {
|
116 |
"id": "IzP5nyP0Gwk8"
|
117 |
},
|
118 |
-
"
|
119 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
},
|
121 |
{
|
122 |
"cell_type": "code",
|
123 |
-
"
|
124 |
-
"!lm_eval --model hf \\\n",
|
125 |
-
" --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
|
126 |
-
" --include_path ./ \\\n",
|
127 |
-
" --tasks mmlu_all \\\n",
|
128 |
-
" --output output/mmlu/ \\\n",
|
129 |
-
" --use_cache cache \\\n",
|
130 |
-
" --device cuda:0 \\\n",
|
131 |
-
" --log_samples\n",
|
132 |
-
" # --limit 10"
|
133 |
-
],
|
134 |
"metadata": {
|
135 |
"id": "oIACOAhDW5ow"
|
136 |
},
|
137 |
-
"
|
138 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
},
|
140 |
{
|
141 |
"cell_type": "code",
|
|
|
|
|
|
|
|
|
|
|
142 |
"source": [
|
143 |
-
"!
|
144 |
-
"
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"
|
148 |
-
"
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
"metadata": {
|
154 |
-
"id": "
|
155 |
},
|
156 |
-
"
|
157 |
-
|
|
|
158 |
},
|
159 |
{
|
160 |
"cell_type": "code",
|
161 |
"source": [
|
162 |
-
"!
|
163 |
-
"
|
164 |
-
"
|
165 |
-
"
|
166 |
-
"
|
167 |
-
"
|
168 |
-
" --device cuda:0 \\\n",
|
169 |
-
" --log_samples\n",
|
170 |
-
" # --limit 10"
|
171 |
],
|
172 |
"metadata": {
|
173 |
-
"id": "
|
174 |
},
|
175 |
"execution_count": null,
|
176 |
"outputs": []
|
177 |
},
|
178 |
{
|
179 |
-
"cell_type": "
|
|
|
|
|
|
|
|
|
|
|
180 |
"source": [
|
181 |
-
"
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
"metadata": {
|
184 |
-
"id": "
|
185 |
-
}
|
|
|
|
|
|
|
186 |
},
|
187 |
{
|
188 |
"cell_type": "code",
|
189 |
"source": [
|
190 |
-
"
|
191 |
-
" --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
|
192 |
-
" --include_path ./ \\\n",
|
193 |
-
" --tasks mmlu_all \\\n",
|
194 |
-
" --output output/mmlu/ \\\n",
|
195 |
-
" --use_cache cache \\\n",
|
196 |
-
" --device cuda:0 \\\n",
|
197 |
-
" --log_samples\n",
|
198 |
-
" # --limit 10"
|
199 |
],
|
200 |
"metadata": {
|
201 |
-
"id": "
|
202 |
},
|
203 |
"execution_count": null,
|
204 |
"outputs": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
},
|
206 |
-
{
|
207 |
-
"
|
208 |
-
"
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
}
|
213 |
-
|
|
|
|
|
214 |
}
|
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "markdown",
|
|
|
|
|
|
|
5 |
"metadata": {
|
6 |
"id": "U8RTc2PmnX-v"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"Initial setup"
|
10 |
+
]
|
11 |
},
|
12 |
{
|
13 |
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
|
|
|
|
15 |
"metadata": {
|
16 |
"id": "kGW7vfRkrqHe"
|
17 |
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
|
21 |
+
]
|
22 |
},
|
23 |
{
|
24 |
"cell_type": "code",
|
25 |
+
"execution_count": null,
|
|
|
|
|
|
|
26 |
"metadata": {
|
27 |
"id": "2I850FIsCVNw"
|
28 |
},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"from datetime import datetime\n",
|
32 |
+
"import os\n",
|
33 |
+
"from huggingface_hub import login, upload_folder\n",
|
34 |
+
"from google.colab import userdata\n",
|
35 |
+
"import shutil\n",
|
36 |
+
"\n",
|
37 |
+
"HF_TOKEN = userdata.get('HUGGING_FACE_WRITE_TOKEN')\n",
|
38 |
+
"login(HF_TOKEN, True)\n",
|
39 |
+
"BASE_DATASET='mmlu'\n",
|
40 |
+
"REPO_ID='flunardelli/llm-metaeval'\n",
|
41 |
+
"BASE_FOLDER=f\"/content/{BASE_DATASET}/\"#{datetime.now().strftime('%Y-%m-%dT%H-%M-%S')}\n",
|
42 |
+
"OUTPUT_FOLDER=os.path.join(BASE_FOLDER,'output')\n",
|
43 |
+
"TASK_FOLDER=os.path.join(BASE_FOLDER,'tasks')\n",
|
44 |
+
"#shutil.rmtree(BASE_FOLDER)\n",
|
45 |
+
"os.makedirs(OUTPUT_FOLDER)\n",
|
46 |
+
"os.makedirs(TASK_FOLDER)\n",
|
47 |
+
"os.environ['HF_TOKEN'] = HF_TOKEN\n",
|
48 |
+
"os.environ['OUTPUT_FOLDER'] = OUTPUT_FOLDER\n",
|
49 |
+
"os.environ['TASK_FOLDER'] = TASK_FOLDER\n",
|
50 |
+
"\n",
|
51 |
+
"def hf_upload_folder(folder_path):\n",
|
52 |
+
" upload_folder(\n",
|
53 |
+
" folder_path=folder_path,\n",
|
54 |
+
" path_in_repo=\"evals/\",\n",
|
55 |
+
" repo_id=REPO_ID,\n",
|
56 |
+
" token=HF_TOKEN,\n",
|
57 |
+
" repo_type=\"dataset\"\n",
|
58 |
+
" )\n",
|
59 |
+
"\n",
|
60 |
+
"def create_task(content, filename):\n",
|
61 |
+
" filename_path = os.path.join(TASK_FOLDER,filename)\n",
|
62 |
+
" with open(filename_path, \"w\") as f:\n",
|
63 |
+
" f.write(content)"
|
64 |
+
]
|
65 |
},
|
66 |
{
|
67 |
"cell_type": "markdown",
|
|
|
|
|
|
|
68 |
"metadata": {
|
69 |
"id": "Jd2JwKZaPkNS"
|
70 |
+
},
|
71 |
+
"source": [
|
72 |
+
"Create task for MMLU all datasets"
|
73 |
+
]
|
74 |
},
|
75 |
{
|
76 |
"cell_type": "code",
|
77 |
+
"execution_count": null,
|
78 |
+
"metadata": {
|
79 |
+
"id": "xP0cC_sHih7C"
|
80 |
+
},
|
81 |
+
"outputs": [],
|
82 |
"source": [
|
83 |
"YAML_mmlu_en_us_string = \"\"\"\n",
|
84 |
"task: mmlu_all\n",
|
85 |
"dataset_path: cais/mmlu\n",
|
86 |
"dataset_name: all\n",
|
87 |
+
"description: \"MMLU dataset\"\n",
|
88 |
"test_split: test\n",
|
89 |
"fewshot_split: dev\n",
|
90 |
"fewshot_config:\n",
|
|
|
101 |
" aggregation: mean\n",
|
102 |
" higher_is_better: true\n",
|
103 |
"\"\"\"\n",
|
104 |
+
"create_task(YAML_mmlu_en_us_string, 'mmlu_en_us.yaml')\n"
|
105 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
},
|
107 |
{
|
108 |
"cell_type": "markdown",
|
|
|
|
|
|
|
109 |
"metadata": {
|
110 |
"id": "mJjo_A5tP-Td"
|
111 |
+
},
|
112 |
+
"source": [
|
113 |
+
"Llama Models"
|
114 |
+
]
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
"metadata": {
|
120 |
"id": "IzP5nyP0Gwk8"
|
121 |
},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"!accelerate launch -m lm_eval \\\n",
|
125 |
+
"--model hf --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct,parallelize=True \\\n",
|
126 |
+
"--tasks mmlu_all \\\n",
|
127 |
+
"--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --use_cache cache --log_samples \\\n",
|
128 |
+
"--batch_size 16\n",
|
129 |
+
"#--limit 10 \\"
|
130 |
+
]
|
131 |
},
|
132 |
{
|
133 |
"cell_type": "code",
|
134 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
"metadata": {
|
136 |
"id": "oIACOAhDW5ow"
|
137 |
},
|
138 |
+
"outputs": [],
|
139 |
+
"source": [
|
140 |
+
"!accelerate launch -m lm_eval \\\n",
|
141 |
+
"--model hf --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct,parallelize=True \\\n",
|
142 |
+
"--tasks mmlu_all \\\n",
|
143 |
+
"--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --use_cache cache --log_samples \\\n",
|
144 |
+
"--batch_size 16\n",
|
145 |
+
"#--limit 10 \\"
|
146 |
+
]
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
+
"execution_count": null,
|
151 |
+
"metadata": {
|
152 |
+
"id": "cFFYPzBIYGf7"
|
153 |
+
},
|
154 |
+
"outputs": [],
|
155 |
"source": [
|
156 |
+
"!accelerate launch -m lm_eval \\\n",
|
157 |
+
"--model hf --model_args pretrained=meta-llama/Meta-Llama-3-8B,parallelize=True \\\n",
|
158 |
+
"--tasks mmlu_all \\\n",
|
159 |
+
"--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --use_cache cache --log_samples \\\n",
|
160 |
+
"--batch_size 16\n",
|
161 |
+
"#--limit 10 \\"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"cell_type": "markdown",
|
166 |
"metadata": {
|
167 |
+
"id": "1fEX-49hQ-Be"
|
168 |
},
|
169 |
+
"source": [
|
170 |
+
"Mistral Models"
|
171 |
+
]
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
"source": [
|
176 |
+
"!accelerate launch -m lm_eval \\\n",
|
177 |
+
"--model hf --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1,parallelize=True \\\n",
|
178 |
+
"--tasks mmlu_all \\\n",
|
179 |
+
"--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --use_cache cache --log_samples \\\n",
|
180 |
+
"--batch_size 16\n",
|
181 |
+
"#--limit 10 \\"
|
|
|
|
|
|
|
182 |
],
|
183 |
"metadata": {
|
184 |
+
"id": "ilu9_ulWTy3p"
|
185 |
},
|
186 |
"execution_count": null,
|
187 |
"outputs": []
|
188 |
},
|
189 |
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": null,
|
192 |
+
"metadata": {
|
193 |
+
"id": "3cHI2qxN2fJ0"
|
194 |
+
},
|
195 |
+
"outputs": [],
|
196 |
"source": [
|
197 |
+
"!accelerate launch -m lm_eval \\\n",
|
198 |
+
"--model hf --model_args pretrained=mistralai/Mixtral-8x22B-v0.1,parallelize=True \\\n",
|
199 |
+
"--tasks mmlu_all \\\n",
|
200 |
+
"--include_path $TASK_FOLDER/. --output $OUTPUT_FOLDER --use_cache cache --log_samples \\\n",
|
201 |
+
"--batch_size 16\n",
|
202 |
+
"#--limit 10 \\"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "markdown",
|
207 |
"metadata": {
|
208 |
+
"id": "ZUTPHnV0kMB1"
|
209 |
+
},
|
210 |
+
"source": [
|
211 |
+
"Save output results"
|
212 |
+
]
|
213 |
},
|
214 |
{
|
215 |
"cell_type": "code",
|
216 |
"source": [
|
217 |
+
"hf_upload_folder(BASE_FOLDER)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
],
|
219 |
"metadata": {
|
220 |
+
"id": "mGGdqBNBzFYL"
|
221 |
},
|
222 |
"execution_count": null,
|
223 |
"outputs": []
|
224 |
+
}
|
225 |
+
],
|
226 |
+
"metadata": {
|
227 |
+
"accelerator": "GPU",
|
228 |
+
"colab": {
|
229 |
+
"gpuType": "T4",
|
230 |
+
"provenance": [],
|
231 |
+
"machine_shape": "hm"
|
232 |
},
|
233 |
+
"kernelspec": {
|
234 |
+
"display_name": "Python 3",
|
235 |
+
"name": "python3"
|
236 |
+
},
|
237 |
+
"language_info": {
|
238 |
+
"name": "python"
|
239 |
}
|
240 |
+
},
|
241 |
+
"nbformat": 4,
|
242 |
+
"nbformat_minor": 0
|
243 |
}
|