flunardelli commited on
Commit
c607184
·
1 Parent(s): bdc6993

initial notebooks

Browse files
llm_eval_harness_GPU_version.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
llm_metaeval_eval_harness_mmlu.ipynb ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "Initial setup"
23
+ ],
24
+ "metadata": {
25
+ "id": "U8RTc2PmnX-v"
26
+ }
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "source": [
31
+ "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
32
+ ],
33
+ "metadata": {
34
+ "id": "kGW7vfRkrqHe"
35
+ },
36
+ "execution_count": null,
37
+ "outputs": []
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "source": [
42
+ "from huggingface_hub import notebook_login\n",
43
+ "notebook_login()"
44
+ ],
45
+ "metadata": {
46
+ "id": "2I850FIsCVNw"
47
+ },
48
+ "execution_count": null,
49
+ "outputs": []
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "source": [
54
+ "Create task for MMLU all datasets"
55
+ ],
56
+ "metadata": {
57
+ "id": "Jd2JwKZaPkNS"
58
+ }
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "source": [
63
+ "YAML_mmlu_en_us_string = \"\"\"\n",
64
+ "task: mmlu_all\n",
65
+ "dataset_path: cais/mmlu\n",
66
+ "dataset_name: all\n",
67
+ "description: \"MMLU dataset in English\"\n",
68
+ "test_split: test\n",
69
+ "fewshot_split: dev\n",
70
+ "fewshot_config:\n",
71
+ " sampler: first_n\n",
72
+ "output_type: multiple_choice\n",
73
+ "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
74
+ "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
75
+ "doc_to_target: answer\n",
76
+ "metric_list:\n",
77
+ " - metric: acc\n",
78
+ " aggregation: mean\n",
79
+ " higher_is_better: true\n",
80
+ " - metric: acc_norm\n",
81
+ " aggregation: mean\n",
82
+ " higher_is_better: true\n",
83
+ "\"\"\"\n",
84
+ "with open(\"mmlu_en_us.yaml\", \"w\") as f:\n",
85
+ " f.write(YAML_mmlu_en_us_string)"
86
+ ],
87
+ "metadata": {
88
+ "id": "xP0cC_sHih7C"
89
+ },
90
+ "execution_count": null,
91
+ "outputs": []
92
+ },
93
+ {
94
+ "cell_type": "markdown",
95
+ "source": [
96
+ "Llama Models"
97
+ ],
98
+ "metadata": {
99
+ "id": "mJjo_A5tP-Td"
100
+ }
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "source": [
105
+ "!lm_eval --model hf \\\n",
106
+ " --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
107
+ " --include_path ./ \\\n",
108
+ " --tasks mmlu_all \\\n",
109
+ " --output output/mmlu/ \\\n",
110
+ " --use_cache cache \\\n",
111
+ " --device cuda:0 \\\n",
112
+ " --log_samples\n",
113
+ " # --limit 10\n"
114
+ ],
115
+ "metadata": {
116
+ "id": "IzP5nyP0Gwk8"
117
+ },
118
+ "execution_count": null,
119
+ "outputs": []
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "source": [
124
+ "!lm_eval --model hf \\\n",
125
+ " --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
126
+ " --include_path ./ \\\n",
127
+ " --tasks mmlu_all \\\n",
128
+ " --output output/mmlu/ \\\n",
129
+ " --use_cache cache \\\n",
130
+ " --device cuda:0 \\\n",
131
+ " --log_samples\n",
132
+ " # --limit 10"
133
+ ],
134
+ "metadata": {
135
+ "id": "oIACOAhDW5ow"
136
+ },
137
+ "execution_count": null,
138
+ "outputs": []
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "source": [
143
+ "!lm_eval --model hf \\\n",
144
+ " --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n",
145
+ " --include_path ./ \\\n",
146
+ " --tasks mmlu_all \\\n",
147
+ " --output output/mmlu/ \\\n",
148
+ " --use_cache cache \\\n",
149
+ " --device cuda:0 \\\n",
150
+ " --log_samples\n",
151
+ " # --limit 10"
152
+ ],
153
+ "metadata": {
154
+ "id": "1Nxw4WNxZUyb"
155
+ },
156
+ "execution_count": null,
157
+ "outputs": []
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "source": [
162
+ "!lm_eval --model hf \\\n",
163
+ " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n",
164
+ " --include_path ./ \\\n",
165
+ " --tasks mmlu_all \\\n",
166
+ " --output output/mmlu/ \\\n",
167
+ " --use_cache cache \\\n",
168
+ " --device cuda:0 \\\n",
169
+ " --log_samples\n",
170
+ " # --limit 10"
171
+ ],
172
+ "metadata": {
173
+ "id": "cFFYPzBIYGf7"
174
+ },
175
+ "execution_count": null,
176
+ "outputs": []
177
+ },
178
+ {
179
+ "cell_type": "markdown",
180
+ "source": [
181
+ "Mistral Models"
182
+ ],
183
+ "metadata": {
184
+ "id": "1fEX-49hQ-Be"
185
+ }
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "source": [
190
+ "!lm_eval --model hf \\\n",
191
+ " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
192
+ " --include_path ./ \\\n",
193
+ " --tasks mmlu_all \\\n",
194
+ " --output output/mmlu/ \\\n",
195
+ " --use_cache cache \\\n",
196
+ " --device cuda:0 \\\n",
197
+ " --log_samples\n",
198
+ " # --limit 10"
199
+ ],
200
+ "metadata": {
201
+ "id": "3cHI2qxN2fJ0"
202
+ },
203
+ "execution_count": null,
204
+ "outputs": []
205
+ },
206
+ {
207
+ "cell_type": "markdown",
208
+ "source": [],
209
+ "metadata": {
210
+ "id": "ZUTPHnV0kMB1"
211
+ }
212
+ }
213
+ ]
214
+ }
llm_metaeval_eval_harness_pub.ipynb ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "source": [
22
+ "Initial setup"
23
+ ],
24
+ "metadata": {
25
+ "id": "U8RTc2PmnX-v"
26
+ }
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "source": [
31
+ "!pip install -r https://huggingface.co/flunardelli/llm-metaeval/raw/main/requirements.txt"
32
+ ],
33
+ "metadata": {
34
+ "id": "kGW7vfRkrqHe"
35
+ },
36
+ "execution_count": null,
37
+ "outputs": []
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "source": [
42
+ "from huggingface_hub import notebook_login\n",
43
+ "notebook_login()"
44
+ ],
45
+ "metadata": {
46
+ "id": "2I850FIsCVNw"
47
+ },
48
+ "execution_count": null,
49
+ "outputs": []
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "source": [
54
+ "Create task for PUB all datasets"
55
+ ],
56
+ "metadata": {
57
+ "id": "Jd2JwKZaPkNS"
58
+ }
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "source": [
63
+ "YAML_template_pub_tasks = [\n",
64
+ " (\"task_1\", 2),\n",
65
+ " (\"task_2\", 5),\n",
66
+ " (\"task_3\", 5),\n",
67
+ " (\"task_4\", 3),\n",
68
+ " (\"task_5\", 2),\n",
69
+ " (\"task_6\", 2),\n",
70
+ " (\"task_7\", 2),\n",
71
+ " (\"task_8\", 2),\n",
72
+ " (\"task_9\", 2),\n",
73
+ " (\"task_10\", 3),\n",
74
+ " (\"task_11\", 3),\n",
75
+ " (\"task_12\", 2),\n",
76
+ " (\"task_13\", 2),\n",
77
+ " (\"task_14\", 4)\n",
78
+ "]\n",
79
+ "\n",
80
+ "default_doc_to_text = \"{{pretext.strip()}}\\n {{options[0]}}\\n{{options[1]}}\\\\n{{options[2]}}\\\\n{{options[3]}}\\\\n{{options[4]}}\\\\nAnswer:\"\n",
81
+ "\n",
82
+ "\n",
83
+ "YAML_template_pub_base = \"\"\"\n",
84
+ "task: __task_name__\n",
85
+ "dataset_path: flunardelli/PUB\n",
86
+ "dataset_name: __dataset_name__\n",
87
+ "description: \"PUB\"\n",
88
+ "test_split: test\n",
89
+ "fewshot_split: test\n",
90
+ "fewshot_config:\n",
91
+ " sampler: first_n\n",
92
+ "num_fewshot: 10\n",
93
+ "output_type: multiple_choice\n",
94
+ "doc_to_text: \"{{pretext.strip()}}\\n Options:\\n__options__\\nAnswer:\"\n",
95
+ "doc_to_choice: \"{{options}}\"\n",
96
+ "doc_to_target: \"correct answer\"\n",
97
+ "metric_list:\n",
98
+ " - metric: acc\n",
99
+ " aggregation: mean\n",
100
+ " higher_is_better: true\n",
101
+ " - metric: acc_norm\n",
102
+ " aggregation: mean\n",
103
+ " higher_is_better: true\n",
104
+ "\"\"\"\n",
105
+ "tasks = []\n",
106
+ "for t in YAML_template_pub_tasks:\n",
107
+ " dataset_name, num_choices = t\n",
108
+ " task_name = f\"pub_{dataset_name}\"\n",
109
+ " tasks.append(task_name)\n",
110
+ " templace_choices = '\\n'.join([\"{{options[__i__]}}\".replace('__i__',str(i)) for i in range(num_choices)])\n",
111
+ " template = (YAML_template_pub_base\n",
112
+ " .replace('__options__',templace_choices)\n",
113
+ " .replace('__dataset_name__',dataset_name).replace('__task_name__',task_name)\n",
114
+ " )\n",
115
+ " with open(f\"pub_{dataset_name}.yaml\", \"w\") as f:\n",
116
+ " f.write(template)\n",
117
+ "\n",
118
+ "','.join(tasks)"
119
+ ],
120
+ "metadata": {
121
+ "id": "xP0cC_sHih7C",
122
+ "colab": {
123
+ "base_uri": "https://localhost:8080/",
124
+ "height": 35
125
+ },
126
+ "outputId": "fcf3ed9e-1422-47f3-e234-016435c8b212"
127
+ },
128
+ "execution_count": 1,
129
+ "outputs": [
130
+ {
131
+ "output_type": "execute_result",
132
+ "data": {
133
+ "text/plain": [
134
+ "'pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14'"
135
+ ],
136
+ "application/vnd.google.colaboratory.intrinsic+json": {
137
+ "type": "string"
138
+ }
139
+ },
140
+ "metadata": {},
141
+ "execution_count": 1
142
+ }
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "markdown",
147
+ "source": [
148
+ "Llama Models"
149
+ ],
150
+ "metadata": {
151
+ "id": "mJjo_A5tP-Td"
152
+ }
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "source": [
157
+ "!lm_eval --model hf \\\n",
158
+ " --model_args pretrained=meta-llama/Llama-3.2-1B-Instruct \\\n",
159
+ " --include_path ./ \\\n",
160
+ " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
161
+ " --output output/pub/ \\\n",
162
+ " --use_cache cache \\\n",
163
+ " --device cuda:0 \\\n",
164
+ " --log_samples\n",
165
+ " # --limit 10\n"
166
+ ],
167
+ "metadata": {
168
+ "id": "IzP5nyP0Gwk8"
169
+ },
170
+ "execution_count": null,
171
+ "outputs": []
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "source": [
176
+ "!lm_eval --model hf \\\n",
177
+ " --model_args pretrained=meta-llama/Llama-3.2-3B-Instruct \\\n",
178
+ " --include_path ./ \\\n",
179
+ " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
180
+ " --output output/pub/ \\\n",
181
+ " --use_cache cache \\\n",
182
+ " --device cuda:0 \\\n",
183
+ " --log_samples\n",
184
+ " # --limit 10"
185
+ ],
186
+ "metadata": {
187
+ "id": "oIACOAhDW5ow"
188
+ },
189
+ "execution_count": null,
190
+ "outputs": []
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "source": [
195
+ "!lm_eval --model hf \\\n",
196
+ " --model_args pretrained=mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n",
197
+ " --include_path ./ \\\n",
198
+ " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
199
+ " --output output/pub/ \\\n",
200
+ " --use_cache cache \\\n",
201
+ " --device cuda:0 \\\n",
202
+ " --log_samples\n",
203
+ " # --limit 10"
204
+ ],
205
+ "metadata": {
206
+ "id": "1Nxw4WNxZUyb"
207
+ },
208
+ "execution_count": null,
209
+ "outputs": []
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "source": [
214
+ "!lm_eval --model hf \\\n",
215
+ " --model_args pretrained=meta-llama/Meta-Llama-3-8B \\\n",
216
+ " --include_path ./ \\\n",
217
+ " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
218
+ " --output output/pub/ \\\n",
219
+ " --use_cache cache \\\n",
220
+ " --device cuda:0 \\\n",
221
+ " --log_samples\n",
222
+ " # --limit 10"
223
+ ],
224
+ "metadata": {
225
+ "id": "cFFYPzBIYGf7"
226
+ },
227
+ "execution_count": null,
228
+ "outputs": []
229
+ },
230
+ {
231
+ "cell_type": "markdown",
232
+ "source": [
233
+ "Mistral Models"
234
+ ],
235
+ "metadata": {
236
+ "id": "1fEX-49hQ-Be"
237
+ }
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "source": [
242
+ "!lm_eval --model hf \\\n",
243
+ " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
244
+ " --include_path ./ \\\n",
245
+ " --tasks pub_task_1,pub_task_2,pub_task_3,pub_task_4,pub_task_5,pub_task_6,pub_task_7,pub_task_8,pub_task_9,pub_task_10,pub_task_11,pub_task_12,pub_task_13,pub_task_14 \\\n",
246
+ " --output output/pub/ \\\n",
247
+ " --use_cache cache \\\n",
248
+ " --device cuda:0 \\\n",
249
+ " --log_samples\n",
250
+ " # --limit 10"
251
+ ],
252
+ "metadata": {
253
+ "id": "3cHI2qxN2fJ0"
254
+ },
255
+ "execution_count": null,
256
+ "outputs": []
257
+ },
258
+ {
259
+ "cell_type": "markdown",
260
+ "source": [],
261
+ "metadata": {
262
+ "id": "ZUTPHnV0kMB1"
263
+ }
264
+ }
265
+ ]
266
+ }