derek-thomas HF staff commited on
Commit
da59d46
·
1 Parent(s): f140b0f

Bug fix: right padding, and wrong model for final answer

Browse files
02-autotrain.ipynb CHANGED
@@ -18,9 +18,13 @@
18
  },
19
  {
20
  "cell_type": "code",
21
- "execution_count": null,
22
  "id": "52543575-f92e-4038-ad13-30967f47eb7a",
23
- "metadata": {},
 
 
 
 
24
  "outputs": [],
25
  "source": [
26
  "import os\n",
@@ -37,6 +41,32 @@
37
  "## Config"
38
  ]
39
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  {
41
  "cell_type": "markdown",
42
  "id": "97c25070-775a-4fb1-9694-4579250686a6",
@@ -58,7 +88,7 @@
58
  },
59
  {
60
  "cell_type": "code",
61
- "execution_count": null,
62
  "id": "dc2a8514-51c1-404b-8cfa-6637cc810668",
63
  "metadata": {},
64
  "outputs": [],
@@ -80,15 +110,15 @@
80
  " },\n",
81
  " },\n",
82
  " \"params\": {\n",
83
- " \"block_size\": 1024,\n",
84
- " \"model_max_length\": 1024,\n",
85
  " \"epochs\": 2,\n",
86
  " \"batch_size\": 1,\n",
87
  " \"lr\": 3e-5,\n",
88
  " \"peft\": True,\n",
89
  " \"quantization\": \"int4\",\n",
90
  " \"target_modules\": \"all-linear\",\n",
91
- " \"padding\": \"left\",\n",
92
  " \"optimizer\": \"adamw_torch\",\n",
93
  " \"scheduler\": \"linear\",\n",
94
  " \"gradient_accumulation\": 8,\n",
@@ -96,7 +126,7 @@
96
  " },\n",
97
  " \"hub\": {\n",
98
  " \"username\": \"derek-thomas\",\n",
99
- " \"token\": os.getenv('HF_TOKEN'),\n",
100
  " \"push_to_hub\": True,\n",
101
  " },\n",
102
  "}"
@@ -113,7 +143,7 @@
113
  },
114
  {
115
  "cell_type": "code",
116
- "execution_count": null,
117
  "id": "957eb2b7-feec-422f-ba46-b293d9a77c1b",
118
  "metadata": {},
119
  "outputs": [],
@@ -133,7 +163,7 @@
133
  },
134
  {
135
  "cell_type": "code",
136
- "execution_count": null,
137
  "id": "b86702bf-f494-4951-863e-be5b8462fbd1",
138
  "metadata": {},
139
  "outputs": [],
@@ -152,10 +182,47 @@
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": null,
156
  "id": "025ccd2f-de54-4ac2-9f36-f606876dcd3c",
157
  "metadata": {},
158
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  "source": [
160
  "# Generate configs and run commands\n",
161
  "for project_suffix, text_column in zip(project_suffixes, text_columns):\n",
@@ -173,6 +240,14 @@
173
  " print(f\"Running autotrain with config: {config_path}\")\n",
174
  " subprocess.run([\"autotrain\", \"--config\", config_path])"
175
  ]
 
 
 
 
 
 
 
 
176
  }
177
  ],
178
  "metadata": {
 
18
  },
19
  {
20
  "cell_type": "code",
21
+ "execution_count": 1,
22
  "id": "52543575-f92e-4038-ad13-30967f47eb7a",
23
+ "metadata": {
24
+ "jupyter": {
25
+ "is_executing": true
26
+ }
27
+ },
28
  "outputs": [],
29
  "source": [
30
  "import os\n",
 
41
  "## Config"
42
  ]
43
  },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 3,
47
+ "id": "6992324b-173c-4335-b557-cf78fbb2dd93",
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "data": {
52
+ "application/vnd.jupyter.widget-view+json": {
53
+ "model_id": "24ea5bd118ed4632a6ad859c4c976e66",
54
+ "version_major": 2,
55
+ "version_minor": 0
56
+ },
57
+ "text/plain": [
58
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
59
+ ]
60
+ },
61
+ "metadata": {},
62
+ "output_type": "display_data"
63
+ }
64
+ ],
65
+ "source": [
66
+ "from huggingface_hub import login, get_token\n",
67
+ "login()"
68
+ ]
69
+ },
70
  {
71
  "cell_type": "markdown",
72
  "id": "97c25070-775a-4fb1-9694-4579250686a6",
 
88
  },
89
  {
90
  "cell_type": "code",
91
+ "execution_count": 4,
92
  "id": "dc2a8514-51c1-404b-8cfa-6637cc810668",
93
  "metadata": {},
94
  "outputs": [],
 
110
  " },\n",
111
  " },\n",
112
  " \"params\": {\n",
113
+ " \"block_size\": 512,\n",
114
+ " \"model_max_length\": 1500,\n",
115
  " \"epochs\": 2,\n",
116
  " \"batch_size\": 1,\n",
117
  " \"lr\": 3e-5,\n",
118
  " \"peft\": True,\n",
119
  " \"quantization\": \"int4\",\n",
120
  " \"target_modules\": \"all-linear\",\n",
121
+ " \"padding\": \"right\",\n",
122
  " \"optimizer\": \"adamw_torch\",\n",
123
  " \"scheduler\": \"linear\",\n",
124
  " \"gradient_accumulation\": 8,\n",
 
126
  " },\n",
127
  " \"hub\": {\n",
128
  " \"username\": \"derek-thomas\",\n",
129
+ " \"token\": get_token(),\n",
130
  " \"push_to_hub\": True,\n",
131
  " },\n",
132
  "}"
 
143
  },
144
  {
145
  "cell_type": "code",
146
+ "execution_count": 5,
147
  "id": "957eb2b7-feec-422f-ba46-b293d9a77c1b",
148
  "metadata": {},
149
  "outputs": [],
 
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": 6,
167
  "id": "b86702bf-f494-4951-863e-be5b8462fbd1",
168
  "metadata": {},
169
  "outputs": [],
 
182
  },
183
  {
184
  "cell_type": "code",
185
+ "execution_count": 9,
186
  "id": "025ccd2f-de54-4ac2-9f36-f606876dcd3c",
187
  "metadata": {},
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "Running autotrain with config: ./autotrain_configs/conversation_RFA_gpt3_5.yml\n",
194
+ "INFO | 2024-12-12 20:45:45 | autotrain.cli.autotrain:main:60 - Using AutoTrain configuration: ./autotrain_configs/conversation_RFA_gpt3_5.yml\n",
195
+ "INFO | 2024-12-12 20:45:45 | autotrain.parser:__post_init__:170 - Running task: lm_training\n",
196
+ "INFO | 2024-12-12 20:45:45 | autotrain.parser:__post_init__:171 - Using backend: spaces-l4x1\n",
197
+ "INFO | 2024-12-12 20:45:45 | autotrain.parser:run:234 - {'model': 'mistralai/Mistral-7B-Instruct-v0.3', 'project_name': 'mistral-v03-poe-RFA-gpt3-5', 'data_path': 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 512, 'model_max_length': 1500, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size': False, 'mixed_precision': 'bf16', 'lr': 3e-05, 'epochs': 2, 'batch_size': 1, 'warmup_ratio': 0.1, 'gradient_accumulation': 8, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'seed': 42, 'chat_template': 'none', 'quantization': 'int4', 'target_modules': 'all-linear', 'merge_adapter': False, 'peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'model_ref': None, 'dpo_beta': 0.1, 'max_prompt_length': 128, 'max_completion_length': None, 'prompt_text_column': None, 'text_column': 'conversation_RFA_gpt3_5', 'rejected_text_column': None, 'push_to_hub': True, 'username': 'derek-thomas', 'token': '*****', 'unsloth': False, 'distributed_backend': None}\n",
198
+ "INFO | 2024-12-12 20:45:52 | autotrain.parser:run:239 - Job ID: derek-thomas/autotrain-mistral-v03-poe-RFA-gpt3-5\n",
199
+ "Running autotrain with config: ./autotrain_configs/conversation_RFA_mistral.yml\n",
200
+ "INFO | 2024-12-12 20:45:56 | autotrain.cli.autotrain:main:60 - Using AutoTrain configuration: ./autotrain_configs/conversation_RFA_mistral.yml\n",
201
+ "INFO | 2024-12-12 20:45:56 | autotrain.parser:__post_init__:170 - Running task: lm_training\n",
202
+ "INFO | 2024-12-12 20:45:56 | autotrain.parser:__post_init__:171 - Using backend: spaces-l4x1\n",
203
+ "INFO | 2024-12-12 20:45:56 | autotrain.parser:run:234 - {'model': 'mistralai/Mistral-7B-Instruct-v0.3', 'project_name': 'mistral-v03-poe-RFA-mistral', 'data_path': 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 512, 'model_max_length': 1500, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size': False, 'mixed_precision': 'bf16', 'lr': 3e-05, 'epochs': 2, 'batch_size': 1, 'warmup_ratio': 0.1, 'gradient_accumulation': 8, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'seed': 42, 'chat_template': 'none', 'quantization': 'int4', 'target_modules': 'all-linear', 'merge_adapter': False, 'peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'model_ref': None, 'dpo_beta': 0.1, 'max_prompt_length': 128, 'max_completion_length': None, 'prompt_text_column': None, 'text_column': 'conversation_RFA_mistral', 'rejected_text_column': None, 'push_to_hub': True, 'username': 'derek-thomas', 'token': '*****', 'unsloth': False, 'distributed_backend': None}\n",
204
+ "INFO | 2024-12-12 20:46:01 | autotrain.parser:run:239 - Job ID: derek-thomas/autotrain-mistral-v03-poe-RFA-mistral\n",
205
+ "Running autotrain with config: ./autotrain_configs/conversation_FAR_gpt3_5.yml\n",
206
+ "INFO | 2024-12-12 20:46:05 | autotrain.cli.autotrain:main:60 - Using AutoTrain configuration: ./autotrain_configs/conversation_FAR_gpt3_5.yml\n",
207
+ "INFO | 2024-12-12 20:46:05 | autotrain.parser:__post_init__:170 - Running task: lm_training\n",
208
+ "INFO | 2024-12-12 20:46:05 | autotrain.parser:__post_init__:171 - Using backend: spaces-l4x1\n",
209
+ "INFO | 2024-12-12 20:46:05 | autotrain.parser:run:234 - {'model': 'mistralai/Mistral-7B-Instruct-v0.3', 'project_name': 'mistral-v03-poe-FAR-gpt3-5', 'data_path': 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 512, 'model_max_length': 1500, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size': False, 'mixed_precision': 'bf16', 'lr': 3e-05, 'epochs': 2, 'batch_size': 1, 'warmup_ratio': 0.1, 'gradient_accumulation': 8, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'seed': 42, 'chat_template': 'none', 'quantization': 'int4', 'target_modules': 'all-linear', 'merge_adapter': False, 'peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'model_ref': None, 'dpo_beta': 0.1, 'max_prompt_length': 128, 'max_completion_length': None, 'prompt_text_column': None, 'text_column': 'conversation_FAR_gpt3_5', 'rejected_text_column': None, 'push_to_hub': True, 'username': 'derek-thomas', 'token': '*****', 'unsloth': False, 'distributed_backend': None}\n",
210
+ "INFO | 2024-12-12 20:46:12 | autotrain.parser:run:239 - Job ID: derek-thomas/autotrain-mistral-v03-poe-FAR-gpt3-5\n",
211
+ "Running autotrain with config: ./autotrain_configs/conversation_FAR_mistral.yml\n",
212
+ "INFO | 2024-12-12 20:46:16 | autotrain.cli.autotrain:main:60 - Using AutoTrain configuration: ./autotrain_configs/conversation_FAR_mistral.yml\n",
213
+ "INFO | 2024-12-12 20:46:16 | autotrain.parser:__post_init__:170 - Running task: lm_training\n",
214
+ "INFO | 2024-12-12 20:46:16 | autotrain.parser:__post_init__:171 - Using backend: spaces-l4x1\n",
215
+ "INFO | 2024-12-12 20:46:16 | autotrain.parser:run:234 - {'model': 'mistralai/Mistral-7B-Instruct-v0.3', 'project_name': 'mistral-v03-poe-FAR-mistral', 'data_path': 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 512, 'model_max_length': 1500, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size': False, 'mixed_precision': 'bf16', 'lr': 3e-05, 'epochs': 2, 'batch_size': 1, 'warmup_ratio': 0.1, 'gradient_accumulation': 8, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'seed': 42, 'chat_template': 'none', 'quantization': 'int4', 'target_modules': 'all-linear', 'merge_adapter': False, 'peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'model_ref': None, 'dpo_beta': 0.1, 'max_prompt_length': 128, 'max_completion_length': None, 'prompt_text_column': None, 'text_column': 'conversation_FAR_mistral', 'rejected_text_column': None, 'push_to_hub': True, 'username': 'derek-thomas', 'token': '*****', 'unsloth': False, 'distributed_backend': None}\n",
216
+ "INFO | 2024-12-12 20:46:22 | autotrain.parser:run:239 - Job ID: derek-thomas/autotrain-mistral-v03-poe-FAR-mistral\n",
217
+ "Running autotrain with config: ./autotrain_configs/conversation_FA.yml\n",
218
+ "INFO | 2024-12-12 20:46:25 | autotrain.cli.autotrain:main:60 - Using AutoTrain configuration: ./autotrain_configs/conversation_FA.yml\n",
219
+ "INFO | 2024-12-12 20:46:25 | autotrain.parser:__post_init__:170 - Running task: lm_training\n",
220
+ "INFO | 2024-12-12 20:46:25 | autotrain.parser:__post_init__:171 - Using backend: spaces-l4x1\n",
221
+ "INFO | 2024-12-12 20:46:25 | autotrain.parser:run:234 - {'model': 'mistralai/Mistral-7B-Instruct-v0.3', 'project_name': 'mistral-v03-poe-FA', 'data_path': 'derek-thomas/labeled-multiple-choice-explained-mistral-tokenized', 'train_split': 'train', 'valid_split': None, 'add_eos_token': True, 'block_size': 512, 'model_max_length': 1500, 'padding': 'right', 'trainer': 'sft', 'use_flash_attention_2': False, 'log': 'tensorboard', 'disable_gradient_checkpointing': False, 'logging_steps': -1, 'eval_strategy': 'epoch', 'save_total_limit': 1, 'auto_find_batch_size': False, 'mixed_precision': 'bf16', 'lr': 3e-05, 'epochs': 2, 'batch_size': 1, 'warmup_ratio': 0.1, 'gradient_accumulation': 8, 'optimizer': 'adamw_torch', 'scheduler': 'linear', 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'seed': 42, 'chat_template': 'none', 'quantization': 'int4', 'target_modules': 'all-linear', 'merge_adapter': False, 'peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'model_ref': None, 'dpo_beta': 0.1, 'max_prompt_length': 128, 'max_completion_length': None, 'prompt_text_column': None, 'text_column': 'conversation_FA', 'rejected_text_column': None, 'push_to_hub': True, 'username': 'derek-thomas', 'token': '*****', 'unsloth': False, 'distributed_backend': None}\n",
222
+ "INFO | 2024-12-12 20:46:31 | autotrain.parser:run:239 - Job ID: derek-thomas/autotrain-mistral-v03-poe-FA\n"
223
+ ]
224
+ }
225
+ ],
226
  "source": [
227
  "# Generate configs and run commands\n",
228
  "for project_suffix, text_column in zip(project_suffixes, text_columns):\n",
 
240
  " print(f\"Running autotrain with config: {config_path}\")\n",
241
  " subprocess.run([\"autotrain\", \"--config\", config_path])"
242
  ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "id": "67675837-2a38-4427-9186-32a25a970ff3",
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": []
251
  }
252
  ],
253
  "metadata": {
04-poe-eval.ipynb CHANGED
@@ -104,7 +104,7 @@
104
  {
105
  "data": {
106
  "text/plain": [
107
- "'derek-thomas/mistral-v03-poe-RFA-mistral,derek-thomas/mistral-v03-poe-FAR-mistral,derek-thomas/mistral-v03-poe-RFA-gpt3-5,derek-thomas/mistral-v03-poe-FAR-gpt3-5,derek-thomas/mistral-v03-poe-FAR'"
108
  ]
109
  },
110
  "execution_count": 3,
@@ -137,7 +137,7 @@
137
  " },\n",
138
  " 'FA': {\n",
139
  " 'pydantic': FAModel,\n",
140
- " \"lora\": \"derek-thomas/mistral-v03-poe-FAR\",\n",
141
  " \"column\": 'user_prompt_FA',\n",
142
  " },\n",
143
  " 'base': {\n",
@@ -162,7 +162,7 @@
162
  {
163
  "data": {
164
  "application/vnd.jupyter.widget-view+json": {
165
- "model_id": "f9a6617489af4d86be59e614e3c505c4",
166
  "version_major": 2,
167
  "version_minor": 0
168
  },
@@ -239,10 +239,9 @@
239
  "\n",
240
  "def get_my_endpoint():\n",
241
  " name = f\"prompt-order-experiment\"\n",
242
- " namespace='HF-test-lab'\n",
243
  " try:\n",
244
  " endpoint = get_inference_endpoint(name, namespace=namespace)\n",
245
- " endpoint.wait()\n",
246
  " except:\n",
247
  " # Custom Docker image details\n",
248
  " custom_image = {\n",
@@ -279,7 +278,6 @@
279
  " custom_image=custom_image,\n",
280
  " secrets=secrets\n",
281
  " )\n",
282
- " # endpoint.wait()\n",
283
  " \n",
284
  " print(\"Your model is ready to use!\")\n",
285
  " endpoint.wait()\n",
@@ -297,8 +295,8 @@
297
  "output_type": "stream",
298
  "text": [
299
  "Your model is ready to use!\n",
300
- "CPU times: user 22.3 ms, sys: 7.64 ms, total: 30 ms\n",
301
- "Wall time: 2.07 s\n"
302
  ]
303
  }
304
  ],
@@ -359,7 +357,7 @@
359
  {
360
  "data": {
361
  "text/plain": [
362
- "'{\"Reasoning\": \"Busses are primarily used for transporting humans, so the correct answer is (b) Transporting humans. The other options are either incorrect (a, c, d, e, f, g, h) or not specific enough to the function of a bus (a, c, e, f, g, h).\", \"Final Answer\": \"b\"}'"
363
  ]
364
  },
365
  "execution_count": 10,
@@ -388,7 +386,7 @@
388
  {
389
  "data": {
390
  "text/plain": [
391
- "'{\"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, schools, and tourist destinations. They provide a means of public transportation, making it easier for people to travel to various locations without the need for personal vehicles. Therefore, the correct answer is (b) transporting humans.\", \"Final Answer\": \"b\"}'"
392
  ]
393
  },
394
  "execution_count": 11,
@@ -400,7 +398,7 @@
400
  "key = 'RFA-gpt3-5'\n",
401
  "response = endpoint.client.text_generation(\n",
402
  " prompt=user_prompt_RFA,\n",
403
- " max_new_tokens=575,\n",
404
  " adapter_id=experiments[key]['lora'],\n",
405
  " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n",
406
  ")\n",
@@ -450,7 +448,7 @@
450
  {
451
  "data": {
452
  "text/plain": [
453
- "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, to facilitate public transportation. They provide a means of transportation for a large number of people at once, reducing the number of vehicles on the road and helping to alleviate traffic congestion. They also serve as a protective shelter for passengers, shielding them from the elements during travel.\"}'"
454
  ]
455
  },
456
  "execution_count": 13,
@@ -479,7 +477,7 @@
479
  {
480
  "data": {
481
  "text/plain": [
482
- "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans. While they can provide a protective shelter and ensure safe operation, they are not used for transporting airplanes, helping other species benefit, serving as a backbone, or being used for communication.\"}'"
483
  ]
484
  },
485
  "execution_count": 14,
@@ -541,7 +539,7 @@
541
  {
542
  "data": {
543
  "text/plain": [
544
- "'{\"Final Answer\": \"b\"}'"
545
  ]
546
  },
547
  "execution_count": 16,
@@ -656,7 +654,7 @@
656
  {
657
  "data": {
658
  "application/vnd.jupyter.widget-view+json": {
659
- "model_id": "f29c84d7a4654e9aa6af47cfe1bbde39",
660
  "version_major": 2,
661
  "version_minor": 0
662
  },
@@ -765,7 +763,7 @@
765
  {
766
  "data": {
767
  "text/plain": [
768
- "InferenceEndpoint(name='prompt-order-experiment', namespace='HF-test-lab', repository='mistralai/Mistral-7B-Instruct-v0.3', status='paused', url=None)"
769
  ]
770
  },
771
  "execution_count": 19,
@@ -859,7 +857,7 @@
859
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
860
  " <td>{\"Reasoning\": \"Busses are primarily used for t...</td>\n",
861
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
862
- " <td>{\"Final Answer\": \"b\"}</td>\n",
863
  " <td>{\"Final Answer\": \"b\"}</td>\n",
864
  " </tr>\n",
865
  " <tr>\n",
@@ -880,9 +878,9 @@
880
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
881
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
882
  " <td>{\"Reasoning\": \"Global warming is primarily cau...</td>\n",
883
- " <td>{\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu...</td>\n",
884
  " <td>{\"Reasoning\": \"The nucleus of a cell (option a...</td>\n",
885
- " <td>{\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu...</td>\n",
886
  " <td>{\"Final Answer\": \"a\"}</td>\n",
887
  " <td>{\"Final Answer\": \"a\"}</td>\n",
888
  " </tr>\n",
@@ -904,10 +902,10 @@
904
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
905
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
906
  " <td>{\"Reasoning\": \"The question asks for an organi...</td>\n",
907
- " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"The proces...</td>\n",
908
- " <td>{\"Reasoning\": \"The process of converting chemi...</td>\n",
909
- " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Bacteria u...</td>\n",
910
- " <td>{\"Final Answer\": \"c\"}</td>\n",
911
  " <td>{\"Final Answer\": \"c\"}</td>\n",
912
  " </tr>\n",
913
  " <tr>\n",
@@ -927,11 +925,11 @@
927
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
928
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
929
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
930
- " <td>{\"Reasoning\": \"Bacteria are microorganisms tha...</td>\n",
931
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
932
- " <td>{ \"Reasoning\": \"Bacteria can cause harm to var...</td>\n",
933
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
934
- " <td>{\"Final Answer\": \"d\"}</td>\n",
935
  " <td>{\"Final Answer\": \"d\"}</td>\n",
936
  " </tr>\n",
937
  " <tr>\n",
@@ -951,11 +949,11 @@
951
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
952
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
953
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
954
- " <td>{\"Reasoning\": \"The question asks about the liv...</td>\n",
955
  " <td>{\"Final Answer\": \"a\", \"Reasoning\": \"Plants and...</td>\n",
956
- " <td>{\"Reasoning\": \"The question asks about the rel...</td>\n",
957
- " <td>{\"Final Answer\": \"f\", \"Reasoning\": \"Plants and...</td>\n",
958
- " <td>{\"Final Answer\": \"g\"}</td>\n",
959
  " <td>{\"Final Answer\": \"g\"}</td>\n",
960
  " </tr>\n",
961
  " <tr>\n",
@@ -1002,7 +1000,7 @@
1002
  " <td>{\"Reasoning\": \"The question asks for a way to ...</td>\n",
1003
  " <td>{\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ...</td>\n",
1004
  " <td>{\"Reasoning\": \"Mining, fossil fuels, deforesta...</td>\n",
1005
- " <td>{\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ...</td>\n",
1006
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1007
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1008
  " </tr>\n",
@@ -1025,9 +1023,9 @@
1025
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1026
  " <td>{\"Reasoning\": \"The question asks for a term th...</td>\n",
1027
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"A drought ...</td>\n",
1028
- " <td>{ \"Reasoning\": \"A drought is a prolonged perio...</td>\n",
1029
- " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"A drought ...</td>\n",
1030
- " <td>{\"Final Answer\": \"d\"}</td>\n",
1031
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1032
  " </tr>\n",
1033
  " <tr>\n",
@@ -1051,7 +1049,7 @@
1051
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
1052
  " <td>{\"Reasoning\": \"Ingestion is the process of tak...</td>\n",
1053
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
1054
- " <td>{\"Final Answer\": \"d\"}</td>\n",
1055
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1056
  " </tr>\n",
1057
  " <tr>\n",
@@ -1074,7 +1072,7 @@
1074
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1075
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Ultraviole...</td>\n",
1076
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1077
- " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Ultraviole...</td>\n",
1078
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1079
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1080
  " </tr>\n",
@@ -1095,11 +1093,11 @@
1095
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1096
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1097
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1098
- " <td>{\"Reasoning\": \"The question asks for something...</td>\n",
1099
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1100
  " <td>{\"Reasoning\": \"A body's strength is primarily ...</td>\n",
1101
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1102
- " <td>{\"Final Answer\": \"c\"}</td>\n",
1103
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1104
  " </tr>\n",
1105
  " </tbody>\n",
@@ -1268,19 +1266,19 @@
1268
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1269
  "1 {\"Reasoning\": \"Global warming is primarily cau... \n",
1270
  "2 {\"Reasoning\": \"The question asks for an organi... \n",
1271
- "3 {\"Reasoning\": \"Bacteria are microorganisms tha... \n",
1272
- "4 {\"Reasoning\": \"The question asks about the liv... \n",
1273
  "... ... \n",
1274
  "1678 {\"Reasoning\": \"The question asks for a way to ... \n",
1275
  "1679 {\"Reasoning\": \"The question asks for a term th... \n",
1276
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1277
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1278
- "1682 {\"Reasoning\": \"The question asks for something... \n",
1279
  "\n",
1280
  " responses_FAR_mistral \\\n",
1281
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1282
- "1 {\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu... \n",
1283
- "2 {\"Final Answer\": \"c\", \"Reasoning\": \"The proces... \n",
1284
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1285
  "4 {\"Final Answer\": \"a\", \"Reasoning\": \"Plants and... \n",
1286
  "... ... \n",
@@ -1293,41 +1291,41 @@
1293
  " responses_RFA_gpt3_5 \\\n",
1294
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1295
  "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n",
1296
- "2 {\"Reasoning\": \"The process of converting chemi... \n",
1297
- "3 { \"Reasoning\": \"Bacteria can cause harm to var... \n",
1298
- "4 {\"Reasoning\": \"The question asks about the rel... \n",
1299
  "... ... \n",
1300
  "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n",
1301
- "1679 { \"Reasoning\": \"A drought is a prolonged perio... \n",
1302
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1303
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1304
  "1682 {\"Reasoning\": \"A body's strength is primarily ... \n",
1305
  "\n",
1306
  " responses_FAR_gpt3_5 \\\n",
1307
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1308
- "1 {\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu... \n",
1309
- "2 {\"Final Answer\": \"e\", \"Reasoning\": \"Bacteria u... \n",
1310
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1311
- "4 {\"Final Answer\": \"f\", \"Reasoning\": \"Plants and... \n",
1312
  "... ... \n",
1313
- "1678 {\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ... \n",
1314
- "1679 {\"Final Answer\": \"d\", \"Reasoning\": \"A drought ... \n",
1315
  "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n",
1316
- "1681 {\"Final Answer\": \"d\", \"Reasoning\": \"Ultraviole... \n",
1317
  "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n",
1318
  "\n",
1319
- " responses_FA responses_base \n",
1320
- "0 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} \n",
1321
- "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} \n",
1322
- "2 {\"Final Answer\": \"c\"} {\"Final Answer\": \"c\"} \n",
1323
- "3 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} \n",
1324
- "4 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} \n",
1325
- "... ... ... \n",
1326
- "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} \n",
1327
- "1679 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} \n",
1328
- "1680 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} \n",
1329
- "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} \n",
1330
- "1682 {\"Final Answer\": \"c\"} {\"Final Answer\": \"c\"} \n",
1331
  "\n",
1332
  "[1683 rows x 21 columns]"
1333
  ]
@@ -1343,13 +1341,17 @@
1343
  },
1344
  {
1345
  "cell_type": "code",
1346
- "execution_count": 22,
1347
  "id": "8619f9f5-9fe4-433e-b524-51c2b12e8d12",
1348
  "metadata": {},
1349
  "outputs": [],
1350
  "source": [
1351
  "def extract_final_answer(response):\n",
1352
- " return json.loads(response).get(\"Final Answer\")\n",
 
 
 
 
1353
  "\n",
1354
  "# Create new columns for predictions\n",
1355
  "df['predictions_base'] = df['responses_base'].apply(extract_final_answer)\n",
@@ -1362,7 +1364,7 @@
1362
  },
1363
  {
1364
  "cell_type": "code",
1365
- "execution_count": 23,
1366
  "id": "938cf2a3-2fed-42a3-82ec-a56cb0ea9f37",
1367
  "metadata": {},
1368
  "outputs": [
@@ -1370,12 +1372,12 @@
1370
  "name": "stdout",
1371
  "output_type": "stream",
1372
  "text": [
1373
- "Base: \t\t\t\t\t\t45.28%\n",
1374
- "Final Answer: \t\t\t\t\t45.4%\n",
1375
- "Reasoning and then the Final Answer (Mistral): \t53.89%\n",
1376
- "Final Answer and then the Reasoning (Mistral): \t60.72%\n",
1377
- "Reasoning and then the Final Answer (GPT-3.5): \t59.06%\n",
1378
- "Final Answer and then the Reasoning (GPT-3.5): \t60.31%\n"
1379
  ]
1380
  }
1381
  ],
@@ -1392,7 +1394,7 @@
1392
  },
1393
  {
1394
  "cell_type": "code",
1395
- "execution_count": 24,
1396
  "id": "83aae472-513b-43c3-9ee8-64d4cda775e0",
1397
  "metadata": {},
1398
  "outputs": [
@@ -1434,10 +1436,10 @@
1434
  " <th>responses_base</th>\n",
1435
  " <th>predictions_base</th>\n",
1436
  " <th>predictions_FA</th>\n",
1437
- " <th>predictions_RFA_mistral</th>\n",
1438
  " <th>predictions_FAR_mistral</th>\n",
1439
  " <th>predictions_RFA_gpt3_5</th>\n",
1440
  " <th>predictions_FAR_gpt3_5</th>\n",
 
1441
  " </tr>\n",
1442
  " </thead>\n",
1443
  " <tbody>\n",
@@ -1456,7 +1458,7 @@
1456
  " <td>...</td>\n",
1457
  " <td>{\"Reasoning\": \"Busses are primarily used for t...</td>\n",
1458
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
1459
- " <td>{\"Final Answer\": \"b\"}</td>\n",
1460
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1461
  " <td>b</td>\n",
1462
  " <td>b</td>\n",
@@ -1479,7 +1481,7 @@
1479
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1480
  " <td>...</td>\n",
1481
  " <td>{\"Reasoning\": \"The nucleus of a cell (option a...</td>\n",
1482
- " <td>{\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu...</td>\n",
1483
  " <td>{\"Final Answer\": \"a\"}</td>\n",
1484
  " <td>{\"Final Answer\": \"a\"}</td>\n",
1485
  " <td>a</td>\n",
@@ -1487,7 +1489,7 @@
1487
  " <td>g</td>\n",
1488
  " <td>a</td>\n",
1489
  " <td>a</td>\n",
1490
- " <td>a</td>\n",
1491
  " </tr>\n",
1492
  " <tr>\n",
1493
  " <th>2</th>\n",
@@ -1502,13 +1504,13 @@
1502
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1503
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1504
  " <td>...</td>\n",
1505
- " <td>{\"Reasoning\": \"The process of converting chemi...</td>\n",
1506
- " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Bacteria u...</td>\n",
1507
- " <td>{\"Final Answer\": \"c\"}</td>\n",
1508
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1509
  " <td>c</td>\n",
1510
- " <td>c</td>\n",
1511
- " <td>e</td>\n",
1512
  " <td>c</td>\n",
1513
  " <td>c</td>\n",
1514
  " <td>e</td>\n",
@@ -1526,16 +1528,16 @@
1526
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1527
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1528
  " <td>...</td>\n",
1529
- " <td>{ \"Reasoning\": \"Bacteria can cause harm to var...</td>\n",
1530
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
 
1531
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1532
- " <td>{\"Final Answer\": \"d\"}</td>\n",
1533
- " <td>d</td>\n",
1534
- " <td>d</td>\n",
1535
  " <td>d</td>\n",
 
1536
  " <td>d</td>\n",
1537
  " <td>d</td>\n",
1538
  " <td>d</td>\n",
 
1539
  " </tr>\n",
1540
  " <tr>\n",
1541
  " <th>4</th>\n",
@@ -1550,16 +1552,16 @@
1550
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1551
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1552
  " <td>...</td>\n",
1553
- " <td>{\"Reasoning\": \"The question asks about the rel...</td>\n",
1554
- " <td>{\"Final Answer\": \"f\", \"Reasoning\": \"Plants and...</td>\n",
1555
- " <td>{\"Final Answer\": \"g\"}</td>\n",
1556
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1557
  " <td>g</td>\n",
1558
- " <td>g</td>\n",
1559
- " <td>b</td>\n",
1560
  " <td>a</td>\n",
1561
  " <td>f</td>\n",
1562
- " <td>f</td>\n",
 
1563
  " </tr>\n",
1564
  " <tr>\n",
1565
  " <th>...</th>\n",
@@ -1599,7 +1601,7 @@
1599
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1600
  " <td>...</td>\n",
1601
  " <td>{\"Reasoning\": \"Mining, fossil fuels, deforesta...</td>\n",
1602
- " <td>{\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ...</td>\n",
1603
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1604
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1605
  " <td>g</td>\n",
@@ -1622,15 +1624,15 @@
1622
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1623
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1624
  " <td>...</td>\n",
1625
- " <td>{ \"Reasoning\": \"A drought is a prolonged perio...</td>\n",
1626
- " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"A drought ...</td>\n",
1627
- " <td>{\"Final Answer\": \"d\"}</td>\n",
1628
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1629
  " <td>d</td>\n",
 
1630
  " <td>d</td>\n",
1631
  " <td>d</td>\n",
1632
  " <td>d</td>\n",
1633
- " <td>f</td>\n",
1634
  " <td>d</td>\n",
1635
  " </tr>\n",
1636
  " <tr>\n",
@@ -1648,14 +1650,14 @@
1648
  " <td>...</td>\n",
1649
  " <td>{\"Reasoning\": \"Ingestion is the process of tak...</td>\n",
1650
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
 
1651
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1652
- " <td>{\"Final Answer\": \"d\"}</td>\n",
1653
- " <td>d</td>\n",
1654
  " <td>d</td>\n",
1655
  " <td>e</td>\n",
1656
  " <td>e</td>\n",
1657
  " <td>e</td>\n",
1658
  " <td>e</td>\n",
 
1659
  " </tr>\n",
1660
  " <tr>\n",
1661
  " <th>1681</th>\n",
@@ -1671,7 +1673,7 @@
1671
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1672
  " <td>...</td>\n",
1673
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1674
- " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Ultraviole...</td>\n",
1675
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1676
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1677
  " <td>b</td>\n",
@@ -1679,7 +1681,7 @@
1679
  " <td>b</td>\n",
1680
  " <td>b</td>\n",
1681
  " <td>b</td>\n",
1682
- " <td>d</td>\n",
1683
  " </tr>\n",
1684
  " <tr>\n",
1685
  " <th>1682</th>\n",
@@ -1696,13 +1698,13 @@
1696
  " <td>...</td>\n",
1697
  " <td>{\"Reasoning\": \"A body's strength is primarily ...</td>\n",
1698
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1699
- " <td>{\"Final Answer\": \"c\"}</td>\n",
1700
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1701
  " <td>c</td>\n",
1702
- " <td>c</td>\n",
1703
  " <td>f</td>\n",
1704
  " <td>c</td>\n",
1705
- " <td>f</td>\n",
 
1706
  " <td>c</td>\n",
1707
  " </tr>\n",
1708
  " </tbody>\n",
@@ -1818,72 +1820,72 @@
1818
  " responses_RFA_gpt3_5 \\\n",
1819
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1820
  "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n",
1821
- "2 {\"Reasoning\": \"The process of converting chemi... \n",
1822
- "3 { \"Reasoning\": \"Bacteria can cause harm to var... \n",
1823
- "4 {\"Reasoning\": \"The question asks about the rel... \n",
1824
  "... ... \n",
1825
  "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n",
1826
- "1679 { \"Reasoning\": \"A drought is a prolonged perio... \n",
1827
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1828
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1829
  "1682 {\"Reasoning\": \"A body's strength is primarily ... \n",
1830
  "\n",
1831
  " responses_FAR_gpt3_5 \\\n",
1832
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1833
- "1 {\"Final Answer\": \"a\", \"Reasoning\": \"The nucleu... \n",
1834
- "2 {\"Final Answer\": \"e\", \"Reasoning\": \"Bacteria u... \n",
1835
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1836
- "4 {\"Final Answer\": \"f\", \"Reasoning\": \"Plants and... \n",
1837
  "... ... \n",
1838
- "1678 {\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ... \n",
1839
- "1679 {\"Final Answer\": \"d\", \"Reasoning\": \"A drought ... \n",
1840
  "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n",
1841
- "1681 {\"Final Answer\": \"d\", \"Reasoning\": \"Ultraviole... \n",
1842
  "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n",
1843
  "\n",
1844
- " responses_FA responses_base predictions_base \\\n",
1845
- "0 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} b \n",
1846
- "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} a \n",
1847
- "2 {\"Final Answer\": \"c\"} {\"Final Answer\": \"c\"} c \n",
1848
- "3 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} d \n",
1849
- "4 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} g \n",
1850
- "... ... ... ... \n",
1851
- "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} g \n",
1852
- "1679 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} d \n",
1853
- "1680 {\"Final Answer\": \"d\"} {\"Final Answer\": \"d\"} d \n",
1854
- "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} b \n",
1855
- "1682 {\"Final Answer\": \"c\"} {\"Final Answer\": \"c\"} c \n",
1856
  "\n",
1857
- " predictions_FA predictions_RFA_mistral predictions_FAR_mistral \\\n",
1858
- "0 b b b \n",
1859
- "1 a g a \n",
1860
- "2 c e c \n",
1861
- "3 d d d \n",
1862
- "4 g b a \n",
1863
- "... ... ... ... \n",
1864
- "1678 g g g \n",
1865
- "1679 d d d \n",
1866
- "1680 d e e \n",
1867
- "1681 b b b \n",
1868
- "1682 c f c \n",
1869
  "\n",
1870
- " predictions_RFA_gpt3_5 predictions_FAR_gpt3_5 \n",
1871
- "0 b b \n",
1872
- "1 a a \n",
1873
- "2 c e \n",
1874
- "3 d d \n",
1875
- "4 f f \n",
1876
- "... ... ... \n",
1877
- "1678 g g \n",
1878
- "1679 f d \n",
1879
- "1680 e e \n",
1880
- "1681 b d \n",
1881
- "1682 f c \n",
1882
  "\n",
1883
  "[1683 rows x 27 columns]"
1884
  ]
1885
  },
1886
- "execution_count": 24,
1887
  "metadata": {},
1888
  "output_type": "execute_result"
1889
  }
@@ -1894,14 +1896,14 @@
1894
  },
1895
  {
1896
  "cell_type": "code",
1897
- "execution_count": 26,
1898
  "id": "45c08dd4-0b98-4e0f-b487-549f60518a4e",
1899
  "metadata": {},
1900
  "outputs": [
1901
  {
1902
  "data": {
1903
  "application/vnd.jupyter.widget-view+json": {
1904
- "model_id": "30c9a9f7656f4950a3fde9deaa6bf0ac",
1905
  "version_major": 2,
1906
  "version_minor": 0
1907
  },
@@ -1915,7 +1917,7 @@
1915
  {
1916
  "data": {
1917
  "application/vnd.jupyter.widget-view+json": {
1918
- "model_id": "bfc21892a7eb45caa097435b6026087d",
1919
  "version_major": 2,
1920
  "version_minor": 0
1921
  },
@@ -1929,10 +1931,10 @@
1929
  {
1930
  "data": {
1931
  "text/plain": [
1932
- "CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results/commit/b2c01c5867b23afe06c0806f2886152864574946', commit_message='Upload dataset', commit_description='', oid='b2c01c5867b23afe06c0806f2886152864574946', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/labeled-multiple-choice-explained-mistral-results'), pr_revision=None, pr_num=None)"
1933
  ]
1934
  },
1935
- "execution_count": 26,
1936
  "metadata": {},
1937
  "output_type": "execute_result"
1938
  }
 
104
  {
105
  "data": {
106
  "text/plain": [
107
+ "'derek-thomas/mistral-v03-poe-RFA-mistral,derek-thomas/mistral-v03-poe-FAR-mistral,derek-thomas/mistral-v03-poe-RFA-gpt3-5,derek-thomas/mistral-v03-poe-FAR-gpt3-5,derek-thomas/mistral-v03-poe-FA'"
108
  ]
109
  },
110
  "execution_count": 3,
 
137
  " },\n",
138
  " 'FA': {\n",
139
  " 'pydantic': FAModel,\n",
140
+ " \"lora\": \"derek-thomas/mistral-v03-poe-FA\",\n",
141
  " \"column\": 'user_prompt_FA',\n",
142
  " },\n",
143
  " 'base': {\n",
 
162
  {
163
  "data": {
164
  "application/vnd.jupyter.widget-view+json": {
165
+ "model_id": "50dbecc676db4dc78dd1974d2f1a87dc",
166
  "version_major": 2,
167
  "version_minor": 0
168
  },
 
239
  "\n",
240
  "def get_my_endpoint():\n",
241
  " name = f\"prompt-order-experiment\"\n",
242
+ " namespace='derek-thomas'\n",
243
  " try:\n",
244
  " endpoint = get_inference_endpoint(name, namespace=namespace)\n",
 
245
  " except:\n",
246
  " # Custom Docker image details\n",
247
  " custom_image = {\n",
 
278
  " custom_image=custom_image,\n",
279
  " secrets=secrets\n",
280
  " )\n",
 
281
  " \n",
282
  " print(\"Your model is ready to use!\")\n",
283
  " endpoint.wait()\n",
 
295
  "output_type": "stream",
296
  "text": [
297
  "Your model is ready to use!\n",
298
+ "CPU times: user 21.1 ms, sys: 10 ms, total: 31.1 ms\n",
299
+ "Wall time: 1.72 s\n"
300
  ]
301
  }
302
  ],
 
357
  {
358
  "data": {
359
  "text/plain": [
360
+ "'{\"Reasoning\": \"Busses are primarily used for transporting people, so the correct answer is (b) Transporting humans. The other options are not related to the function of a bus.\", \"Final Answer\": \"b\"}'"
361
  ]
362
  },
363
  "execution_count": 10,
 
386
  {
387
  "data": {
388
  "text/plain": [
389
+ "'{\"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas where public transportation is necessary. They provide a means of transportation for a large number of people at once, reducing traffic congestion and carbon emissions. Therefore, the correct answer is (b) transporting humans.\", \"Final Answer\": \"b\"}'"
390
  ]
391
  },
392
  "execution_count": 11,
 
398
  "key = 'RFA-gpt3-5'\n",
399
  "response = endpoint.client.text_generation(\n",
400
  " prompt=user_prompt_RFA,\n",
401
+ " max_new_tokens=OUTPUT_TOKENS,\n",
402
  " adapter_id=experiments[key]['lora'],\n",
403
  " grammar={\"type\": \"json\", \"value\": experiments[key]['pydantic'].schema()},\n",
404
  ")\n",
 
448
  {
449
  "data": {
450
  "text/plain": [
451
+ "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, to facilitate commuting and travel. They are not used for protective shelter, helping other species benefit, transporting airplanes, serving as a backbone, communication, safe operation, or safe driving.\"}'"
452
  ]
453
  },
454
  "execution_count": 13,
 
477
  {
478
  "data": {
479
  "text/plain": [
480
+ "'{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are primarily used for transporting humans, especially in urban areas, to facilitate their travel from one place to another. The other options are incorrect because busses do not provide protective shelter, do not help other species benefit, are not used to transport airplanes, do not serve as a backbone, are not used for communication, and are not related to safe operation or driving.\"}'"
481
  ]
482
  },
483
  "execution_count": 14,
 
539
  {
540
  "data": {
541
  "text/plain": [
542
+ "\"{'Final Answer': 'b'}\""
543
  ]
544
  },
545
  "execution_count": 16,
 
654
  {
655
  "data": {
656
  "application/vnd.jupyter.widget-view+json": {
657
+ "model_id": "be0bd3e278ae4d90a161918772ee71e8",
658
  "version_major": 2,
659
  "version_minor": 0
660
  },
 
763
  {
764
  "data": {
765
  "text/plain": [
766
+ "InferenceEndpoint(name='prompt-order-experiment', namespace='derek-thomas', repository='mistralai/Mistral-7B-Instruct-v0.3', status='paused', url=None)"
767
  ]
768
  },
769
  "execution_count": 19,
 
857
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
858
  " <td>{\"Reasoning\": \"Busses are primarily used for t...</td>\n",
859
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
860
+ " <td>{ \"Final Answer\": \"b\" }</td>\n",
861
  " <td>{\"Final Answer\": \"b\"}</td>\n",
862
  " </tr>\n",
863
  " <tr>\n",
 
878
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
879
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
880
  " <td>{\"Reasoning\": \"Global warming is primarily cau...</td>\n",
881
+ " <td>{\"Final Answer\": \"g\", \"Reasoning\": \"Riding a b...</td>\n",
882
  " <td>{\"Reasoning\": \"The nucleus of a cell (option a...</td>\n",
883
+ " <td>{ \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle...</td>\n",
884
  " <td>{\"Final Answer\": \"a\"}</td>\n",
885
  " <td>{\"Final Answer\": \"a\"}</td>\n",
886
  " </tr>\n",
 
902
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
903
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
904
  " <td>{\"Reasoning\": \"The question asks for an organi...</td>\n",
905
+ " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Cameras ar...</td>\n",
906
+ " <td>{\"Reasoning\": \"The correct answer is (c) Cells...</td>\n",
907
+ " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ...</td>\n",
908
+ " <td>{\"Final Answer\": \"b\"}</td>\n",
909
  " <td>{\"Final Answer\": \"c\"}</td>\n",
910
  " </tr>\n",
911
  " <tr>\n",
 
925
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
926
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
927
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
928
+ " <td>{\"Reasoning\": \"The question asks about what is...</td>\n",
929
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
930
+ " <td>{\"Reasoning\": \"Bacteria are microorganisms tha...</td>\n",
931
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
932
+ " <td>{\"Final Answer\": \"e\"}</td>\n",
933
  " <td>{\"Final Answer\": \"d\"}</td>\n",
934
  " </tr>\n",
935
  " <tr>\n",
 
949
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
950
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
951
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
952
+ " <td>{\"Reasoning\": \"The question asks for the livin...</td>\n",
953
  " <td>{\"Final Answer\": \"a\", \"Reasoning\": \"Plants and...</td>\n",
954
+ " <td>{\"Reasoning\": \"The question asks about the liv...</td>\n",
955
+ " <td>{ \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an...</td>\n",
956
+ " <td>{\"Final Answer\": \"a\"}</td>\n",
957
  " <td>{\"Final Answer\": \"g\"}</td>\n",
958
  " </tr>\n",
959
  " <tr>\n",
 
1000
  " <td>{\"Reasoning\": \"The question asks for a way to ...</td>\n",
1001
  " <td>{\"Final Answer\": \"g\", \"Reasoning\": \"Recycling ...</td>\n",
1002
  " <td>{\"Reasoning\": \"Mining, fossil fuels, deforesta...</td>\n",
1003
+ " <td>{ \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling...</td>\n",
1004
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1005
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1006
  " </tr>\n",
 
1023
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1024
  " <td>{\"Reasoning\": \"The question asks for a term th...</td>\n",
1025
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"A drought ...</td>\n",
1026
+ " <td>{\"Reasoning\": \"A drought is a prolonged period...</td>\n",
1027
+ " <td>{ \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought...</td>\n",
1028
+ " <td>{\"Final Answer\": \"a\"}</td>\n",
1029
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1030
  " </tr>\n",
1031
  " <tr>\n",
 
1049
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
1050
  " <td>{\"Reasoning\": \"Ingestion is the process of tak...</td>\n",
1051
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
1052
+ " <td>{\"Final Answer\": \"e\"}</td>\n",
1053
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1054
  " </tr>\n",
1055
  " <tr>\n",
 
1072
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1073
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Ultraviole...</td>\n",
1074
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1075
+ " <td>{ \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol...</td>\n",
1076
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1077
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1078
  " </tr>\n",
 
1093
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1094
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1095
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1096
+ " <td>{\"Reasoning\": \"The correct answer is 'Exercise...</td>\n",
1097
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1098
  " <td>{\"Reasoning\": \"A body's strength is primarily ...</td>\n",
1099
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1100
+ " <td>{\"Final Answer\": \"f\"}</td>\n",
1101
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1102
  " </tr>\n",
1103
  " </tbody>\n",
 
1266
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1267
  "1 {\"Reasoning\": \"Global warming is primarily cau... \n",
1268
  "2 {\"Reasoning\": \"The question asks for an organi... \n",
1269
+ "3 {\"Reasoning\": \"The question asks about what is... \n",
1270
+ "4 {\"Reasoning\": \"The question asks for the livin... \n",
1271
  "... ... \n",
1272
  "1678 {\"Reasoning\": \"The question asks for a way to ... \n",
1273
  "1679 {\"Reasoning\": \"The question asks for a term th... \n",
1274
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1275
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1276
+ "1682 {\"Reasoning\": \"The correct answer is 'Exercise... \n",
1277
  "\n",
1278
  " responses_FAR_mistral \\\n",
1279
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1280
+ "1 {\"Final Answer\": \"g\", \"Reasoning\": \"Riding a b... \n",
1281
+ "2 {\"Final Answer\": \"b\", \"Reasoning\": \"Cameras ar... \n",
1282
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1283
  "4 {\"Final Answer\": \"a\", \"Reasoning\": \"Plants and... \n",
1284
  "... ... \n",
 
1291
  " responses_RFA_gpt3_5 \\\n",
1292
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1293
  "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n",
1294
+ "2 {\"Reasoning\": \"The correct answer is (c) Cells... \n",
1295
+ "3 {\"Reasoning\": \"Bacteria are microorganisms tha... \n",
1296
+ "4 {\"Reasoning\": \"The question asks about the liv... \n",
1297
  "... ... \n",
1298
  "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n",
1299
+ "1679 {\"Reasoning\": \"A drought is a prolonged period... \n",
1300
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1301
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1302
  "1682 {\"Reasoning\": \"A body's strength is primarily ... \n",
1303
  "\n",
1304
  " responses_FAR_gpt3_5 \\\n",
1305
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1306
+ "1 { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle... \n",
1307
+ "2 {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ... \n",
1308
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1309
+ "4 { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an... \n",
1310
  "... ... \n",
1311
+ "1678 { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling... \n",
1312
+ "1679 { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought... \n",
1313
  "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n",
1314
+ "1681 { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol... \n",
1315
  "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n",
1316
  "\n",
1317
+ " responses_FA responses_base \n",
1318
+ "0 { \"Final Answer\": \"b\" } {\"Final Answer\": \"b\"} \n",
1319
+ "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} \n",
1320
+ "2 {\"Final Answer\": \"b\"} {\"Final Answer\": \"c\"} \n",
1321
+ "3 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} \n",
1322
+ "4 {\"Final Answer\": \"a\"} {\"Final Answer\": \"g\"} \n",
1323
+ "... ... ... \n",
1324
+ "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} \n",
1325
+ "1679 {\"Final Answer\": \"a\"} {\"Final Answer\": \"d\"} \n",
1326
+ "1680 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} \n",
1327
+ "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} \n",
1328
+ "1682 {\"Final Answer\": \"f\"} {\"Final Answer\": \"c\"} \n",
1329
  "\n",
1330
  "[1683 rows x 21 columns]"
1331
  ]
 
1341
  },
1342
  {
1343
  "cell_type": "code",
1344
+ "execution_count": 26,
1345
  "id": "8619f9f5-9fe4-433e-b524-51c2b12e8d12",
1346
  "metadata": {},
1347
  "outputs": [],
1348
  "source": [
1349
  "def extract_final_answer(response):\n",
1350
+ " try:\n",
1351
+ " answer = json.loads(response).get(\"Final Answer\")\n",
1352
+ " except:\n",
1353
+ " answer = 'x'\n",
1354
+ " return answer\n",
1355
  "\n",
1356
  "# Create new columns for predictions\n",
1357
  "df['predictions_base'] = df['responses_base'].apply(extract_final_answer)\n",
 
1364
  },
1365
  {
1366
  "cell_type": "code",
1367
+ "execution_count": 28,
1368
  "id": "938cf2a3-2fed-42a3-82ec-a56cb0ea9f37",
1369
  "metadata": {},
1370
  "outputs": [
 
1372
  "name": "stdout",
1373
  "output_type": "stream",
1374
  "text": [
1375
+ "Base: \t\t\t\t\t\t45.22%\n",
1376
+ "Final Answer: \t\t\t\t\t64.53%\n",
1377
+ "Reasoning and then the Final Answer (Mistral): \t55.02%\n",
1378
+ "Final Answer and then the Reasoning (Mistral): \t61.79%\n",
1379
+ "Reasoning and then the Final Answer (GPT-3.5): \t57.28%\n",
1380
+ "Final Answer and then the Reasoning (GPT-3.5): \t61.62%\n"
1381
  ]
1382
  }
1383
  ],
 
1394
  },
1395
  {
1396
  "cell_type": "code",
1397
+ "execution_count": 29,
1398
  "id": "83aae472-513b-43c3-9ee8-64d4cda775e0",
1399
  "metadata": {},
1400
  "outputs": [
 
1436
  " <th>responses_base</th>\n",
1437
  " <th>predictions_base</th>\n",
1438
  " <th>predictions_FA</th>\n",
 
1439
  " <th>predictions_FAR_mistral</th>\n",
1440
  " <th>predictions_RFA_gpt3_5</th>\n",
1441
  " <th>predictions_FAR_gpt3_5</th>\n",
1442
+ " <th>predictions_RFA_mistral</th>\n",
1443
  " </tr>\n",
1444
  " </thead>\n",
1445
  " <tbody>\n",
 
1458
  " <td>...</td>\n",
1459
  " <td>{\"Reasoning\": \"Busses are primarily used for t...</td>\n",
1460
  " <td>{\"Final Answer\": \"b\", \"Reasoning\": \"Busses are...</td>\n",
1461
+ " <td>{ \"Final Answer\": \"b\" }</td>\n",
1462
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1463
  " <td>b</td>\n",
1464
  " <td>b</td>\n",
 
1481
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1482
  " <td>...</td>\n",
1483
  " <td>{\"Reasoning\": \"The nucleus of a cell (option a...</td>\n",
1484
+ " <td>{ \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle...</td>\n",
1485
  " <td>{\"Final Answer\": \"a\"}</td>\n",
1486
  " <td>{\"Final Answer\": \"a\"}</td>\n",
1487
  " <td>a</td>\n",
 
1489
  " <td>g</td>\n",
1490
  " <td>a</td>\n",
1491
  " <td>a</td>\n",
1492
+ " <td>g</td>\n",
1493
  " </tr>\n",
1494
  " <tr>\n",
1495
  " <th>2</th>\n",
 
1504
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1505
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1506
  " <td>...</td>\n",
1507
+ " <td>{\"Reasoning\": \"The correct answer is (c) Cells...</td>\n",
1508
+ " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ...</td>\n",
1509
+ " <td>{\"Final Answer\": \"b\"}</td>\n",
1510
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1511
  " <td>c</td>\n",
1512
+ " <td>b</td>\n",
1513
+ " <td>b</td>\n",
1514
  " <td>c</td>\n",
1515
  " <td>c</td>\n",
1516
  " <td>e</td>\n",
 
1528
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1529
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1530
  " <td>...</td>\n",
1531
+ " <td>{\"Reasoning\": \"Bacteria are microorganisms tha...</td>\n",
1532
  " <td>{\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c...</td>\n",
1533
+ " <td>{\"Final Answer\": \"e\"}</td>\n",
1534
  " <td>{\"Final Answer\": \"d\"}</td>\n",
 
 
 
1535
  " <td>d</td>\n",
1536
+ " <td>e</td>\n",
1537
  " <td>d</td>\n",
1538
  " <td>d</td>\n",
1539
  " <td>d</td>\n",
1540
+ " <td>e</td>\n",
1541
  " </tr>\n",
1542
  " <tr>\n",
1543
  " <th>4</th>\n",
 
1552
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1553
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1554
  " <td>...</td>\n",
1555
+ " <td>{\"Reasoning\": \"The question asks about the liv...</td>\n",
1556
+ " <td>{ \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an...</td>\n",
1557
+ " <td>{\"Final Answer\": \"a\"}</td>\n",
1558
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1559
  " <td>g</td>\n",
1560
+ " <td>a</td>\n",
 
1561
  " <td>a</td>\n",
1562
  " <td>f</td>\n",
1563
+ " <td>a</td>\n",
1564
+ " <td>b</td>\n",
1565
  " </tr>\n",
1566
  " <tr>\n",
1567
  " <th>...</th>\n",
 
1601
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1602
  " <td>...</td>\n",
1603
  " <td>{\"Reasoning\": \"Mining, fossil fuels, deforesta...</td>\n",
1604
+ " <td>{ \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling...</td>\n",
1605
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1606
  " <td>{\"Final Answer\": \"g\"}</td>\n",
1607
  " <td>g</td>\n",
 
1624
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1625
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1626
  " <td>...</td>\n",
1627
+ " <td>{\"Reasoning\": \"A drought is a prolonged period...</td>\n",
1628
+ " <td>{ \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought...</td>\n",
1629
+ " <td>{\"Final Answer\": \"a\"}</td>\n",
1630
  " <td>{\"Final Answer\": \"d\"}</td>\n",
1631
  " <td>d</td>\n",
1632
+ " <td>a</td>\n",
1633
  " <td>d</td>\n",
1634
  " <td>d</td>\n",
1635
  " <td>d</td>\n",
 
1636
  " <td>d</td>\n",
1637
  " </tr>\n",
1638
  " <tr>\n",
 
1650
  " <td>...</td>\n",
1651
  " <td>{\"Reasoning\": \"Ingestion is the process of tak...</td>\n",
1652
  " <td>{\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ...</td>\n",
1653
+ " <td>{\"Final Answer\": \"e\"}</td>\n",
1654
  " <td>{\"Final Answer\": \"d\"}</td>\n",
 
 
1655
  " <td>d</td>\n",
1656
  " <td>e</td>\n",
1657
  " <td>e</td>\n",
1658
  " <td>e</td>\n",
1659
  " <td>e</td>\n",
1660
+ " <td>c</td>\n",
1661
  " </tr>\n",
1662
  " <tr>\n",
1663
  " <th>1681</th>\n",
 
1673
  " <td>&lt;s&gt;[INST] Answer the Question and include your...</td>\n",
1674
  " <td>...</td>\n",
1675
  " <td>{\"Reasoning\": \"Ultraviolet (UV) light is a typ...</td>\n",
1676
+ " <td>{ \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol...</td>\n",
1677
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1678
  " <td>{\"Final Answer\": \"b\"}</td>\n",
1679
  " <td>b</td>\n",
 
1681
  " <td>b</td>\n",
1682
  " <td>b</td>\n",
1683
  " <td>b</td>\n",
1684
+ " <td>b</td>\n",
1685
  " </tr>\n",
1686
  " <tr>\n",
1687
  " <th>1682</th>\n",
 
1698
  " <td>...</td>\n",
1699
  " <td>{\"Reasoning\": \"A body's strength is primarily ...</td>\n",
1700
  " <td>{\"Final Answer\": \"c\", \"Reasoning\": \"Running is...</td>\n",
1701
+ " <td>{\"Final Answer\": \"f\"}</td>\n",
1702
  " <td>{\"Final Answer\": \"c\"}</td>\n",
1703
  " <td>c</td>\n",
 
1704
  " <td>f</td>\n",
1705
  " <td>c</td>\n",
1706
+ " <td>d</td>\n",
1707
+ " <td>c</td>\n",
1708
  " <td>c</td>\n",
1709
  " </tr>\n",
1710
  " </tbody>\n",
 
1820
  " responses_RFA_gpt3_5 \\\n",
1821
  "0 {\"Reasoning\": \"Busses are primarily used for t... \n",
1822
  "1 {\"Reasoning\": \"The nucleus of a cell (option a... \n",
1823
+ "2 {\"Reasoning\": \"The correct answer is (c) Cells... \n",
1824
+ "3 {\"Reasoning\": \"Bacteria are microorganisms tha... \n",
1825
+ "4 {\"Reasoning\": \"The question asks about the liv... \n",
1826
  "... ... \n",
1827
  "1678 {\"Reasoning\": \"Mining, fossil fuels, deforesta... \n",
1828
+ "1679 {\"Reasoning\": \"A drought is a prolonged period... \n",
1829
  "1680 {\"Reasoning\": \"Ingestion is the process of tak... \n",
1830
  "1681 {\"Reasoning\": \"Ultraviolet (UV) light is a typ... \n",
1831
  "1682 {\"Reasoning\": \"A body's strength is primarily ... \n",
1832
  "\n",
1833
  " responses_FAR_gpt3_5 \\\n",
1834
  "0 {\"Final Answer\": \"b\", \"Reasoning\": \"Busses are... \n",
1835
+ "1 { \"Final Answer\": \"a\" ,\"Reasoning\": \"The nucle... \n",
1836
+ "2 {\"Final Answer\": \"c\", \"Reasoning\": \"Cells are ... \n",
1837
  "3 {\"Final Answer\": \"d\", \"Reasoning\": \"Bacteria c... \n",
1838
+ "4 { \"Final Answer\": \"a\" ,\"Reasoning\": \"Plants an... \n",
1839
  "... ... \n",
1840
+ "1678 { \"Final Answer\": \"g\" ,\"Reasoning\": \"Recycling... \n",
1841
+ "1679 { \"Final Answer\": \"d\" ,\"Reasoning\": \"A drought... \n",
1842
  "1680 {\"Final Answer\": \"e\", \"Reasoning\": \"Ingestion ... \n",
1843
+ "1681 { \"Final Answer\": \"b\" ,\"Reasoning\": \"Ultraviol... \n",
1844
  "1682 {\"Final Answer\": \"c\", \"Reasoning\": \"Running is... \n",
1845
  "\n",
1846
+ " responses_FA responses_base predictions_base \\\n",
1847
+ "0 { \"Final Answer\": \"b\" } {\"Final Answer\": \"b\"} b \n",
1848
+ "1 {\"Final Answer\": \"a\"} {\"Final Answer\": \"a\"} a \n",
1849
+ "2 {\"Final Answer\": \"b\"} {\"Final Answer\": \"c\"} c \n",
1850
+ "3 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} d \n",
1851
+ "4 {\"Final Answer\": \"a\"} {\"Final Answer\": \"g\"} g \n",
1852
+ "... ... ... ... \n",
1853
+ "1678 {\"Final Answer\": \"g\"} {\"Final Answer\": \"g\"} g \n",
1854
+ "1679 {\"Final Answer\": \"a\"} {\"Final Answer\": \"d\"} d \n",
1855
+ "1680 {\"Final Answer\": \"e\"} {\"Final Answer\": \"d\"} d \n",
1856
+ "1681 {\"Final Answer\": \"b\"} {\"Final Answer\": \"b\"} b \n",
1857
+ "1682 {\"Final Answer\": \"f\"} {\"Final Answer\": \"c\"} c \n",
1858
  "\n",
1859
+ " predictions_FA predictions_FAR_mistral predictions_RFA_gpt3_5 \\\n",
1860
+ "0 b b b \n",
1861
+ "1 a g a \n",
1862
+ "2 b b c \n",
1863
+ "3 e d d \n",
1864
+ "4 a a f \n",
1865
+ "... ... ... ... \n",
1866
+ "1678 g g g \n",
1867
+ "1679 a d d \n",
1868
+ "1680 e e e \n",
1869
+ "1681 b b b \n",
1870
+ "1682 f c d \n",
1871
  "\n",
1872
+ " predictions_FAR_gpt3_5 predictions_RFA_mistral \n",
1873
+ "0 b b \n",
1874
+ "1 a g \n",
1875
+ "2 c e \n",
1876
+ "3 d e \n",
1877
+ "4 a b \n",
1878
+ "... ... ... \n",
1879
+ "1678 g g \n",
1880
+ "1679 d d \n",
1881
+ "1680 e c \n",
1882
+ "1681 b b \n",
1883
+ "1682 c c \n",
1884
  "\n",
1885
  "[1683 rows x 27 columns]"
1886
  ]
1887
  },
1888
+ "execution_count": 29,
1889
  "metadata": {},
1890
  "output_type": "execute_result"
1891
  }
 
1896
  },
1897
  {
1898
  "cell_type": "code",
1899
+ "execution_count": 30,
1900
  "id": "45c08dd4-0b98-4e0f-b487-549f60518a4e",
1901
  "metadata": {},
1902
  "outputs": [
1903
  {
1904
  "data": {
1905
  "application/vnd.jupyter.widget-view+json": {
1906
+ "model_id": "23d5dbd0a91d436fb9920dfe81e4803a",
1907
  "version_major": 2,
1908
  "version_minor": 0
1909
  },
 
1917
  {
1918
  "data": {
1919
  "application/vnd.jupyter.widget-view+json": {
1920
+ "model_id": "0b25bcb277574e8792b14e838a32fe25",
1921
  "version_major": 2,
1922
  "version_minor": 0
1923
  },
 
1931
  {
1932
  "data": {
1933
  "text/plain": [
1934
+ "CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results/commit/796d0867b715f2fad05d6e54ad1e0e0504ca670c', commit_message='Upload dataset', commit_description='', oid='796d0867b715f2fad05d6e54ad1e0e0504ca670c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/labeled-multiple-choice-explained-mistral-results', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/labeled-multiple-choice-explained-mistral-results'), pr_revision=None, pr_num=None)"
1935
  ]
1936
  },
1937
+ "execution_count": 30,
1938
  "metadata": {},
1939
  "output_type": "execute_result"
1940
  }
prompt-order-experiment.cfg ADDED
File without changes
requirements.txt CHANGED
@@ -8,4 +8,9 @@ scikit-learn
8
  lighteval[tensorboardX,adapters]
9
  nest_asyncio
10
  plotly
11
- ipywidgets
 
 
 
 
 
 
8
  lighteval[tensorboardX,adapters]
9
  nest_asyncio
10
  plotly
11
+ ipywidgets
12
+
13
+ # Reflex
14
+ reflex
15
+ reflex-ag-grid
16
+