Ramon Meffert commited on
Commit
5a6e5dd
1 Parent(s): e7b8106

Add EM analysis

Browse files
Files changed (1) hide show
  1. results/em_analysis.ipynb +457 -0
results/em_analysis.ipynb ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# F1 Scores"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {
14
+ "vscode": {
15
+ "languageId": "r"
16
+ }
17
+ },
18
+ "outputs": [
19
+ {
20
+ "name": "stderr",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Loading required package: ggplot2\n",
24
+ "\n",
25
+ "-- \u001b[1mAttaching packages\u001b[22m --------------------------------------- tidyverse 1.3.1 --\n",
26
+ "\n",
27
+ "\u001b[32mv\u001b[39m \u001b[34mtibble \u001b[39m 3.1.5 \u001b[32mv\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.7\n",
28
+ "\u001b[32mv\u001b[39m \u001b[34mtidyr \u001b[39m 1.1.4 \u001b[32mv\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n",
29
+ "\u001b[32mv\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4 \u001b[32mv\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n",
30
+ "\n",
31
+ "-- \u001b[1mConflicts\u001b[22m ------------------------------------------ tidyverse_conflicts() --\n",
32
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
33
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n",
34
+ "\n",
35
+ "Loading required package: mvtnorm\n",
36
+ "\n",
37
+ "Loading required package: survival\n",
38
+ "\n",
39
+ "Loading required package: TH.data\n",
40
+ "\n",
41
+ "Loading required package: MASS\n",
42
+ "\n",
43
+ "\n",
44
+ "Attaching package: 'MASS'\n",
45
+ "\n",
46
+ "\n",
47
+ "The following object is masked from 'package:dplyr':\n",
48
+ "\n",
49
+ " select\n",
50
+ "\n",
51
+ "\n",
52
+ "\n",
53
+ "Attaching package: 'TH.data'\n",
54
+ "\n",
55
+ "\n",
56
+ "The following object is masked from 'package:MASS':\n",
57
+ "\n",
58
+ " geyser\n",
59
+ "\n",
60
+ "\n",
61
+ "Loading required package: carData\n",
62
+ "\n",
63
+ "\n",
64
+ "Attaching package: 'car'\n",
65
+ "\n",
66
+ "\n",
67
+ "The following object is masked from 'package:dplyr':\n",
68
+ "\n",
69
+ " recode\n",
70
+ "\n",
71
+ "\n",
72
+ "The following object is masked from 'package:purrr':\n",
73
+ "\n",
74
+ " some\n",
75
+ "\n",
76
+ "\n",
77
+ "\n",
78
+ "Attaching package: 'rstatix'\n",
79
+ "\n",
80
+ "\n",
81
+ "The following object is masked from 'package:MASS':\n",
82
+ "\n",
83
+ " select\n",
84
+ "\n",
85
+ "\n",
86
+ "The following object is masked from 'package:stats':\n",
87
+ "\n",
88
+ " filter\n",
89
+ "\n",
90
+ "\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "library(\"ggpubr\")\n",
96
+ "library(readr)\n",
97
+ "library(ggplot2)\n",
98
+ "library(tidyverse)\n",
99
+ "library(ARTool)\n",
100
+ "library(emmeans)\n",
101
+ "library(multcomp)\n",
102
+ "library(car)\n",
103
+ "library(rstatix)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 3,
109
+ "metadata": {
110
+ "vscode": {
111
+ "languageId": "r"
112
+ }
113
+ },
114
+ "outputs": [
115
+ {
116
+ "name": "stderr",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "New names:\n",
120
+ "* `` -> ...1\n",
121
+ "\n",
122
+ "\u001b[1mRows: \u001b[22m\u001b[34m59\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m5\u001b[39m\n",
123
+ "\u001b[36m--\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m--------------------------------------------------------\u001b[39m\n",
124
+ "\u001b[1mDelimiter:\u001b[22m \",\"\n",
125
+ "\u001b[32mdbl\u001b[39m (5): ...1, faiss_dpr, faiss_longformer, es_dpr, es_longformer\n",
126
+ "\n",
127
+ "\u001b[36mi\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n",
128
+ "\u001b[36mi\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n"
129
+ ]
130
+ },
131
+ {
132
+ "data": {
133
+ "text/html": [
134
+ "<table class=\"dataframe\">\n",
135
+ "<caption>A tibble: 6 × 4</caption>\n",
136
+ "<thead>\n",
137
+ "\t<tr><th scope=col>question</th><th scope=col>retriever</th><th scope=col>reader</th><th scope=col>em</th></tr>\n",
138
+ "\t<tr><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
139
+ "</thead>\n",
140
+ "<tbody>\n",
141
+ "\t<tr><td>0</td><td>faiss</td><td>dpr </td><td>0</td></tr>\n",
142
+ "\t<tr><td>0</td><td>faiss</td><td>longformer</td><td>0</td></tr>\n",
143
+ "\t<tr><td>0</td><td>es </td><td>dpr </td><td>0</td></tr>\n",
144
+ "\t<tr><td>0</td><td>es </td><td>longformer</td><td>0</td></tr>\n",
145
+ "\t<tr><td>1</td><td>faiss</td><td>dpr </td><td>0</td></tr>\n",
146
+ "\t<tr><td>1</td><td>faiss</td><td>longformer</td><td>0</td></tr>\n",
147
+ "</tbody>\n",
148
+ "</table>\n"
149
+ ],
150
+ "text/latex": [
151
+ "A tibble: 6 × 4\n",
152
+ "\\begin{tabular}{llll}\n",
153
+ " question & retriever & reader & em\\\\\n",
154
+ " <dbl> & <fct> & <fct> & <dbl>\\\\\n",
155
+ "\\hline\n",
156
+ "\t 0 & faiss & dpr & 0\\\\\n",
157
+ "\t 0 & faiss & longformer & 0\\\\\n",
158
+ "\t 0 & es & dpr & 0\\\\\n",
159
+ "\t 0 & es & longformer & 0\\\\\n",
160
+ "\t 1 & faiss & dpr & 0\\\\\n",
161
+ "\t 1 & faiss & longformer & 0\\\\\n",
162
+ "\\end{tabular}\n"
163
+ ],
164
+ "text/markdown": [
165
+ "\n",
166
+ "A tibble: 6 × 4\n",
167
+ "\n",
168
+ "| question &lt;dbl&gt; | retriever &lt;fct&gt; | reader &lt;fct&gt; | em &lt;dbl&gt; |\n",
169
+ "|---|---|---|---|\n",
170
+ "| 0 | faiss | dpr | 0 |\n",
171
+ "| 0 | faiss | longformer | 0 |\n",
172
+ "| 0 | es | dpr | 0 |\n",
173
+ "| 0 | es | longformer | 0 |\n",
174
+ "| 1 | faiss | dpr | 0 |\n",
175
+ "| 1 | faiss | longformer | 0 |\n",
176
+ "\n"
177
+ ],
178
+ "text/plain": [
179
+ " question retriever reader em\n",
180
+ "1 0 faiss dpr 0 \n",
181
+ "2 0 faiss longformer 0 \n",
182
+ "3 0 es dpr 0 \n",
183
+ "4 0 es longformer 0 \n",
184
+ "5 1 faiss dpr 0 \n",
185
+ "6 1 faiss longformer 0 "
186
+ ]
187
+ },
188
+ "metadata": {},
189
+ "output_type": "display_data"
190
+ }
191
+ ],
192
+ "source": [
193
+ "em_scores <- read_csv(\"em_scores.csv\") %>%\n",
194
+ " rename(question = `...1`) %>%\n",
195
+ " pivot_longer(!question, names_to=c(\"retriever\", \"reader\"), names_sep=\"_\", values_to=\"em\")\n",
196
+ "\n",
197
+ "em_scores$retriever <- as.factor(em_scores$retriever)\n",
198
+ "em_scores$reader <- as.factor(em_scores$reader)\n",
199
+ "\n",
200
+ "head(em_scores)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "metadata": {},
206
+ "source": [
207
+ "To test which tests we can use, we need to check for normality. For this, we use a Shapiro-Wilk test of normality. In this case, results with FAISS as retriever or DPR had reader had zero exact matches, thus making it impossible to compute the Shapiro-Wilk test of normality. Nonetheless, we know that a distribution with all-identical values is not normally distributed. As you can see in the results below, all other $p$-values are lower than 0.001, so we reject the null-hypothesis of normality and now know that none of the f1-scores are normally distributed."
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 14,
213
+ "metadata": {
214
+ "vscode": {
215
+ "languageId": "r"
216
+ }
217
+ },
218
+ "outputs": [
219
+ {
220
+ "data": {
221
+ "text/html": [
222
+ "<table class=\"dataframe\">\n",
223
+ "<caption>A tibble: 1 × 3</caption>\n",
224
+ "<thead>\n",
225
+ "\t<tr><th scope=col>retriever</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
226
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
227
+ "</thead>\n",
228
+ "<tbody>\n",
229
+ "\t<tr><td>es</td><td>0.2503666</td><td>6.788451e-22</td></tr>\n",
230
+ "</tbody>\n",
231
+ "</table>\n"
232
+ ],
233
+ "text/latex": [
234
+ "A tibble: 1 × 3\n",
235
+ "\\begin{tabular}{lll}\n",
236
+ " retriever & sw.stat & sw.p\\\\\n",
237
+ " <fct> & <dbl> & <dbl>\\\\\n",
238
+ "\\hline\n",
239
+ "\t es & 0.2503666 & 6.788451e-22\\\\\n",
240
+ "\\end{tabular}\n"
241
+ ],
242
+ "text/markdown": [
243
+ "\n",
244
+ "A tibble: 1 × 3\n",
245
+ "\n",
246
+ "| retriever &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
247
+ "|---|---|---|\n",
248
+ "| es | 0.2503666 | 6.788451e-22 |\n",
249
+ "\n"
250
+ ],
251
+ "text/plain": [
252
+ " retriever sw.stat sw.p \n",
253
+ "1 es 0.2503666 6.788451e-22"
254
+ ]
255
+ },
256
+ "metadata": {},
257
+ "output_type": "display_data"
258
+ },
259
+ {
260
+ "data": {
261
+ "text/html": [
262
+ "<table class=\"dataframe\">\n",
263
+ "<caption>A tibble: 1 × 3</caption>\n",
264
+ "<thead>\n",
265
+ "\t<tr><th scope=col>reader</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
266
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
267
+ "</thead>\n",
268
+ "<tbody>\n",
269
+ "\t<tr><td>longformer</td><td>0.2503666</td><td>6.788451e-22</td></tr>\n",
270
+ "</tbody>\n",
271
+ "</table>\n"
272
+ ],
273
+ "text/latex": [
274
+ "A tibble: 1 × 3\n",
275
+ "\\begin{tabular}{lll}\n",
276
+ " reader & sw.stat & sw.p\\\\\n",
277
+ " <fct> & <dbl> & <dbl>\\\\\n",
278
+ "\\hline\n",
279
+ "\t longformer & 0.2503666 & 6.788451e-22\\\\\n",
280
+ "\\end{tabular}\n"
281
+ ],
282
+ "text/markdown": [
283
+ "\n",
284
+ "A tibble: 1 × 3\n",
285
+ "\n",
286
+ "| reader &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
287
+ "|---|---|---|\n",
288
+ "| longformer | 0.2503666 | 6.788451e-22 |\n",
289
+ "\n"
290
+ ],
291
+ "text/plain": [
292
+ " reader sw.stat sw.p \n",
293
+ "1 longformer 0.2503666 6.788451e-22"
294
+ ]
295
+ },
296
+ "metadata": {},
297
+ "output_type": "display_data"
298
+ }
299
+ ],
300
+ "source": [
301
+ "em_scores %>%\n",
302
+ " select(!question) %>%\n",
303
+ " group_by(retriever) %>%\n",
304
+ " filter(sum(em) > 0) %>%\n",
305
+ " summarise(sw.stat = shapiro.test(em)$statistic,\n",
306
+ " sw.p = shapiro.test(em)$p)\n",
307
+ "em_scores %>%\n",
308
+ " select(!question) %>%\n",
309
+ " group_by(reader) %>%\n",
310
+ " filter(sum(em) > 0) %>%\n",
311
+ " summarise(sw.stat = shapiro.test(em)$statistic,\n",
312
+ " sw.p = shapiro.test(em)$p)"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "markdown",
317
+ "metadata": {},
318
+ "source": [
319
+ "Since our data is not normally distributed, we cannot use an ANOVA to compare our results. Therefore, we use an aligned-rank test, which is a non-parameteric version of a factorial repeated measures ANOVA."
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": 4,
325
+ "metadata": {
326
+ "vscode": {
327
+ "languageId": "r"
328
+ }
329
+ },
330
+ "outputs": [
331
+ {
332
+ "data": {
333
+ "text/html": [
334
+ "<table class=\"dataframe\">\n",
335
+ "<caption>A anova.art: 3 × 7</caption>\n",
336
+ "<thead>\n",
337
+ "\t<tr><th></th><th scope=col>Term</th><th scope=col>Df</th><th scope=col>Df.res</th><th scope=col>Sum Sq</th><th scope=col>Sum Sq.res</th><th scope=col>F value</th><th scope=col>Pr(&gt;F)</th></tr>\n",
338
+ "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
339
+ "</thead>\n",
340
+ "<tbody>\n",
341
+ "\t<tr><th scope=row>retriever</th><td>retriever </td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
342
+ "\t<tr><th scope=row>reader</th><td>reader </td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
343
+ "\t<tr><th scope=row>retriever:reader</th><td>retriever:reader</td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
344
+ "</tbody>\n",
345
+ "</table>\n"
346
+ ],
347
+ "text/latex": [
348
+ "A anova.art: 3 × 7\n",
349
+ "\\begin{tabular}{r|lllllll}\n",
350
+ " & Term & Df & Df.res & Sum Sq & Sum Sq.res & F value & Pr(>F)\\\\\n",
351
+ " & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
352
+ "\\hline\n",
353
+ "\tretriever & retriever & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
354
+ "\treader & reader & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
355
+ "\tretriever:reader & retriever:reader & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
356
+ "\\end{tabular}\n"
357
+ ],
358
+ "text/markdown": [
359
+ "\n",
360
+ "A anova.art: 3 × 7\n",
361
+ "\n",
362
+ "| <!--/--> | Term &lt;chr&gt; | Df &lt;dbl&gt; | Df.res &lt;dbl&gt; | Sum Sq &lt;dbl&gt; | Sum Sq.res &lt;dbl&gt; | F value &lt;dbl&gt; | Pr(&gt;F) &lt;dbl&gt; |\n",
363
+ "|---|---|---|---|---|---|---|---|\n",
364
+ "| retriever | retriever | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
365
+ "| reader | reader | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
366
+ "| retriever:reader | retriever:reader | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
367
+ "\n"
368
+ ],
369
+ "text/plain": [
370
+ " Term Df Df.res Sum Sq Sum Sq.res F value\n",
371
+ "retriever retriever 1 232 11564 263081 10.1978\n",
372
+ "reader reader 1 232 11564 263081 10.1978\n",
373
+ "retriever:reader retriever:reader 1 232 11564 263081 10.1978\n",
374
+ " Pr(>F) \n",
375
+ "retriever 0.001600976\n",
376
+ "reader 0.001600976\n",
377
+ "retriever:reader 0.001600976"
378
+ ]
379
+ },
380
+ "metadata": {},
381
+ "output_type": "display_data"
382
+ },
383
+ {
384
+ "name": "stderr",
385
+ "output_type": "stream",
386
+ "text": [
387
+ "NOTE: Results may be misleading due to involvement in interactions\n",
388
+ "\n"
389
+ ]
390
+ },
391
+ {
392
+ "data": {
393
+ "text/plain": [
394
+ " contrast estimate SE df t.ratio p.value\n",
395
+ " es - faiss 14 4.38 232 3.193 0.0016\n",
396
+ "\n",
397
+ "Results are averaged over the levels of: reader "
398
+ ]
399
+ },
400
+ "metadata": {},
401
+ "output_type": "display_data"
402
+ },
403
+ {
404
+ "name": "stderr",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "NOTE: Results may be misleading due to involvement in interactions\n",
408
+ "\n"
409
+ ]
410
+ },
411
+ {
412
+ "data": {
413
+ "text/plain": [
414
+ " contrast estimate SE df t.ratio p.value\n",
415
+ " dpr - longformer -14 4.38 232 -3.193 0.0016\n",
416
+ "\n",
417
+ "Results are averaged over the levels of: retriever "
418
+ ]
419
+ },
420
+ "metadata": {},
421
+ "output_type": "display_data"
422
+ }
423
+ ],
424
+ "source": [
425
+ "model.acc <- art(f1 ~ retriever * reader, data = em_scores)\n",
426
+ "anova(model.acc)\n",
427
+ "art.con(model.acc, ~ retriever)\n",
428
+ "art.con(model.acc, ~ reader)"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "markdown",
433
+ "metadata": {},
434
+ "source": [
435
+ "From these results, we can see that both the retriever and the reader have a significant effect on the F1 score ($F = 58.63$ and $F = 16.23$ respectively, $p < 0.0001$ for both). However, there is also an interaction between the retriever and reader ($F = 43.53$, $p < 0.0001$). The post-hoc analysis of contrasts shows that ElasticSearch performs better than FAISS ($p < 0.0001$) and Longformer performs better than DPR ($p = 0.0001$)."
436
+ ]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "kernelspec": {
441
+ "display_name": "R",
442
+ "language": "R",
443
+ "name": "ir"
444
+ },
445
+ "language_info": {
446
+ "codemirror_mode": "r",
447
+ "file_extension": ".r",
448
+ "mimetype": "text/x-r-source",
449
+ "name": "R",
450
+ "pygments_lexer": "r",
451
+ "version": "4.1.2"
452
+ },
453
+ "orig_nbformat": 4
454
+ },
455
+ "nbformat": 4,
456
+ "nbformat_minor": 2
457
+ }