GGroenendaal commited on
Commit
2664ada
2 Parent(s): 790404c 5a6e5dd

Merge branch 'main' of github.com:RamonMeffert/nlp-flashcard-project into main

Browse files
results/em_analysis.ipynb ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# F1 Scores"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 2,
13
+ "metadata": {
14
+ "vscode": {
15
+ "languageId": "r"
16
+ }
17
+ },
18
+ "outputs": [
19
+ {
20
+ "name": "stderr",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Loading required package: ggplot2\n",
24
+ "\n",
25
+ "-- \u001b[1mAttaching packages\u001b[22m --------------------------------------- tidyverse 1.3.1 --\n",
26
+ "\n",
27
+ "\u001b[32mv\u001b[39m \u001b[34mtibble \u001b[39m 3.1.5 \u001b[32mv\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.7\n",
28
+ "\u001b[32mv\u001b[39m \u001b[34mtidyr \u001b[39m 1.1.4 \u001b[32mv\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n",
29
+ "\u001b[32mv\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4 \u001b[32mv\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n",
30
+ "\n",
31
+ "-- \u001b[1mConflicts\u001b[22m ------------------------------------------ tidyverse_conflicts() --\n",
32
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
33
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n",
34
+ "\n",
35
+ "Loading required package: mvtnorm\n",
36
+ "\n",
37
+ "Loading required package: survival\n",
38
+ "\n",
39
+ "Loading required package: TH.data\n",
40
+ "\n",
41
+ "Loading required package: MASS\n",
42
+ "\n",
43
+ "\n",
44
+ "Attaching package: 'MASS'\n",
45
+ "\n",
46
+ "\n",
47
+ "The following object is masked from 'package:dplyr':\n",
48
+ "\n",
49
+ " select\n",
50
+ "\n",
51
+ "\n",
52
+ "\n",
53
+ "Attaching package: 'TH.data'\n",
54
+ "\n",
55
+ "\n",
56
+ "The following object is masked from 'package:MASS':\n",
57
+ "\n",
58
+ " geyser\n",
59
+ "\n",
60
+ "\n",
61
+ "Loading required package: carData\n",
62
+ "\n",
63
+ "\n",
64
+ "Attaching package: 'car'\n",
65
+ "\n",
66
+ "\n",
67
+ "The following object is masked from 'package:dplyr':\n",
68
+ "\n",
69
+ " recode\n",
70
+ "\n",
71
+ "\n",
72
+ "The following object is masked from 'package:purrr':\n",
73
+ "\n",
74
+ " some\n",
75
+ "\n",
76
+ "\n",
77
+ "\n",
78
+ "Attaching package: 'rstatix'\n",
79
+ "\n",
80
+ "\n",
81
+ "The following object is masked from 'package:MASS':\n",
82
+ "\n",
83
+ " select\n",
84
+ "\n",
85
+ "\n",
86
+ "The following object is masked from 'package:stats':\n",
87
+ "\n",
88
+ " filter\n",
89
+ "\n",
90
+ "\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "library(\"ggpubr\")\n",
96
+ "library(readr)\n",
97
+ "library(ggplot2)\n",
98
+ "library(tidyverse)\n",
99
+ "library(ARTool)\n",
100
+ "library(emmeans)\n",
101
+ "library(multcomp)\n",
102
+ "library(car)\n",
103
+ "library(rstatix)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 3,
109
+ "metadata": {
110
+ "vscode": {
111
+ "languageId": "r"
112
+ }
113
+ },
114
+ "outputs": [
115
+ {
116
+ "name": "stderr",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "New names:\n",
120
+ "* `` -> ...1\n",
121
+ "\n",
122
+ "\u001b[1mRows: \u001b[22m\u001b[34m59\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m5\u001b[39m\n",
123
+ "\u001b[36m--\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m--------------------------------------------------------\u001b[39m\n",
124
+ "\u001b[1mDelimiter:\u001b[22m \",\"\n",
125
+ "\u001b[32mdbl\u001b[39m (5): ...1, faiss_dpr, faiss_longformer, es_dpr, es_longformer\n",
126
+ "\n",
127
+ "\u001b[36mi\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n",
128
+ "\u001b[36mi\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n"
129
+ ]
130
+ },
131
+ {
132
+ "data": {
133
+ "text/html": [
134
+ "<table class=\"dataframe\">\n",
135
+ "<caption>A tibble: 6 × 4</caption>\n",
136
+ "<thead>\n",
137
+ "\t<tr><th scope=col>question</th><th scope=col>retriever</th><th scope=col>reader</th><th scope=col>em</th></tr>\n",
138
+ "\t<tr><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
139
+ "</thead>\n",
140
+ "<tbody>\n",
141
+ "\t<tr><td>0</td><td>faiss</td><td>dpr </td><td>0</td></tr>\n",
142
+ "\t<tr><td>0</td><td>faiss</td><td>longformer</td><td>0</td></tr>\n",
143
+ "\t<tr><td>0</td><td>es </td><td>dpr </td><td>0</td></tr>\n",
144
+ "\t<tr><td>0</td><td>es </td><td>longformer</td><td>0</td></tr>\n",
145
+ "\t<tr><td>1</td><td>faiss</td><td>dpr </td><td>0</td></tr>\n",
146
+ "\t<tr><td>1</td><td>faiss</td><td>longformer</td><td>0</td></tr>\n",
147
+ "</tbody>\n",
148
+ "</table>\n"
149
+ ],
150
+ "text/latex": [
151
+ "A tibble: 6 × 4\n",
152
+ "\\begin{tabular}{llll}\n",
153
+ " question & retriever & reader & em\\\\\n",
154
+ " <dbl> & <fct> & <fct> & <dbl>\\\\\n",
155
+ "\\hline\n",
156
+ "\t 0 & faiss & dpr & 0\\\\\n",
157
+ "\t 0 & faiss & longformer & 0\\\\\n",
158
+ "\t 0 & es & dpr & 0\\\\\n",
159
+ "\t 0 & es & longformer & 0\\\\\n",
160
+ "\t 1 & faiss & dpr & 0\\\\\n",
161
+ "\t 1 & faiss & longformer & 0\\\\\n",
162
+ "\\end{tabular}\n"
163
+ ],
164
+ "text/markdown": [
165
+ "\n",
166
+ "A tibble: 6 × 4\n",
167
+ "\n",
168
+ "| question &lt;dbl&gt; | retriever &lt;fct&gt; | reader &lt;fct&gt; | em &lt;dbl&gt; |\n",
169
+ "|---|---|---|---|\n",
170
+ "| 0 | faiss | dpr | 0 |\n",
171
+ "| 0 | faiss | longformer | 0 |\n",
172
+ "| 0 | es | dpr | 0 |\n",
173
+ "| 0 | es | longformer | 0 |\n",
174
+ "| 1 | faiss | dpr | 0 |\n",
175
+ "| 1 | faiss | longformer | 0 |\n",
176
+ "\n"
177
+ ],
178
+ "text/plain": [
179
+ " question retriever reader em\n",
180
+ "1 0 faiss dpr 0 \n",
181
+ "2 0 faiss longformer 0 \n",
182
+ "3 0 es dpr 0 \n",
183
+ "4 0 es longformer 0 \n",
184
+ "5 1 faiss dpr 0 \n",
185
+ "6 1 faiss longformer 0 "
186
+ ]
187
+ },
188
+ "metadata": {},
189
+ "output_type": "display_data"
190
+ }
191
+ ],
192
+ "source": [
193
+ "em_scores <- read_csv(\"em_scores.csv\") %>%\n",
194
+ " rename(question = `...1`) %>%\n",
195
+ " pivot_longer(!question, names_to=c(\"retriever\", \"reader\"), names_sep=\"_\", values_to=\"em\")\n",
196
+ "\n",
197
+ "em_scores$retriever <- as.factor(em_scores$retriever)\n",
198
+ "em_scores$reader <- as.factor(em_scores$reader)\n",
199
+ "\n",
200
+ "head(em_scores)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "markdown",
205
+ "metadata": {},
206
+ "source": [
207
+ "To test which tests we can use, we need to check for normality. For this, we use a Shapiro-Wilk test of normality. In this case, results with FAISS as retriever or DPR had reader had zero exact matches, thus making it impossible to compute the Shapiro-Wilk test of normality. Nonetheless, we know that a distribution with all-identical values is not normally distributed. As you can see in the results below, all other $p$-values are lower than 0.001, so we reject the null-hypothesis of normality and now know that none of the f1-scores are normally distributed."
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 14,
213
+ "metadata": {
214
+ "vscode": {
215
+ "languageId": "r"
216
+ }
217
+ },
218
+ "outputs": [
219
+ {
220
+ "data": {
221
+ "text/html": [
222
+ "<table class=\"dataframe\">\n",
223
+ "<caption>A tibble: 1 × 3</caption>\n",
224
+ "<thead>\n",
225
+ "\t<tr><th scope=col>retriever</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
226
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
227
+ "</thead>\n",
228
+ "<tbody>\n",
229
+ "\t<tr><td>es</td><td>0.2503666</td><td>6.788451e-22</td></tr>\n",
230
+ "</tbody>\n",
231
+ "</table>\n"
232
+ ],
233
+ "text/latex": [
234
+ "A tibble: 1 × 3\n",
235
+ "\\begin{tabular}{lll}\n",
236
+ " retriever & sw.stat & sw.p\\\\\n",
237
+ " <fct> & <dbl> & <dbl>\\\\\n",
238
+ "\\hline\n",
239
+ "\t es & 0.2503666 & 6.788451e-22\\\\\n",
240
+ "\\end{tabular}\n"
241
+ ],
242
+ "text/markdown": [
243
+ "\n",
244
+ "A tibble: 1 × 3\n",
245
+ "\n",
246
+ "| retriever &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
247
+ "|---|---|---|\n",
248
+ "| es | 0.2503666 | 6.788451e-22 |\n",
249
+ "\n"
250
+ ],
251
+ "text/plain": [
252
+ " retriever sw.stat sw.p \n",
253
+ "1 es 0.2503666 6.788451e-22"
254
+ ]
255
+ },
256
+ "metadata": {},
257
+ "output_type": "display_data"
258
+ },
259
+ {
260
+ "data": {
261
+ "text/html": [
262
+ "<table class=\"dataframe\">\n",
263
+ "<caption>A tibble: 1 × 3</caption>\n",
264
+ "<thead>\n",
265
+ "\t<tr><th scope=col>reader</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
266
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
267
+ "</thead>\n",
268
+ "<tbody>\n",
269
+ "\t<tr><td>longformer</td><td>0.2503666</td><td>6.788451e-22</td></tr>\n",
270
+ "</tbody>\n",
271
+ "</table>\n"
272
+ ],
273
+ "text/latex": [
274
+ "A tibble: 1 × 3\n",
275
+ "\\begin{tabular}{lll}\n",
276
+ " reader & sw.stat & sw.p\\\\\n",
277
+ " <fct> & <dbl> & <dbl>\\\\\n",
278
+ "\\hline\n",
279
+ "\t longformer & 0.2503666 & 6.788451e-22\\\\\n",
280
+ "\\end{tabular}\n"
281
+ ],
282
+ "text/markdown": [
283
+ "\n",
284
+ "A tibble: 1 × 3\n",
285
+ "\n",
286
+ "| reader &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
287
+ "|---|---|---|\n",
288
+ "| longformer | 0.2503666 | 6.788451e-22 |\n",
289
+ "\n"
290
+ ],
291
+ "text/plain": [
292
+ " reader sw.stat sw.p \n",
293
+ "1 longformer 0.2503666 6.788451e-22"
294
+ ]
295
+ },
296
+ "metadata": {},
297
+ "output_type": "display_data"
298
+ }
299
+ ],
300
+ "source": [
301
+ "em_scores %>%\n",
302
+ " select(!question) %>%\n",
303
+ " group_by(retriever) %>%\n",
304
+ " filter(sum(em) > 0) %>%\n",
305
+ " summarise(sw.stat = shapiro.test(em)$statistic,\n",
306
+ " sw.p = shapiro.test(em)$p)\n",
307
+ "em_scores %>%\n",
308
+ " select(!question) %>%\n",
309
+ " group_by(reader) %>%\n",
310
+ " filter(sum(em) > 0) %>%\n",
311
+ " summarise(sw.stat = shapiro.test(em)$statistic,\n",
312
+ " sw.p = shapiro.test(em)$p)"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "markdown",
317
+ "metadata": {},
318
+ "source": [
319
+ "Since our data is not normally distributed, we cannot use an ANOVA to compare our results. Therefore, we use an aligned-rank test, which is a non-parameteric version of a factorial repeated measures ANOVA."
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": 4,
325
+ "metadata": {
326
+ "vscode": {
327
+ "languageId": "r"
328
+ }
329
+ },
330
+ "outputs": [
331
+ {
332
+ "data": {
333
+ "text/html": [
334
+ "<table class=\"dataframe\">\n",
335
+ "<caption>A anova.art: 3 × 7</caption>\n",
336
+ "<thead>\n",
337
+ "\t<tr><th></th><th scope=col>Term</th><th scope=col>Df</th><th scope=col>Df.res</th><th scope=col>Sum Sq</th><th scope=col>Sum Sq.res</th><th scope=col>F value</th><th scope=col>Pr(&gt;F)</th></tr>\n",
338
+ "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
339
+ "</thead>\n",
340
+ "<tbody>\n",
341
+ "\t<tr><th scope=row>retriever</th><td>retriever </td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
342
+ "\t<tr><th scope=row>reader</th><td>reader </td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
343
+ "\t<tr><th scope=row>retriever:reader</th><td>retriever:reader</td><td>1</td><td>232</td><td>11564</td><td>263081</td><td>10.1978</td><td>0.001600976</td></tr>\n",
344
+ "</tbody>\n",
345
+ "</table>\n"
346
+ ],
347
+ "text/latex": [
348
+ "A anova.art: 3 × 7\n",
349
+ "\\begin{tabular}{r|lllllll}\n",
350
+ " & Term & Df & Df.res & Sum Sq & Sum Sq.res & F value & Pr(>F)\\\\\n",
351
+ " & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
352
+ "\\hline\n",
353
+ "\tretriever & retriever & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
354
+ "\treader & reader & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
355
+ "\tretriever:reader & retriever:reader & 1 & 232 & 11564 & 263081 & 10.1978 & 0.001600976\\\\\n",
356
+ "\\end{tabular}\n"
357
+ ],
358
+ "text/markdown": [
359
+ "\n",
360
+ "A anova.art: 3 × 7\n",
361
+ "\n",
362
+ "| <!--/--> | Term &lt;chr&gt; | Df &lt;dbl&gt; | Df.res &lt;dbl&gt; | Sum Sq &lt;dbl&gt; | Sum Sq.res &lt;dbl&gt; | F value &lt;dbl&gt; | Pr(&gt;F) &lt;dbl&gt; |\n",
363
+ "|---|---|---|---|---|---|---|---|\n",
364
+ "| retriever | retriever | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
365
+ "| reader | reader | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
366
+ "| retriever:reader | retriever:reader | 1 | 232 | 11564 | 263081 | 10.1978 | 0.001600976 |\n",
367
+ "\n"
368
+ ],
369
+ "text/plain": [
370
+ " Term Df Df.res Sum Sq Sum Sq.res F value\n",
371
+ "retriever retriever 1 232 11564 263081 10.1978\n",
372
+ "reader reader 1 232 11564 263081 10.1978\n",
373
+ "retriever:reader retriever:reader 1 232 11564 263081 10.1978\n",
374
+ " Pr(>F) \n",
375
+ "retriever 0.001600976\n",
376
+ "reader 0.001600976\n",
377
+ "retriever:reader 0.001600976"
378
+ ]
379
+ },
380
+ "metadata": {},
381
+ "output_type": "display_data"
382
+ },
383
+ {
384
+ "name": "stderr",
385
+ "output_type": "stream",
386
+ "text": [
387
+ "NOTE: Results may be misleading due to involvement in interactions\n",
388
+ "\n"
389
+ ]
390
+ },
391
+ {
392
+ "data": {
393
+ "text/plain": [
394
+ " contrast estimate SE df t.ratio p.value\n",
395
+ " es - faiss 14 4.38 232 3.193 0.0016\n",
396
+ "\n",
397
+ "Results are averaged over the levels of: reader "
398
+ ]
399
+ },
400
+ "metadata": {},
401
+ "output_type": "display_data"
402
+ },
403
+ {
404
+ "name": "stderr",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "NOTE: Results may be misleading due to involvement in interactions\n",
408
+ "\n"
409
+ ]
410
+ },
411
+ {
412
+ "data": {
413
+ "text/plain": [
414
+ " contrast estimate SE df t.ratio p.value\n",
415
+ " dpr - longformer -14 4.38 232 -3.193 0.0016\n",
416
+ "\n",
417
+ "Results are averaged over the levels of: retriever "
418
+ ]
419
+ },
420
+ "metadata": {},
421
+ "output_type": "display_data"
422
+ }
423
+ ],
424
+ "source": [
425
+ "model.acc <- art(f1 ~ retriever * reader, data = em_scores)\n",
426
+ "anova(model.acc)\n",
427
+ "art.con(model.acc, ~ retriever)\n",
428
+ "art.con(model.acc, ~ reader)"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "markdown",
433
+ "metadata": {},
434
+ "source": [
435
+ "From these results, we can see that both the retriever and the reader have a significant effect on the F1 score ($F = 58.63$ and $F = 16.23$ respectively, $p < 0.0001$ for both). However, there is also an interaction between the retriever and reader ($F = 43.53$, $p < 0.0001$). The post-hoc analysis of contrasts shows that ElasticSearch performs better than FAISS ($p < 0.0001$) and Longformer performs better than DPR ($p = 0.0001$)."
436
+ ]
437
+ }
438
+ ],
439
+ "metadata": {
440
+ "kernelspec": {
441
+ "display_name": "R",
442
+ "language": "R",
443
+ "name": "ir"
444
+ },
445
+ "language_info": {
446
+ "codemirror_mode": "r",
447
+ "file_extension": ".r",
448
+ "mimetype": "text/x-r-source",
449
+ "name": "R",
450
+ "pygments_lexer": "r",
451
+ "version": "4.1.2"
452
+ },
453
+ "orig_nbformat": 4
454
+ },
455
+ "nbformat": 4,
456
+ "nbformat_minor": 2
457
+ }
results/f1_analysis.ipynb ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# F1 Scores"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {
14
+ "vscode": {
15
+ "languageId": "r"
16
+ }
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "library(\"ggpubr\")\n",
21
+ "library(readr)\n",
22
+ "library(ggplot2)\n",
23
+ "library(tidyverse)\n",
24
+ "library(ARTool)\n",
25
+ "library(emmeans)\n",
26
+ "library(multcomp)\n",
27
+ "library(car)\n",
28
+ "library(rstatix)"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 10,
34
+ "metadata": {
35
+ "vscode": {
36
+ "languageId": "r"
37
+ }
38
+ },
39
+ "outputs": [
40
+ {
41
+ "name": "stderr",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "New names:\n",
45
+ "* `` -> ...1\n",
46
+ "\n",
47
+ "\u001b[1mRows: \u001b[22m\u001b[34m59\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m5\u001b[39m\n",
48
+ "\u001b[36m--\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m--------------------------------------------------------\u001b[39m\n",
49
+ "\u001b[1mDelimiter:\u001b[22m \",\"\n",
50
+ "\u001b[32mdbl\u001b[39m (5): ...1, faiss_dpr, faiss_longformer, es_dpr, es_longformer\n",
51
+ "\n",
52
+ "\u001b[36mi\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n",
53
+ "\u001b[36mi\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n"
54
+ ]
55
+ },
56
+ {
57
+ "data": {
58
+ "text/html": [
59
+ "<table class=\"dataframe\">\n",
60
+ "<caption>A tibble: 6 × 4</caption>\n",
61
+ "<thead>\n",
62
+ "\t<tr><th scope=col>question</th><th scope=col>retriever</th><th scope=col>reader</th><th scope=col>f1</th></tr>\n",
63
+ "\t<tr><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
64
+ "</thead>\n",
65
+ "<tbody>\n",
66
+ "\t<tr><td>0</td><td>faiss</td><td>dpr </td><td>0.0000000</td></tr>\n",
67
+ "\t<tr><td>0</td><td>faiss</td><td>longformer</td><td>0.0000000</td></tr>\n",
68
+ "\t<tr><td>0</td><td>es </td><td>dpr </td><td>0.1300813</td></tr>\n",
69
+ "\t<tr><td>0</td><td>es </td><td>longformer</td><td>0.7692308</td></tr>\n",
70
+ "\t<tr><td>1</td><td>faiss</td><td>dpr </td><td>0.0000000</td></tr>\n",
71
+ "\t<tr><td>1</td><td>faiss</td><td>longformer</td><td>0.0000000</td></tr>\n",
72
+ "</tbody>\n",
73
+ "</table>\n"
74
+ ],
75
+ "text/latex": [
76
+ "A tibble: 6 × 4\n",
77
+ "\\begin{tabular}{llll}\n",
78
+ " question & retriever & reader & f1\\\\\n",
79
+ " <dbl> & <fct> & <fct> & <dbl>\\\\\n",
80
+ "\\hline\n",
81
+ "\t 0 & faiss & dpr & 0.0000000\\\\\n",
82
+ "\t 0 & faiss & longformer & 0.0000000\\\\\n",
83
+ "\t 0 & es & dpr & 0.1300813\\\\\n",
84
+ "\t 0 & es & longformer & 0.7692308\\\\\n",
85
+ "\t 1 & faiss & dpr & 0.0000000\\\\\n",
86
+ "\t 1 & faiss & longformer & 0.0000000\\\\\n",
87
+ "\\end{tabular}\n"
88
+ ],
89
+ "text/markdown": [
90
+ "\n",
91
+ "A tibble: 6 × 4\n",
92
+ "\n",
93
+ "| question &lt;dbl&gt; | retriever &lt;fct&gt; | reader &lt;fct&gt; | f1 &lt;dbl&gt; |\n",
94
+ "|---|---|---|---|\n",
95
+ "| 0 | faiss | dpr | 0.0000000 |\n",
96
+ "| 0 | faiss | longformer | 0.0000000 |\n",
97
+ "| 0 | es | dpr | 0.1300813 |\n",
98
+ "| 0 | es | longformer | 0.7692308 |\n",
99
+ "| 1 | faiss | dpr | 0.0000000 |\n",
100
+ "| 1 | faiss | longformer | 0.0000000 |\n",
101
+ "\n"
102
+ ],
103
+ "text/plain": [
104
+ " question retriever reader f1 \n",
105
+ "1 0 faiss dpr 0.0000000\n",
106
+ "2 0 faiss longformer 0.0000000\n",
107
+ "3 0 es dpr 0.1300813\n",
108
+ "4 0 es longformer 0.7692308\n",
109
+ "5 1 faiss dpr 0.0000000\n",
110
+ "6 1 faiss longformer 0.0000000"
111
+ ]
112
+ },
113
+ "metadata": {},
114
+ "output_type": "display_data"
115
+ }
116
+ ],
117
+ "source": [
118
+ "f1_scores <- read_csv(\"f1_scores.csv\") %>%\n",
119
+ " rename(question = `...1`) %>%\n",
120
+ " pivot_longer(!question, names_to=c(\"retriever\", \"reader\"), names_sep=\"_\", values_to=\"f1\")\n",
121
+ "\n",
122
+ "f1_scores$retriever = as.factor(f1_scores$retriever)\n",
123
+ "f1_scores$reader = as.factor(f1_scores$reader)\n",
124
+ "\n",
125
+ "head(f1_scores)"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "markdown",
130
+ "metadata": {},
131
+ "source": [
132
+ "To test which tests we can use, we need to check for normality. For this, we use a Shapiro-Wilk test of normality. As you can see in the results below, all $p$-values are lower than 0.001, so we reject the null-hypothesis of normality and now know that none of the f1-scores are normally distributed."
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 21,
138
+ "metadata": {
139
+ "vscode": {
140
+ "languageId": "r"
141
+ }
142
+ },
143
+ "outputs": [
144
+ {
145
+ "data": {
146
+ "text/html": [
147
+ "<table class=\"dataframe\">\n",
148
+ "<caption>A tibble: 1 × 3</caption>\n",
149
+ "<thead>\n",
150
+ "\t<tr><th scope=col>variable</th><th scope=col>statistic</th><th scope=col>p</th></tr>\n",
151
+ "\t<tr><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
152
+ "</thead>\n",
153
+ "<tbody>\n",
154
+ "\t<tr><td>f1</td><td>0.5086706</td><td>3.999447e-18</td></tr>\n",
155
+ "</tbody>\n",
156
+ "</table>\n"
157
+ ],
158
+ "text/latex": [
159
+ "A tibble: 1 × 3\n",
160
+ "\\begin{tabular}{lll}\n",
161
+ " variable & statistic & p\\\\\n",
162
+ " <chr> & <dbl> & <dbl>\\\\\n",
163
+ "\\hline\n",
164
+ "\t f1 & 0.5086706 & 3.999447e-18\\\\\n",
165
+ "\\end{tabular}\n"
166
+ ],
167
+ "text/markdown": [
168
+ "\n",
169
+ "A tibble: 1 × 3\n",
170
+ "\n",
171
+ "| variable &lt;chr&gt; | statistic &lt;dbl&gt; | p &lt;dbl&gt; |\n",
172
+ "|---|---|---|\n",
173
+ "| f1 | 0.5086706 | 3.999447e-18 |\n",
174
+ "\n"
175
+ ],
176
+ "text/plain": [
177
+ " variable statistic p \n",
178
+ "1 f1 0.5086706 3.999447e-18"
179
+ ]
180
+ },
181
+ "metadata": {},
182
+ "output_type": "display_data"
183
+ },
184
+ {
185
+ "data": {
186
+ "text/html": [
187
+ "<table class=\"dataframe\">\n",
188
+ "<caption>A tibble: 1 × 3</caption>\n",
189
+ "<thead>\n",
190
+ "\t<tr><th scope=col>variable</th><th scope=col>statistic</th><th scope=col>p</th></tr>\n",
191
+ "\t<tr><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
192
+ "</thead>\n",
193
+ "<tbody>\n",
194
+ "\t<tr><td>f1</td><td>0.7704567</td><td>2.671656e-12</td></tr>\n",
195
+ "</tbody>\n",
196
+ "</table>\n"
197
+ ],
198
+ "text/latex": [
199
+ "A tibble: 1 × 3\n",
200
+ "\\begin{tabular}{lll}\n",
201
+ " variable & statistic & p\\\\\n",
202
+ " <chr> & <dbl> & <dbl>\\\\\n",
203
+ "\\hline\n",
204
+ "\t f1 & 0.7704567 & 2.671656e-12\\\\\n",
205
+ "\\end{tabular}\n"
206
+ ],
207
+ "text/markdown": [
208
+ "\n",
209
+ "A tibble: 1 × 3\n",
210
+ "\n",
211
+ "| variable &lt;chr&gt; | statistic &lt;dbl&gt; | p &lt;dbl&gt; |\n",
212
+ "|---|---|---|\n",
213
+ "| f1 | 0.7704567 | 2.671656e-12 |\n",
214
+ "\n"
215
+ ],
216
+ "text/plain": [
217
+ " variable statistic p \n",
218
+ "1 f1 0.7704567 2.671656e-12"
219
+ ]
220
+ },
221
+ "metadata": {},
222
+ "output_type": "display_data"
223
+ },
224
+ {
225
+ "data": {
226
+ "text/html": [
227
+ "<table class=\"dataframe\">\n",
228
+ "<caption>A tibble: 1 × 3</caption>\n",
229
+ "<thead>\n",
230
+ "\t<tr><th scope=col>variable</th><th scope=col>statistic</th><th scope=col>p</th></tr>\n",
231
+ "\t<tr><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
232
+ "</thead>\n",
233
+ "<tbody>\n",
234
+ "\t<tr><td>f1</td><td>0.6741031</td><td>7.912632e-15</td></tr>\n",
235
+ "</tbody>\n",
236
+ "</table>\n"
237
+ ],
238
+ "text/latex": [
239
+ "A tibble: 1 × 3\n",
240
+ "\\begin{tabular}{lll}\n",
241
+ " variable & statistic & p\\\\\n",
242
+ " <chr> & <dbl> & <dbl>\\\\\n",
243
+ "\\hline\n",
244
+ "\t f1 & 0.6741031 & 7.912632e-15\\\\\n",
245
+ "\\end{tabular}\n"
246
+ ],
247
+ "text/markdown": [
248
+ "\n",
249
+ "A tibble: 1 × 3\n",
250
+ "\n",
251
+ "| variable &lt;chr&gt; | statistic &lt;dbl&gt; | p &lt;dbl&gt; |\n",
252
+ "|---|---|---|\n",
253
+ "| f1 | 0.6741031 | 7.912632e-15 |\n",
254
+ "\n"
255
+ ],
256
+ "text/plain": [
257
+ " variable statistic p \n",
258
+ "1 f1 0.6741031 7.912632e-15"
259
+ ]
260
+ },
261
+ "metadata": {},
262
+ "output_type": "display_data"
263
+ },
264
+ {
265
+ "data": {
266
+ "text/html": [
267
+ "<table class=\"dataframe\">\n",
268
+ "<caption>A tibble: 1 × 3</caption>\n",
269
+ "<thead>\n",
270
+ "\t<tr><th scope=col>variable</th><th scope=col>statistic</th><th scope=col>p</th></tr>\n",
271
+ "\t<tr><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
272
+ "</thead>\n",
273
+ "<tbody>\n",
274
+ "\t<tr><td>f1</td><td>0.6558935</td><td>3.037616e-15</td></tr>\n",
275
+ "</tbody>\n",
276
+ "</table>\n"
277
+ ],
278
+ "text/latex": [
279
+ "A tibble: 1 × 3\n",
280
+ "\\begin{tabular}{lll}\n",
281
+ " variable & statistic & p\\\\\n",
282
+ " <chr> & <dbl> & <dbl>\\\\\n",
283
+ "\\hline\n",
284
+ "\t f1 & 0.6558935 & 3.037616e-15\\\\\n",
285
+ "\\end{tabular}\n"
286
+ ],
287
+ "text/markdown": [
288
+ "\n",
289
+ "A tibble: 1 × 3\n",
290
+ "\n",
291
+ "| variable &lt;chr&gt; | statistic &lt;dbl&gt; | p &lt;dbl&gt; |\n",
292
+ "|---|---|---|\n",
293
+ "| f1 | 0.6558935 | 3.037616e-15 |\n",
294
+ "\n"
295
+ ],
296
+ "text/plain": [
297
+ " variable statistic p \n",
298
+ "1 f1 0.6558935 3.037616e-15"
299
+ ]
300
+ },
301
+ "metadata": {},
302
+ "output_type": "display_data"
303
+ }
304
+ ],
305
+ "source": [
306
+ "f1_scores %>%\n",
307
+ " filter(retriever == \"faiss\") %>%\n",
308
+ " shapiro_test(f1)\n",
309
+ "\n",
310
+ "f1_scores %>%\n",
311
+ " filter(retriever == \"es\") %>%\n",
312
+ " shapiro_test(f1)\n",
313
+ "\n",
314
+ "f1_scores %>%\n",
315
+ " filter(reader == \"dpr\") %>%\n",
316
+ " shapiro_test(f1)\n",
317
+ "\n",
318
+ "f1_scores %>%\n",
319
+ " filter(reader == \"longformer\") %>%\n",
320
+ " shapiro_test(f1)\n"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "markdown",
325
+ "metadata": {},
326
+ "source": [
327
+ "Since our data is not normally distributed, we cannot use an ANOVA to compare our results. Therefore, we use an aligned-rank test, which is a non-parameteric version of a factorial repeated measures ANOVA."
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": 22,
333
+ "metadata": {
334
+ "vscode": {
335
+ "languageId": "r"
336
+ }
337
+ },
338
+ "outputs": [
339
+ {
340
+ "data": {
341
+ "text/html": [
342
+ "<table class=\"dataframe\">\n",
343
+ "<caption>A anova.art: 3 × 7</caption>\n",
344
+ "<thead>\n",
345
+ "\t<tr><th></th><th scope=col>Term</th><th scope=col>Df</th><th scope=col>Df.res</th><th scope=col>Sum Sq</th><th scope=col>Sum Sq.res</th><th scope=col>F value</th><th scope=col>Pr(&gt;F)</th></tr>\n",
346
+ "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
347
+ "</thead>\n",
348
+ "<tbody>\n",
349
+ "\t<tr><th scope=row>retriever</th><td>retriever </td><td>1</td><td>232</td><td>200452.90</td><td>793168.0</td><td>58.63206</td><td>5.105423e-13</td></tr>\n",
350
+ "\t<tr><th scope=row>reader</th><td>reader </td><td>1</td><td>232</td><td> 66045.36</td><td>944311.6</td><td>16.22613</td><td>7.620176e-05</td></tr>\n",
351
+ "\t<tr><th scope=row>retriever:reader</th><td>retriever:reader</td><td>1</td><td>232</td><td>158290.44</td><td>843714.0</td><td>43.52587</td><td>2.804257e-10</td></tr>\n",
352
+ "</tbody>\n",
353
+ "</table>\n"
354
+ ],
355
+ "text/latex": [
356
+ "A anova.art: 3 × 7\n",
357
+ "\\begin{tabular}{r|lllllll}\n",
358
+ " & Term & Df & Df.res & Sum Sq & Sum Sq.res & F value & Pr(>F)\\\\\n",
359
+ " & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
360
+ "\\hline\n",
361
+ "\tretriever & retriever & 1 & 232 & 200452.90 & 793168.0 & 58.63206 & 5.105423e-13\\\\\n",
362
+ "\treader & reader & 1 & 232 & 66045.36 & 944311.6 & 16.22613 & 7.620176e-05\\\\\n",
363
+ "\tretriever:reader & retriever:reader & 1 & 232 & 158290.44 & 843714.0 & 43.52587 & 2.804257e-10\\\\\n",
364
+ "\\end{tabular}\n"
365
+ ],
366
+ "text/markdown": [
367
+ "\n",
368
+ "A anova.art: 3 × 7\n",
369
+ "\n",
370
+ "| <!--/--> | Term &lt;chr&gt; | Df &lt;dbl&gt; | Df.res &lt;dbl&gt; | Sum Sq &lt;dbl&gt; | Sum Sq.res &lt;dbl&gt; | F value &lt;dbl&gt; | Pr(&gt;F) &lt;dbl&gt; |\n",
371
+ "|---|---|---|---|---|---|---|---|\n",
372
+ "| retriever | retriever | 1 | 232 | 200452.90 | 793168.0 | 58.63206 | 5.105423e-13 |\n",
373
+ "| reader | reader | 1 | 232 | 66045.36 | 944311.6 | 16.22613 | 7.620176e-05 |\n",
374
+ "| retriever:reader | retriever:reader | 1 | 232 | 158290.44 | 843714.0 | 43.52587 | 2.804257e-10 |\n",
375
+ "\n"
376
+ ],
377
+ "text/plain": [
378
+ " Term Df Df.res Sum Sq Sum Sq.res F value \n",
379
+ "retriever retriever 1 232 200452.90 793168.0 58.63206\n",
380
+ "reader reader 1 232 66045.36 944311.6 16.22613\n",
381
+ "retriever:reader retriever:reader 1 232 158290.44 843714.0 43.52587\n",
382
+ " Pr(>F) \n",
383
+ "retriever 5.105423e-13\n",
384
+ "reader 7.620176e-05\n",
385
+ "retriever:reader 2.804257e-10"
386
+ ]
387
+ },
388
+ "metadata": {},
389
+ "output_type": "display_data"
390
+ },
391
+ {
392
+ "name": "stderr",
393
+ "output_type": "stream",
394
+ "text": [
395
+ "NOTE: Results may be misleading due to involvement in interactions\n",
396
+ "\n"
397
+ ]
398
+ },
399
+ {
400
+ "data": {
401
+ "text/plain": [
402
+ " contrast estimate SE df t.ratio p.value\n",
403
+ " es - faiss 58.3 7.61 232 7.657 <.0001\n",
404
+ "\n",
405
+ "Results are averaged over the levels of: reader "
406
+ ]
407
+ },
408
+ "metadata": {},
409
+ "output_type": "display_data"
410
+ },
411
+ {
412
+ "name": "stderr",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "NOTE: Results may be misleading due to involvement in interactions\n",
416
+ "\n"
417
+ ]
418
+ },
419
+ {
420
+ "data": {
421
+ "text/plain": [
422
+ " contrast estimate SE df t.ratio p.value\n",
423
+ " dpr - longformer -33.5 8.31 232 -4.028 0.0001\n",
424
+ "\n",
425
+ "Results are averaged over the levels of: retriever "
426
+ ]
427
+ },
428
+ "metadata": {},
429
+ "output_type": "display_data"
430
+ }
431
+ ],
432
+ "source": [
433
+ "model.acc <- art(f1 ~ retriever * reader, data = f1_scores)\n",
434
+ "anova(model.acc)\n",
435
+ "art.con(model.acc, ~ retriever)\n",
436
+ "art.con(model.acc, ~ reader)"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "markdown",
441
+ "metadata": {},
442
+ "source": [
443
+ "From these results, we can see that both the retriever and the reader have a significant effect on the F1 score ($F = 58.63$ and $F = 16.23$ respectively, $p < 0.0001$ for both). However, there is also an interaction between the retriever and reader ($F = 43.53$, $p < 0.0001$). The post-hoc analysis of contrasts shows that ElasticSearch performs better than FAISS ($p < 0.0001$) and Longformer performs better than DPR ($p = 0.0001$)."
444
+ ]
445
+ }
446
+ ],
447
+ "metadata": {
448
+ "kernelspec": {
449
+ "display_name": "R",
450
+ "language": "R",
451
+ "name": "ir"
452
+ },
453
+ "language_info": {
454
+ "codemirror_mode": "r",
455
+ "file_extension": ".r",
456
+ "mimetype": "text/x-r-source",
457
+ "name": "R",
458
+ "pygments_lexer": "r",
459
+ "version": "4.1.2"
460
+ },
461
+ "orig_nbformat": 4
462
+ },
463
+ "nbformat": 4,
464
+ "nbformat_minor": 2
465
+ }
results/timings_analysis.ipynb ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Timings"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {
14
+ "vscode": {
15
+ "languageId": "r"
16
+ }
17
+ },
18
+ "outputs": [
19
+ {
20
+ "name": "stderr",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Loading required package: ggplot2\n",
24
+ "\n",
25
+ "-- \u001b[1mAttaching packages\u001b[22m --------------------------------------- tidyverse 1.3.1 --\n",
26
+ "\n",
27
+ "\u001b[32mv\u001b[39m \u001b[34mtibble \u001b[39m 3.1.5 \u001b[32mv\u001b[39m \u001b[34mdplyr \u001b[39m 1.0.7\n",
28
+ "\u001b[32mv\u001b[39m \u001b[34mtidyr \u001b[39m 1.1.4 \u001b[32mv\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n",
29
+ "\u001b[32mv\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4 \u001b[32mv\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n",
30
+ "\n",
31
+ "-- \u001b[1mConflicts\u001b[22m ------------------------------------------ tidyverse_conflicts() --\n",
32
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
33
+ "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n",
34
+ "\n",
35
+ "Loading required package: mvtnorm\n",
36
+ "\n",
37
+ "Loading required package: survival\n",
38
+ "\n",
39
+ "Loading required package: TH.data\n",
40
+ "\n",
41
+ "Loading required package: MASS\n",
42
+ "\n",
43
+ "\n",
44
+ "Attaching package: 'MASS'\n",
45
+ "\n",
46
+ "\n",
47
+ "The following object is masked from 'package:dplyr':\n",
48
+ "\n",
49
+ " select\n",
50
+ "\n",
51
+ "\n",
52
+ "\n",
53
+ "Attaching package: 'TH.data'\n",
54
+ "\n",
55
+ "\n",
56
+ "The following object is masked from 'package:MASS':\n",
57
+ "\n",
58
+ " geyser\n",
59
+ "\n",
60
+ "\n",
61
+ "Loading required package: carData\n",
62
+ "\n",
63
+ "\n",
64
+ "Attaching package: 'car'\n",
65
+ "\n",
66
+ "\n",
67
+ "The following object is masked from 'package:dplyr':\n",
68
+ "\n",
69
+ " recode\n",
70
+ "\n",
71
+ "\n",
72
+ "The following object is masked from 'package:purrr':\n",
73
+ "\n",
74
+ " some\n",
75
+ "\n",
76
+ "\n",
77
+ "\n",
78
+ "Attaching package: 'rstatix'\n",
79
+ "\n",
80
+ "\n",
81
+ "The following object is masked from 'package:MASS':\n",
82
+ "\n",
83
+ " select\n",
84
+ "\n",
85
+ "\n",
86
+ "The following object is masked from 'package:stats':\n",
87
+ "\n",
88
+ " filter\n",
89
+ "\n",
90
+ "\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "library(\"ggpubr\")\n",
96
+ "library(readr)\n",
97
+ "library(ggplot2)\n",
98
+ "library(tidyverse)\n",
99
+ "library(ARTool)\n",
100
+ "library(emmeans)\n",
101
+ "library(multcomp)\n",
102
+ "library(car)\n",
103
+ "library(rstatix)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 2,
109
+ "metadata": {
110
+ "vscode": {
111
+ "languageId": "r"
112
+ }
113
+ },
114
+ "outputs": [
115
+ {
116
+ "name": "stderr",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "New names:\n",
120
+ "* `` -> ...1\n",
121
+ "\n",
122
+ "\u001b[1mRows: \u001b[22m\u001b[34m59\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m9\u001b[39m\n",
123
+ "\u001b[36m--\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m--------------------------------------------------------\u001b[39m\n",
124
+ "\u001b[1mDelimiter:\u001b[22m \",\"\n",
125
+ "\u001b[32mdbl\u001b[39m (9): ...1, faiss_dpr.retrieve, faiss_dpr.read, faiss_longformer.retrieve...\n",
126
+ "\n",
127
+ "\u001b[36mi\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n",
128
+ "\u001b[36mi\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n"
129
+ ]
130
+ },
131
+ {
132
+ "data": {
133
+ "text/html": [
134
+ "<table class=\"dataframe\">\n",
135
+ "<caption>A tibble: 6 × 5</caption>\n",
136
+ "<thead>\n",
137
+ "\t<tr><th scope=col>question</th><th scope=col>retriever</th><th scope=col>reader</th><th scope=col>method</th><th scope=col>time</th></tr>\n",
138
+ "\t<tr><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
139
+ "</thead>\n",
140
+ "<tbody>\n",
141
+ "\t<tr><td>0</td><td>faiss</td><td>dpr </td><td>retrieve</td><td>0.30384302</td></tr>\n",
142
+ "\t<tr><td>0</td><td>faiss</td><td>dpr </td><td>read </td><td>4.56640005</td></tr>\n",
143
+ "\t<tr><td>0</td><td>faiss</td><td>longformer</td><td>retrieve</td><td>0.92279482</td></tr>\n",
144
+ "\t<tr><td>0</td><td>faiss</td><td>longformer</td><td>read </td><td>5.76836824</td></tr>\n",
145
+ "\t<tr><td>0</td><td>es </td><td>dpr </td><td>retrieve</td><td>0.01930094</td></tr>\n",
146
+ "\t<tr><td>0</td><td>es </td><td>dpr </td><td>read </td><td>2.74536490</td></tr>\n",
147
+ "</tbody>\n",
148
+ "</table>\n"
149
+ ],
150
+ "text/latex": [
151
+ "A tibble: 6 × 5\n",
152
+ "\\begin{tabular}{lllll}\n",
153
+ " question & retriever & reader & method & time\\\\\n",
154
+ " <dbl> & <fct> & <fct> & <fct> & <dbl>\\\\\n",
155
+ "\\hline\n",
156
+ "\t 0 & faiss & dpr & retrieve & 0.30384302\\\\\n",
157
+ "\t 0 & faiss & dpr & read & 4.56640005\\\\\n",
158
+ "\t 0 & faiss & longformer & retrieve & 0.92279482\\\\\n",
159
+ "\t 0 & faiss & longformer & read & 5.76836824\\\\\n",
160
+ "\t 0 & es & dpr & retrieve & 0.01930094\\\\\n",
161
+ "\t 0 & es & dpr & read & 2.74536490\\\\\n",
162
+ "\\end{tabular}\n"
163
+ ],
164
+ "text/markdown": [
165
+ "\n",
166
+ "A tibble: 6 × 5\n",
167
+ "\n",
168
+ "| question &lt;dbl&gt; | retriever &lt;fct&gt; | reader &lt;fct&gt; | method &lt;fct&gt; | time &lt;dbl&gt; |\n",
169
+ "|---|---|---|---|---|\n",
170
+ "| 0 | faiss | dpr | retrieve | 0.30384302 |\n",
171
+ "| 0 | faiss | dpr | read | 4.56640005 |\n",
172
+ "| 0 | faiss | longformer | retrieve | 0.92279482 |\n",
173
+ "| 0 | faiss | longformer | read | 5.76836824 |\n",
174
+ "| 0 | es | dpr | retrieve | 0.01930094 |\n",
175
+ "| 0 | es | dpr | read | 2.74536490 |\n",
176
+ "\n"
177
+ ],
178
+ "text/plain": [
179
+ " question retriever reader method time \n",
180
+ "1 0 faiss dpr retrieve 0.30384302\n",
181
+ "2 0 faiss dpr read 4.56640005\n",
182
+ "3 0 faiss longformer retrieve 0.92279482\n",
183
+ "4 0 faiss longformer read 5.76836824\n",
184
+ "5 0 es dpr retrieve 0.01930094\n",
185
+ "6 0 es dpr read 2.74536490"
186
+ ]
187
+ },
188
+ "metadata": {},
189
+ "output_type": "display_data"
190
+ }
191
+ ],
192
+ "source": [
193
+ "timings <- read_csv(\"timings.csv\") %>%\n",
194
+ " rename(question = `...1`) %>%\n",
195
+ " pivot_longer(!question, names_to=c(\"retriever\", \"reader\", \"method\"), names_sep=\"[._]\", values_to=\"time\")\n",
196
+ "\n",
197
+ "timings$retriever <- as.factor(timings$retriever)\n",
198
+ "timings$reader <- as.factor(timings$reader)\n",
199
+ "timings$method <- as.factor(timings$method)\n",
200
+ "\n",
201
+ "head(timings)"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 3,
207
+ "metadata": {
208
+ "vscode": {
209
+ "languageId": "r"
210
+ }
211
+ },
212
+ "outputs": [],
213
+ "source": [
214
+ "timings_read <- filter(timings, method == \"read\") %>%\n",
215
+ " select(!method)\n",
216
+ "timings_retrieve <- filter(timings, method == \"retrieve\") %>%\n",
217
+ " select(!method)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "markdown",
222
+ "metadata": {},
223
+ "source": [
224
+ "To test which tests we can use, we need to check for normality. For this, we use a Shapiro-Wilk test of normality. As you can see in the results below, all $p$-values are lower than 0.001, so we reject the null-hypothesis of normality and now know that none of the f1-scores are normally distributed."
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 4,
230
+ "metadata": {
231
+ "vscode": {
232
+ "languageId": "r"
233
+ }
234
+ },
235
+ "outputs": [
236
+ {
237
+ "data": {
238
+ "text/html": [
239
+ "<table class=\"dataframe\">\n",
240
+ "<caption>A tibble: 2 × 3</caption>\n",
241
+ "<thead>\n",
242
+ "\t<tr><th scope=col>retriever</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
243
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
244
+ "</thead>\n",
245
+ "<tbody>\n",
246
+ "\t<tr><td>es </td><td>0.7534261</td><td>1.667341e-18</td></tr>\n",
247
+ "\t<tr><td>faiss</td><td>0.7585727</td><td>2.563192e-18</td></tr>\n",
248
+ "</tbody>\n",
249
+ "</table>\n"
250
+ ],
251
+ "text/latex": [
252
+ "A tibble: 2 × 3\n",
253
+ "\\begin{tabular}{lll}\n",
254
+ " retriever & sw.stat & sw.p\\\\\n",
255
+ " <fct> & <dbl> & <dbl>\\\\\n",
256
+ "\\hline\n",
257
+ "\t es & 0.7534261 & 1.667341e-18\\\\\n",
258
+ "\t faiss & 0.7585727 & 2.563192e-18\\\\\n",
259
+ "\\end{tabular}\n"
260
+ ],
261
+ "text/markdown": [
262
+ "\n",
263
+ "A tibble: 2 × 3\n",
264
+ "\n",
265
+ "| retriever &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
266
+ "|---|---|---|\n",
267
+ "| es | 0.7534261 | 1.667341e-18 |\n",
268
+ "| faiss | 0.7585727 | 2.563192e-18 |\n",
269
+ "\n"
270
+ ],
271
+ "text/plain": [
272
+ " retriever sw.stat sw.p \n",
273
+ "1 es 0.7534261 1.667341e-18\n",
274
+ "2 faiss 0.7585727 2.563192e-18"
275
+ ]
276
+ },
277
+ "metadata": {},
278
+ "output_type": "display_data"
279
+ },
280
+ {
281
+ "data": {
282
+ "text/html": [
283
+ "<table class=\"dataframe\">\n",
284
+ "<caption>A tibble: 2 × 3</caption>\n",
285
+ "<thead>\n",
286
+ "\t<tr><th scope=col>reader</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
287
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
288
+ "</thead>\n",
289
+ "<tbody>\n",
290
+ "\t<tr><td>dpr </td><td>0.7639005</td><td>4.029344e-18</td></tr>\n",
291
+ "\t<tr><td>longformer</td><td>0.8116362</td><td>3.381683e-16</td></tr>\n",
292
+ "</tbody>\n",
293
+ "</table>\n"
294
+ ],
295
+ "text/latex": [
296
+ "A tibble: 2 × 3\n",
297
+ "\\begin{tabular}{lll}\n",
298
+ " reader & sw.stat & sw.p\\\\\n",
299
+ " <fct> & <dbl> & <dbl>\\\\\n",
300
+ "\\hline\n",
301
+ "\t dpr & 0.7639005 & 4.029344e-18\\\\\n",
302
+ "\t longformer & 0.8116362 & 3.381683e-16\\\\\n",
303
+ "\\end{tabular}\n"
304
+ ],
305
+ "text/markdown": [
306
+ "\n",
307
+ "A tibble: 2 × 3\n",
308
+ "\n",
309
+ "| reader &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
310
+ "|---|---|---|\n",
311
+ "| dpr | 0.7639005 | 4.029344e-18 |\n",
312
+ "| longformer | 0.8116362 | 3.381683e-16 |\n",
313
+ "\n"
314
+ ],
315
+ "text/plain": [
316
+ " reader sw.stat sw.p \n",
317
+ "1 dpr 0.7639005 4.029344e-18\n",
318
+ "2 longformer 0.8116362 3.381683e-16"
319
+ ]
320
+ },
321
+ "metadata": {},
322
+ "output_type": "display_data"
323
+ },
324
+ {
325
+ "data": {
326
+ "text/html": [
327
+ "<table class=\"dataframe\">\n",
328
+ "<caption>A tibble: 2 × 3</caption>\n",
329
+ "<thead>\n",
330
+ "\t<tr><th scope=col>method</th><th scope=col>sw.stat</th><th scope=col>sw.p</th></tr>\n",
331
+ "\t<tr><th scope=col>&lt;fct&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
332
+ "</thead>\n",
333
+ "<tbody>\n",
334
+ "\t<tr><td>read </td><td>0.8838182</td><td>1.779766e-12</td></tr>\n",
335
+ "\t<tr><td>retrieve</td><td>0.6237773</td><td>1.838892e-22</td></tr>\n",
336
+ "</tbody>\n",
337
+ "</table>\n"
338
+ ],
339
+ "text/latex": [
340
+ "A tibble: 2 × 3\n",
341
+ "\\begin{tabular}{lll}\n",
342
+ " method & sw.stat & sw.p\\\\\n",
343
+ " <fct> & <dbl> & <dbl>\\\\\n",
344
+ "\\hline\n",
345
+ "\t read & 0.8838182 & 1.779766e-12\\\\\n",
346
+ "\t retrieve & 0.6237773 & 1.838892e-22\\\\\n",
347
+ "\\end{tabular}\n"
348
+ ],
349
+ "text/markdown": [
350
+ "\n",
351
+ "A tibble: 2 × 3\n",
352
+ "\n",
353
+ "| method &lt;fct&gt; | sw.stat &lt;dbl&gt; | sw.p &lt;dbl&gt; |\n",
354
+ "|---|---|---|\n",
355
+ "| read | 0.8838182 | 1.779766e-12 |\n",
356
+ "| retrieve | 0.6237773 | 1.838892e-22 |\n",
357
+ "\n"
358
+ ],
359
+ "text/plain": [
360
+ " method sw.stat sw.p \n",
361
+ "1 read 0.8838182 1.779766e-12\n",
362
+ "2 retrieve 0.6237773 1.838892e-22"
363
+ ]
364
+ },
365
+ "metadata": {},
366
+ "output_type": "display_data"
367
+ }
368
+ ],
369
+ "source": [
370
+ "timings %>%\n",
371
+ " group_by(retriever) %>%\n",
372
+ " summarise(sw.stat = shapiro.test(time)$statistic,\n",
373
+ " sw.p = shapiro.test(time)$p)\n",
374
+ "\n",
375
+ "timings %>%\n",
376
+ " group_by(reader) %>%\n",
377
+ " summarise(sw.stat = shapiro.test(time)$statistic,\n",
378
+ " sw.p = shapiro.test(time)$p)\n",
379
+ "\n",
380
+ "timings %>%\n",
381
+ " group_by(method) %>%\n",
382
+ " summarise(sw.stat = shapiro.test(time)$statistic,\n",
383
+ " sw.p = shapiro.test(time)$p)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "markdown",
388
+ "metadata": {},
389
+ "source": [
390
+ "Since our data is not normally distributed, we cannot use an ANOVA to compare our results. Therefore, we use an aligned-rank test, which is a non-parameteric version of a factorial repeated measures ANOVA."
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 5,
396
+ "metadata": {
397
+ "vscode": {
398
+ "languageId": "r"
399
+ }
400
+ },
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/html": [
405
+ "<table class=\"dataframe\">\n",
406
+ "<caption>A anova.art: 3 × 7</caption>\n",
407
+ "<thead>\n",
408
+ "\t<tr><th></th><th scope=col>Term</th><th scope=col>Df</th><th scope=col>Df.res</th><th scope=col>Sum Sq</th><th scope=col>Sum Sq.res</th><th scope=col>F value</th><th scope=col>Pr(&gt;F)</th></tr>\n",
409
+ "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
410
+ "</thead>\n",
411
+ "<tbody>\n",
412
+ "\t<tr><th scope=row>retriever</th><td>retriever </td><td>1</td><td>232</td><td> 41088.97</td><td>1037631.8</td><td> 9.18692</td><td>2.714084e-03</td></tr>\n",
413
+ "\t<tr><th scope=row>reader</th><td>reader </td><td>1</td><td>232</td><td>790427.81</td><td> 301414.1</td><td>608.39633</td><td>8.802730e-67</td></tr>\n",
414
+ "\t<tr><th scope=row>retriever:reader</th><td>retriever:reader</td><td>1</td><td>232</td><td>101903.46</td><td> 983331.4</td><td> 24.04235</td><td>1.771995e-06</td></tr>\n",
415
+ "</tbody>\n",
416
+ "</table>\n"
417
+ ],
418
+ "text/latex": [
419
+ "A anova.art: 3 × 7\n",
420
+ "\\begin{tabular}{r|lllllll}\n",
421
+ " & Term & Df & Df.res & Sum Sq & Sum Sq.res & F value & Pr(>F)\\\\\n",
422
+ " & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
423
+ "\\hline\n",
424
+ "\tretriever & retriever & 1 & 232 & 41088.97 & 1037631.8 & 9.18692 & 2.714084e-03\\\\\n",
425
+ "\treader & reader & 1 & 232 & 790427.81 & 301414.1 & 608.39633 & 8.802730e-67\\\\\n",
426
+ "\tretriever:reader & retriever:reader & 1 & 232 & 101903.46 & 983331.4 & 24.04235 & 1.771995e-06\\\\\n",
427
+ "\\end{tabular}\n"
428
+ ],
429
+ "text/markdown": [
430
+ "\n",
431
+ "A anova.art: 3 × 7\n",
432
+ "\n",
433
+ "| <!--/--> | Term &lt;chr&gt; | Df &lt;dbl&gt; | Df.res &lt;dbl&gt; | Sum Sq &lt;dbl&gt; | Sum Sq.res &lt;dbl&gt; | F value &lt;dbl&gt; | Pr(&gt;F) &lt;dbl&gt; |\n",
434
+ "|---|---|---|---|---|---|---|---|\n",
435
+ "| retriever | retriever | 1 | 232 | 41088.97 | 1037631.8 | 9.18692 | 2.714084e-03 |\n",
436
+ "| reader | reader | 1 | 232 | 790427.81 | 301414.1 | 608.39633 | 8.802730e-67 |\n",
437
+ "| retriever:reader | retriever:reader | 1 | 232 | 101903.46 | 983331.4 | 24.04235 | 1.771995e-06 |\n",
438
+ "\n"
439
+ ],
440
+ "text/plain": [
441
+ " Term Df Df.res Sum Sq Sum Sq.res F value \n",
442
+ "retriever retriever 1 232 41088.97 1037631.8 9.18692\n",
443
+ "reader reader 1 232 790427.81 301414.1 608.39633\n",
444
+ "retriever:reader retriever:reader 1 232 101903.46 983331.4 24.04235\n",
445
+ " Pr(>F) \n",
446
+ "retriever 2.714084e-03\n",
447
+ "reader 8.802730e-67\n",
448
+ "retriever:reader 1.771995e-06"
449
+ ]
450
+ },
451
+ "metadata": {},
452
+ "output_type": "display_data"
453
+ },
454
+ {
455
+ "name": "stderr",
456
+ "output_type": "stream",
457
+ "text": [
458
+ "NOTE: Results may be misleading due to involvement in interactions\n",
459
+ "\n"
460
+ ]
461
+ },
462
+ {
463
+ "data": {
464
+ "text/plain": [
465
+ " contrast estimate SE df t.ratio p.value\n",
466
+ " es - faiss 26.4 8.71 232 3.031 0.0027\n",
467
+ "\n",
468
+ "Results are averaged over the levels of: reader "
469
+ ]
470
+ },
471
+ "metadata": {},
472
+ "output_type": "display_data"
473
+ },
474
+ {
475
+ "name": "stderr",
476
+ "output_type": "stream",
477
+ "text": [
478
+ "NOTE: Results may be misleading due to involvement in interactions\n",
479
+ "\n"
480
+ ]
481
+ },
482
+ {
483
+ "data": {
484
+ "text/plain": [
485
+ " contrast estimate SE df t.ratio p.value\n",
486
+ " dpr - longformer -116 4.69 232 -24.666 <.0001\n",
487
+ "\n",
488
+ "Results are averaged over the levels of: retriever "
489
+ ]
490
+ },
491
+ "metadata": {},
492
+ "output_type": "display_data"
493
+ }
494
+ ],
495
+ "source": [
496
+ "model.acc <- art(time ~ retriever * reader, data = timings_read)\n",
497
+ "anova(model.acc)\n",
498
+ "art.con(model.acc, ~ retriever)\n",
499
+ "art.con(model.acc, ~ reader)"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": 6,
505
+ "metadata": {
506
+ "vscode": {
507
+ "languageId": "r"
508
+ }
509
+ },
510
+ "outputs": [
511
+ {
512
+ "data": {
513
+ "text/html": [
514
+ "<table class=\"dataframe\">\n",
515
+ "<caption>A anova.art: 3 × 7</caption>\n",
516
+ "<thead>\n",
517
+ "\t<tr><th></th><th scope=col>Term</th><th scope=col>Df</th><th scope=col>Df.res</th><th scope=col>Sum Sq</th><th scope=col>Sum Sq.res</th><th scope=col>F value</th><th scope=col>Pr(&gt;F)</th></tr>\n",
518
+ "\t<tr><th></th><th scope=col>&lt;chr&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th><th scope=col>&lt;dbl&gt;</th></tr>\n",
519
+ "</thead>\n",
520
+ "<tbody>\n",
521
+ "\t<tr><th scope=row>retriever</th><td>retriever </td><td>1</td><td>232</td><td>821516</td><td>240071.9</td><td>793.8944</td><td>7.630526e-77</td></tr>\n",
522
+ "\t<tr><th scope=row>reader</th><td>reader </td><td>1</td><td>232</td><td>821516</td><td>214935.3</td><td>886.7398</td><td>3.256422e-81</td></tr>\n",
523
+ "\t<tr><th scope=row>retriever:reader</th><td>retriever:reader</td><td>1</td><td>232</td><td>821516</td><td>215501.6</td><td>884.4096</td><td>4.148583e-81</td></tr>\n",
524
+ "</tbody>\n",
525
+ "</table>\n"
526
+ ],
527
+ "text/latex": [
528
+ "A anova.art: 3 × 7\n",
529
+ "\\begin{tabular}{r|lllllll}\n",
530
+ " & Term & Df & Df.res & Sum Sq & Sum Sq.res & F value & Pr(>F)\\\\\n",
531
+ " & <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl> & <dbl>\\\\\n",
532
+ "\\hline\n",
533
+ "\tretriever & retriever & 1 & 232 & 821516 & 240071.9 & 793.8944 & 7.630526e-77\\\\\n",
534
+ "\treader & reader & 1 & 232 & 821516 & 214935.3 & 886.7398 & 3.256422e-81\\\\\n",
535
+ "\tretriever:reader & retriever:reader & 1 & 232 & 821516 & 215501.6 & 884.4096 & 4.148583e-81\\\\\n",
536
+ "\\end{tabular}\n"
537
+ ],
538
+ "text/markdown": [
539
+ "\n",
540
+ "A anova.art: 3 × 7\n",
541
+ "\n",
542
+ "| <!--/--> | Term &lt;chr&gt; | Df &lt;dbl&gt; | Df.res &lt;dbl&gt; | Sum Sq &lt;dbl&gt; | Sum Sq.res &lt;dbl&gt; | F value &lt;dbl&gt; | Pr(&gt;F) &lt;dbl&gt; |\n",
543
+ "|---|---|---|---|---|---|---|---|\n",
544
+ "| retriever | retriever | 1 | 232 | 821516 | 240071.9 | 793.8944 | 7.630526e-77 |\n",
545
+ "| reader | reader | 1 | 232 | 821516 | 214935.3 | 886.7398 | 3.256422e-81 |\n",
546
+ "| retriever:reader | retriever:reader | 1 | 232 | 821516 | 215501.6 | 884.4096 | 4.148583e-81 |\n",
547
+ "\n"
548
+ ],
549
+ "text/plain": [
550
+ " Term Df Df.res Sum Sq Sum Sq.res F value \n",
551
+ "retriever retriever 1 232 821516 240071.9 793.8944\n",
552
+ "reader reader 1 232 821516 214935.3 886.7398\n",
553
+ "retriever:reader retriever:reader 1 232 821516 215501.6 884.4096\n",
554
+ " Pr(>F) \n",
555
+ "retriever 7.630526e-77\n",
556
+ "reader 3.256422e-81\n",
557
+ "retriever:reader 4.148583e-81"
558
+ ]
559
+ },
560
+ "metadata": {},
561
+ "output_type": "display_data"
562
+ },
563
+ {
564
+ "name": "stderr",
565
+ "output_type": "stream",
566
+ "text": [
567
+ "NOTE: Results may be misleading due to involvement in interactions\n",
568
+ "\n"
569
+ ]
570
+ },
571
+ {
572
+ "data": {
573
+ "text/plain": [
574
+ " contrast estimate SE df t.ratio p.value\n",
575
+ " es - faiss -118 4.19 232 -28.176 <.0001\n",
576
+ "\n",
577
+ "Results are averaged over the levels of: reader "
578
+ ]
579
+ },
580
+ "metadata": {},
581
+ "output_type": "display_data"
582
+ },
583
+ {
584
+ "name": "stderr",
585
+ "output_type": "stream",
586
+ "text": [
587
+ "NOTE: Results may be misleading due to involvement in interactions\n",
588
+ "\n"
589
+ ]
590
+ },
591
+ {
592
+ "data": {
593
+ "text/plain": [
594
+ " contrast estimate SE df t.ratio p.value\n",
595
+ " dpr - longformer -118 3.96 232 -29.778 <.0001\n",
596
+ "\n",
597
+ "Results are averaged over the levels of: retriever "
598
+ ]
599
+ },
600
+ "metadata": {},
601
+ "output_type": "display_data"
602
+ }
603
+ ],
604
+ "source": [
605
+ "model.acc <- art(time ~ retriever * reader, data = timings_retrieve)\n",
606
+ "anova(model.acc)\n",
607
+ "art.con(model.acc, ~ retriever)\n",
608
+ "art.con(model.acc, ~ reader)"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "markdown",
613
+ "metadata": {},
614
+ "source": []
615
+ }
616
+ ],
617
+ "metadata": {
618
+ "kernelspec": {
619
+ "display_name": "R",
620
+ "language": "R",
621
+ "name": "ir"
622
+ },
623
+ "language_info": {
624
+ "codemirror_mode": "r",
625
+ "file_extension": ".r",
626
+ "mimetype": "text/x-r-source",
627
+ "name": "R",
628
+ "pygments_lexer": "r",
629
+ "version": "4.1.2"
630
+ },
631
+ "orig_nbformat": 4
632
+ },
633
+ "nbformat": 4,
634
+ "nbformat_minor": 2
635
+ }