File size: 80,493 Bytes
50714a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
677554f
50714a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e8433c
 
50714a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e8433c
50714a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e8433c
 
 
 
 
 
 
 
50714a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
<!DOCTYPE html>
<html>
<head>
    <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
    <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta charset="utf8">
    <base target="_blank">
    <title>FAT5: Flash Attention T5</title>
    <link rel="stylesheet" href="style.css">
</head>

<body>
<d-front-matter>
    <script id='distill-front-matter' type="text/json">{
    "title": "FAT5 : Flash Attention T5",
    "description": "",
    "published": "May 28, 2024", 
    "authors": [
      {
        "author":"Boris ALBAR",
        "authorURL":"https://github.com/b-albar",
        "affiliation": [{"name": "CATIE", "url": "https://catie.fr"}]
      },
      {
        "author":"Loïck BOURDOIS",
        "authorURL":"https://github.com/lbourdois",
        "affiliation": [{"name": "CATIE", "url": "https://catie.fr"}]
      }
    ],
    "color": "#9CA3AF",
    "katex": {
      "delimiters": [
        {"left": "$$", "right": "$$", "display": false}
      ]
    }
  }
    </script>
</d-front-matter>
<d-title>
    <h1 class="l-page" style="text-align: center;">FAT5: Flash Attention T5</h1>
    <p><img src="./assets/FAT5_dark.gif" alt="FAT5" width="100%"></p>
</d-title>

<d-article>
    <d-contents>
    </d-contents>

  <div class="note"> For a better experience, we do not recommend reading on a cell phone. </div>
  
    <h2 id="motivation">Motivation</h2>
    <p class="width_125">
    While much effort has been devoted to optimising decoder transformers, thus abandoning the encoder, we believe it is essential to maintain an encoder-decoder architecture.<br>

    Indeed, this architecture, which offers interesting performance for instruction tuning <d-cite bibtex-key="chia2023instructeval"></d-cite>, is suitable for distillation <d-cite bibtex-key="hsieh2023distilling"></d-cite> and seems superior to decoder models when finetuned <d-cite bibtex-key="fu2024tiny"></d-cite>.
    It has also been shown that encoder-decoder models trained with masked language modelling achieve better zero-shot performance after multitasking finetuning compared with a decoder model <d-cite bibtex-key="wang2022languagemodelarchitecturepretraining"></d-cite>.<br>
    Beyond NLP, which is the focus of this blog post, encoder-decoder architecture is widely used in other fields such as audio or time series, for example. Note also that the encoder of such architecture is also used in some diffusion models.<br>    
      
    That's why we've decided to focus on the T5 <d-cite bibtex-key="JMLR:v21:20-074"></d-cite>.<br><br>

    This article presents the optimisations we have implemented to efficiently pre-train a T5 in French with 147M parameters in a reasonable time (1,461 H for 419B tokens) and with limited resources (1 single A100; i.e. a computing budget of around 2,200 euros).
    To achieve this, we designed CUDA/Triton kernels to make Flash Attention compatible with T5 and provide linear inference, thus extending the context size that can be taken into account by the model.<br><br>    
    <strong>The pre-training code is available in our <a class="link" href="https://github.com/catie-aq/flashT5">GitHub repository</a> under Apache-2.0 license and weights on our <a class="link" href="https://hf.co/CATIE-AQ">Hugging Face</a> account.</strong>
    <p class="width_125"><br><br><br></p>

  
    <h2 id="vue-d-ensemble-de-notre-travail">Overview of our work</h2>
    <p class="width_125">We therefore chose to work with a T5 and in practice with the nanoT5 <d-cite bibtex-key="nawrot2023nanot5"></d-cite>.<br>
    For pretext tasks during pre-training, we followed the UL2 ones <d-cite bibtex-key='tay2023ul2'></d-cite> with the following 7 tasks:</p>
    <pre><code class="lang-py">
            denoiser_list=[
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">3.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[R]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">8.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[R]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">4.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">0</span>, <span class="hljs-string">"max_spans"</span>: <span class="hljs-number">1</span>, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[S]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">3.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">5</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">8.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">64.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
                {<span class="hljs-string">"mu"</span>: <span class="hljs-number">64.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">5</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>}]
            denoiser_proportions=[<span class="hljs-number">0</span>.<span class="hljs-number">165</span>, <span class="hljs-number">0</span>.<span class="hljs-number">165</span>, <span class="hljs-number">0</span>.<span class="hljs-number">34</span>, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825]
    </code></pre>
    <p class="width_125">with <code>mu</code> the n-gram size, <code>r</code> the masking percentage in the n-gram and <code>prefix</code> the type of pretext task. 
    The meaning of letters <code>[R]</code>, <code>[S]</code> and <code>[X]</code> is described <a class="link" href="https://huggingface.co/google/ul2#mixture-of-denoisers">here</a> 
    and we invite you to take a look at the following <a class="link" href="https://raw.githubusercontent.com/google-research/google-research/master/ul2/figs/mod.png">image</a> in particular.</p>
    <p class="width_125">For a quick training, we decided to focus on the Flash Attention <d-cite bibtex-key="dao2022flashattention"></d-cite>. 
    However, as it does not manage the attentional (additive) biases of the T5, we had to extend it by developing a custom kernel. 
    More specifically, we successively developed two versions of this kernel. 
    In the first version, at the start of our work, we transmitted the bias matrix to the kernel. 
    In the current version, inspired by TurboT5 <d-cite bibtex-key='turbot5'></d-cite>, we only communicate a tensor containing the merged biases in order to materialise the bias matrix on the fly. 
    This makes it possible to switch from a T5 with quadratic memory to a T5 with linear memory, and consequently greatly increases the size of context that the model can support.</p>    
  
  <p class="width_125">Our work resulted in the pre-training of a T5 in French with 147M parameters: the FAT5 <i>small</i>.<br> 
    The dataset we used is made up of the French part of the CulturaX corpus <d-cite bibtex-key='nguyen2023culturax'></d-cite> (the main source with over 1258 GB of text), 
    the French part of Wikipedia <d-cite bibtex-key="wikidump"></d-cite> (dump 20231101), 
    justice_fr (French legal texts) <d-cite bibtex-key="justice_fr"></d-cite> 
    and 25,000,000 lines from TheStack <d-cite bibtex-key="Kocetkov2022TheStack"></d-cite> 
    (the idea here is to show our model a bit of code, although this is not our main objective).<br>
    
    This model was evaluated on five tasks: text summarization, binary classification, question answering, named entity recognition and sentence similarity.</p>   
  <p class="width_125"><br><br><br></p>
  
    <h2 id="les-d-tails-de-la-recette">Recipe details</h2>
    <p class="width_125">With only two A100 (one 80GB and one 40GB), we had to spend some time implementing optimisations to get the best out of our hardware.
      Indeed, before even training a model, or even modifying its architecture, we need to ensure that we are optimising the use of our GPUs' computing capacity. 
      There are several factors that can explain sub-optimal training of a deep learning model:<br>
    &nbsp; • Disk-bounded<br>
    &nbsp; • Memory-bounded<br>
    &nbsp; • Compute-bounded</p>

    <p class="width_125">Ideally, we would like the model to be limited by the speed of calculation, i.e. the GPU to be used at full capacity. 
      With this in mind, we worked on three main points: <br>
    &nbsp; • GPU disk optimisation <br>
    &nbsp; • GPU memory bandwidth optimisation <br>
    &nbsp; • Optimisation of the use of Tensor Cores<br>
    </p>
  
    <p class="width_125">So it's a combination of hardware and software issues.</p>
    <p></p>
    <p class="width_125">In the rest of this section, everything we have done/implemented to address the limitations encountered is available in a green box. Notes/comments can be found in a blue box. 
    <br><br></p>
    <h3 id="optimisation-du-disque-du-gpu">GPU disk optimisation</h3>
    <p class="width_125">Disk limitation occurs either during data loading or during pre-processing operations. 
      In both cases, the problem manifests itself as slowness.
    <br></p>
    <h4 id="acc-s-disques">Disk access</h4>
    <p class="width_125">If the limitation comes from disk access, there are several possible solutions:</p>
    <ul>
    <li><p class="width_125"><u>Put data in RAM</u><br>
      This solves the problem radically, but assumes that the database fits into RAM, which is far from obvious given its small size.</p>
    <div class="tip"><p>So this is not the solution we have chosen.</p></div>
    </li>
    <li><p class="width_125"><u>Put data on a faster and/or less-used disk</u><br>
      If you have physical access to your GPU server, it is very useful to integrate <a class="link" href="https://fr.wikipedia.org/wiki/NVM_Express">NVMe</a> in its configuration.</p>
    <p class="width_125">You also need to be careful not to have too many processes from different training pulling on the same disc. 
      It is therefore preferable to have several small discs rather than one large one.</p>
    <div class="note"><p>A beneficial indirect effect is that such a configuration costs less 😉</p></div>
    </li>
    </ul>    
    <ul>    
    <li class="width_125"><u>Use more efficient file formats, particularly in terms of random accesses</u><br>
      For example <code>.parquet</code> files are more efficient than <code>.csv</code>. 
      We can also use formats specifically developed for this purpose, such as the <code>.beton</code> from ffcv <d-cite bibtex-key="leclerc2023ffcv"></d-cite>.</li>
    <div class="tip"><p>We use the Datasets library <d-cite bibtex-key="lhoest2021datasets"></d-cite> to load and process the data we use. 
      With this library, the data is decompressed locally in the <code>Arrow</code> format. 
      Moreover, if the data loaded from the Hugging Face Hub has been added using the <code>push_to_hub()</code> function, 
      then the dataset is converted by default in <code>parquet</code>.</p></div>
    </ul>
    <ul>    
    <li class="width_125"><u>Pre-tokenise data</u><br>
    The most effective option is probably to pre-tokenise the data in order to optimise access.
    In other words, tokenisation takes place in a preliminary stage and not on the fly.
    </li>
    <div class="tip"><p>Readers are invited to consult the following 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/examples/minipile/pretokenize_minipile.py">code</a> which
      illustrates how we proceed in our FAT5 tutorial applied to the Minipile dataset <d-cite bibtex-key="kaddour2023minipilechallengedataefficientlanguage"></d-cite>.</p></div>
    </ul>
    <p><br></p>
    <h4 id="traitement-des-donn-es">Data processing</h4>
    <p class="width_125">If the limitation comes from the processing of data after they have been uploaded:</p>
    <ul>
    <li><p class="width_125"><u>Several processes can be used to process data in parallel</u><br> 
    For example, the parameter <code>num_workers</code> of the <code>Dataloader</code> of PyTorch <d-cite bibtex-key="paszke2019pytorch"></d-cite>.</p></li>
    <div class="tip"><p>You can find in our code the values we use for this parameter for our FAT5 small <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/configs/fr/fat5-fr-small.yaml#L42">small</a>.</div>
    </ul>
    <ul>
    <li><p class="width_125"><u>The bottleneck can also come from the <code>DataCollator</code></u><br>
    This is especially the case when there are complex tasks to perform (image masking or multiple denoisers on NLP tasks).<br>
    We can then build a custom <code>DataCollator</code> for the task. 
    On appliquera les méthodes traditionnelles pour optimiser la vitesse de celui-ci. 
    Similarly, using Numpy's vectorisation will allow lists to be processed more quickly than with <code>for</code> loops. 
    Generally speaking, Numpy is faster than PyTorch for this type of task. 
    You can also use compilation methods such as numba <d-cite bibtex-key="10.1145/2833157.2833162"></d-cite> for Python, for example.</p></li>
    <div class="tip"><p>We followed this principle and developed a custom <code>DataCollator</code> for our FAT5. 
      You can find the code <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/data/data_collator_ul2.py">here</a>. 
      It manages UL2 pretext tasks and has a dynamic batch mechanism to reduce padding (more information in the next section)</p></div>
    <div class="note"><p>As there was no implementation of UL2's <code>DataCollator</code> available in PyTorch until now, 
      we hope this may be useful for other work.</p></div>
    </ul>
    <ul>
    <li><p class="width_125"><u>Effective padding</u><br>
    <p class="width_125">When working with sequences, there is a natural tendency to pad a set of sequences in order to build batches. 
      The padding tokens then generate unnecessary calculations.<br>
      The first thing to do is to limit padding to the maximum size sequence and not to a maximum value. 
      This is the <a class="link" href="https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt#dynamic-padding">dynamic padding</a> technique.<br>
      With this approach, padding tokens may still remain. There are two ways of managing them:<br>
    &nbsp; • use a method for grouping data of similar sizes
      (for example, <a class="link" href="https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.group_by_length">this parameter</a> 
      in the Transformers library <d-cite bibtex-key="wolf2020huggingfaces"></d-cite> or 
      <a class="link" href="https://discuss.huggingface.co/t/how-to-implement-trainers-group-by-length-in-pytorch/9232">by retrieving this sampler</a> for PyTorch)<br>
    &nbsp; • concatenate different examples in a custom DataCollator.</p>
    <div class="tip"><p>We have opted for the second option and refer the reader back to the
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/data/data_collator_ul2.py">code</a> our DataCollator.</p></div>
    <div class="note"><p>More optimised heuristics probably need to be put in place. 
      We carried out a test by proposing a
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/data/data_collator_ul2.py#L45">function</a> 
      in the <code>DataCollator</code> to sort <code>input_ids</code> and <code>labels</code> by descending length.
      However, this is rather time-consuming for a minimal packaging gain.
      More work needs to be done on this point.
      </p></div>
    </ul>
    <p class="width_125"><br><br></p>

  
    <h3 id="optimisation-de-la-bande-passante-de-la-m-moire-du-gpu">GPU memory bandwidth optimisation</h3>
    <p class="width_125">Memory bandwidth limitation is more difficult to deal with. 
      A memory-limited operation is one whose overall execution time is restricted by memory accesses. 
      This is particularly the case for LLMs, especially at the inference level. 
      The diagnosis can be made from the <a class="link" href="https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html">PyTorch profiler</a>:</p>
    <figure class="width_125">
    <img src="https://pytorch.org/tutorials/_static/img/profiler_overview1.png" alt="profiler_overview1.png" width="100%">
    <figcaption><center><i>Source: <a class="link" href="https://pytorch.org/tutorials/_static/img/profiler_overview1.png">https://pytorch.org/tutorials/_static/img/profiler_overview1.png</a></i></center></figcaption>
    </figure>
    <br><br><br>
    <p class="width_125">Another way of establishing a diagnosis is to use a simple <code>nvidia-smi</code>:</p>
    <figure class="width_125">
    <img src="./assets/nvidiasmi.png" alt="nvidiasmi.png" width="100%">
    </figure>
    <br>
    <p class="width_125">Useful for finding out if a problem is present, but gives limited information about the nature of the problem. 
      That's why we prefer the profiler.</p>
    <p><br></p>
    <h4 id="noyau-cuda">CUDA kernel</h4>
    <p class="width_125">The main technique for optimising GPU memory bandwidth is to develop a CUDA kernel that merges several limiting operations into SRAM. 
      This can limit the copying of large matrices into the HBM and then immediately reloading them into SRAM. 
      This is now a common feature of decoder transformers thanks to the <a class="link" href="https://github.com/Dao-AILab/flash-attention">Flash Attention</a>.</p>
    <div class="tip"><p>
    As Flash Attention does not manage the (additive) attentional biases of the T5, we extended it by developing a custom CUDA kernel. 
    As mentioned in the introduction, we actually implemented two successive versions of this kernel. 
    Without going into the details of the 650 lines of code in the implementation of the first version (which can be consulted 
      <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/617">here</a>),
      the general and simplified idea (for a forward pass) is as follows:</p>
    <ul>
    <li>The expected output O, initialised with 0's, is loaded from the HBM to the SRAM, as well as the query Q, the key K, the value V and the biases B.</li>
    <li>Our CUDA kernel calculates the following steps:<br>
    &nbsp; • Compute the matrix S using the matrix product of Q and the transpose of K<br>
    &nbsp; • Compute S', which is the sum of the S matrix and the bias matrix B<br>
    &nbsp; • Compute P, which is the softmax (cumulative under the hood) of S’<br>
    &nbsp; • Compute the output O, which is the matrix product of P and V<br>
    </li>
    <li>Output O is loaded on the HBM and the SRAM is cleared.
    <picture>
    <source media="(prefers-color-scheme: dark)" srcset="./assets/FAT5_dark.gif">
    <img alt="FAT5 animation" src="./assets/FAT5.gif" width="100%">
    </picture>
    </ul>
    <br>
    <p>While the first version of the kernel is generic, the second (available <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/956">here</a>) 
      is specific to the working of models with relative positional encoding (which is the case of the T5).
     The general and simplified idea (for a forward pass) is as follows:</p>
    <ul>
    <li>In the HBM, we have the expected output O (output) initialised with 0s, as well as the query Q (query), the key K (key) and the value V (value). 
      However, we don't have the bias matrix B as before, but the bucket of tensors T.</li>
    <li>O is loaded from the HBM to the SRAM, along with the query Q, the key K, the value V and the tensor bucket T.</li>
    <li>Our CUDA kernel calculates the following steps:<br>
    &nbsp; • Compute the matrix S using the matrix product of Q and the transpose of K<br>
    &nbsp; • Compute S', which is the sum of the matrix S and a matrix filled with the elements of T<br>
    &nbsp; • Compute P, which is the softmax (cumulative under the hood) of S’<br>
    &nbsp; • Compute the output O, which is the matrix product of P and V<br>
    </li>
    <li>Output O is loaded on the HBM and the SRAM is cleared.
    </ul>
    <p>
    In this way, whereas the first version of the B bias matrix required a quadratic memory, 
      here we are back to a linear memory enabling inferences to be performed on tens of thousands of tokens.<br>
    To design this second version, we were inspired by the TurboT5's Triton kernel, which we ported to CUDA and extended to full BF16.  
    </p>
    </div>
    <br>

    <div class="tip"><p>Note that the two versions developed can be used with several positional encodings.<br>
    We invite the reader to consult this <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/positional_encoding.py">file</a> 
      containing classes compatible with Flash Attention for the 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L10">RelativePositionalEncoding</a> 
      <d-cite bibtex-key="shaw2018selfattention"></d-cite>, 
      the <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L113">ALiBiPositionalEncoding</a> 
      <d-cite bibtex-key="press2022train"></d-cite>, 
      the <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L205">RotaryPositionalEncoding</a> 
      <d-cite bibtex-key="su2023roformer"></d-cite> and 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L341">FIRE</a> <d-cite bibtex-key="li2024functional"></d-cite>.</p></div>

    <div class="note"><p>At the time of writing, the two pull requests (one for each kernel version, 
      available <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/617">here</a> 
      and <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/956">here</a>) 
      opened on the official Flash Attention repository have not been merged.
      Readers will therefore have to temporarily recompile our custom Flash Attention patches to be able to use our models.<br>
      Readers are invited to consult the Benchmark section further below to see the improvements brought by these two kernels.</p></div>
    <br>
    <div class="note"><p>Although we didn't use them, it should be noted that some libraries contain merged implementations of common operators, for example Apex <d-cite bibtex-key="nvidiapex"></d-cite>.</p></div>
    
    <p><br></p>
    <h4 id="noyau-triton">Triton kernel</h4>
    <p class="width_125">Triton <d-cite bibtex-key="10.1145/3315508.3329973"></d-cite> is a maintained programming language that allows Python code to be compiled efficiently, like CUDA, but with the advantage of being (from our point of view) easier to learn. Unlike CUDA, which requires an in-depth understanding of GPU hardware architecture, Triton ignores many low-level details such as memory coalescing, shared memory management and scheduling within CUDA thread blocks.</p>

    <div class="tip"><p>A Triton implementation of the 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/flash_attention_v2_bias.py">Flash Attention 2 managing attention bias</a> 
      is provided for those who do not wish to recompile a custom patch for Flash Attention. 
      To do this, we based ourselves on the FlagAttention repository <d-cite bibtex-key="flagattention"></d-cite>. 
      <br>
      <br>
      In addition to this implementation (whose use is optional), other parts of the architecture have been optimised using ad hoc Triton kernels, namely:
    <br>
    &nbsp; • the <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/cross_entropy_loss.py">cross entropy loss</a> (and the loss z <d-cite bibtex-key="debrébisson2016zloss"></d-cite>) <br>
    &nbsp; • the <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/rms_norm.py">RMSNorm layer</a> <d-cite bibtex-key="zhang2019root"></d-cite> <br>
    <br>
    We drew inspiration from <a class="link" href="https://github.com/unslothai/unsloth">Unsloth</a> <d-cite bibtex-key="unsloth"></d-cite>.<br>
    <br>
    Readers are invited to refer to the Benchmark section below to see the impact of this optimisation.</div>
    
    <p><br></p>
    <h4 id="utiliser-torch-compile-">Use <code>torch.compile</code></h4>
    <p class="width_125">A simpler approach is to compile the models with <code>torch.compile</code>. 
      PyTorch then takes care of making the possible merges, possibly by reordering operations. 
      This involves hunting down breaks in the compilation graph, which are returns to an eager execution mode that have a negative impact on the performance of the operation.</p>
    <div class="note"><p>See the <a class="link" href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html">official documentation</a> for more details.</p></div>
    <p class="width_125">Another possibility is to use both a custom kernel and <code>torch.compile</code>. 
      The implementation of this option has been greatly simplified since the 
      <a class="link" href="https://github.com/pytorch/pytorch/releases/tag/v2.4.0">version 2.4 of PyTorch</a>.</p>
    <p class="width_125">Readers are invited to refer to the benchmark section at the end of the article to measure the memory performance 
      performance of the various techniques described.</p>
    <p class="width_125"><br><br></p>


  
    <h3 id="optimisation-de-l-utilisation-des-tensor-cores">Optimisation of the use of Tensor Cores</h3>
    <p class="width_125">Recent GPUs have units dedicated to tensorial operations: the TensorCore. Using them correctly is essential.</p>
    <p class="width_125">Once again, to establish a diagnosis, it is advisable to refer to the PyTorch profiler, which indicates the proportion of TensorCore used for each CUDA kernel:</p>
    <p><figure class="width_125">
    <img src="https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png" alt="profiler_kernel_view.png" width="100%">
    <figcaption><center><i>Source: <a class="link" href="https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png">https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png</a></i></center></figcaption>
    </figure>
    <br><br>
    <p class="width_125">The optimisations that can be made are:<br></p>
    <h4 id="puissances-de-2">Use multiples of 8 or 64</h4>
    <p class="width_125">The first is to use tensor sizes that are multiples of 8 or 64. 
      Please refer to the Nvidia documentation, 
      in particular this <a class="link" href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc">article</a>
      and this <a class="link" href="https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/">article</a> 
      to determine the multiple to select according to the desired precision.</p>
    <div class="tip"><p>With this in mind, we trained a tokenizer of size 32 768 (8**5), 
      following <a class="link" href="https://twitter.com/karpathy/status/1621578354024677377">this observation by KARPATHY</a>. 
      This is a BPE tokenizer <d-cite bibtex-key="sennrich2016neuralmachinetranslationrare"></d-cite> trained on CulturaX and The Stack, using 256 extra_tokens and the numbers are separated.<br>
      Readers can find the code used <a class="link" href=https://github.com/catie-aq/flashT5/blob/main/examples/fat5-fr/train_tokenizer.py">here</a>.
      </p></div>
    <p><br></p>
      
    <h4 id="utiliser-le-bon-optimiseur">Use the right optimiser</h4>
    <p class="width_125">Changing optimisers from the initial implementation of the model can be a good way of speeding up convergence of the model (although it may prevent the results of the original paper from being reproduced).<br>
      Optimisers speed up convergence by allowing large batch sizes, as in the case of LAMB <d-cite bibtex-key="you2020large"></d-cite> 
      or the use of higher learning rates such as Sophia <d-cite bibtex-key="liu2024sophia"></d-cite>.<br>
      More efficient versions of the optimisers can also be used, such as the <code>fused</code> option
      in the <a class="link" href="https://pytorch.org/docs/stable/generated/torch.optim.Adam.html">Adam optimiser</a> available in PyTorch 
      or the optimisers available in <a class="link" href="https://github.com/NVIDIA/apex">Apex</a>.</p>
    <div class="tip"><p>
      We used the original T5 optimiser, <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/adamw_scaled.py">AdamWScale</a>.
      For hyperparameter values, we use <code>lr = 5e-3</code>, <code>betas = (0.9, 0.999)</code>, <code>eps = 1e-6</code> et <code>weight_decay = 0.0</code>
      based on the observations of <a class="link" href="https://github.com/PiotrNawrot/nanoT5/issues/25#issuecomment-1922731400">Wilson Wongso</a>.
      Indeed, it turns out that not all the alternative optimisers tested converged.</p></div>
    <div class="note"><p>We have added the parameter <code>foreach</code> in our version of AdamWScale.</p>       
    </div>
    <p><br></p>

    <h4 id="entra-ner-ses-mod-les-en-bf16-ou-fp16-">Training models in <code>bf16</code></h4>
    <p class="width_125">Recent GPUs make it possible to full exploit the use of reduced precision 
      (enabling a gain of a factor of 2 in throughput compared to the <code>fp32</code> precision). 
      The <code>bf16</code> is only available on Ampere or more recent architectures, but allows you to avoid loss scaling 
      <d-cite bibtex-key="micikevicius2018mixed"></d-cite> method which is generally necessary in <code>fp16</code> 
      thanks to a wider dynamic range (the exponent is coded on 8 bits like the <code>fp32</code>).</p>
    <div class="tip"><p>With this in mind, we train our models in <code>bf16</code>.
      More specifically, while at the beginning of our experiments we used <code>bf16-mixed</code>, we have used the
      <a class="link" href="https://en.wikipedia.org/wiki/Kahan_summation_algorithm">Kahan summation algorithm</a>
      so that we can use <code>full bf16</code> in our optimizer.<br>  
      Once again, the code for our optimizer is accessible <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/adamw_scaled.py">here</a>.
    </p></div>
    <p><br></p>
      
    <h4 id="utiliser-moins-de-m-moire-du-gpu">Use less GPU memory</h4>
    <p class="width_125">Certain techniques exist to limit the use of GPU memory by the model, such as the 
      <a class="link" href="https://pytorch.org/docs/stable/checkpoint.html">gradient checkpointing</a> 
      or ZeRO-type methods <d-cite bibtex-key="rajbhandari2020zero"></d-cite> implemented in 
      <a class="link" href="https://github.com/microsoft/DeepSpeed">DeepSpeed</a>. 
      By limiting the amount of memory used, larger batch sizes can be used to speed up model training.</p>
    <p class="width_125"><br><br></p>

  
    <h3 id="autres">Other</h3>
    <h4 id="le-parall-lisme">Parallelism</h4>
    <p class="width_125">Using several GPUs is tricky. 
      Done naively, it can result in lower performance than implementation on a single GPU, wasting computing resources. 
      This is particularly the case when bottlenecks occur in communications between GPUs. 
      The aim is to ensure that the model is not limited by the bandwidth between the cards, or to ensure that the cards are connected with sufficient 
      bandwidths via techniques such as <a class="link" href="https://en.wikipedia.org/wiki/NVLink">NVLink</a> for example. </p>
    <p class="width_125">It should also be noted that optimisation techniques generally require all the GPUs to be synchronised at the end of a batch. 
      As a result, if one GPU is slower than the others (or is being used by another process), the model is limited to the speed of the slowest GPU in the group. </p>
    <div class="note"><p>
    Having pre-trained our model on a single 80GB A100, we were unable to experiment with parallelism.</p>
    </div>
    <p><br></p>
  
    <h4 id="les-t-tes-pour-le-finetuning">Finetuning heads</h4>
    <p class="width_125">We looked at the elements listed above with a view to optimising the pre-training of our model. 
      In practice, we then need to fine-tune it to specialise on the final tasks that interest us.  
      To do this, we use heads. For the <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5">vanilla T5</a>, 
      five are available in Transformers to perform all feasible tasks: 
      <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration"><code>T5ForConditionalGeneration</code></a>, 
      <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForSequenceClassification"><code>T5ForSequenceClassification</code></a>, 
      <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForTokenClassification"><code>T5ForTokenClassification</code></a>,  
      <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForQuestionAnswering"><code>T5ForQuestionAnswering</code></a>
      et <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel"><code>T5EncoderModel</code></a>.<br><br>
      Here again, optimisation work can be carried out.<br>
      For conditional generation, the main point is to ensure that the generation process is efficient.<br>
      For heads involved in classification tasks (sequence, NER and QA), it is necessary to ensure that the encoder part
      of the T5 is used, since the decoder is not essential for these tasks, as shown in EncT5 <d-cite bibtex-key="liu2022enct5"></d-cite>. 
      The decoder weights take up unnecessary memory space, and the execution time of the finetuning code is doubled unnecessarily.<br>
      The last head is simply used to retain only the encoder part of an encoder-decoder model. It therefore does not need to be optimised.</p>
    <div class="tip"><p>
    About the head<code>ForConditionalGeneration</code>, our 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/684d02640464ea8bd2339689ce37da2d4e3b5f0b/src/model/modeling_flash_t5.py#L593">implementation</a> 
      is based on the generation process available in the 
      <a class="link" href="https://github.com/PiotrNawrot/nanoT5/blob/1c82d67bf8dea635be68a3b2a68a43b68b665193/nanoT5/utils/t5_model.py#L407">nanoT5</a> 
      because it is 14% faster than the Hugging Face implementation.<br>
      For classification heads, the implementation is available in this 
      <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/custom_heads_flash_t5.py">file</a>. 
      This file is separate from the modelling file because our implementations differ from those available in Transformers. 
      Indeed, heads <code>T5ForSequenceClassification</code> and <code>T5ForQuestionAnswering</code> available in Transformers are based 
      on the T5 encoder and decoder, which is inefficient. 
      We therefore recoded these two heads to use only the encoder. 
      We then followed the same structure as the <code>T5ForTokenClassification</code> head available in Transformers, 
      which also only uses the encoder, and so have used as is.</p>
    </div>


    <p class="width_125"><br><br><br></p>
    <h2 id="benchmark">Benchmark</h2>

    <h3 id="TFLOPS">TFLOPS</h3>
    <p class="width_125">
    The number of TFLOPS (trillions of floating-point calculations a processor can perform in one second) is probably the most telling metric to demonstrate the impact of the optimizations carried out.<br>
    We compare four approaches:<br>
    • SPDA (Scaled Dot Product Attention) implementation with full bias,<br>
    • the same implementation but in Triton,<br>
    • the Flash implementation Attention RPE, i.e. the second kernel we've developed (can be seen as turboT5 but in C++/Cuda),<br>
    • the Flash implementation Attention i.e. without bias. We've included it for reference but it's unusable in practice for a T5.<br>
    <br>
    For the forward pass, we have:
    </p> 
    <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/FWD-causal-True_dark.png" width="100%">
      <img alt="Benchmark memory backward pass" src="./assets/FWD-causal-True.png" width="100%">
    </picture>

    <div class="width_125"><p>For the forward pass, the Triton approach achieves 1.34 times more FLOPS than the SPDA approach, while the Flash Attention RPE approach achieves 1.99 times more FLOPS than the SPDA approach.<br>
    We can also see that our bf16 implementation is equivalent to fp16 (doing even better at size 512).<br>
    Following this benchmark, we decided to train our French model in bf16, head_dim = 128 and with a sequence of 1024.</p></div>

    <br>
    <p class="width_125">
    For the backward pass, we have: 
    </p>
  
    <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/BWD-causal-True_dark.png" width="100%">
      <img alt="Benchmark memory backward pass" src="./assets/BWD-causal-True.png" width="100%">
    </picture>
  
    <div class="width_125"><p>For the backward pass, the Triton implementation performed worse than SPDA, with 0.71 times the FLOPS of SPDA. The Flash Attention RPE implementation is more or less equivalent to SPDA (1.018 times more FLOPS).<br>
      We can also observe that Triton in head_dim 64 is more efficient than Triton in head_dim 128.</p></div>    

     <p><br></p>  
    <h4 id="torchvstriton">Torch vs Triton</h4>
    <p class="width_125">
    We mentioned previously that we had optimised parts of the architecture using ad hoc Triton kernels, namely the cross-entropy and RMSNorm layer.
    The following benchmarks should illustrate why.<br>
      
    For cross-entropy, we get a forward pass 7 to 11.4 times faster, a backward pass 3.26 to 3.75 times faster as well as a memory reduced by a factor of 4:</p>.
  
    <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/CE_dark.png" width="100%">
      <img alt="Benchmark memory backward pass" src="./assets/CE.png" width="100%">
    </picture>

    <p class="width_125">
    For the RMSNorm layer, we get a forward pass 3 to 5 times faster, a backward pass 2.33 to 4.33 times faster as well as a memory reduced by a factor of 3.2:</p>.
    
  <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/LN_dark.png" width="100%">
      <img alt="Benchmark memory backward pass" src="./assets/BLN.png" width="100%">
    </picture>


    <p class="note">
    Note that all the benchmark graphs can be generated automatically using the following <a href="https://github.com/catie-aq/flashT5/tree/main/benchmarks">code</a>.
    </p>


    <p><br><br></p>
    <h3 id="mod-le-en-fran-ais">Model in French</h3>
    <p class="width_125">We applied our work to French by pre-training a 147M parameter model. <br>
    The dataset we used is a mixture of CulturaX, Wikipedia, justice_fr and The Stack. <br>
    Our tokenizer of size 32,768 (8**5) is trained on CulturaX and The Stack.<br>
    Our model is pre-trained on a sequence of 1,024 tokens.</p>

    <p class="width_125">
    We wanted to compare the performance of our model with other previously published French-language models, such as CamemBERT <d-cite bibtex-key="Martin_2020"></d-cite> for classification tasks and BARThez <d-cite bibtex-key="eddine2021barthez"></d-cite> for generation tasks.<br>
    For this reason, we thought it important to make comparisons with an equivalent number of tokens seen.
    We therefore tried to estimate the number of tokens seen by these two models using the formula number of steps × sequence size × batch size. We couldn't find the information in the BARThez publication to do this. For CamemBERT, we estimate a maximum of 419.4B tokens. This figure could actually be lower, as we don't know the number of padding tokens seen by this model (where in our case, we don't use any). So we have pre-trained our model on the maximum number of tokens seen by the CamemBERT.<br></p>

    <p><br></p>

    <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/loss_train.png" width="49%">
      <img alt="Convergence masked accuracy FAT5" src="./assets/loss_train.png" width="49%">
     </picture>
     <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/loss_eval.png" width="49%">
      <img alt="Convergence masked accuracy FAT5" src="./assets/loss_eval.png" width="49%">
    </picture>
    </p>

    <p><br></p>

    <p class="width_125">
    We were also interested in comparing our model against itself, i.e. we evaluated its performance on downstream tasks every 100,000 steps (~26 billion tokens) during pre-training.<br>
    In the table below, we have listed the number of tokens equivalent to each interval of 100,000 steps.<br>
    </p>
  
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>Number of tokens ✝</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>FAT5-small-100K</td>
    <td>26,214,400,000 (100,000 × 1,024 × 256)</td>
    </tr>    
    <tr>
    <td>FAT5-small-200K</td>
    <td>52,428,800,000 (200,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-300K</td>
    <td>78,643,200,000 (300,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-400K</td>
    <td>104,857,600,000 (400,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-500K</td>
    <td>131,072,000,000 (500,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-600K</td>
    <td>157,286,400,000 (600,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-700K</td>
    <td>183,500,800,000 (700,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-800K</td>
    <td>209,715,200,000 (800,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-900K</td>
    <td>235,929,600,000 (900,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1000K</td>
    <td>262,144,000,000 (1,000,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1100K</td>
    <td>288,358,400,000 (1,100,000× 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1200K</td>
    <td>314,572,800,000 (1,200,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1300K</td>
    <td>340,787,200,000 (1,300,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1400K</td>
    <td>367,001,600,000 (1,400,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1500K</td>
    <td>393,216,000,000 (1,500,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td>FAT5-small-1600K</td>
    <td>419,430,400,000 (1,600,000 × 1,024 × 256)</td>
    </tr>
    <tr>
    <td><a class="link" href="https://hf.co/almanach/camembert-base">camembert (base ou large)</a></td>
    <td>419,430,400,000 (100,000 × 512 × 8,192)</td>
    </tr>
    </tbody>
    </table>
    <p class="width_125">✝ equivalent to number of steps × sequence size × batch size</p>


     <p><br></p>
    <h4 id="finetuning">Finetuning</h4>
    <p class="width_125">We focused on five tasks:<br>
    &nbsp; • Summarising texts to illustrate the use of the head <code>T5ForConditionalGeneration</code>,<br>
    &nbsp; • Binary classification to illustrate the use of the head <code>T5ForSequenceClassification</code>,<br>
    &nbsp; • Named entity recognition to illustrate the use of the head <code>T5ForTokenClassification</code>,<br>
    &nbsp; • Question answering to illustrate the use of the head <code>T5ForQuestionAnswering</code>.<br>
    &nbsp; • Sentence similarity to illustrate the use of the head <code>T5EncoderModel</code>.</p>

  <p class="width_125"> Classification tasks seem to us important to evaluate, as they are generally ignored by benchmarks of generative language models, even though they are often used in practice by companies (document retrieval, classification for customer reviews, data anonymization, etc.).
      The fact that 6 and a half years after its release, BERT <d-cite bibtex-key="devlin2019bert"></d-cite> alone is downloaded more times per month than the <a class="link" href="https://huggingface.co/models?pipeline_tag=text-generation&amp;sort=downloads">30 text generation models</a> most downloaded on Hugging Face at the time of writing: 38.5M versus 31.3M.</p>

  <p class="width_125">In the following tables, we underline for FAT5 the line with the best result for each task. We interpret the results of the generation part after the text summarization table. The classification results are interpreted after the binary classification, QA, NER and sentence-similarity tables.</p>

    <p><br></p>
    <h5>Summarization</h5>
    <p class="width_125">For this task, we used the dataset <a class="link" href="https://huggingface.co/datasets/orange_sum">orange_sum</a><d-cite bibtex-key="eddine2021barthez"></d-cite>.</p>
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>ROUGE-1</th>
    <th>ROUGE-2</th>
    <th>ROUGE-L</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>FAT5-small-100K (147M)</td>
    <td>28.17</td>
    <td>10.60</td>
    <td>20.62</td>
    </tr>
    <tr>
    <td>FAT5-small-200K (147M)</td>
    <td>28.72</td>
    <td>10.86</td>
    <td>20.68</td>
    </tr>
    <tr>
    <td>FAT5-small-300K (147M)</td>
    <td>28.76</td>
    <td>10.85</td>
    <td>20.63</td>
    </tr>
    <tr>
    <td>FAT5-small-400K (147M)</td>
    <td>28.59</td>
    <td>10.76</td>
    <td>20.60</td>
    </tr>
    <tr>
    <td>FAT5-small-500K (147M)</td>
    <td>28.98</td>
    <td>10.97</td>
    <td>20.72</td>
    </tr>
    <tr>
    <td>FAT5-small-600K (147M)</td>
    <td>29.04</td>
    <td>11.20</td>
    <td>20.89</td>
    </tr>
    <tr>
    <td>FAT5-small-700K (147M)</td>
    <td>28.72</td>
    <td>10.87</td>
    <td>20.77</td>
    </tr>
    <tr>
    <td>FAT5-small-800K (147M)</td>
    <td>29.00</td>
    <td>10.91</td>
    <td>20.78</td>
    </tr>
    <tr>
    <td>FAT5-small-900K (147M)</td>
    <td>29.30</td>
    <td>11.34</td>
    <td>21.22</td>
    </tr>
    <tr>
    <td>FAT5-small-1000K (147M)</td>
    <td>29.10</td>
    <td>11.21</td>
    <td>21.08</td>
    </tr>
    <tr>
    <td>FAT5-small-1100K (147M)</td>
    <td>29.43</td>
    <td>11.40</td>
    <td>21.15</td>
    </tr>
    <tr>
    <td>FAT5-small-1200K (147M)</td>
    <td>29.30</td>
    <td>11.38</td>
    <td>21.18</td>
    </tr>
    <tr>
    <td>FAT5-small-1300K (147M)</td>
    <td>29.38</td>
    <td>11.38</td>
    <td>21.18</td>
    </tr>
    <tr>
    <td>FAT5-small-1400K (147M)</td>
    <td>29.29</td>
    <td>11.18</td>
    <td>21.14</td>
    </tr>
    <tr>
    <td>FAT5-small-1500K (147M)</td>
    <td><u>29.48</u></td>
    <td><u>11.48</u></td>
    <td><u>21.22</u></td>
    </tr>
    <tr>
    <td>FAT5-small-1600K (147M)</td>
    <td>29.30</td>
    <td>11.27</td>
    <td>21.10</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/moussaKam/barthez">Barthez<d-cite bibtex-key="eddine2021barthez"></d-cite></a> (165M)</td>
    <td>31.44</td>
    <td>12.77</td>
    <td>22.23</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/moussaKam/mbarthez">mBarthez</a> (458M)</td>
    <td>32.67</td>
    <td>13.73</td>
    <td>23.18</td>
    </tr>
    </tbody>
    </table>

    <p><br></p>
  <p class="width_125">We can see that our model performs worse than the Barthez. We can put forward a few hypotheses on this subject. <br>
  Firstly, it's likely that our text generation process is not optimal. Not knowing the one used by the Barthez, we simply used the default parameters of the <a class="link" href="https://github.com/huggingface/transformers/blob/241c04d36867259cdf11dbb4e9d9a60f9cb65ebc/src/transformers/generation/utils.py#L1905">generate</a> function in transformers to avoid giving our model an advantage with a more sophisticated generation process.<br>
  Secondly, we didn't use a prompt to condition the generation, which could have benefited our model since the T5 is the model that introduced this system.<br>
  Thirdly, the Barthez surely saw more tokens than our model. Although we can't determine this number from the authors' publication, it is indicated that this is a BART model <d-cite bibtex-key="lewis2019bartdenoisingsequencetosequencepretraining"></d-cite> which received additional pre-training on French. However, BART's paper states that the model was trained on 500,000 steps × a sequence of 1,024 tokens × a batch of size 8000, i.e. 4,096,000,000,000 tokens, which is 9.76 times more than our model.
  </p>
  
    <p><br></p>
    <h5 id="classification">Classification</h5>
    <p class="width_125">We use a cleaned version of the allocine dataset <d-cite bibtex-key="allocine"></d-cite> : <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/allocine_clean">allocine_clean</a>. Specifically, 0.6% of the test sample was unreliable because it contained leaks or duplicate data. It is likely that the resulting dataset is still imperfect, with annotation problems requiring further proofreading/correction.
    </p>
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>Accuracy</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>FAT5-small-100K (67.4M)</td>
    <td>96.05</td>
    </tr>
    <tr>
    <td>FAT5-small-200K (67.4M)</td>
    <td>96.20</td>
    </tr>
    <tr>
    <td>FAT5-small-300K (67.4M)</td>
    <td>96.48</td>
    </tr>
    <tr>
    <td>FAT5-small-400K (67.4M)</td>
    <td>96.60</td>
    </tr>
    <tr>
    <td>FAT5-small-500K (67.4M)</td>
    <td>96.60</td>
    </tr>
    <tr>
    <td>FAT5-small-600K (67.4M)</td>
    <td>96.60</td>
    </tr>
    <tr>
    <td>FAT5-small-700K (67.4M)</td>
    <td>96.68</td>
    </tr>
    <tr>
    <td>FAT5-small-800K (67.4M)</td>
    <td>96.59</td>
    </tr>
    <tr>
    <td>FAT5-small-900K (67.4M)</td>
    <td><u>96.75</u></td>
    </tr>
    <tr>
    <td>FAT5-small-1000K (67.4M)</td>
    <td>96.62</td>
    </tr>
    <tr>
    <td>FAT5-small-1100K (67.4M)</td>
    <td>96.69</td>
    </tr>
    <tr>
    <td>FAT5-small-1200K (67.4M)</td>
    <td>96.71</td>
    </tr>
    <tr>
    <td>FAT5-small-1300K (67.4M)</td>
    <td>96.69</td>
    </tr>
    <tr>
    <td>FAT5-small-1400K (67.4M)</td>
    <td>96.65</td>
    </tr>
    <tr>
    <td>FAT5-small-1500K (67.4M)</td>
    <td>96.57</td>
    </tr>
    <tr>
    <td>FAT5-small-1600K (67.4M)</td>
    <td>96.69</td>
    </tr>
    <tr>
    <td><a class="link" href="">distillcamembert</a> (68.1M)</td>
    <td>96.74</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/bourdoiscatie/camembert_base_cls">camembert-base</a> (111M)</td>
    <td>97.27</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/bourdoiscatie/camembert_large_cls">camembert-large</a> (337M)</td>
    <td>97.15</td>
    </tr>
    <tr>
    </tbody>
    </table>
    <p class="width_125">Note: in this and the following tables, distillcamembert refers to a <a class="link" href="https://huggingface.co/cmarkea/distilcamembert-base">distilcamembert-base</a> <d-cite bibtex-key="delestre2022distilcamembert"></d-cite> that we have finetuned.</p>
  
    <p><br></p>
    <h5>Named entity recognition</h5>
    <p class="width_125">For this task, we used frenchNER in its <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/frenchNER_4entities">4 entities</a> (PER, LOC, ORG, MISC) <d-cite bibtex-key="frenchNER2024"></d-cite> configuration.</p>
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>F1 PER</th>
    <th>F1 LOC</th>
    <th>F1 ORG</th>
    <th>F1 MISC</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>FAT5-small-100K  (67.1M)</td>
    <td>96.51</td>
    <td>94.48</td>
    <td>87.24</td>
    <td>75.81</td>
    </tr>
    <tr>
    <td>FAT5-small-200K  (67.1M)</td>
    <td>96.90</td>
    <td>94.83</td>
    <td>88.78</td>
    <td>76.82</td>
    </tr>
    <tr>
    <td>FAT5-small-300K  (67.1M)</td>
    <td>97.25</td>
    <td>95.11</td>
    <td>88.86</td>
    <td><u>77.48</u></td>
    </tr>
    <tr>
    <td>FAT5-small-400K  (67.1M)</td>
    <td>97.18</td>
    <td>95.08</td>
    <td>89.11</td>
    <td>77.42</td>
    </tr>
    <tr>
    <td>FAT5-small-500K  (67.1M)</td>
    <td>97.25</td>
    <td>95.16</td>
    <td>89.16</td>
    <td>76.91</td>
    </tr>
    <tr>
    <td>FAT5-small-600K  (67.1M)</td>
    <td>97.19</td>
    <td>95.19</td>
    <td>88.85</td>
    <td>76.88</td>
    </tr>
    <tr>
    <td>FAT5-small-700K  (67.1M)</td>
    <td>97.17</td>
    <td>95.14</td>
    <td>89.39</td>
    <td>76.82</td>
    </tr>
    <tr>
    <td>FAT5-small-800K  (67.1M)</td>
    <td><u>97.34</u></td>
    <td>95.20</td>
    <td>89.18</td>
    <td>77.27</td>
    </tr>
    <tr>
    <td>FAT5-small-900K  (67.1M)</td>
    <td>97.19</td>
    <td>95.21</td>
    <td>89.04</td>
    <td>76.83</td>
    </tr>
    <tr>
    <td>FAT5-small-1000K  (67.1M)</td>
    <td>97.31</td>
    <td>95.26</td>
    <td>89.24</td>
    <td>76.84</td>
    </tr>
    <tr>
    <td>FAT5-small-1100K  (67.1M)</td>
    <td>97.11</td>
    <td>94.99</td>
    <td>88.52</td>
    <td>76.30</td>
    </tr>
    <tr>
    <td>FAT5-small-1200K  (67.1M)</td>
    <td>97.19</td>
    <td>95.11</td>
    <td>88.79</td>
    <td>76.86</td>
    </tr>
    <tr>
    <td>FAT5-small-1300K  (67.1M)</td>
    <td>97.15</td>
    <td>95.00</td>
    <td>88.62</td>
    <td>76.58</td>
    </tr>
    <tr>
    <td>FAT5-small-1400K  (67.1M)</td>
    <td>97.22</td>
    <td>95.09</td>
    <td>89.01</td>
    <td>77.00</td>
    </tr>
    <tr>
    <td>FAT5-small-1500K  (67.1M)</td>
    <td>97.32</td>
    <td><u>95.34</u></td>
    <td><u>89.39</u></td>
    <td>77.30</td>
    </tr>
    <tr>
    <td>FAT5-small-1600K  (67.1M)</td>
    <td>97.14</td>
    <td>95.22</td>
    <td>89.24</td>
    <td>76.88</td>
    </tr>
    <tr>
    <td><a class="link" href="">distillcamembert</a>  (67.5M)</td>
    <td>97.26</td>
    <td>95.24</td>
    <td>89.10</td>
    <td>79.88</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/CATIE-AQ/NERmembert-base-4entities">camembert-base</a> (110M)</td>
    <td>97.80</td>
    <td>95.78</td>
    <td>90.27</td>
    <td>81.38</td>
    </tr>
    <tr>
    <td><a class="link" href="https://huggingface.co/CATIE-AQ/NERmembert-large-4entities">camembert-large</a> (336M)</td>
    <td>98.17</td>
    <td>96.37</td>
    <td>91.87</td>
    <td>83.35</td>
    </tr>
    </tbody>
    </table>

    <p><br></p>
      <h5 id="question-answering">Question Answering</h5>
    <p class="width_125">
    We wanted to finetune our model on this task but realized that our tokenizer has two problems.<br>
    Firstly, we forgot to add the token at the beginning of the sentence.  
    Secondly, we decided to use a fast BPE tokenizer. We learned afterwards that the `add_special_tokens=True` argument doesn't work with this type of tokenizer.
    Correcting these two points requires us to post-process the tokenizer's encodings before performing our finetuning task, which isn't elegant and requires time we don't have right now.  
    <p><br></p>

    <h5><i>Sentence Similarity</i></h5>
    <p class="width_125">
    We invite the reader to take the results of this section with a grain of salt.<br>
    We performed a finetuning on this task in order to verify that the <code>T5EncoderModel</code> head was working, but we are not focusing on the results obtained because we are questioning the quality of the benchmark on which we are evaluating the models, namely MTEB FR <d-cite bibtex-key="ciancone2024mtebfrenchresourcesfrenchsentence"></d-cite>, a French version of MTEB.<br>
      
    Indeed, Nils Reimers, creator of the MTEB, recently questioned in a <a class="link" href="https://x.com/Nils_Reimers/status/1870812625505849849">tweet</a> the relevance of this benchmark, declaring it "dead".
      
    Earlier in the year, we observed data leaks and duplications in this benchmark 
      (see <a class="link" href="https://huggingface.co/datasets/lbourdois/MTEB_leaks_and_duplications">here</a> and
      <a class="link" href="https://github.com/embeddings-benchmark/mteb/issues/1036">here</a>).
    Alexey Vatolin then extended these observations to include empty lines (see <a class="link" href="https://github.com/embeddings-benchmark/mteb/issues/1049#issuecomment-2463095122">here</a>).
    <br>
    
    In the table below, we finetuned on a cleaned version of the dataset <code>stsb_multi_mt</code> <d-cite bibtex-key="huggingface:dataset:stsb_multi_mt"></d-cite> (0.653% of the split test was unreliable because it contained leaks or duplicated data) before evaluating on MTEB FR.
    <br>
    </p>
  
      
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>Average</th>
    <th>Classification</th>
    <th>Clustering</th>
    <th>PairClassification</th>
    <th>Reranking</th>
    <th>Retrieval</th>
    <th>STS</th>
    <th>Summary</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>FAT5-small-400K (67.1M)</td>
    <td>52.2</td>
    <td>59.8</td>
    <td>39.1</td>
    <td>77.5</td>
    <td>56.1</td>
    <td>29.1</td>
    <td>74</td>
    <td>29.8</td>
    </tr>
    <tr>
    <td>distillcamembert(68.1M)</td>
    <td>51.3</td>
    <td>60.7</td>
    <td>37.4</td>
    <td>77</td>
    <td>51.1</td>
    <td>25.2</td>
    <td>76.4</td>
    <td>31.3</td>
    </tr>
    </tbody>
    </table>

    <p><br><br><br></p>
  
    <p class="width_125">     
    We can see from the masked accuracy convergence graph that the performance of the encoder part of the model progresses initially before flattening out.  
    </p>
    <p><br></p>

    <p class="width_125">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="./assets/convergence_masked_accuracy_FAT5.png" width="100%">
      <img alt="Convergence masked accuracy FAT5" src="./assets/convergence_masked_accuracy_FAT5.png" width="100%">
    </picture>
    </p>

    <p><br></p>

  <p class="width_125">   
    This phenomenon can also be observed in the finetuning results: FAT5 matches the performance of distilcamembert at around 800 or 900K steps (except for the MISC entity for the NER task), but does no better beyond that. This is nevertheless encouraging in view of scaling up, since distilled models derived from larger models usually perform better than trained models from scratch.<br>
    Note that this sort of plateau in performance needs to be confirmed by carrying out several executions with different configurations (notably at seed level), in order to propose results in the form of an interval instead of a single result (for each step evaluated, we use a seed of 42).<br>
  It should also be mentioned that this capping for the encoder part has already been observed by other authors. One example is CamemBERT(a) 2.0 <d-cite bibtex-key="antoun2024camembert20smarterfrench"></d-cite> which has also been trained on the French-language part of CulturaX. CamemBERT 2.0 did not perform any better than CamemBERT 1.0, despite having seen more tokens. On the other hand, the authors obtained performance gains with CamemBERTa 2.0, suggesting that for encoders, the most important thing is to focus on the architecture (CamemBERTa 2.0 is a DeBERTaV3 <d-cite bibtex-key="he2023debertav3improvingdebertausing"></d-cite>  while CamemBERT 2.0 is a RoBERTa <d-cite bibtex-key="liu2019robertarobustlyoptimizedbert"></d-cite>) rather than data. This result invites us to think about updating the T5 encoder architecture.<br>
  A final observation that can be made is that if performance plateaus, it is possible to afford to stop pre-training earlier and thus reduce costs.<br>

    In the table below, we list cost estimates (in euros) for the pre-training of our model according to various cloud providers. 
    For each of them, we base ourselves on the hourly price of an A 100 80GB offered on December 20, 2024.<br>
    We show two cases: pre-training on 262 billion tokens (the threshold at which performance on classification tasks begins to plateau and marginal gains become low) on 419 billion tokens (the maximum number of tokens seen by CamemBERT).
    <br>
    </p>
    
    <table class="width_125">
    <thead>
    <tr>
    <th>Cloud provider</th>
    <th>Hourly rate for an A 100</th>
    <th>Price for 262B tokens</th>
    <th>Price for 419B tokens</th>
    <th>Note</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>AWS</td>
    <td>1.77</td>
    <td>1,616</td>
    <td>2,586</td>
    <td></td>
    </tr>
    <tr>
    <td>OVH</td>
    <td>2.75</td>
    <td>2,475</td>
    <td>3,960</td>
    <td>By opting for monthly rather than hourly payment, the price in both cases is €2,200.</td>
    </tr>
    <tr>
    <td>Azure</td>
    <td>3.31</td>
    <td>3,021</td>
    <td>4,833</td>
    <td>The hourly price was calculated from the monthly price of 8 A100.</td>
    </tr>
    <tr>
    <td>Google Cloud</td>
    <td>3.52</td>
    <td>3,214</td>
    <td>5,143</td>
    <td></td>
    </tr>
    </tbody>
    </table>
          
    <p><br><br></p>
    <h4>Pre-training times and emissions</h4>
    <p class="width_125">Carbon emissions were estimated using the <a class="link" href="https://mlco2.github.io/impact#compute">Machine Learning Impact calculator</a> <d-cite bibtex-key="lacoste2019quantifying"></d-cite>.<br>
      Our model was pre-trained on a single A100 PCIe 80GB, on a private infrastructure. 
      For carbon efficiency, we based ourselves on the daily numbers given by <a class="link" href="https://app.electricitymaps.com/zone/FR">electricitymaps</a> for France during our pre-training period.
      The finetunings were carried out on a single A100 PCIe 40GB. 
      As execution time is generally counted in hours or even minutes, for carbon efficiency we refer to the electricitymaps numbers for the hour in question rather than the daily number.<br>
      We estimate the emissions of our model at 14.084 kg eq. CO2, including 13.5 kg eq. CO2 for pre-training and 0.584 kg eq. CO2 for the 49 finetunings.<br>
      To this, we must add additional emissions estimated at 6.24 kg eq. CO2.
      They correspond to the finetuning of models to establish baselines against which to compare (0.475 kg eq. CO2), to our preliminary work in bfp16 mixed (4.735 kg eq. CO2 for the pre-training of three different models over 300K steps) and to tests in bfp16 full prior to the training of our final model (1.03 kg eq. in pre-training of a model half the size over 400K steps).<br>
      In total, we estimate the carbon footprint of our work at 20.324 kg eq. CO2.  </p>
  
    <p class="width_125">For the pre-training phase (we don't have enough information to make estimates for the other phases), it is then possible to compare us with the other French pre-trained models listed above: </p>
    <table class="width_125">
    <thead>
    <tr>
    <th>Model</th>
    <th>Time (H)</th>
    <th>Emissions (kg Co2 eq)</th>
    <th>Note</th>
    </tr>
    </thead>
    <tbody>
    <tr>
    <td>Camembert</td>
    <td>6,144</td>
    <td>106.91 ✝</td>
    <td>24H × 256 Tesla V100-SXM2-32GB at 58g (average over 2019) <br>The authors do not specify the numbers for the large version.</td>
    </tr>
    <tr>
    <td>Flaubert base <d-cite bibtex-key="le2020flaubert"></d-cite></td>
    <td>13,120</td>
    <td>190.24 to 228.29 ✝</td>
    <td>410H × 32 V100 at 58g (average over 2019) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
    </tr>
    <tr>
    <td>Flaubert large <d-cite bibtex-key="le2020flaubert"></d-cite></td>
    <td>49,920</td>
    <td>723.84 to 868.61 ✝</td>
    <td>390H × 128 V100 at 58g (average over 2019) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
    </tr>
    <tr>
    <td>Barthez</td>
    <td>7,680 ★</td>
    <td>107.52 to 129.02 ✝</td>
    <td>60H × 128 V100 at 56g (average over 2020) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
    </tr>
    <tr>
    <td>FAT5-small</td>
    <td>1,461</td>
    <td>13.5</td>
    <td>1 461H × 1 A100 to 36.96 g (average between 2024-18-10 and 2024-19-12)</td>
    </tr>
    </tbody>
    </table>
    <p class="width_125">✝ the numbers given are estimates based on the information provided by the authors in their publication<br>
    ★ we indicate only the hours for the French pre-training applied on top of the initial English pre-training on which the model is based</p>

    <p><br></p>

    <h3 id="mod-les-en-anglais">Models in other languages</h3>
    <p class="width_125">
      Our contribution focuses on French, with the introduction of a new model. For other languages, we can't afford to carry out work on the same magnitude.<br>
      Nevertheless, we provide a <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/convert_huggingface_t5.py">code</a> for adapting already pre-trained (m)T5/FLAN-T5 weights <d-cite bibtex-key="chung2022scaling"></d-cite> to our method. We hope that users will be able to continue pre-training one of these models to adapt it to more recent data, for example.<br>
      Please note, however, that this adaptation is limited, since the additional pre-training will have to be carried out in the precision of the original model. For example, if the model's weights are in fp32 (which is the case with the FLAN-T5), training will not be as fast as with the FAT5, which is in bf16.<br><br>  
    
    For English speakers, we have already adapted the weights of the various FLAN-T5 versions to our method. All weights can be found in this <a class="link" href="https://huggingface.co/collections/CATIE-AQ/catie-english-fat5-flan-662b679a8e855c7c0137d69e">Hugging Face collection</a>.<br><br>  

    If you'd like to pre-train your own model (to specialize in a specific domain, for example, and thus benefit from a customized tokenizer), we refer you once again to the <a class="link"  href="https://github.com/catie-aq/flashT5/tree/main/examples/minipile">tutorial</a> showing how to pre-train a model on minipile. Note that we have tested and trained the model in the tutorial on an A100, which may or may not work with other GPUs.</p>
    <p class="width_125"><br><br><br></p>
  

    <h2 id="la-suite">Next stage</h2>
    <p class="width_125">Let's end this article by mentioning what we intend to do, or at least would like to do, as a follow-up to this work.<br></p>


    <h3>Near future</h3>
    <p class="width_125">These are things that should already have been in this article, but took more time than expected.
    Typically, we've finished building datasets but haven't had time to do the finetunings.<br>
    The aim is to complete these tasks in the near future, so that we can include the results in an update to this blog post.
    </p>

    <h4>Fix the tokenizer</h4>
    <p class="width_125">
    The current FAT5 is usable. However, due to problems with the tokenizer resulting in inelegant post-processing for certain tasks, we're not excluding the possibility of re-training a model (on 1M steps only) with a new tokenizer allowing simpler use of the model.    
    <br><br></p>
      
    <h4>Instruct model</h4>
    <p class="width_125">We'd like to test FAT5's text generation capabilities in a more optimal way, in particular through the use of prompts, by developing an  instruct model.<br>
    For this, we have <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/DFP">DFP</a> (<i>Dataset of French Prompts</i>) <d-cite bibtex-key="centre_aquitain_des_technologies_de_l'information_et_electroniques_2023"></d-cite>, a dataset of over 100M rows covering thirty NLP tasks. It follows the methodology of the <a class="link" href="https://huggingface.co/datasets/bigscience/xP3">xP3</a> dataset used for mT0 <d-cite bibtex-key="muennighoff2023crosslingualgeneralizationmultitaskfinetuning"></d-cite>. We could also take this opportunity to check out BigScience's "Finding 2" <d-cite bibtex-key="wang2022languagemodelarchitecturepretraining"></d-cite> (page 9 of the publication) indicating that encoder-decoder models would have better 0-shot capabilities than decoder models. <br>
    Beyond NLP tasks, we also have over 2M open QA prompt rows, which should enable us to test FAT5 on more general tasks/knowledge.<br><br>
      
    The development of this instruct model should also enable us to work on its alignment, in particular via a dataset of 12M rows to perform DPO in French.<br><br></p>
    
    <h4>Long sequences</h4>
    <p class="width_125">
    Pre-training is performed on sequences of 1,024 tokens. However, the CUDA kernel we've developed supports positional encodings that greatly extend the context size, as well a linear inference.<br>
    With this in mind, we've created two datasets of long sequences in French (one of QA, one of text summaries) on which we'd like to finetune our model.<br><br><br></p>


    <h3>Distant future</h3>
    <p class="width_125">The items listed below are longer-term ideas. In other words, they will take time to implement and will be the subject of a new blog post if necessary.</p>

  <h4 id="calcul-lin-aire">Memory reduction</h4>
  <p class="width_125">Although we're already satisfied with the memory optimisations achieved via our CUDA kernel, we think we can take these results further using other techniques. For example, we can cite the CCE (Cut Cross-Entropy) method <d-cite bibtex-key="wijmans2024cut"></d-cite> with which we have already obtained interesting results on decoder models.<br>
  In addition, while we have concentrated on pre-training, more work needs to be done on inference, which in practice consumes the most resources over time once the model is in production. We are thinking in particular of using the SageAttention2 <d-cite bibtex-key="zhang2024sageattention2efficientattentionthorough"></d-cite> released while our model was training.
  <br><br></p>

    <h4 id="calcul-lin-aire">Linear computation</h4>
    <p class="width_125">In this work, we present a linear memory model. 
    A further improvement would be that, in addition to this memory, the model operates with linear computations.<br>
    The idea is to replace traditional quadratic attention with another form of attention.<br>
    We can think of some already applied to the T5, such as that of LongT5 <d-cite bibtex-key="guo2022longt5"></d-cite>.
    It is also possible to test more recent forms such as Based <d-cite bibtex-key="arora2024simple"></d-cite>.
    We are also interested in testing with Hedgehog <d-cite bibtex-key="zhang2024hedgehog"></d-cite>. 
    In fact, it is possible to combine them with the optimised kernels available in <a class="link" href="https://github.com/HazyResearch/ThunderKittens/tree/main/kernels">ThunderKittens</a> <d-cite bibtex-key="thunderkittens"></d-cite>.
    The benefit is that it is then possible to keep the pre-trained model and, via additional finetuning, replace standard attention with softmax by linear attention with Hedgehog.<br>
    LoLCATs <d-cite bibtex-key="zhang2024lolcatslowranklinearizinglarge"></d-cite> performs this finetuning via LoRA <d-cite bibtex-key="hu2021loralowrankadaptationlarge"></d-cite>.
      <br><br></p>

    <h4 id="passage-l-chelle">Model size</h4>
    <p class="width_125"> T5/FLAN-T5 have been trained to 11 billion parameters, demonstrating that this architecture can scale.<br>
    We would like to offer larger models with a FAT5-base and a FAT5-large with 305M and 973M parameters respectively, which we would then like to distil. The aim is to offer models that consume as little as possible in routine/inference.<br>
    We also expect the distilled models to perform better than models of equivalent size trained from scratch.<br>
    This should also allow us to propose models that will be used in practice. Indeed, in the current state for French, if the user is more motivated by performance than by the memory size of the model, he has more interest in using a CamemBERTa 2.0 for classification tasks. The present FAT5 should therefore be seen more as a proof of concept before being scaled up to make it competitive.
    <br><br></p>

    <h4 id="modeles-specialises">Training data</h4>
    <p class="width_125">
    In this work, we used "generic" French data, mainly from CulturaX. During the training of our model, Hugging Face introduced the FineWeb2 dataset <d-cite bibtex-key="penedo2024fineweb-2"></d-cite> which includes French. We would like to pre-train a new model so that we can compare the impact of pre-training data on performance on downstream tasks.<br>

    Beyond generic French, we particularly want to be able to apply our methodology to specific domains (medicine, regional variants of French, etc.).<br>
    To do this, we would need to train a new dedicated tokenizer and perform a new pre-training for each of the chosen domains.
    The advantage of the optimisations implemented and presented in this blog article is that they enable a significant reduction in the cost of pre-training.<br>
    We would then like to conduct a comparison between these small specialised models vs. large generic models.<br><br></p>

  <h4 id="modeles-specialises">Update of the T5 architecture</h4>
  <p class="width_125">The final direction we would like to explore is an update of the T5 architecture. As encoder-decoders have been neglected, they have not benefited from the improvements that decoder models have received in recent months (more recent activation or normalisation layers, multi-token prediction <d-cite bibtex-key="gloeckle2024betterfasterlarge"></d-cite>, etc.).</p>
  
  
    <p class="width_125"><br><br><br></p>
    <h2 id="conclusion">Conclusion</h2>
    <p class="width_125">
    We introduced the FAT5 (Flash Attention T5) model, detailing our approach to optimizing various elements of the pre-training and finetuning processes.
    This is based on kernels that enable Flash Attention to be used with a T5 and give the model a linear memory.
    In particular, we've applied our work to French as a proof of concept, and made sure that it can also be used in any other language.
    We hope that our method, which enables a model with 147M parameters to be pre-trained from scratch for €1,600, will be useful for people with limited computational resources.
    It also opens the way for a possible comeback of encoder-decoder models, rather than only decoder models.<br>
    <p class="width_125"><br><br></p>


    <style>
      d-appendix .citation {
        font-size: 11px;
        line-height: 15px;
        border-left: 1px solid rgba(0, 0, 0, 0.1);
        padding-left: 10px;
        border: 1px solid rgba(0,0,0,0.1);
        background: #0D1117;
        padding: 10px 10px;
        border-radius: 3px;
        color: rgba(150, 150, 150, 1);
        overflow: hidden;
        margin-top: -12px;
        white-space: pre-wrap;
        word-wrap: break-word;
      }
    </style>

    <h3 id="citation">Citation</h3>
    <pre class="citation long">@misc {FAT5,
    title        = { FAT5: Flash Attention T5 },
    author       = { Boris ALBAR and Loïck BOURDOIS },
    organization = { Centre Aquitain des Technologies de l'Information et Electroniques }, 
    year         = 2025,
    url          = { https://huggingface.co/spaces/CATIE-AQ/FAT5-report },
    doi          = { 10.57967/hf/4160 },
    publisher    = { Hugging Face }
      }</pre>
      
    <d-appendix style="color: #9CA3AF;" >
    <d-bibliography src="bibliography.bib"></d-bibliography>
</d-appendix>
</d-article>
  
<script>
    const article = document.querySelector('d-article');
    const toc = document.querySelector('d-contents');
    if (toc) {
        const headings = article.querySelectorAll('h2, h3, h4');
        let ToC = `<nav role="navigation" class="l-text figcaption" style="color: #9CA3AF;"><h3>Table des matières</h3>`;
        let prevLevel = 0;
        for (const el of headings) {
            // should element be included in TOC?
            const isInTitle = el.parentElement.tagName == 'D-TITLE';
            const isException = el.getAttribute('no-toc');
            if (isInTitle || isException) continue;
            el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
            const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
            const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
            while (prevLevel < level) {
                ToC += '<ul>'
                prevLevel++;
            }
            while (prevLevel > level) {
                ToC += '</ul>'
                prevLevel--;
            }
            if (level === 0)
                ToC += '<div>' + link + '</div>';
            else
                ToC += '<li>' + link + '</li>';
        }
        while (prevLevel > 0) {
            ToC += '</ul>'
            prevLevel--;
        }
        ToC += '</nav>';
        toc.innerHTML = ToC;
        toc.setAttribute('prerendered', 'true');
        const toc_links = document.querySelectorAll('d-contents > nav a');
        window.addEventListener('scroll', (_event) => {
            if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
                // Then iterate forwards, on the first match highlight it and break
                find_active: {
                    for (let i = headings.length - 1; i >= 0; i--) {
                        if (headings[i].getBoundingClientRect().top - 50 <= 0) {
                            if (!toc_links[i].classList.contains("active")) {
                                toc_links.forEach((link, _index) => {
                                    link.classList.remove("active");
                                });
                                toc_links[i].classList.add('active');
                            }
                            break find_active;
                        }
                    }
                    toc_links.forEach((link, _index) => {
                        link.classList.remove("active");
                    });
                }
            }
        });
    }
</script>
</body>
</html>