diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10021 @@ +{ + "best_metric": 0.6475752433152033, + "best_model_checkpoint": "./runtime-masked/MiniLMv2-L6-H384-distilled-from-RoBERTa-Large-finetuned-wikitext103-mlm-multi-emails-hq-x2bs/checkpoint-4004", + "epoch": 16.0, + "global_step": 4928, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 2.4291497975708505e-06, + "loss": 7.2679, + "step": 3 + }, + { + "epoch": 0.02, + "learning_rate": 4.858299595141701e-06, + "loss": 7.1451, + "step": 6 + }, + { + "epoch": 0.03, + "learning_rate": 7.287449392712551e-06, + "loss": 7.1002, + "step": 9 + }, + { + "epoch": 0.04, + "learning_rate": 9.716599190283402e-06, + "loss": 6.9508, + "step": 12 + }, + { + "epoch": 0.05, + "learning_rate": 1.2145748987854251e-05, + "loss": 6.7874, + "step": 15 + }, + { + "epoch": 0.06, + "learning_rate": 1.4574898785425101e-05, + "loss": 6.6279, + "step": 18 + }, + { + "epoch": 0.07, + "learning_rate": 1.7004048582995952e-05, + "loss": 6.5254, + "step": 21 + }, + { + "epoch": 0.08, + "learning_rate": 1.9433198380566804e-05, + "loss": 6.3327, + "step": 24 + }, + { + "epoch": 0.09, + "learning_rate": 2.1862348178137653e-05, + "loss": 6.2761, + "step": 27 + }, + { + "epoch": 0.1, + "learning_rate": 2.4291497975708502e-05, + "loss": 6.0968, + "step": 30 + }, + { + "epoch": 0.11, + "learning_rate": 2.6720647773279357e-05, + "loss": 5.9384, + "step": 33 + }, + { + "epoch": 0.12, + "learning_rate": 2.9149797570850203e-05, + "loss": 5.8496, + "step": 36 + }, + { + "epoch": 0.13, + "learning_rate": 3.157894736842105e-05, + "loss": 5.7136, + "step": 39 + }, + { + "epoch": 0.14, + "learning_rate": 3.4008097165991904e-05, + "loss": 5.6149, + "step": 42 + }, + { + "epoch": 0.15, + "learning_rate": 3.6437246963562756e-05, + "loss": 5.5674, + "step": 45 + }, + { + "epoch": 0.16, + "learning_rate": 3.886639676113361e-05, + "loss": 5.489, + "step": 48 + }, + { + "epoch": 0.17, + "learning_rate": 4.1295546558704454e-05, + "loss": 5.3851, + "step": 51 + }, + { + "epoch": 0.18, + "learning_rate": 4.3724696356275306e-05, + "loss": 5.3135, + "step": 54 + }, + { + "epoch": 0.19, + "learning_rate": 4.615384615384616e-05, + "loss": 5.1979, + "step": 57 + }, + { + "epoch": 0.19, + "learning_rate": 4.8582995951417004e-05, + "loss": 5.0876, + "step": 60 + }, + { + "epoch": 0.2, + "learning_rate": 5.101214574898786e-05, + "loss": 5.094, + "step": 63 + }, + { + "epoch": 0.21, + "learning_rate": 5.3441295546558715e-05, + "loss": 5.0148, + "step": 66 + }, + { + "epoch": 0.22, + "learning_rate": 5.587044534412956e-05, + "loss": 4.9376, + "step": 69 + }, + { + "epoch": 0.23, + "learning_rate": 5.8299595141700406e-05, + "loss": 4.9033, + "step": 72 + }, + { + "epoch": 0.24, + "learning_rate": 6.072874493927125e-05, + "loss": 4.8783, + "step": 75 + }, + { + "epoch": 0.25, + "learning_rate": 6.31578947368421e-05, + "loss": 4.8382, + "step": 78 + }, + { + "epoch": 0.26, + "learning_rate": 6.558704453441296e-05, + "loss": 4.7009, + "step": 81 + }, + { + "epoch": 0.27, + "learning_rate": 6.801619433198381e-05, + "loss": 4.6597, + "step": 84 + }, + { + "epoch": 0.28, + "learning_rate": 7.044534412955465e-05, + "loss": 4.5674, + "step": 87 + }, + { + "epoch": 0.29, + "learning_rate": 7.287449392712551e-05, + "loss": 4.5938, + "step": 90 + }, + { + "epoch": 0.3, + "learning_rate": 7.530364372469636e-05, + "loss": 4.6061, + "step": 93 + }, + { + "epoch": 0.31, + "learning_rate": 7.773279352226722e-05, + "loss": 4.582, + "step": 96 + }, + { + "epoch": 0.32, + "learning_rate": 8.016194331983806e-05, + "loss": 4.4975, + "step": 99 + }, + { + "epoch": 0.33, + "learning_rate": 8.259109311740891e-05, + "loss": 4.3876, + "step": 102 + }, + { + "epoch": 0.34, + "learning_rate": 8.502024291497977e-05, + "loss": 4.388, + "step": 105 + }, + { + "epoch": 0.35, + "learning_rate": 8.744939271255061e-05, + "loss": 4.2698, + "step": 108 + }, + { + "epoch": 0.36, + "learning_rate": 8.987854251012147e-05, + "loss": 4.3306, + "step": 111 + }, + { + "epoch": 0.37, + "learning_rate": 9.230769230769232e-05, + "loss": 4.3391, + "step": 114 + }, + { + "epoch": 0.38, + "learning_rate": 9.473684210526316e-05, + "loss": 4.1981, + "step": 117 + }, + { + "epoch": 0.39, + "learning_rate": 9.716599190283401e-05, + "loss": 4.3052, + "step": 120 + }, + { + "epoch": 0.4, + "learning_rate": 9.959514170040485e-05, + "loss": 4.1384, + "step": 123 + }, + { + "epoch": 0.41, + "learning_rate": 0.00010202429149797573, + "loss": 4.159, + "step": 126 + }, + { + "epoch": 0.42, + "learning_rate": 0.00010445344129554657, + "loss": 4.1178, + "step": 129 + }, + { + "epoch": 0.43, + "learning_rate": 0.00010688259109311743, + "loss": 4.0929, + "step": 132 + }, + { + "epoch": 0.44, + "learning_rate": 0.00010931174089068827, + "loss": 4.0761, + "step": 135 + }, + { + "epoch": 0.45, + "learning_rate": 0.00011174089068825912, + "loss": 3.9875, + "step": 138 + }, + { + "epoch": 0.46, + "learning_rate": 0.00011417004048582995, + "loss": 4.0039, + "step": 141 + }, + { + "epoch": 0.47, + "learning_rate": 0.00011659919028340081, + "loss": 3.9917, + "step": 144 + }, + { + "epoch": 0.48, + "learning_rate": 0.00011902834008097166, + "loss": 4.0101, + "step": 147 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001214574898785425, + "loss": 3.9108, + "step": 150 + }, + { + "epoch": 0.5, + "learning_rate": 0.00012388663967611335, + "loss": 3.9445, + "step": 153 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001263157894736842, + "loss": 3.96, + "step": 156 + }, + { + "epoch": 0.52, + "learning_rate": 0.00012874493927125507, + "loss": 3.9475, + "step": 159 + }, + { + "epoch": 0.53, + "learning_rate": 0.00013117408906882592, + "loss": 3.8582, + "step": 162 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013360323886639676, + "loss": 3.8952, + "step": 165 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013603238866396762, + "loss": 3.7632, + "step": 168 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013846153846153847, + "loss": 3.7845, + "step": 171 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001408906882591093, + "loss": 3.7638, + "step": 174 + }, + { + "epoch": 0.57, + "learning_rate": 0.00014331983805668017, + "loss": 3.8404, + "step": 177 + }, + { + "epoch": 0.58, + "learning_rate": 0.00014574898785425102, + "loss": 3.7742, + "step": 180 + }, + { + "epoch": 0.59, + "learning_rate": 0.00014817813765182186, + "loss": 3.7533, + "step": 183 + }, + { + "epoch": 0.6, + "learning_rate": 0.00015060728744939272, + "loss": 3.7303, + "step": 186 + }, + { + "epoch": 0.61, + "learning_rate": 0.00015303643724696357, + "loss": 3.7195, + "step": 189 + }, + { + "epoch": 0.62, + "learning_rate": 0.00015546558704453443, + "loss": 3.7544, + "step": 192 + }, + { + "epoch": 0.63, + "learning_rate": 0.00015789473684210527, + "loss": 3.6913, + "step": 195 + }, + { + "epoch": 0.64, + "learning_rate": 0.00016032388663967612, + "loss": 3.7917, + "step": 198 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016275303643724698, + "loss": 3.6758, + "step": 201 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016518218623481781, + "loss": 3.6774, + "step": 204 + }, + { + "epoch": 0.67, + "learning_rate": 0.00016761133603238867, + "loss": 3.6199, + "step": 207 + }, + { + "epoch": 0.68, + "learning_rate": 0.00017004048582995953, + "loss": 3.6028, + "step": 210 + }, + { + "epoch": 0.69, + "learning_rate": 0.0001724696356275304, + "loss": 3.6084, + "step": 213 + }, + { + "epoch": 0.7, + "learning_rate": 0.00017489878542510122, + "loss": 3.6165, + "step": 216 + }, + { + "epoch": 0.71, + "learning_rate": 0.00017732793522267208, + "loss": 3.5123, + "step": 219 + }, + { + "epoch": 0.72, + "learning_rate": 0.00017975708502024294, + "loss": 3.5594, + "step": 222 + }, + { + "epoch": 0.73, + "learning_rate": 0.00018218623481781377, + "loss": 3.6238, + "step": 225 + }, + { + "epoch": 0.74, + "learning_rate": 0.00018461538461538463, + "loss": 3.4991, + "step": 228 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001870445344129555, + "loss": 3.5384, + "step": 231 + }, + { + "epoch": 0.76, + "learning_rate": 0.00018947368421052632, + "loss": 3.5282, + "step": 234 + }, + { + "epoch": 0.77, + "learning_rate": 0.00019190283400809716, + "loss": 3.574, + "step": 237 + }, + { + "epoch": 0.78, + "learning_rate": 0.00019433198380566801, + "loss": 3.5391, + "step": 240 + }, + { + "epoch": 0.79, + "learning_rate": 0.00019676113360323887, + "loss": 3.4529, + "step": 243 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001991902834008097, + "loss": 3.4957, + "step": 246 + }, + { + "epoch": 0.81, + "learning_rate": 0.00019999990991501854, + "loss": 3.4346, + "step": 249 + }, + { + "epoch": 0.82, + "learning_rate": 0.00019999943696930958, + "loss": 3.4838, + "step": 252 + }, + { + "epoch": 0.83, + "learning_rate": 0.00019999855864354245, + "loss": 3.4721, + "step": 255 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001999972749412778, + "loss": 3.5, + "step": 258 + }, + { + "epoch": 0.85, + "learning_rate": 0.00019999558586771948, + "loss": 3.3624, + "step": 261 + }, + { + "epoch": 0.86, + "learning_rate": 0.00019999349142971467, + "loss": 3.4138, + "step": 264 + }, + { + "epoch": 0.87, + "learning_rate": 0.00019999099163575389, + "loss": 3.4005, + "step": 267 + }, + { + "epoch": 0.88, + "learning_rate": 0.00019998808649597085, + "loss": 3.365, + "step": 270 + }, + { + "epoch": 0.89, + "learning_rate": 0.0001999847760221425, + "loss": 3.3424, + "step": 273 + }, + { + "epoch": 0.9, + "learning_rate": 0.00019998106022768887, + "loss": 3.3629, + "step": 276 + }, + { + "epoch": 0.91, + "learning_rate": 0.00019997693912767318, + "loss": 3.3722, + "step": 279 + }, + { + "epoch": 0.92, + "learning_rate": 0.00019997241273880158, + "loss": 3.3951, + "step": 282 + }, + { + "epoch": 0.93, + "learning_rate": 0.00019996748107942335, + "loss": 3.3817, + "step": 285 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019996214416953046, + "loss": 3.3289, + "step": 288 + }, + { + "epoch": 0.94, + "learning_rate": 0.00019995640203075788, + "loss": 3.3074, + "step": 291 + }, + { + "epoch": 0.95, + "learning_rate": 0.00019995025468638318, + "loss": 3.3145, + "step": 294 + }, + { + "epoch": 0.96, + "learning_rate": 0.00019994370216132662, + "loss": 3.2853, + "step": 297 + }, + { + "epoch": 0.97, + "learning_rate": 0.000199936744482151, + "loss": 3.2416, + "step": 300 + }, + { + "epoch": 0.98, + "learning_rate": 0.0001999293816770615, + "loss": 3.2565, + "step": 303 + }, + { + "epoch": 0.99, + "learning_rate": 0.00019992161377590563, + "loss": 3.2947, + "step": 306 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.5121698756686252, + "eval_loss": 3.0832247734069824, + "eval_runtime": 16.2528, + "eval_samples_per_second": 135.177, + "eval_steps_per_second": 67.619, + "step": 308 + }, + { + "epoch": 1.0, + "learning_rate": 0.0001999134408101731, + "loss": 3.2464, + "step": 309 + }, + { + "epoch": 1.01, + "learning_rate": 0.00019990486281299568, + "loss": 3.2509, + "step": 312 + }, + { + "epoch": 1.02, + "learning_rate": 0.00019989587981914704, + "loss": 3.284, + "step": 315 + }, + { + "epoch": 1.03, + "learning_rate": 0.00019988649186504262, + "loss": 3.1894, + "step": 318 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001998766989887396, + "loss": 3.3045, + "step": 321 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001998665012299365, + "loss": 3.2935, + "step": 324 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001998558986299733, + "loss": 3.1808, + "step": 327 + }, + { + "epoch": 1.07, + "learning_rate": 0.000199844891231831, + "loss": 3.1295, + "step": 330 + }, + { + "epoch": 1.08, + "learning_rate": 0.00019983347908013172, + "loss": 3.262, + "step": 333 + }, + { + "epoch": 1.09, + "learning_rate": 0.00019982166222113826, + "loss": 3.1685, + "step": 336 + }, + { + "epoch": 1.1, + "learning_rate": 0.00019980944070275406, + "loss": 3.1682, + "step": 339 + }, + { + "epoch": 1.11, + "learning_rate": 0.00019979681457452304, + "loss": 3.2196, + "step": 342 + }, + { + "epoch": 1.12, + "learning_rate": 0.0001997837838876293, + "loss": 3.1117, + "step": 345 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001997703486948969, + "loss": 3.2077, + "step": 348 + }, + { + "epoch": 1.14, + "learning_rate": 0.00019975650905078976, + "loss": 3.1355, + "step": 351 + }, + { + "epoch": 1.15, + "learning_rate": 0.00019974226501141137, + "loss": 3.2277, + "step": 354 + }, + { + "epoch": 1.16, + "learning_rate": 0.00019972761663450452, + "loss": 3.14, + "step": 357 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001997125639794512, + "loss": 3.1937, + "step": 360 + }, + { + "epoch": 1.18, + "learning_rate": 0.00019969710710727214, + "loss": 3.1441, + "step": 363 + }, + { + "epoch": 1.19, + "learning_rate": 0.00019968124608062682, + "loss": 3.1193, + "step": 366 + }, + { + "epoch": 1.2, + "learning_rate": 0.000199664980963813, + "loss": 3.1849, + "step": 369 + }, + { + "epoch": 1.21, + "learning_rate": 0.00019964831182276663, + "loss": 3.1189, + "step": 372 + }, + { + "epoch": 1.22, + "learning_rate": 0.00019963123872506147, + "loss": 3.0733, + "step": 375 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001996137617399088, + "loss": 3.101, + "step": 378 + }, + { + "epoch": 1.24, + "learning_rate": 0.00019959588093815728, + "loss": 3.074, + "step": 381 + }, + { + "epoch": 1.25, + "learning_rate": 0.00019957759639229247, + "loss": 3.1142, + "step": 384 + }, + { + "epoch": 1.26, + "learning_rate": 0.00019955890817643674, + "loss": 3.1246, + "step": 387 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001995398163663488, + "loss": 3.104, + "step": 390 + }, + { + "epoch": 1.28, + "learning_rate": 0.00019952032103942347, + "loss": 3.1105, + "step": 393 + }, + { + "epoch": 1.29, + "learning_rate": 0.0001995004222746913, + "loss": 3.0985, + "step": 396 + }, + { + "epoch": 1.3, + "learning_rate": 0.00019948012015281853, + "loss": 3.1341, + "step": 399 + }, + { + "epoch": 1.31, + "learning_rate": 0.00019945941475610623, + "loss": 3.0335, + "step": 402 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001994383061684905, + "loss": 3.0978, + "step": 405 + }, + { + "epoch": 1.32, + "learning_rate": 0.00019941679447554175, + "loss": 3.0737, + "step": 408 + }, + { + "epoch": 1.33, + "learning_rate": 0.00019939487976446468, + "loss": 3.0942, + "step": 411 + }, + { + "epoch": 1.34, + "learning_rate": 0.00019937256212409756, + "loss": 3.0471, + "step": 414 + }, + { + "epoch": 1.35, + "learning_rate": 0.00019934984164491227, + "loss": 3.0165, + "step": 417 + }, + { + "epoch": 1.36, + "learning_rate": 0.00019932671841901354, + "loss": 2.9886, + "step": 420 + }, + { + "epoch": 1.37, + "learning_rate": 0.00019930319254013887, + "loss": 3.0629, + "step": 423 + }, + { + "epoch": 1.38, + "learning_rate": 0.000199279264103658, + "loss": 3.0518, + "step": 426 + }, + { + "epoch": 1.39, + "learning_rate": 0.00019925493320657262, + "loss": 2.9858, + "step": 429 + }, + { + "epoch": 1.4, + "learning_rate": 0.00019923019994751585, + "loss": 3.0696, + "step": 432 + }, + { + "epoch": 1.41, + "learning_rate": 0.000199205064426752, + "loss": 3.1251, + "step": 435 + }, + { + "epoch": 1.42, + "learning_rate": 0.000199179526746176, + "loss": 3.0311, + "step": 438 + }, + { + "epoch": 1.43, + "learning_rate": 0.00019915358700931313, + "loss": 2.9571, + "step": 441 + }, + { + "epoch": 1.44, + "learning_rate": 0.00019912724532131847, + "loss": 2.9914, + "step": 444 + }, + { + "epoch": 1.45, + "learning_rate": 0.00019910050178897657, + "loss": 2.9803, + "step": 447 + }, + { + "epoch": 1.46, + "learning_rate": 0.00019907335652070103, + "loss": 3.0183, + "step": 450 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001990458096265339, + "loss": 3.0207, + "step": 453 + }, + { + "epoch": 1.48, + "learning_rate": 0.00019901786121814547, + "loss": 2.9883, + "step": 456 + }, + { + "epoch": 1.49, + "learning_rate": 0.00019898951140883369, + "loss": 2.924, + "step": 459 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001989607603135236, + "loss": 3.0618, + "step": 462 + }, + { + "epoch": 1.51, + "learning_rate": 0.00019893160804876708, + "loss": 3.0179, + "step": 465 + }, + { + "epoch": 1.52, + "learning_rate": 0.00019890205473274236, + "loss": 2.9295, + "step": 468 + }, + { + "epoch": 1.53, + "learning_rate": 0.00019887210048525323, + "loss": 2.9724, + "step": 471 + }, + { + "epoch": 1.54, + "learning_rate": 0.00019884174542772899, + "loss": 2.9413, + "step": 474 + }, + { + "epoch": 1.55, + "learning_rate": 0.00019881098968322367, + "loss": 3.0484, + "step": 477 + }, + { + "epoch": 1.56, + "learning_rate": 0.00019877983337641565, + "loss": 2.9098, + "step": 480 + }, + { + "epoch": 1.57, + "learning_rate": 0.00019874827663360706, + "loss": 2.9568, + "step": 483 + }, + { + "epoch": 1.58, + "learning_rate": 0.00019871631958272336, + "loss": 2.9348, + "step": 486 + }, + { + "epoch": 1.59, + "learning_rate": 0.00019868396235331282, + "loss": 2.9615, + "step": 489 + }, + { + "epoch": 1.6, + "learning_rate": 0.00019865120507654593, + "loss": 2.9036, + "step": 492 + }, + { + "epoch": 1.61, + "learning_rate": 0.00019861804788521493, + "loss": 2.8977, + "step": 495 + }, + { + "epoch": 1.62, + "learning_rate": 0.00019858449091373313, + "loss": 3.0531, + "step": 498 + }, + { + "epoch": 1.63, + "learning_rate": 0.00019855053429813463, + "loss": 2.9548, + "step": 501 + }, + { + "epoch": 1.64, + "learning_rate": 0.00019851617817607354, + "loss": 2.9541, + "step": 504 + }, + { + "epoch": 1.65, + "learning_rate": 0.00019848142268682356, + "loss": 2.8871, + "step": 507 + }, + { + "epoch": 1.66, + "learning_rate": 0.00019844626797127724, + "loss": 2.8821, + "step": 510 + }, + { + "epoch": 1.67, + "learning_rate": 0.00019841071417194561, + "loss": 2.9179, + "step": 513 + }, + { + "epoch": 1.68, + "learning_rate": 0.00019837476143295748, + "loss": 2.9251, + "step": 516 + }, + { + "epoch": 1.69, + "learning_rate": 0.00019833840990005893, + "loss": 2.8764, + "step": 519 + }, + { + "epoch": 1.69, + "learning_rate": 0.00019830165972061265, + "loss": 2.8817, + "step": 522 + }, + { + "epoch": 1.7, + "learning_rate": 0.00019826451104359738, + "loss": 2.8707, + "step": 525 + }, + { + "epoch": 1.71, + "learning_rate": 0.00019822696401960727, + "loss": 2.8489, + "step": 528 + }, + { + "epoch": 1.72, + "learning_rate": 0.00019818901880085137, + "loss": 2.9285, + "step": 531 + }, + { + "epoch": 1.73, + "learning_rate": 0.00019815067554115282, + "loss": 2.8707, + "step": 534 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001981119343959485, + "loss": 2.8615, + "step": 537 + }, + { + "epoch": 1.75, + "learning_rate": 0.00019807279552228816, + "loss": 2.8273, + "step": 540 + }, + { + "epoch": 1.76, + "learning_rate": 0.00019803325907883385, + "loss": 2.9113, + "step": 543 + }, + { + "epoch": 1.77, + "learning_rate": 0.00019799332522585936, + "loss": 2.8561, + "step": 546 + }, + { + "epoch": 1.78, + "learning_rate": 0.00019795299412524945, + "loss": 2.8562, + "step": 549 + }, + { + "epoch": 1.79, + "learning_rate": 0.00019791226594049932, + "loss": 2.8861, + "step": 552 + }, + { + "epoch": 1.8, + "learning_rate": 0.00019787114083671375, + "loss": 2.8196, + "step": 555 + }, + { + "epoch": 1.81, + "learning_rate": 0.00019782961898060677, + "loss": 2.8737, + "step": 558 + }, + { + "epoch": 1.82, + "learning_rate": 0.00019778770054050058, + "loss": 2.9101, + "step": 561 + }, + { + "epoch": 1.83, + "learning_rate": 0.00019774538568632515, + "loss": 2.9066, + "step": 564 + }, + { + "epoch": 1.84, + "learning_rate": 0.00019770267458961741, + "loss": 2.8889, + "step": 567 + }, + { + "epoch": 1.85, + "learning_rate": 0.00019765956742352062, + "loss": 2.8761, + "step": 570 + }, + { + "epoch": 1.86, + "learning_rate": 0.00019761606436278362, + "loss": 2.8484, + "step": 573 + }, + { + "epoch": 1.87, + "learning_rate": 0.00019757216558376013, + "loss": 2.8575, + "step": 576 + }, + { + "epoch": 1.88, + "learning_rate": 0.00019752787126440803, + "loss": 2.785, + "step": 579 + }, + { + "epoch": 1.89, + "learning_rate": 0.0001974831815842887, + "loss": 2.8186, + "step": 582 + }, + { + "epoch": 1.9, + "learning_rate": 0.00019743809672456618, + "loss": 2.8184, + "step": 585 + }, + { + "epoch": 1.91, + "learning_rate": 0.0001973926168680066, + "loss": 2.8115, + "step": 588 + }, + { + "epoch": 1.92, + "learning_rate": 0.00019734674219897718, + "loss": 2.8627, + "step": 591 + }, + { + "epoch": 1.93, + "learning_rate": 0.00019730047290344578, + "loss": 2.8416, + "step": 594 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001972538091689799, + "loss": 2.8565, + "step": 597 + }, + { + "epoch": 1.95, + "learning_rate": 0.00019720675118474614, + "loss": 2.8373, + "step": 600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00019715929914150923, + "loss": 2.8271, + "step": 603 + }, + { + "epoch": 1.97, + "learning_rate": 0.00019711145323163137, + "loss": 2.8548, + "step": 606 + }, + { + "epoch": 1.98, + "learning_rate": 0.00019706321364907142, + "loss": 2.8292, + "step": 609 + }, + { + "epoch": 1.99, + "learning_rate": 0.00019701458058938418, + "loss": 2.9203, + "step": 612 + }, + { + "epoch": 2.0, + "learning_rate": 0.00019696555424971943, + "loss": 2.8727, + "step": 615 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.5661561892883697, + "eval_loss": 2.672184705734253, + "eval_runtime": 16.2659, + "eval_samples_per_second": 135.068, + "eval_steps_per_second": 67.565, + "step": 616 + }, + { + "epoch": 2.01, + "learning_rate": 0.0001969161348288213, + "loss": 2.7287, + "step": 618 + }, + { + "epoch": 2.02, + "learning_rate": 0.00019686632252702743, + "loss": 2.7983, + "step": 621 + }, + { + "epoch": 2.03, + "learning_rate": 0.00019681611754626807, + "loss": 2.7829, + "step": 624 + }, + { + "epoch": 2.04, + "learning_rate": 0.00019676552009006534, + "loss": 2.8671, + "step": 627 + }, + { + "epoch": 2.05, + "learning_rate": 0.0001967145303635324, + "loss": 2.7472, + "step": 630 + }, + { + "epoch": 2.06, + "learning_rate": 0.00019666314857337262, + "loss": 2.7506, + "step": 633 + }, + { + "epoch": 2.06, + "learning_rate": 0.00019661137492787867, + "loss": 2.7307, + "step": 636 + }, + { + "epoch": 2.07, + "learning_rate": 0.00019655920963693174, + "loss": 2.7653, + "step": 639 + }, + { + "epoch": 2.08, + "learning_rate": 0.00019650665291200082, + "loss": 2.8072, + "step": 642 + }, + { + "epoch": 2.09, + "learning_rate": 0.00019645370496614145, + "loss": 2.781, + "step": 645 + }, + { + "epoch": 2.1, + "learning_rate": 0.00019640036601399535, + "loss": 2.6695, + "step": 648 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019634663627178918, + "loss": 2.7504, + "step": 651 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019629251595733383, + "loss": 2.7793, + "step": 654 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019623800529002347, + "loss": 2.7255, + "step": 657 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019618310449083477, + "loss": 2.7955, + "step": 660 + }, + { + "epoch": 2.15, + "learning_rate": 0.00019612781378232583, + "loss": 2.6888, + "step": 663 + }, + { + "epoch": 2.16, + "learning_rate": 0.00019607213338863547, + "loss": 2.8287, + "step": 666 + }, + { + "epoch": 2.17, + "learning_rate": 0.0001960160635354821, + "loss": 2.7925, + "step": 669 + }, + { + "epoch": 2.18, + "learning_rate": 0.00019595960445016307, + "loss": 2.8107, + "step": 672 + }, + { + "epoch": 2.19, + "learning_rate": 0.00019590275636155352, + "loss": 2.7144, + "step": 675 + }, + { + "epoch": 2.2, + "learning_rate": 0.00019584551950010555, + "loss": 2.8271, + "step": 678 + }, + { + "epoch": 2.21, + "learning_rate": 0.00019578789409784727, + "loss": 2.685, + "step": 681 + }, + { + "epoch": 2.22, + "learning_rate": 0.00019572988038838194, + "loss": 2.7504, + "step": 684 + }, + { + "epoch": 2.23, + "learning_rate": 0.00019567147860688686, + "loss": 2.7186, + "step": 687 + }, + { + "epoch": 2.24, + "learning_rate": 0.00019561268899011256, + "loss": 2.7287, + "step": 690 + }, + { + "epoch": 2.25, + "learning_rate": 0.00019555351177638172, + "loss": 2.7973, + "step": 693 + }, + { + "epoch": 2.26, + "learning_rate": 0.00019549394720558833, + "loss": 2.7732, + "step": 696 + }, + { + "epoch": 2.27, + "learning_rate": 0.00019543399551919668, + "loss": 2.7949, + "step": 699 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001953736569602403, + "loss": 2.893, + "step": 702 + }, + { + "epoch": 2.29, + "learning_rate": 0.00019531293177332102, + "loss": 2.7169, + "step": 705 + }, + { + "epoch": 2.3, + "learning_rate": 0.00019525182020460803, + "loss": 2.7442, + "step": 708 + }, + { + "epoch": 2.31, + "learning_rate": 0.0001951903225018369, + "loss": 2.7373, + "step": 711 + }, + { + "epoch": 2.32, + "learning_rate": 0.0001951284389143084, + "loss": 2.6366, + "step": 714 + }, + { + "epoch": 2.33, + "learning_rate": 0.00019506616969288768, + "loss": 2.7411, + "step": 717 + }, + { + "epoch": 2.34, + "learning_rate": 0.00019500351509000314, + "loss": 2.7378, + "step": 720 + }, + { + "epoch": 2.35, + "learning_rate": 0.00019494047535964553, + "loss": 2.6151, + "step": 723 + }, + { + "epoch": 2.36, + "learning_rate": 0.00019487705075736672, + "loss": 2.8224, + "step": 726 + }, + { + "epoch": 2.37, + "learning_rate": 0.00019481324154027894, + "loss": 2.8167, + "step": 729 + }, + { + "epoch": 2.38, + "learning_rate": 0.00019474904796705337, + "loss": 2.6986, + "step": 732 + }, + { + "epoch": 2.39, + "learning_rate": 0.0001946844702979195, + "loss": 2.7249, + "step": 735 + }, + { + "epoch": 2.4, + "learning_rate": 0.00019461950879466383, + "loss": 2.6904, + "step": 738 + }, + { + "epoch": 2.41, + "learning_rate": 0.0001945541637206287, + "loss": 2.7191, + "step": 741 + }, + { + "epoch": 2.42, + "learning_rate": 0.00019448843534071163, + "loss": 2.804, + "step": 744 + }, + { + "epoch": 2.43, + "learning_rate": 0.00019442232392136375, + "loss": 2.7587, + "step": 747 + }, + { + "epoch": 2.44, + "learning_rate": 0.00019435582973058915, + "loss": 2.6742, + "step": 750 + }, + { + "epoch": 2.44, + "learning_rate": 0.00019428895303794352, + "loss": 2.7017, + "step": 753 + }, + { + "epoch": 2.45, + "learning_rate": 0.00019422169411453317, + "loss": 2.7544, + "step": 756 + }, + { + "epoch": 2.46, + "learning_rate": 0.0001941540532330139, + "loss": 2.7186, + "step": 759 + }, + { + "epoch": 2.47, + "learning_rate": 0.00019408603066758988, + "loss": 2.7649, + "step": 762 + }, + { + "epoch": 2.48, + "learning_rate": 0.00019401762669401257, + "loss": 2.8109, + "step": 765 + }, + { + "epoch": 2.49, + "learning_rate": 0.00019394884158957965, + "loss": 2.7248, + "step": 768 + }, + { + "epoch": 2.5, + "learning_rate": 0.00019387967563313377, + "loss": 2.6719, + "step": 771 + }, + { + "epoch": 2.51, + "learning_rate": 0.00019381012910506146, + "loss": 2.7268, + "step": 774 + }, + { + "epoch": 2.52, + "learning_rate": 0.00019374020228729206, + "loss": 2.7121, + "step": 777 + }, + { + "epoch": 2.53, + "learning_rate": 0.0001936698954632966, + "loss": 2.6516, + "step": 780 + }, + { + "epoch": 2.54, + "learning_rate": 0.00019359920891808647, + "loss": 2.7795, + "step": 783 + }, + { + "epoch": 2.55, + "learning_rate": 0.00019352814293821248, + "loss": 2.7295, + "step": 786 + }, + { + "epoch": 2.56, + "learning_rate": 0.00019345669781176356, + "loss": 2.6901, + "step": 789 + }, + { + "epoch": 2.57, + "learning_rate": 0.00019338487382836565, + "loss": 2.7171, + "step": 792 + }, + { + "epoch": 2.58, + "learning_rate": 0.00019331267127918044, + "loss": 2.5934, + "step": 795 + }, + { + "epoch": 2.59, + "learning_rate": 0.00019324009045690438, + "loss": 2.677, + "step": 798 + }, + { + "epoch": 2.6, + "learning_rate": 0.00019316713165576726, + "loss": 2.7009, + "step": 801 + }, + { + "epoch": 2.61, + "learning_rate": 0.0001930937951715312, + "loss": 2.7384, + "step": 804 + }, + { + "epoch": 2.62, + "learning_rate": 0.00019302008130148932, + "loss": 2.6525, + "step": 807 + }, + { + "epoch": 2.63, + "learning_rate": 0.00019294599034446467, + "loss": 2.626, + "step": 810 + }, + { + "epoch": 2.64, + "learning_rate": 0.00019287152260080888, + "loss": 2.6826, + "step": 813 + }, + { + "epoch": 2.65, + "learning_rate": 0.00019279667837240105, + "loss": 2.7388, + "step": 816 + }, + { + "epoch": 2.66, + "learning_rate": 0.00019272145796264648, + "loss": 2.7281, + "step": 819 + }, + { + "epoch": 2.67, + "learning_rate": 0.0001926458616764754, + "loss": 2.5821, + "step": 822 + }, + { + "epoch": 2.68, + "learning_rate": 0.00019256988982034178, + "loss": 2.6877, + "step": 825 + }, + { + "epoch": 2.69, + "learning_rate": 0.00019249354270222218, + "loss": 2.6533, + "step": 828 + }, + { + "epoch": 2.7, + "learning_rate": 0.00019241682063161428, + "loss": 2.7017, + "step": 831 + }, + { + "epoch": 2.71, + "learning_rate": 0.00019233972391953584, + "loss": 2.5812, + "step": 834 + }, + { + "epoch": 2.72, + "learning_rate": 0.00019226225287852325, + "loss": 2.6331, + "step": 837 + }, + { + "epoch": 2.73, + "learning_rate": 0.0001921844078226305, + "loss": 2.6415, + "step": 840 + }, + { + "epoch": 2.74, + "learning_rate": 0.0001921061890674277, + "loss": 2.6382, + "step": 843 + }, + { + "epoch": 2.75, + "learning_rate": 0.0001920275969299998, + "loss": 2.6692, + "step": 846 + }, + { + "epoch": 2.76, + "learning_rate": 0.00019194863172894552, + "loss": 2.6522, + "step": 849 + }, + { + "epoch": 2.77, + "learning_rate": 0.00019186929378437582, + "loss": 2.6311, + "step": 852 + }, + { + "epoch": 2.78, + "learning_rate": 0.00019178958341791268, + "loss": 2.6869, + "step": 855 + }, + { + "epoch": 2.79, + "learning_rate": 0.00019170950095268792, + "loss": 2.6057, + "step": 858 + }, + { + "epoch": 2.8, + "learning_rate": 0.00019162904671334163, + "loss": 2.6813, + "step": 861 + }, + { + "epoch": 2.81, + "learning_rate": 0.00019154822102602115, + "loss": 2.6263, + "step": 864 + }, + { + "epoch": 2.81, + "learning_rate": 0.0001914670242183795, + "loss": 2.6506, + "step": 867 + }, + { + "epoch": 2.82, + "learning_rate": 0.00019138545661957426, + "loss": 2.6605, + "step": 870 + }, + { + "epoch": 2.83, + "learning_rate": 0.00019130351856026597, + "loss": 2.6763, + "step": 873 + }, + { + "epoch": 2.84, + "learning_rate": 0.00019122121037261719, + "loss": 2.6219, + "step": 876 + }, + { + "epoch": 2.85, + "learning_rate": 0.00019113853239029064, + "loss": 2.6208, + "step": 879 + }, + { + "epoch": 2.86, + "learning_rate": 0.00019105548494844835, + "loss": 2.7369, + "step": 882 + }, + { + "epoch": 2.87, + "learning_rate": 0.00019097206838374997, + "loss": 2.5989, + "step": 885 + }, + { + "epoch": 2.88, + "learning_rate": 0.0001908882830343515, + "loss": 2.5859, + "step": 888 + }, + { + "epoch": 2.89, + "learning_rate": 0.00019080412923990395, + "loss": 2.6183, + "step": 891 + }, + { + "epoch": 2.9, + "learning_rate": 0.00019071960734155194, + "loss": 2.6804, + "step": 894 + }, + { + "epoch": 2.91, + "learning_rate": 0.00019063471768193235, + "loss": 2.6772, + "step": 897 + }, + { + "epoch": 2.92, + "learning_rate": 0.00019054946060517283, + "loss": 2.6334, + "step": 900 + }, + { + "epoch": 2.93, + "learning_rate": 0.00019046383645689055, + "loss": 2.7288, + "step": 903 + }, + { + "epoch": 2.94, + "learning_rate": 0.00019037784558419065, + "loss": 2.6486, + "step": 906 + }, + { + "epoch": 2.95, + "learning_rate": 0.00019029148833566497, + "loss": 2.6382, + "step": 909 + }, + { + "epoch": 2.96, + "learning_rate": 0.00019020476506139057, + "loss": 2.5683, + "step": 912 + }, + { + "epoch": 2.97, + "learning_rate": 0.00019011767611292819, + "loss": 2.6047, + "step": 915 + }, + { + "epoch": 2.98, + "learning_rate": 0.00019003022184332116, + "loss": 2.6616, + "step": 918 + }, + { + "epoch": 2.99, + "learning_rate": 0.0001899424026070936, + "loss": 2.6049, + "step": 921 + }, + { + "epoch": 3.0, + "learning_rate": 0.00018985421876024916, + "loss": 2.6339, + "step": 924 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.5878054172915932, + "eval_loss": 2.479712724685669, + "eval_runtime": 16.3394, + "eval_samples_per_second": 134.46, + "eval_steps_per_second": 67.261, + "step": 924 + }, + { + "epoch": 3.01, + "learning_rate": 0.0001897656706602696, + "loss": 2.5743, + "step": 927 + }, + { + "epoch": 3.02, + "learning_rate": 0.0001896767586661133, + "loss": 2.5385, + "step": 930 + }, + { + "epoch": 3.03, + "learning_rate": 0.0001895874831382138, + "loss": 2.6556, + "step": 933 + }, + { + "epoch": 3.04, + "learning_rate": 0.00018949784443847824, + "loss": 2.5895, + "step": 936 + }, + { + "epoch": 3.05, + "learning_rate": 0.00018940784293028617, + "loss": 2.5747, + "step": 939 + }, + { + "epoch": 3.06, + "learning_rate": 0.00018931747897848778, + "loss": 2.6032, + "step": 942 + }, + { + "epoch": 3.07, + "learning_rate": 0.00018922675294940256, + "loss": 2.5687, + "step": 945 + }, + { + "epoch": 3.08, + "learning_rate": 0.00018913566521081777, + "loss": 2.5473, + "step": 948 + }, + { + "epoch": 3.09, + "learning_rate": 0.00018904421613198712, + "loss": 2.6586, + "step": 951 + }, + { + "epoch": 3.1, + "learning_rate": 0.00018895240608362895, + "loss": 2.6245, + "step": 954 + }, + { + "epoch": 3.11, + "learning_rate": 0.000188860235437925, + "loss": 2.6062, + "step": 957 + }, + { + "epoch": 3.12, + "learning_rate": 0.00018876770456851877, + "loss": 2.4521, + "step": 960 + }, + { + "epoch": 3.13, + "learning_rate": 0.0001886748138505141, + "loss": 2.698, + "step": 963 + }, + { + "epoch": 3.14, + "learning_rate": 0.00018858156366047358, + "loss": 2.5416, + "step": 966 + }, + { + "epoch": 3.15, + "learning_rate": 0.00018848795437641697, + "loss": 2.6364, + "step": 969 + }, + { + "epoch": 3.16, + "learning_rate": 0.00018839398637781972, + "loss": 2.5949, + "step": 972 + }, + { + "epoch": 3.17, + "learning_rate": 0.00018829966004561163, + "loss": 2.5397, + "step": 975 + }, + { + "epoch": 3.18, + "learning_rate": 0.00018820497576217492, + "loss": 2.5792, + "step": 978 + }, + { + "epoch": 3.19, + "learning_rate": 0.00018810993391134295, + "loss": 2.5549, + "step": 981 + }, + { + "epoch": 3.19, + "learning_rate": 0.00018801453487839862, + "loss": 2.6141, + "step": 984 + }, + { + "epoch": 3.2, + "learning_rate": 0.00018791877905007277, + "loss": 2.6055, + "step": 987 + }, + { + "epoch": 3.21, + "learning_rate": 0.00018782266681454255, + "loss": 2.5834, + "step": 990 + }, + { + "epoch": 3.22, + "learning_rate": 0.00018772619856143009, + "loss": 2.6272, + "step": 993 + }, + { + "epoch": 3.23, + "learning_rate": 0.0001876293746818006, + "loss": 2.5862, + "step": 996 + }, + { + "epoch": 3.24, + "learning_rate": 0.000187532195568161, + "loss": 2.6453, + "step": 999 + }, + { + "epoch": 3.25, + "learning_rate": 0.00018743466161445823, + "loss": 2.5199, + "step": 1002 + }, + { + "epoch": 3.26, + "learning_rate": 0.00018733677321607775, + "loss": 2.5887, + "step": 1005 + }, + { + "epoch": 3.27, + "learning_rate": 0.0001872385307698418, + "loss": 2.5769, + "step": 1008 + }, + { + "epoch": 3.28, + "learning_rate": 0.00018713993467400796, + "loss": 2.6303, + "step": 1011 + }, + { + "epoch": 3.29, + "learning_rate": 0.00018704098532826735, + "loss": 2.5144, + "step": 1014 + }, + { + "epoch": 3.3, + "learning_rate": 0.0001869416831337432, + "loss": 2.5568, + "step": 1017 + }, + { + "epoch": 3.31, + "learning_rate": 0.00018684202849298897, + "loss": 2.6413, + "step": 1020 + }, + { + "epoch": 3.32, + "learning_rate": 0.00018674202180998708, + "loss": 2.5877, + "step": 1023 + }, + { + "epoch": 3.33, + "learning_rate": 0.0001866416634901469, + "loss": 2.5414, + "step": 1026 + }, + { + "epoch": 3.34, + "learning_rate": 0.00018654095394030334, + "loss": 2.5394, + "step": 1029 + }, + { + "epoch": 3.35, + "learning_rate": 0.00018643989356871514, + "loss": 2.5929, + "step": 1032 + }, + { + "epoch": 3.36, + "learning_rate": 0.00018633848278506323, + "loss": 2.6068, + "step": 1035 + }, + { + "epoch": 3.37, + "learning_rate": 0.00018623672200044898, + "loss": 2.6195, + "step": 1038 + }, + { + "epoch": 3.38, + "learning_rate": 0.00018613461162739263, + "loss": 2.5121, + "step": 1041 + }, + { + "epoch": 3.39, + "learning_rate": 0.00018603215207983165, + "loss": 2.5959, + "step": 1044 + }, + { + "epoch": 3.4, + "learning_rate": 0.0001859293437731189, + "loss": 2.5925, + "step": 1047 + }, + { + "epoch": 3.41, + "learning_rate": 0.00018582618712402113, + "loss": 2.608, + "step": 1050 + }, + { + "epoch": 3.42, + "learning_rate": 0.00018572268255071718, + "loss": 2.6188, + "step": 1053 + }, + { + "epoch": 3.43, + "learning_rate": 0.0001856188304727963, + "loss": 2.5683, + "step": 1056 + }, + { + "epoch": 3.44, + "learning_rate": 0.00018551463131125649, + "loss": 2.5835, + "step": 1059 + }, + { + "epoch": 3.45, + "learning_rate": 0.00018541008548850273, + "loss": 2.5374, + "step": 1062 + }, + { + "epoch": 3.46, + "learning_rate": 0.0001853051934283453, + "loss": 2.5489, + "step": 1065 + }, + { + "epoch": 3.47, + "learning_rate": 0.00018519995555599817, + "loss": 2.4947, + "step": 1068 + }, + { + "epoch": 3.48, + "learning_rate": 0.000185094372298077, + "loss": 2.4604, + "step": 1071 + }, + { + "epoch": 3.49, + "learning_rate": 0.00018498844408259773, + "loss": 2.5453, + "step": 1074 + }, + { + "epoch": 3.5, + "learning_rate": 0.00018488217133897462, + "loss": 2.5738, + "step": 1077 + }, + { + "epoch": 3.51, + "learning_rate": 0.00018477555449801863, + "loss": 2.5437, + "step": 1080 + }, + { + "epoch": 3.52, + "learning_rate": 0.00018466859399193555, + "loss": 2.46, + "step": 1083 + }, + { + "epoch": 3.53, + "learning_rate": 0.00018456129025432442, + "loss": 2.5457, + "step": 1086 + }, + { + "epoch": 3.54, + "learning_rate": 0.00018445364372017564, + "loss": 2.5188, + "step": 1089 + }, + { + "epoch": 3.55, + "learning_rate": 0.00018434565482586924, + "loss": 2.5652, + "step": 1092 + }, + { + "epoch": 3.56, + "learning_rate": 0.00018423732400917316, + "loss": 2.544, + "step": 1095 + }, + { + "epoch": 3.56, + "learning_rate": 0.00018412865170924135, + "loss": 2.6398, + "step": 1098 + }, + { + "epoch": 3.57, + "learning_rate": 0.00018401963836661218, + "loss": 2.6341, + "step": 1101 + }, + { + "epoch": 3.58, + "learning_rate": 0.00018391028442320644, + "loss": 2.5351, + "step": 1104 + }, + { + "epoch": 3.59, + "learning_rate": 0.0001838005903223257, + "loss": 2.5473, + "step": 1107 + }, + { + "epoch": 3.6, + "learning_rate": 0.00018369055650865052, + "loss": 2.5146, + "step": 1110 + }, + { + "epoch": 3.61, + "learning_rate": 0.00018358018342823855, + "loss": 2.5715, + "step": 1113 + }, + { + "epoch": 3.62, + "learning_rate": 0.0001834694715285227, + "loss": 2.5376, + "step": 1116 + }, + { + "epoch": 3.63, + "learning_rate": 0.00018335842125830954, + "loss": 2.6296, + "step": 1119 + }, + { + "epoch": 3.64, + "learning_rate": 0.00018324703306777718, + "loss": 2.4321, + "step": 1122 + }, + { + "epoch": 3.65, + "learning_rate": 0.00018313530740847375, + "loss": 2.5319, + "step": 1125 + }, + { + "epoch": 3.66, + "learning_rate": 0.0001830232447333153, + "loss": 2.5231, + "step": 1128 + }, + { + "epoch": 3.67, + "learning_rate": 0.00018291084549658412, + "loss": 2.5694, + "step": 1131 + }, + { + "epoch": 3.68, + "learning_rate": 0.00018279811015392685, + "loss": 2.499, + "step": 1134 + }, + { + "epoch": 3.69, + "learning_rate": 0.00018268503916235273, + "loss": 2.5255, + "step": 1137 + }, + { + "epoch": 3.7, + "learning_rate": 0.00018257163298023151, + "loss": 2.5671, + "step": 1140 + }, + { + "epoch": 3.71, + "learning_rate": 0.0001824578920672919, + "loss": 2.4801, + "step": 1143 + }, + { + "epoch": 3.72, + "learning_rate": 0.00018234381688461942, + "loss": 2.5006, + "step": 1146 + }, + { + "epoch": 3.73, + "learning_rate": 0.00018222940789465475, + "loss": 2.5033, + "step": 1149 + }, + { + "epoch": 3.74, + "learning_rate": 0.00018211466556119173, + "loss": 2.565, + "step": 1152 + }, + { + "epoch": 3.75, + "learning_rate": 0.0001819995903493755, + "loss": 2.5869, + "step": 1155 + }, + { + "epoch": 3.76, + "learning_rate": 0.00018188418272570061, + "loss": 2.422, + "step": 1158 + }, + { + "epoch": 3.77, + "learning_rate": 0.00018176844315800924, + "loss": 2.4295, + "step": 1161 + }, + { + "epoch": 3.78, + "learning_rate": 0.0001816523721154892, + "loss": 2.4252, + "step": 1164 + }, + { + "epoch": 3.79, + "learning_rate": 0.00018153597006867188, + "loss": 2.5694, + "step": 1167 + }, + { + "epoch": 3.8, + "learning_rate": 0.00018141923748943073, + "loss": 2.4952, + "step": 1170 + }, + { + "epoch": 3.81, + "learning_rate": 0.00018130217485097893, + "loss": 2.4748, + "step": 1173 + }, + { + "epoch": 3.82, + "learning_rate": 0.00018118478262786782, + "loss": 2.5343, + "step": 1176 + }, + { + "epoch": 3.83, + "learning_rate": 0.0001810670612959847, + "loss": 2.4971, + "step": 1179 + }, + { + "epoch": 3.84, + "learning_rate": 0.00018094901133255105, + "loss": 2.4903, + "step": 1182 + }, + { + "epoch": 3.85, + "learning_rate": 0.00018083063321612056, + "loss": 2.5106, + "step": 1185 + }, + { + "epoch": 3.86, + "learning_rate": 0.0001807119274265773, + "loss": 2.4929, + "step": 1188 + }, + { + "epoch": 3.87, + "learning_rate": 0.00018059289444513347, + "loss": 2.5104, + "step": 1191 + }, + { + "epoch": 3.88, + "learning_rate": 0.00018047353475432782, + "loss": 2.4528, + "step": 1194 + }, + { + "epoch": 3.89, + "learning_rate": 0.00018035384883802346, + "loss": 2.4571, + "step": 1197 + }, + { + "epoch": 3.9, + "learning_rate": 0.00018023383718140593, + "loss": 2.5601, + "step": 1200 + }, + { + "epoch": 3.91, + "learning_rate": 0.00018011350027098127, + "loss": 2.4553, + "step": 1203 + }, + { + "epoch": 3.92, + "learning_rate": 0.00017999283859457412, + "loss": 2.5075, + "step": 1206 + }, + { + "epoch": 3.93, + "learning_rate": 0.0001798718526413256, + "loss": 2.5526, + "step": 1209 + }, + { + "epoch": 3.94, + "learning_rate": 0.00017975054290169138, + "loss": 2.4881, + "step": 1212 + }, + { + "epoch": 3.94, + "learning_rate": 0.0001796289098674397, + "loss": 2.4973, + "step": 1215 + }, + { + "epoch": 3.95, + "learning_rate": 0.00017950695403164943, + "loss": 2.4831, + "step": 1218 + }, + { + "epoch": 3.96, + "learning_rate": 0.000179384675888708, + "loss": 2.5187, + "step": 1221 + }, + { + "epoch": 3.97, + "learning_rate": 0.0001792620759343094, + "loss": 2.4949, + "step": 1224 + }, + { + "epoch": 3.98, + "learning_rate": 0.00017913915466545217, + "loss": 2.533, + "step": 1227 + }, + { + "epoch": 3.99, + "learning_rate": 0.00017901591258043747, + "loss": 2.5053, + "step": 1230 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.6025211491749728, + "eval_loss": 2.383329153060913, + "eval_runtime": 16.3176, + "eval_samples_per_second": 134.64, + "eval_steps_per_second": 67.351, + "step": 1232 + }, + { + "epoch": 4.0, + "learning_rate": 0.0001788923501788669, + "loss": 2.5555, + "step": 1233 + }, + { + "epoch": 4.01, + "learning_rate": 0.00017876846796164068, + "loss": 2.4955, + "step": 1236 + }, + { + "epoch": 4.02, + "learning_rate": 0.0001786442664309554, + "loss": 2.5338, + "step": 1239 + }, + { + "epoch": 4.03, + "learning_rate": 0.0001785197460903021, + "loss": 2.4958, + "step": 1242 + }, + { + "epoch": 4.04, + "learning_rate": 0.0001783949074444643, + "loss": 2.4291, + "step": 1245 + }, + { + "epoch": 4.05, + "learning_rate": 0.00017826975099951583, + "loss": 2.4112, + "step": 1248 + }, + { + "epoch": 4.06, + "learning_rate": 0.0001781442772628188, + "loss": 2.4166, + "step": 1251 + }, + { + "epoch": 4.07, + "learning_rate": 0.00017801848674302154, + "loss": 2.5571, + "step": 1254 + }, + { + "epoch": 4.08, + "learning_rate": 0.00017789237995005668, + "loss": 2.4778, + "step": 1257 + }, + { + "epoch": 4.09, + "learning_rate": 0.0001777659573951388, + "loss": 2.5491, + "step": 1260 + }, + { + "epoch": 4.1, + "learning_rate": 0.00017763921959076273, + "loss": 2.4311, + "step": 1263 + }, + { + "epoch": 4.11, + "learning_rate": 0.00017751216705070105, + "loss": 2.4439, + "step": 1266 + }, + { + "epoch": 4.12, + "learning_rate": 0.00017738480029000234, + "loss": 2.4623, + "step": 1269 + }, + { + "epoch": 4.13, + "learning_rate": 0.000177257119824989, + "loss": 2.5291, + "step": 1272 + }, + { + "epoch": 4.14, + "learning_rate": 0.00017712912617325502, + "loss": 2.4934, + "step": 1275 + }, + { + "epoch": 4.15, + "learning_rate": 0.0001770008198536641, + "loss": 2.5712, + "step": 1278 + }, + { + "epoch": 4.16, + "learning_rate": 0.0001768722013863474, + "loss": 2.5426, + "step": 1281 + }, + { + "epoch": 4.17, + "learning_rate": 0.00017674327129270148, + "loss": 2.4619, + "step": 1284 + }, + { + "epoch": 4.18, + "learning_rate": 0.00017661403009538616, + "loss": 2.484, + "step": 1287 + }, + { + "epoch": 4.19, + "learning_rate": 0.00017648447831832242, + "loss": 2.4566, + "step": 1290 + }, + { + "epoch": 4.2, + "learning_rate": 0.0001763546164866903, + "loss": 2.5267, + "step": 1293 + }, + { + "epoch": 4.21, + "learning_rate": 0.00017622444512692672, + "loss": 2.4614, + "step": 1296 + }, + { + "epoch": 4.22, + "learning_rate": 0.00017609396476672343, + "loss": 2.4796, + "step": 1299 + }, + { + "epoch": 4.23, + "learning_rate": 0.0001759631759350247, + "loss": 2.3971, + "step": 1302 + }, + { + "epoch": 4.24, + "learning_rate": 0.0001758320791620254, + "loss": 2.4879, + "step": 1305 + }, + { + "epoch": 4.25, + "learning_rate": 0.0001757006749791687, + "loss": 2.5324, + "step": 1308 + }, + { + "epoch": 4.26, + "learning_rate": 0.00017556896391914394, + "loss": 2.3853, + "step": 1311 + }, + { + "epoch": 4.27, + "learning_rate": 0.0001754369465158845, + "loss": 2.455, + "step": 1314 + }, + { + "epoch": 4.28, + "learning_rate": 0.0001753046233045656, + "loss": 2.4964, + "step": 1317 + }, + { + "epoch": 4.29, + "learning_rate": 0.0001751719948216022, + "loss": 2.4615, + "step": 1320 + }, + { + "epoch": 4.3, + "learning_rate": 0.00017503906160464672, + "loss": 2.4745, + "step": 1323 + }, + { + "epoch": 4.31, + "learning_rate": 0.00017490582419258697, + "loss": 2.4068, + "step": 1326 + }, + { + "epoch": 4.31, + "learning_rate": 0.00017477228312554388, + "loss": 2.4827, + "step": 1329 + }, + { + "epoch": 4.32, + "learning_rate": 0.00017463843894486937, + "loss": 2.4002, + "step": 1332 + }, + { + "epoch": 4.33, + "learning_rate": 0.00017450429219314408, + "loss": 2.3769, + "step": 1335 + }, + { + "epoch": 4.34, + "learning_rate": 0.00017436984341417532, + "loss": 2.4319, + "step": 1338 + }, + { + "epoch": 4.35, + "learning_rate": 0.00017423509315299458, + "loss": 2.4803, + "step": 1341 + }, + { + "epoch": 4.36, + "learning_rate": 0.00017410004195585573, + "loss": 2.4286, + "step": 1344 + }, + { + "epoch": 4.37, + "learning_rate": 0.00017396469037023242, + "loss": 2.4884, + "step": 1347 + }, + { + "epoch": 4.38, + "learning_rate": 0.00017382903894481611, + "loss": 2.4376, + "step": 1350 + }, + { + "epoch": 4.39, + "learning_rate": 0.00017369308822951367, + "loss": 2.4966, + "step": 1353 + }, + { + "epoch": 4.4, + "learning_rate": 0.00017355683877544532, + "loss": 2.5422, + "step": 1356 + }, + { + "epoch": 4.41, + "learning_rate": 0.00017342029113494233, + "loss": 2.4287, + "step": 1359 + }, + { + "epoch": 4.42, + "learning_rate": 0.00017328344586154467, + "loss": 2.445, + "step": 1362 + }, + { + "epoch": 4.43, + "learning_rate": 0.0001731463035099989, + "loss": 2.4555, + "step": 1365 + }, + { + "epoch": 4.44, + "learning_rate": 0.00017300886463625595, + "loss": 2.4885, + "step": 1368 + }, + { + "epoch": 4.45, + "learning_rate": 0.00017287112979746868, + "loss": 2.4137, + "step": 1371 + }, + { + "epoch": 4.46, + "learning_rate": 0.00017273309955198984, + "loss": 2.494, + "step": 1374 + }, + { + "epoch": 4.47, + "learning_rate": 0.0001725947744593697, + "loss": 2.4098, + "step": 1377 + }, + { + "epoch": 4.48, + "learning_rate": 0.0001724561550803537, + "loss": 2.4887, + "step": 1380 + }, + { + "epoch": 4.49, + "learning_rate": 0.00017231724197688033, + "loss": 2.4888, + "step": 1383 + }, + { + "epoch": 4.5, + "learning_rate": 0.0001721780357120788, + "loss": 2.5013, + "step": 1386 + }, + { + "epoch": 4.51, + "learning_rate": 0.00017203853685026675, + "loss": 2.422, + "step": 1389 + }, + { + "epoch": 4.52, + "learning_rate": 0.00017189874595694788, + "loss": 2.4682, + "step": 1392 + }, + { + "epoch": 4.53, + "learning_rate": 0.00017175866359880982, + "loss": 2.3751, + "step": 1395 + }, + { + "epoch": 4.54, + "learning_rate": 0.00017161829034372168, + "loss": 2.4773, + "step": 1398 + }, + { + "epoch": 4.55, + "learning_rate": 0.00017147762676073187, + "loss": 2.4837, + "step": 1401 + }, + { + "epoch": 4.56, + "learning_rate": 0.0001713366734200657, + "loss": 2.4487, + "step": 1404 + }, + { + "epoch": 4.57, + "learning_rate": 0.00017119543089312317, + "loss": 2.4272, + "step": 1407 + }, + { + "epoch": 4.58, + "learning_rate": 0.00017105389975247647, + "loss": 2.4699, + "step": 1410 + }, + { + "epoch": 4.59, + "learning_rate": 0.00017091208057186792, + "loss": 2.4837, + "step": 1413 + }, + { + "epoch": 4.6, + "learning_rate": 0.00017076997392620737, + "loss": 2.4351, + "step": 1416 + }, + { + "epoch": 4.61, + "learning_rate": 0.0001706275803915701, + "loss": 2.3902, + "step": 1419 + }, + { + "epoch": 4.62, + "learning_rate": 0.00017048490054519434, + "loss": 2.3355, + "step": 1422 + }, + { + "epoch": 4.63, + "learning_rate": 0.00017034193496547902, + "loss": 2.3968, + "step": 1425 + }, + { + "epoch": 4.64, + "learning_rate": 0.00017019868423198134, + "loss": 2.4197, + "step": 1428 + }, + { + "epoch": 4.65, + "learning_rate": 0.00017005514892541444, + "loss": 2.4192, + "step": 1431 + }, + { + "epoch": 4.66, + "learning_rate": 0.00016991132962764516, + "loss": 2.359, + "step": 1434 + }, + { + "epoch": 4.67, + "learning_rate": 0.00016976722692169148, + "loss": 2.4068, + "step": 1437 + }, + { + "epoch": 4.68, + "learning_rate": 0.00016962284139172037, + "loss": 2.4527, + "step": 1440 + }, + { + "epoch": 4.69, + "learning_rate": 0.00016947817362304525, + "loss": 2.4723, + "step": 1443 + }, + { + "epoch": 4.69, + "learning_rate": 0.00016933322420212372, + "loss": 2.4029, + "step": 1446 + }, + { + "epoch": 4.7, + "learning_rate": 0.00016918799371655512, + "loss": 2.4574, + "step": 1449 + }, + { + "epoch": 4.71, + "learning_rate": 0.00016904248275507818, + "loss": 2.4426, + "step": 1452 + }, + { + "epoch": 4.72, + "learning_rate": 0.00016889669190756868, + "loss": 2.4689, + "step": 1455 + }, + { + "epoch": 4.73, + "learning_rate": 0.00016875062176503693, + "loss": 2.407, + "step": 1458 + }, + { + "epoch": 4.74, + "learning_rate": 0.0001686042729196255, + "loss": 2.3877, + "step": 1461 + }, + { + "epoch": 4.75, + "learning_rate": 0.0001684576459646068, + "loss": 2.394, + "step": 1464 + }, + { + "epoch": 4.76, + "learning_rate": 0.00016831074149438056, + "loss": 2.4613, + "step": 1467 + }, + { + "epoch": 4.77, + "learning_rate": 0.00016816356010447163, + "loss": 2.3824, + "step": 1470 + }, + { + "epoch": 4.78, + "learning_rate": 0.0001680161023915273, + "loss": 2.5161, + "step": 1473 + }, + { + "epoch": 4.79, + "learning_rate": 0.00016786836895331514, + "loss": 2.3828, + "step": 1476 + }, + { + "epoch": 4.8, + "learning_rate": 0.00016772036038872039, + "loss": 2.4413, + "step": 1479 + }, + { + "epoch": 4.81, + "learning_rate": 0.0001675720772977437, + "loss": 2.3981, + "step": 1482 + }, + { + "epoch": 4.82, + "learning_rate": 0.00016742352028149843, + "loss": 2.4631, + "step": 1485 + }, + { + "epoch": 4.83, + "learning_rate": 0.0001672746899422086, + "loss": 2.4012, + "step": 1488 + }, + { + "epoch": 4.84, + "learning_rate": 0.000167125586883206, + "loss": 2.4043, + "step": 1491 + }, + { + "epoch": 4.85, + "learning_rate": 0.00016697621170892824, + "loss": 2.3889, + "step": 1494 + }, + { + "epoch": 4.86, + "learning_rate": 0.00016682656502491576, + "loss": 2.4473, + "step": 1497 + }, + { + "epoch": 4.87, + "learning_rate": 0.0001666766474378099, + "loss": 2.5165, + "step": 1500 + }, + { + "epoch": 4.88, + "learning_rate": 0.00016652645955535006, + "loss": 2.5155, + "step": 1503 + }, + { + "epoch": 4.89, + "learning_rate": 0.00016637600198637133, + "loss": 2.3923, + "step": 1506 + }, + { + "epoch": 4.9, + "learning_rate": 0.00016622527534080218, + "loss": 2.4353, + "step": 1509 + }, + { + "epoch": 4.91, + "learning_rate": 0.00016607428022966182, + "loss": 2.3346, + "step": 1512 + }, + { + "epoch": 4.92, + "learning_rate": 0.00016592301726505771, + "loss": 2.3873, + "step": 1515 + }, + { + "epoch": 4.93, + "learning_rate": 0.00016577148706018328, + "loss": 2.4799, + "step": 1518 + }, + { + "epoch": 4.94, + "learning_rate": 0.00016561969022931515, + "loss": 2.4228, + "step": 1521 + }, + { + "epoch": 4.95, + "learning_rate": 0.0001654676273878109, + "loss": 2.4024, + "step": 1524 + }, + { + "epoch": 4.96, + "learning_rate": 0.00016531529915210643, + "loss": 2.386, + "step": 1527 + }, + { + "epoch": 4.97, + "learning_rate": 0.0001651627061397135, + "loss": 2.4365, + "step": 1530 + }, + { + "epoch": 4.98, + "learning_rate": 0.00016500984896921725, + "loss": 2.3997, + "step": 1533 + }, + { + "epoch": 4.99, + "learning_rate": 0.00016485672826027363, + "loss": 2.3659, + "step": 1536 + }, + { + "epoch": 5.0, + "learning_rate": 0.00016470334463360698, + "loss": 2.4531, + "step": 1539 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.6106430794745761, + "eval_loss": 2.3084843158721924, + "eval_runtime": 16.3383, + "eval_samples_per_second": 134.469, + "eval_steps_per_second": 67.265, + "step": 1540 + }, + { + "epoch": 5.01, + "learning_rate": 0.00016454969871100743, + "loss": 2.376, + "step": 1542 + }, + { + "epoch": 5.02, + "learning_rate": 0.0001643957911153284, + "loss": 2.2957, + "step": 1545 + }, + { + "epoch": 5.03, + "learning_rate": 0.00016424162247048412, + "loss": 2.3557, + "step": 1548 + }, + { + "epoch": 5.04, + "learning_rate": 0.00016408719340144705, + "loss": 2.3722, + "step": 1551 + }, + { + "epoch": 5.05, + "learning_rate": 0.00016393250453424534, + "loss": 2.3903, + "step": 1554 + }, + { + "epoch": 5.06, + "learning_rate": 0.0001637775564959604, + "loss": 2.3326, + "step": 1557 + }, + { + "epoch": 5.06, + "learning_rate": 0.00016362234991472416, + "loss": 2.3791, + "step": 1560 + }, + { + "epoch": 5.07, + "learning_rate": 0.00016346688541971668, + "loss": 2.4608, + "step": 1563 + }, + { + "epoch": 5.08, + "learning_rate": 0.00016331116364116363, + "loss": 2.3049, + "step": 1566 + }, + { + "epoch": 5.09, + "learning_rate": 0.00016315518521033354, + "loss": 2.437, + "step": 1569 + }, + { + "epoch": 5.1, + "learning_rate": 0.00016299895075953547, + "loss": 2.4088, + "step": 1572 + }, + { + "epoch": 5.11, + "learning_rate": 0.0001628424609221163, + "loss": 2.4097, + "step": 1575 + }, + { + "epoch": 5.12, + "learning_rate": 0.00016268571633245812, + "loss": 2.3635, + "step": 1578 + }, + { + "epoch": 5.13, + "learning_rate": 0.00016252871762597592, + "loss": 2.4373, + "step": 1581 + }, + { + "epoch": 5.14, + "learning_rate": 0.00016237146543911463, + "loss": 2.2713, + "step": 1584 + }, + { + "epoch": 5.15, + "learning_rate": 0.00016221396040934694, + "loss": 2.4049, + "step": 1587 + }, + { + "epoch": 5.16, + "learning_rate": 0.00016205620317517034, + "loss": 2.3796, + "step": 1590 + }, + { + "epoch": 5.17, + "learning_rate": 0.00016189819437610484, + "loss": 2.3642, + "step": 1593 + }, + { + "epoch": 5.18, + "learning_rate": 0.00016173993465269022, + "loss": 2.3668, + "step": 1596 + }, + { + "epoch": 5.19, + "learning_rate": 0.00016158142464648342, + "loss": 2.4196, + "step": 1599 + }, + { + "epoch": 5.2, + "learning_rate": 0.00016142266500005604, + "loss": 2.488, + "step": 1602 + }, + { + "epoch": 5.21, + "learning_rate": 0.00016126365635699166, + "loss": 2.3974, + "step": 1605 + }, + { + "epoch": 5.22, + "learning_rate": 0.00016110439936188318, + "loss": 2.4516, + "step": 1608 + }, + { + "epoch": 5.23, + "learning_rate": 0.00016094489466033043, + "loss": 2.3589, + "step": 1611 + }, + { + "epoch": 5.24, + "learning_rate": 0.0001607851428989372, + "loss": 2.4077, + "step": 1614 + }, + { + "epoch": 5.25, + "learning_rate": 0.00016062514472530898, + "loss": 2.3902, + "step": 1617 + }, + { + "epoch": 5.26, + "learning_rate": 0.0001604649007880501, + "loss": 2.3319, + "step": 1620 + }, + { + "epoch": 5.27, + "learning_rate": 0.00016030441173676117, + "loss": 2.3729, + "step": 1623 + }, + { + "epoch": 5.28, + "learning_rate": 0.00016014367822203646, + "loss": 2.3052, + "step": 1626 + }, + { + "epoch": 5.29, + "learning_rate": 0.0001599827008954613, + "loss": 2.3613, + "step": 1629 + }, + { + "epoch": 5.3, + "learning_rate": 0.0001598214804096093, + "loss": 2.4415, + "step": 1632 + }, + { + "epoch": 5.31, + "learning_rate": 0.00015966001741803983, + "loss": 2.3959, + "step": 1635 + }, + { + "epoch": 5.32, + "learning_rate": 0.0001594983125752954, + "loss": 2.3294, + "step": 1638 + }, + { + "epoch": 5.33, + "learning_rate": 0.0001593363665368988, + "loss": 2.3211, + "step": 1641 + }, + { + "epoch": 5.34, + "learning_rate": 0.00015917417995935077, + "loss": 2.3113, + "step": 1644 + }, + { + "epoch": 5.35, + "learning_rate": 0.00015901175350012698, + "loss": 2.3507, + "step": 1647 + }, + { + "epoch": 5.36, + "learning_rate": 0.00015884908781767565, + "loss": 2.3533, + "step": 1650 + }, + { + "epoch": 5.37, + "learning_rate": 0.00015868618357141472, + "loss": 2.4636, + "step": 1653 + }, + { + "epoch": 5.38, + "learning_rate": 0.00015852304142172923, + "loss": 2.4222, + "step": 1656 + }, + { + "epoch": 5.39, + "learning_rate": 0.00015835966202996867, + "loss": 2.4257, + "step": 1659 + }, + { + "epoch": 5.4, + "learning_rate": 0.00015819604605844418, + "loss": 2.3802, + "step": 1662 + }, + { + "epoch": 5.41, + "learning_rate": 0.00015803219417042608, + "loss": 2.2824, + "step": 1665 + }, + { + "epoch": 5.42, + "learning_rate": 0.00015786810703014096, + "loss": 2.4089, + "step": 1668 + }, + { + "epoch": 5.43, + "learning_rate": 0.0001577037853027691, + "loss": 2.3537, + "step": 1671 + }, + { + "epoch": 5.44, + "learning_rate": 0.00015753922965444184, + "loss": 2.3758, + "step": 1674 + }, + { + "epoch": 5.44, + "learning_rate": 0.0001573744407522386, + "loss": 2.3749, + "step": 1677 + }, + { + "epoch": 5.45, + "learning_rate": 0.00015720941926418455, + "loss": 2.3841, + "step": 1680 + }, + { + "epoch": 5.46, + "learning_rate": 0.0001570441658592477, + "loss": 2.3546, + "step": 1683 + }, + { + "epoch": 5.47, + "learning_rate": 0.00015687868120733614, + "loss": 2.3845, + "step": 1686 + }, + { + "epoch": 5.48, + "learning_rate": 0.00015671296597929535, + "loss": 2.2959, + "step": 1689 + }, + { + "epoch": 5.49, + "learning_rate": 0.00015654702084690568, + "loss": 2.4619, + "step": 1692 + }, + { + "epoch": 5.5, + "learning_rate": 0.0001563808464828794, + "loss": 2.2943, + "step": 1695 + }, + { + "epoch": 5.51, + "learning_rate": 0.00015621444356085803, + "loss": 2.3027, + "step": 1698 + }, + { + "epoch": 5.52, + "learning_rate": 0.00015604781275540956, + "loss": 2.4349, + "step": 1701 + }, + { + "epoch": 5.53, + "learning_rate": 0.00015588095474202595, + "loss": 2.3143, + "step": 1704 + }, + { + "epoch": 5.54, + "learning_rate": 0.00015571387019712004, + "loss": 2.3555, + "step": 1707 + }, + { + "epoch": 5.55, + "learning_rate": 0.0001555465597980231, + "loss": 2.4337, + "step": 1710 + }, + { + "epoch": 5.56, + "learning_rate": 0.00015537902422298197, + "loss": 2.393, + "step": 1713 + }, + { + "epoch": 5.57, + "learning_rate": 0.00015521126415115623, + "loss": 2.3029, + "step": 1716 + }, + { + "epoch": 5.58, + "learning_rate": 0.00015504328026261566, + "loss": 2.3065, + "step": 1719 + }, + { + "epoch": 5.59, + "learning_rate": 0.0001548750732383372, + "loss": 2.3214, + "step": 1722 + }, + { + "epoch": 5.6, + "learning_rate": 0.00015470664376020246, + "loss": 2.3422, + "step": 1725 + }, + { + "epoch": 5.61, + "learning_rate": 0.00015453799251099478, + "loss": 2.3227, + "step": 1728 + }, + { + "epoch": 5.62, + "learning_rate": 0.00015436912017439657, + "loss": 2.3816, + "step": 1731 + }, + { + "epoch": 5.63, + "learning_rate": 0.00015420002743498645, + "loss": 2.3966, + "step": 1734 + }, + { + "epoch": 5.64, + "learning_rate": 0.00015403071497823652, + "loss": 2.2734, + "step": 1737 + }, + { + "epoch": 5.65, + "learning_rate": 0.0001538611834905096, + "loss": 2.384, + "step": 1740 + }, + { + "epoch": 5.66, + "learning_rate": 0.00015369143365905635, + "loss": 2.3495, + "step": 1743 + }, + { + "epoch": 5.67, + "learning_rate": 0.00015352146617201266, + "loss": 2.3252, + "step": 1746 + }, + { + "epoch": 5.68, + "learning_rate": 0.00015335128171839671, + "loss": 2.349, + "step": 1749 + }, + { + "epoch": 5.69, + "learning_rate": 0.00015318088098810622, + "loss": 2.3845, + "step": 1752 + }, + { + "epoch": 5.7, + "learning_rate": 0.0001530102646719156, + "loss": 2.3248, + "step": 1755 + }, + { + "epoch": 5.71, + "learning_rate": 0.0001528394334614733, + "loss": 2.3738, + "step": 1758 + }, + { + "epoch": 5.72, + "learning_rate": 0.00015266838804929892, + "loss": 2.3512, + "step": 1761 + }, + { + "epoch": 5.73, + "learning_rate": 0.00015249712912878031, + "loss": 2.4223, + "step": 1764 + }, + { + "epoch": 5.74, + "learning_rate": 0.00015232565739417092, + "loss": 2.3593, + "step": 1767 + }, + { + "epoch": 5.75, + "learning_rate": 0.00015215397354058686, + "loss": 2.3934, + "step": 1770 + }, + { + "epoch": 5.76, + "learning_rate": 0.00015198207826400413, + "loss": 2.4059, + "step": 1773 + }, + { + "epoch": 5.77, + "learning_rate": 0.00015180997226125592, + "loss": 2.3081, + "step": 1776 + }, + { + "epoch": 5.78, + "learning_rate": 0.00015163765623002945, + "loss": 2.2689, + "step": 1779 + }, + { + "epoch": 5.79, + "learning_rate": 0.00015146513086886356, + "loss": 2.3314, + "step": 1782 + }, + { + "epoch": 5.8, + "learning_rate": 0.00015129239687714557, + "loss": 2.2807, + "step": 1785 + }, + { + "epoch": 5.81, + "learning_rate": 0.00015111945495510857, + "loss": 2.3481, + "step": 1788 + }, + { + "epoch": 5.81, + "learning_rate": 0.0001509463058038286, + "loss": 2.3724, + "step": 1791 + }, + { + "epoch": 5.82, + "learning_rate": 0.00015077295012522174, + "loss": 2.3322, + "step": 1794 + }, + { + "epoch": 5.83, + "learning_rate": 0.00015059938862204127, + "loss": 2.2553, + "step": 1797 + }, + { + "epoch": 5.84, + "learning_rate": 0.0001504256219978749, + "loss": 2.2632, + "step": 1800 + }, + { + "epoch": 5.85, + "learning_rate": 0.0001502516509571418, + "loss": 2.27, + "step": 1803 + }, + { + "epoch": 5.86, + "learning_rate": 0.00015007747620508988, + "loss": 2.3615, + "step": 1806 + }, + { + "epoch": 5.87, + "learning_rate": 0.00014990309844779284, + "loss": 2.2815, + "step": 1809 + }, + { + "epoch": 5.88, + "learning_rate": 0.0001497285183921473, + "loss": 2.3518, + "step": 1812 + }, + { + "epoch": 5.89, + "learning_rate": 0.00014955373674586996, + "loss": 2.3358, + "step": 1815 + }, + { + "epoch": 5.9, + "learning_rate": 0.00014937875421749472, + "loss": 2.3217, + "step": 1818 + }, + { + "epoch": 5.91, + "learning_rate": 0.00014920357151636992, + "loss": 2.3558, + "step": 1821 + }, + { + "epoch": 5.92, + "learning_rate": 0.00014902818935265527, + "loss": 2.3474, + "step": 1824 + }, + { + "epoch": 5.93, + "learning_rate": 0.00014885260843731905, + "loss": 2.3579, + "step": 1827 + }, + { + "epoch": 5.94, + "learning_rate": 0.00014867682948213536, + "loss": 2.3964, + "step": 1830 + }, + { + "epoch": 5.95, + "learning_rate": 0.000148500853199681, + "loss": 2.3697, + "step": 1833 + }, + { + "epoch": 5.96, + "learning_rate": 0.00014832468030333265, + "loss": 2.3099, + "step": 1836 + }, + { + "epoch": 5.97, + "learning_rate": 0.00014814831150726428, + "loss": 2.3651, + "step": 1839 + }, + { + "epoch": 5.98, + "learning_rate": 0.00014797174752644382, + "loss": 2.304, + "step": 1842 + }, + { + "epoch": 5.99, + "learning_rate": 0.00014779498907663033, + "loss": 2.3598, + "step": 1845 + }, + { + "epoch": 6.0, + "learning_rate": 0.0001476180368743715, + "loss": 2.2852, + "step": 1848 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.6175154625608633, + "eval_loss": 2.245072364807129, + "eval_runtime": 16.3034, + "eval_samples_per_second": 134.757, + "eval_steps_per_second": 67.409, + "step": 1848 + }, + { + "epoch": 6.01, + "learning_rate": 0.00014744089163700025, + "loss": 2.395, + "step": 1851 + }, + { + "epoch": 6.02, + "learning_rate": 0.0001472635540826321, + "loss": 2.3843, + "step": 1854 + }, + { + "epoch": 6.03, + "learning_rate": 0.00014708602493016218, + "loss": 2.3417, + "step": 1857 + }, + { + "epoch": 6.04, + "learning_rate": 0.0001469083048992623, + "loss": 2.2776, + "step": 1860 + }, + { + "epoch": 6.05, + "learning_rate": 0.00014673039471037807, + "loss": 2.2988, + "step": 1863 + }, + { + "epoch": 6.06, + "learning_rate": 0.000146552295084726, + "loss": 2.2561, + "step": 1866 + }, + { + "epoch": 6.07, + "learning_rate": 0.00014637400674429057, + "loss": 2.3886, + "step": 1869 + }, + { + "epoch": 6.08, + "learning_rate": 0.00014619553041182116, + "loss": 2.2859, + "step": 1872 + }, + { + "epoch": 6.09, + "learning_rate": 0.00014601686681082934, + "loss": 2.3711, + "step": 1875 + }, + { + "epoch": 6.1, + "learning_rate": 0.00014583801666558576, + "loss": 2.315, + "step": 1878 + }, + { + "epoch": 6.11, + "learning_rate": 0.00014565898070111735, + "loss": 2.2431, + "step": 1881 + }, + { + "epoch": 6.12, + "learning_rate": 0.0001454797596432043, + "loss": 2.3051, + "step": 1884 + }, + { + "epoch": 6.13, + "learning_rate": 0.00014530035421837716, + "loss": 2.3228, + "step": 1887 + }, + { + "epoch": 6.14, + "learning_rate": 0.00014512076515391375, + "loss": 2.3054, + "step": 1890 + }, + { + "epoch": 6.15, + "learning_rate": 0.0001449409931778365, + "loss": 2.3504, + "step": 1893 + }, + { + "epoch": 6.16, + "learning_rate": 0.0001447610390189092, + "loss": 2.328, + "step": 1896 + }, + { + "epoch": 6.17, + "learning_rate": 0.00014458090340663428, + "loss": 2.268, + "step": 1899 + }, + { + "epoch": 6.18, + "learning_rate": 0.00014440058707124967, + "loss": 2.3306, + "step": 1902 + }, + { + "epoch": 6.19, + "learning_rate": 0.00014422009074372604, + "loss": 2.2928, + "step": 1905 + }, + { + "epoch": 6.19, + "learning_rate": 0.00014403941515576344, + "loss": 2.3729, + "step": 1908 + }, + { + "epoch": 6.2, + "learning_rate": 0.00014385856103978894, + "loss": 2.2654, + "step": 1911 + }, + { + "epoch": 6.21, + "learning_rate": 0.0001436775291289532, + "loss": 2.3404, + "step": 1914 + }, + { + "epoch": 6.22, + "learning_rate": 0.00014349632015712752, + "loss": 2.2935, + "step": 1917 + }, + { + "epoch": 6.23, + "learning_rate": 0.00014331493485890114, + "loss": 2.2743, + "step": 1920 + }, + { + "epoch": 6.24, + "learning_rate": 0.00014313337396957803, + "loss": 2.4234, + "step": 1923 + }, + { + "epoch": 6.25, + "learning_rate": 0.00014295163822517393, + "loss": 2.3393, + "step": 1926 + }, + { + "epoch": 6.26, + "learning_rate": 0.0001427697283624135, + "loss": 2.3336, + "step": 1929 + }, + { + "epoch": 6.27, + "learning_rate": 0.00014258764511872716, + "loss": 2.3729, + "step": 1932 + }, + { + "epoch": 6.28, + "learning_rate": 0.00014240538923224823, + "loss": 2.3284, + "step": 1935 + }, + { + "epoch": 6.29, + "learning_rate": 0.00014222296144180994, + "loss": 2.3265, + "step": 1938 + }, + { + "epoch": 6.3, + "learning_rate": 0.00014204036248694225, + "loss": 2.3156, + "step": 1941 + }, + { + "epoch": 6.31, + "learning_rate": 0.00014185759310786917, + "loss": 2.2792, + "step": 1944 + }, + { + "epoch": 6.32, + "learning_rate": 0.00014167465404550542, + "loss": 2.2745, + "step": 1947 + }, + { + "epoch": 6.33, + "learning_rate": 0.00014149154604145366, + "loss": 2.2669, + "step": 1950 + }, + { + "epoch": 6.34, + "learning_rate": 0.00014130826983800145, + "loss": 2.2678, + "step": 1953 + }, + { + "epoch": 6.35, + "learning_rate": 0.0001411248261781181, + "loss": 2.3385, + "step": 1956 + }, + { + "epoch": 6.36, + "learning_rate": 0.00014094121580545183, + "loss": 2.3564, + "step": 1959 + }, + { + "epoch": 6.37, + "learning_rate": 0.0001407574394643267, + "loss": 2.2749, + "step": 1962 + }, + { + "epoch": 6.38, + "learning_rate": 0.00014057349789973946, + "loss": 2.2941, + "step": 1965 + }, + { + "epoch": 6.39, + "learning_rate": 0.00014038939185735683, + "loss": 2.2932, + "step": 1968 + }, + { + "epoch": 6.4, + "learning_rate": 0.0001402051220835121, + "loss": 2.2875, + "step": 1971 + }, + { + "epoch": 6.41, + "learning_rate": 0.00014002068932520247, + "loss": 2.3496, + "step": 1974 + }, + { + "epoch": 6.42, + "learning_rate": 0.00013983609433008574, + "loss": 2.273, + "step": 1977 + }, + { + "epoch": 6.43, + "learning_rate": 0.0001396513378464774, + "loss": 2.2399, + "step": 1980 + }, + { + "epoch": 6.44, + "learning_rate": 0.00013946642062334766, + "loss": 2.3506, + "step": 1983 + }, + { + "epoch": 6.45, + "learning_rate": 0.00013928134341031825, + "loss": 2.2949, + "step": 1986 + }, + { + "epoch": 6.46, + "learning_rate": 0.00013909610695765948, + "loss": 2.3473, + "step": 1989 + }, + { + "epoch": 6.47, + "learning_rate": 0.00013891071201628728, + "loss": 2.2964, + "step": 1992 + }, + { + "epoch": 6.48, + "learning_rate": 0.00013872515933776, + "loss": 2.3721, + "step": 1995 + }, + { + "epoch": 6.49, + "learning_rate": 0.00013853944967427535, + "loss": 2.3076, + "step": 1998 + }, + { + "epoch": 6.5, + "learning_rate": 0.00013835358377866763, + "loss": 2.3327, + "step": 2001 + }, + { + "epoch": 6.51, + "learning_rate": 0.00013816756240440424, + "loss": 2.2885, + "step": 2004 + }, + { + "epoch": 6.52, + "learning_rate": 0.00013798138630558303, + "loss": 2.3773, + "step": 2007 + }, + { + "epoch": 6.53, + "learning_rate": 0.00013779505623692909, + "loss": 2.3261, + "step": 2010 + }, + { + "epoch": 6.54, + "learning_rate": 0.00013760857295379154, + "loss": 2.3456, + "step": 2013 + }, + { + "epoch": 6.55, + "learning_rate": 0.00013742193721214064, + "loss": 2.422, + "step": 2016 + }, + { + "epoch": 6.56, + "learning_rate": 0.00013723514976856483, + "loss": 2.2936, + "step": 2019 + }, + { + "epoch": 6.56, + "learning_rate": 0.00013704821138026737, + "loss": 2.3485, + "step": 2022 + }, + { + "epoch": 6.57, + "learning_rate": 0.00013686112280506346, + "loss": 2.2716, + "step": 2025 + }, + { + "epoch": 6.58, + "learning_rate": 0.00013667388480137716, + "loss": 2.291, + "step": 2028 + }, + { + "epoch": 6.59, + "learning_rate": 0.0001364864981282383, + "loss": 2.3078, + "step": 2031 + }, + { + "epoch": 6.6, + "learning_rate": 0.00013629896354527932, + "loss": 2.3167, + "step": 2034 + }, + { + "epoch": 6.61, + "learning_rate": 0.0001361112818127323, + "loss": 2.3923, + "step": 2037 + }, + { + "epoch": 6.62, + "learning_rate": 0.00013592345369142585, + "loss": 2.3003, + "step": 2040 + }, + { + "epoch": 6.63, + "learning_rate": 0.00013573547994278205, + "loss": 2.2712, + "step": 2043 + }, + { + "epoch": 6.64, + "learning_rate": 0.0001355473613288132, + "loss": 2.3246, + "step": 2046 + }, + { + "epoch": 6.65, + "learning_rate": 0.00013535909861211903, + "loss": 2.2397, + "step": 2049 + }, + { + "epoch": 6.66, + "learning_rate": 0.00013517069255588327, + "loss": 2.2596, + "step": 2052 + }, + { + "epoch": 6.67, + "learning_rate": 0.00013498214392387083, + "loss": 2.1575, + "step": 2055 + }, + { + "epoch": 6.68, + "learning_rate": 0.0001347934534804246, + "loss": 2.2904, + "step": 2058 + }, + { + "epoch": 6.69, + "learning_rate": 0.00013460462199046226, + "loss": 2.1767, + "step": 2061 + }, + { + "epoch": 6.7, + "learning_rate": 0.00013441565021947332, + "loss": 2.2593, + "step": 2064 + }, + { + "epoch": 6.71, + "learning_rate": 0.00013422653893351604, + "loss": 2.3004, + "step": 2067 + }, + { + "epoch": 6.72, + "learning_rate": 0.0001340372888992141, + "loss": 2.2932, + "step": 2070 + }, + { + "epoch": 6.73, + "learning_rate": 0.0001338479008837538, + "loss": 2.3329, + "step": 2073 + }, + { + "epoch": 6.74, + "learning_rate": 0.00013365837565488064, + "loss": 2.2765, + "step": 2076 + }, + { + "epoch": 6.75, + "learning_rate": 0.00013346871398089644, + "loss": 2.2924, + "step": 2079 + }, + { + "epoch": 6.76, + "learning_rate": 0.00013327891663065614, + "loss": 2.2459, + "step": 2082 + }, + { + "epoch": 6.77, + "learning_rate": 0.0001330889843735647, + "loss": 2.1898, + "step": 2085 + }, + { + "epoch": 6.78, + "learning_rate": 0.00013289891797957395, + "loss": 2.2234, + "step": 2088 + }, + { + "epoch": 6.79, + "learning_rate": 0.00013270871821917946, + "loss": 2.3638, + "step": 2091 + }, + { + "epoch": 6.8, + "learning_rate": 0.00013251838586341745, + "loss": 2.3405, + "step": 2094 + }, + { + "epoch": 6.81, + "learning_rate": 0.00013232792168386176, + "loss": 2.2713, + "step": 2097 + }, + { + "epoch": 6.82, + "learning_rate": 0.00013213732645262044, + "loss": 2.2664, + "step": 2100 + }, + { + "epoch": 6.83, + "learning_rate": 0.00013194660094233298, + "loss": 2.2861, + "step": 2103 + }, + { + "epoch": 6.84, + "learning_rate": 0.00013175574592616692, + "loss": 2.2772, + "step": 2106 + }, + { + "epoch": 6.85, + "learning_rate": 0.00013156476217781468, + "loss": 2.2263, + "step": 2109 + }, + { + "epoch": 6.86, + "learning_rate": 0.00013137365047149078, + "loss": 2.3177, + "step": 2112 + }, + { + "epoch": 6.87, + "learning_rate": 0.00013118241158192827, + "loss": 2.2316, + "step": 2115 + }, + { + "epoch": 6.88, + "learning_rate": 0.0001309910462843758, + "loss": 2.3542, + "step": 2118 + }, + { + "epoch": 6.89, + "learning_rate": 0.00013079955535459455, + "loss": 2.2913, + "step": 2121 + }, + { + "epoch": 6.9, + "learning_rate": 0.000130607939568855, + "loss": 2.2785, + "step": 2124 + }, + { + "epoch": 6.91, + "learning_rate": 0.00013041619970393352, + "loss": 2.2508, + "step": 2127 + }, + { + "epoch": 6.92, + "learning_rate": 0.0001302243365371098, + "loss": 2.2514, + "step": 2130 + }, + { + "epoch": 6.93, + "learning_rate": 0.00013003235084616324, + "loss": 2.2517, + "step": 2133 + }, + { + "epoch": 6.94, + "learning_rate": 0.00012984024340936983, + "loss": 2.2517, + "step": 2136 + }, + { + "epoch": 6.94, + "learning_rate": 0.00012964801500549931, + "loss": 2.2079, + "step": 2139 + }, + { + "epoch": 6.95, + "learning_rate": 0.00012945566641381159, + "loss": 2.3606, + "step": 2142 + }, + { + "epoch": 6.96, + "learning_rate": 0.00012926319841405394, + "loss": 2.3764, + "step": 2145 + }, + { + "epoch": 6.97, + "learning_rate": 0.00012907061178645763, + "loss": 2.2781, + "step": 2148 + }, + { + "epoch": 6.98, + "learning_rate": 0.00012887790731173486, + "loss": 2.2436, + "step": 2151 + }, + { + "epoch": 6.99, + "learning_rate": 0.0001286850857710755, + "loss": 2.228, + "step": 2154 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.6243810506983248, + "eval_loss": 2.193675994873047, + "eval_runtime": 16.3306, + "eval_samples_per_second": 134.533, + "eval_steps_per_second": 67.297, + "step": 2156 + }, + { + "epoch": 7.0, + "learning_rate": 0.00012849214794614407, + "loss": 2.2569, + "step": 2157 + }, + { + "epoch": 7.01, + "learning_rate": 0.0001282990946190764, + "loss": 2.3658, + "step": 2160 + }, + { + "epoch": 7.02, + "learning_rate": 0.00012810592657247656, + "loss": 2.2651, + "step": 2163 + }, + { + "epoch": 7.03, + "learning_rate": 0.0001279126445894138, + "loss": 2.215, + "step": 2166 + }, + { + "epoch": 7.04, + "learning_rate": 0.00012771924945341906, + "loss": 2.2391, + "step": 2169 + }, + { + "epoch": 7.05, + "learning_rate": 0.00012752574194848211, + "loss": 2.2661, + "step": 2172 + }, + { + "epoch": 7.06, + "learning_rate": 0.00012733212285904818, + "loss": 2.1834, + "step": 2175 + }, + { + "epoch": 7.07, + "learning_rate": 0.0001271383929700149, + "loss": 2.2816, + "step": 2178 + }, + { + "epoch": 7.08, + "learning_rate": 0.00012694455306672895, + "loss": 2.2706, + "step": 2181 + }, + { + "epoch": 7.09, + "learning_rate": 0.00012675060393498318, + "loss": 2.2744, + "step": 2184 + }, + { + "epoch": 7.1, + "learning_rate": 0.00012655654636101304, + "loss": 2.2792, + "step": 2187 + }, + { + "epoch": 7.11, + "learning_rate": 0.00012636238113149367, + "loss": 2.3216, + "step": 2190 + }, + { + "epoch": 7.12, + "learning_rate": 0.00012616810903353666, + "loss": 2.2724, + "step": 2193 + }, + { + "epoch": 7.13, + "learning_rate": 0.00012597373085468678, + "loss": 2.2908, + "step": 2196 + }, + { + "epoch": 7.14, + "learning_rate": 0.00012577924738291877, + "loss": 2.3213, + "step": 2199 + }, + { + "epoch": 7.15, + "learning_rate": 0.0001255846594066344, + "loss": 2.2962, + "step": 2202 + }, + { + "epoch": 7.16, + "learning_rate": 0.00012538996771465887, + "loss": 2.2341, + "step": 2205 + }, + { + "epoch": 7.17, + "learning_rate": 0.00012519517309623793, + "loss": 2.3292, + "step": 2208 + }, + { + "epoch": 7.18, + "learning_rate": 0.0001250002763410346, + "loss": 2.2678, + "step": 2211 + }, + { + "epoch": 7.19, + "learning_rate": 0.0001248052782391259, + "loss": 2.2799, + "step": 2214 + }, + { + "epoch": 7.2, + "learning_rate": 0.00012461017958099966, + "loss": 2.2489, + "step": 2217 + }, + { + "epoch": 7.21, + "learning_rate": 0.00012441498115755146, + "loss": 2.3042, + "step": 2220 + }, + { + "epoch": 7.22, + "learning_rate": 0.00012421968376008115, + "loss": 2.1692, + "step": 2223 + }, + { + "epoch": 7.23, + "learning_rate": 0.00012402428818028994, + "loss": 2.3398, + "step": 2226 + }, + { + "epoch": 7.24, + "learning_rate": 0.000123828795210277, + "loss": 2.2435, + "step": 2229 + }, + { + "epoch": 7.25, + "learning_rate": 0.00012363320564253637, + "loss": 2.2838, + "step": 2232 + }, + { + "epoch": 7.26, + "learning_rate": 0.0001234375202699535, + "loss": 2.1099, + "step": 2235 + }, + { + "epoch": 7.27, + "learning_rate": 0.00012324173988580235, + "loss": 2.239, + "step": 2238 + }, + { + "epoch": 7.28, + "learning_rate": 0.0001230458652837421, + "loss": 2.2111, + "step": 2241 + }, + { + "epoch": 7.29, + "learning_rate": 0.00012284989725781377, + "loss": 2.2932, + "step": 2244 + }, + { + "epoch": 7.3, + "learning_rate": 0.0001226538366024371, + "loss": 2.2204, + "step": 2247 + }, + { + "epoch": 7.31, + "learning_rate": 0.00012245768411240737, + "loss": 2.2478, + "step": 2250 + }, + { + "epoch": 7.31, + "learning_rate": 0.00012226144058289216, + "loss": 2.2759, + "step": 2253 + }, + { + "epoch": 7.32, + "learning_rate": 0.00012206510680942806, + "loss": 2.297, + "step": 2256 + }, + { + "epoch": 7.33, + "learning_rate": 0.00012186868358791756, + "loss": 2.2437, + "step": 2259 + }, + { + "epoch": 7.34, + "learning_rate": 0.00012167217171462566, + "loss": 2.2756, + "step": 2262 + }, + { + "epoch": 7.35, + "learning_rate": 0.00012147557198617678, + "loss": 2.2352, + "step": 2265 + }, + { + "epoch": 7.36, + "learning_rate": 0.00012127888519955157, + "loss": 2.2638, + "step": 2268 + }, + { + "epoch": 7.37, + "learning_rate": 0.0001210821121520835, + "loss": 2.1679, + "step": 2271 + }, + { + "epoch": 7.38, + "learning_rate": 0.0001208852536414557, + "loss": 2.2499, + "step": 2274 + }, + { + "epoch": 7.39, + "learning_rate": 0.00012068831046569789, + "loss": 2.2459, + "step": 2277 + }, + { + "epoch": 7.4, + "learning_rate": 0.00012049128342318288, + "loss": 2.1518, + "step": 2280 + }, + { + "epoch": 7.41, + "learning_rate": 0.00012029417331262349, + "loss": 2.246, + "step": 2283 + }, + { + "epoch": 7.42, + "learning_rate": 0.00012009698093306936, + "loss": 2.2471, + "step": 2286 + }, + { + "epoch": 7.43, + "learning_rate": 0.00011989970708390353, + "loss": 2.2332, + "step": 2289 + }, + { + "epoch": 7.44, + "learning_rate": 0.00011970235256483934, + "loss": 2.2554, + "step": 2292 + }, + { + "epoch": 7.45, + "learning_rate": 0.00011950491817591717, + "loss": 2.2454, + "step": 2295 + }, + { + "epoch": 7.46, + "learning_rate": 0.00011930740471750121, + "loss": 2.2378, + "step": 2298 + }, + { + "epoch": 7.47, + "learning_rate": 0.00011910981299027608, + "loss": 2.2978, + "step": 2301 + }, + { + "epoch": 7.48, + "learning_rate": 0.00011891214379524375, + "loss": 2.2042, + "step": 2304 + }, + { + "epoch": 7.49, + "learning_rate": 0.0001187143979337203, + "loss": 2.2105, + "step": 2307 + }, + { + "epoch": 7.5, + "learning_rate": 0.00011851657620733243, + "loss": 2.2343, + "step": 2310 + }, + { + "epoch": 7.51, + "learning_rate": 0.00011831867941801455, + "loss": 2.2197, + "step": 2313 + }, + { + "epoch": 7.52, + "learning_rate": 0.00011812070836800533, + "loss": 2.3094, + "step": 2316 + }, + { + "epoch": 7.53, + "learning_rate": 0.00011792266385984433, + "loss": 2.2106, + "step": 2319 + }, + { + "epoch": 7.54, + "learning_rate": 0.00011772454669636912, + "loss": 2.3657, + "step": 2322 + }, + { + "epoch": 7.55, + "learning_rate": 0.00011752635768071167, + "loss": 2.3096, + "step": 2325 + }, + { + "epoch": 7.56, + "learning_rate": 0.0001173280976162952, + "loss": 2.1856, + "step": 2328 + }, + { + "epoch": 7.57, + "learning_rate": 0.00011712976730683108, + "loss": 2.236, + "step": 2331 + }, + { + "epoch": 7.58, + "learning_rate": 0.00011693136755631528, + "loss": 2.2304, + "step": 2334 + }, + { + "epoch": 7.59, + "learning_rate": 0.00011673289916902539, + "loss": 2.2346, + "step": 2337 + }, + { + "epoch": 7.6, + "learning_rate": 0.00011653436294951724, + "loss": 2.2353, + "step": 2340 + }, + { + "epoch": 7.61, + "learning_rate": 0.00011633575970262152, + "loss": 2.2511, + "step": 2343 + }, + { + "epoch": 7.62, + "learning_rate": 0.0001161370902334408, + "loss": 2.2557, + "step": 2346 + }, + { + "epoch": 7.63, + "learning_rate": 0.00011593835534734596, + "loss": 2.2501, + "step": 2349 + }, + { + "epoch": 7.64, + "learning_rate": 0.00011573955584997318, + "loss": 2.2231, + "step": 2352 + }, + { + "epoch": 7.65, + "learning_rate": 0.00011554069254722051, + "loss": 2.1851, + "step": 2355 + }, + { + "epoch": 7.66, + "learning_rate": 0.00011534176624524464, + "loss": 2.2573, + "step": 2358 + }, + { + "epoch": 7.67, + "learning_rate": 0.00011514277775045768, + "loss": 2.2314, + "step": 2361 + }, + { + "epoch": 7.68, + "learning_rate": 0.00011494372786952384, + "loss": 2.1805, + "step": 2364 + }, + { + "epoch": 7.69, + "learning_rate": 0.00011474461740935621, + "loss": 2.2441, + "step": 2367 + }, + { + "epoch": 7.69, + "learning_rate": 0.0001145454471771134, + "loss": 2.2018, + "step": 2370 + }, + { + "epoch": 7.7, + "learning_rate": 0.0001143462179801964, + "loss": 2.1672, + "step": 2373 + }, + { + "epoch": 7.71, + "learning_rate": 0.00011414693062624515, + "loss": 2.2151, + "step": 2376 + }, + { + "epoch": 7.72, + "learning_rate": 0.00011394758592313543, + "loss": 2.2059, + "step": 2379 + }, + { + "epoch": 7.73, + "learning_rate": 0.00011374818467897541, + "loss": 2.2516, + "step": 2382 + }, + { + "epoch": 7.74, + "learning_rate": 0.00011354872770210256, + "loss": 2.1991, + "step": 2385 + }, + { + "epoch": 7.75, + "learning_rate": 0.00011334921580108027, + "loss": 2.2307, + "step": 2388 + }, + { + "epoch": 7.76, + "learning_rate": 0.00011314964978469445, + "loss": 2.1478, + "step": 2391 + }, + { + "epoch": 7.77, + "learning_rate": 0.00011295003046195058, + "loss": 2.1494, + "step": 2394 + }, + { + "epoch": 7.78, + "learning_rate": 0.00011275035864207017, + "loss": 2.289, + "step": 2397 + }, + { + "epoch": 7.79, + "learning_rate": 0.00011255063513448743, + "loss": 2.1709, + "step": 2400 + }, + { + "epoch": 7.8, + "learning_rate": 0.00011235086074884622, + "loss": 2.2586, + "step": 2403 + }, + { + "epoch": 7.81, + "learning_rate": 0.00011215103629499661, + "loss": 2.2279, + "step": 2406 + }, + { + "epoch": 7.82, + "learning_rate": 0.00011195116258299169, + "loss": 2.2403, + "step": 2409 + }, + { + "epoch": 7.83, + "learning_rate": 0.00011175124042308416, + "loss": 2.2453, + "step": 2412 + }, + { + "epoch": 7.84, + "learning_rate": 0.00011155127062572314, + "loss": 2.2293, + "step": 2415 + }, + { + "epoch": 7.85, + "learning_rate": 0.00011135125400155091, + "loss": 2.2598, + "step": 2418 + }, + { + "epoch": 7.86, + "learning_rate": 0.00011115119136139951, + "loss": 2.2331, + "step": 2421 + }, + { + "epoch": 7.87, + "learning_rate": 0.00011095108351628758, + "loss": 2.2416, + "step": 2424 + }, + { + "epoch": 7.88, + "learning_rate": 0.00011075093127741695, + "loss": 2.1992, + "step": 2427 + }, + { + "epoch": 7.89, + "learning_rate": 0.00011055073545616952, + "loss": 2.1727, + "step": 2430 + }, + { + "epoch": 7.9, + "learning_rate": 0.0001103504968641037, + "loss": 2.2371, + "step": 2433 + }, + { + "epoch": 7.91, + "learning_rate": 0.00011015021631295149, + "loss": 2.23, + "step": 2436 + }, + { + "epoch": 7.92, + "learning_rate": 0.00010994989461461476, + "loss": 2.1677, + "step": 2439 + }, + { + "epoch": 7.93, + "learning_rate": 0.00010974953258116238, + "loss": 2.2252, + "step": 2442 + }, + { + "epoch": 7.94, + "learning_rate": 0.00010954913102482664, + "loss": 2.2119, + "step": 2445 + }, + { + "epoch": 7.95, + "learning_rate": 0.000109348690758, + "loss": 2.2567, + "step": 2448 + }, + { + "epoch": 7.96, + "learning_rate": 0.00010914821259323202, + "loss": 2.2209, + "step": 2451 + }, + { + "epoch": 7.97, + "learning_rate": 0.00010894769734322567, + "loss": 2.2701, + "step": 2454 + }, + { + "epoch": 7.98, + "learning_rate": 0.00010874714582083438, + "loss": 2.1552, + "step": 2457 + }, + { + "epoch": 7.99, + "learning_rate": 0.00010854655883905869, + "loss": 2.1527, + "step": 2460 + }, + { + "epoch": 8.0, + "learning_rate": 0.0001083459372110427, + "loss": 2.2013, + "step": 2463 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.6309924029431118, + "eval_loss": 2.1445603370666504, + "eval_runtime": 16.3353, + "eval_samples_per_second": 134.494, + "eval_steps_per_second": 67.278, + "step": 2464 + }, + { + "epoch": 8.01, + "learning_rate": 0.00010814528175007108, + "loss": 2.22, + "step": 2466 + }, + { + "epoch": 8.02, + "learning_rate": 0.0001079445932695657, + "loss": 2.2225, + "step": 2469 + }, + { + "epoch": 8.03, + "learning_rate": 0.00010774387258308217, + "loss": 2.2667, + "step": 2472 + }, + { + "epoch": 8.04, + "learning_rate": 0.00010754312050430668, + "loss": 2.2468, + "step": 2475 + }, + { + "epoch": 8.05, + "learning_rate": 0.00010734233784705276, + "loss": 2.2416, + "step": 2478 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010714152542525781, + "loss": 2.1588, + "step": 2481 + }, + { + "epoch": 8.06, + "learning_rate": 0.00010694068405297997, + "loss": 2.2093, + "step": 2484 + }, + { + "epoch": 8.07, + "learning_rate": 0.00010673981454439463, + "loss": 2.2511, + "step": 2487 + }, + { + "epoch": 8.08, + "learning_rate": 0.00010653891771379134, + "loss": 2.2265, + "step": 2490 + }, + { + "epoch": 8.09, + "learning_rate": 0.00010633799437557039, + "loss": 2.2257, + "step": 2493 + }, + { + "epoch": 8.1, + "learning_rate": 0.00010613704534423949, + "loss": 2.2588, + "step": 2496 + }, + { + "epoch": 8.11, + "learning_rate": 0.00010593607143441053, + "loss": 2.1435, + "step": 2499 + }, + { + "epoch": 8.12, + "learning_rate": 0.00010573507346079625, + "loss": 2.1494, + "step": 2502 + }, + { + "epoch": 8.13, + "learning_rate": 0.00010553405223820693, + "loss": 2.2067, + "step": 2505 + }, + { + "epoch": 8.14, + "learning_rate": 0.00010533300858154715, + "loss": 2.2174, + "step": 2508 + }, + { + "epoch": 8.15, + "learning_rate": 0.00010513194330581233, + "loss": 2.168, + "step": 2511 + }, + { + "epoch": 8.16, + "learning_rate": 0.00010493085722608562, + "loss": 2.2395, + "step": 2514 + }, + { + "epoch": 8.17, + "learning_rate": 0.00010472975115753452, + "loss": 2.1584, + "step": 2517 + }, + { + "epoch": 8.18, + "learning_rate": 0.00010452862591540742, + "loss": 2.1738, + "step": 2520 + }, + { + "epoch": 8.19, + "learning_rate": 0.0001043274823150306, + "loss": 2.2533, + "step": 2523 + }, + { + "epoch": 8.2, + "learning_rate": 0.00010412632117180471, + "loss": 2.308, + "step": 2526 + }, + { + "epoch": 8.21, + "learning_rate": 0.00010392514330120145, + "loss": 2.1664, + "step": 2529 + }, + { + "epoch": 8.22, + "learning_rate": 0.00010372394951876043, + "loss": 2.2669, + "step": 2532 + }, + { + "epoch": 8.23, + "learning_rate": 0.00010352274064008567, + "loss": 2.1636, + "step": 2535 + }, + { + "epoch": 8.24, + "learning_rate": 0.00010332151748084242, + "loss": 2.2729, + "step": 2538 + }, + { + "epoch": 8.25, + "learning_rate": 0.00010312028085675391, + "loss": 2.2097, + "step": 2541 + }, + { + "epoch": 8.26, + "learning_rate": 0.00010291903158359783, + "loss": 2.2306, + "step": 2544 + }, + { + "epoch": 8.27, + "learning_rate": 0.0001027177704772032, + "loss": 2.1675, + "step": 2547 + }, + { + "epoch": 8.28, + "learning_rate": 0.00010251649835344696, + "loss": 2.266, + "step": 2550 + }, + { + "epoch": 8.29, + "learning_rate": 0.0001023152160282508, + "loss": 2.2716, + "step": 2553 + }, + { + "epoch": 8.3, + "learning_rate": 0.00010211392431757773, + "loss": 2.2013, + "step": 2556 + }, + { + "epoch": 8.31, + "learning_rate": 0.00010191262403742878, + "loss": 2.1526, + "step": 2559 + }, + { + "epoch": 8.32, + "learning_rate": 0.00010171131600383974, + "loss": 2.1521, + "step": 2562 + }, + { + "epoch": 8.33, + "learning_rate": 0.00010151000103287784, + "loss": 2.2781, + "step": 2565 + }, + { + "epoch": 8.34, + "learning_rate": 0.00010130867994063839, + "loss": 2.1813, + "step": 2568 + }, + { + "epoch": 8.35, + "learning_rate": 0.00010110735354324159, + "loss": 2.2153, + "step": 2571 + }, + { + "epoch": 8.36, + "learning_rate": 0.00010090602265682906, + "loss": 2.2226, + "step": 2574 + }, + { + "epoch": 8.37, + "learning_rate": 0.00010070468809756068, + "loss": 2.2177, + "step": 2577 + }, + { + "epoch": 8.38, + "learning_rate": 0.00010050335068161123, + "loss": 2.186, + "step": 2580 + }, + { + "epoch": 8.39, + "learning_rate": 0.00010030201122516696, + "loss": 2.2026, + "step": 2583 + }, + { + "epoch": 8.4, + "learning_rate": 0.00010010067054442251, + "loss": 2.2229, + "step": 2586 + }, + { + "epoch": 8.41, + "learning_rate": 9.989932945557751e-05, + "loss": 2.2416, + "step": 2589 + }, + { + "epoch": 8.42, + "learning_rate": 9.969798877483308e-05, + "loss": 2.2854, + "step": 2592 + }, + { + "epoch": 8.43, + "learning_rate": 9.949664931838882e-05, + "loss": 2.158, + "step": 2595 + }, + { + "epoch": 8.44, + "learning_rate": 9.929531190243932e-05, + "loss": 2.2394, + "step": 2598 + }, + { + "epoch": 8.44, + "learning_rate": 9.909397734317095e-05, + "loss": 2.1703, + "step": 2601 + }, + { + "epoch": 8.45, + "learning_rate": 9.889264645675843e-05, + "loss": 2.2031, + "step": 2604 + }, + { + "epoch": 8.46, + "learning_rate": 9.869132005936163e-05, + "loss": 2.2224, + "step": 2607 + }, + { + "epoch": 8.47, + "learning_rate": 9.848999896712217e-05, + "loss": 2.2693, + "step": 2610 + }, + { + "epoch": 8.48, + "learning_rate": 9.82886839961603e-05, + "loss": 2.2381, + "step": 2613 + }, + { + "epoch": 8.49, + "learning_rate": 9.808737596257121e-05, + "loss": 2.1839, + "step": 2616 + }, + { + "epoch": 8.5, + "learning_rate": 9.788607568242229e-05, + "loss": 2.129, + "step": 2619 + }, + { + "epoch": 8.51, + "learning_rate": 9.768478397174922e-05, + "loss": 2.121, + "step": 2622 + }, + { + "epoch": 8.52, + "learning_rate": 9.748350164655306e-05, + "loss": 2.2323, + "step": 2625 + }, + { + "epoch": 8.53, + "learning_rate": 9.728222952279684e-05, + "loss": 2.2613, + "step": 2628 + }, + { + "epoch": 8.54, + "learning_rate": 9.708096841640222e-05, + "loss": 2.1113, + "step": 2631 + }, + { + "epoch": 8.55, + "learning_rate": 9.687971914324607e-05, + "loss": 2.1729, + "step": 2634 + }, + { + "epoch": 8.56, + "learning_rate": 9.667848251915758e-05, + "loss": 2.2271, + "step": 2637 + }, + { + "epoch": 8.57, + "learning_rate": 9.647725935991436e-05, + "loss": 2.2319, + "step": 2640 + }, + { + "epoch": 8.58, + "learning_rate": 9.627605048123959e-05, + "loss": 2.2244, + "step": 2643 + }, + { + "epoch": 8.59, + "learning_rate": 9.607485669879857e-05, + "loss": 2.1239, + "step": 2646 + }, + { + "epoch": 8.6, + "learning_rate": 9.587367882819532e-05, + "loss": 2.2429, + "step": 2649 + }, + { + "epoch": 8.61, + "learning_rate": 9.567251768496938e-05, + "loss": 2.1936, + "step": 2652 + }, + { + "epoch": 8.62, + "learning_rate": 9.547137408459257e-05, + "loss": 2.2038, + "step": 2655 + }, + { + "epoch": 8.63, + "learning_rate": 9.52702488424655e-05, + "loss": 2.1978, + "step": 2658 + }, + { + "epoch": 8.64, + "learning_rate": 9.506914277391439e-05, + "loss": 2.1977, + "step": 2661 + }, + { + "epoch": 8.65, + "learning_rate": 9.486805669418769e-05, + "loss": 2.1772, + "step": 2664 + }, + { + "epoch": 8.66, + "learning_rate": 9.466699141845287e-05, + "loss": 2.1929, + "step": 2667 + }, + { + "epoch": 8.67, + "learning_rate": 9.446594776179306e-05, + "loss": 2.2712, + "step": 2670 + }, + { + "epoch": 8.68, + "learning_rate": 9.426492653920375e-05, + "loss": 2.2606, + "step": 2673 + }, + { + "epoch": 8.69, + "learning_rate": 9.406392856558949e-05, + "loss": 2.1655, + "step": 2676 + }, + { + "epoch": 8.7, + "learning_rate": 9.386295465576053e-05, + "loss": 2.1544, + "step": 2679 + }, + { + "epoch": 8.71, + "learning_rate": 9.366200562442963e-05, + "loss": 2.2016, + "step": 2682 + }, + { + "epoch": 8.72, + "learning_rate": 9.346108228620868e-05, + "loss": 2.3044, + "step": 2685 + }, + { + "epoch": 8.73, + "learning_rate": 9.326018545560542e-05, + "loss": 2.2102, + "step": 2688 + }, + { + "epoch": 8.74, + "learning_rate": 9.305931594702007e-05, + "loss": 2.1511, + "step": 2691 + }, + { + "epoch": 8.75, + "learning_rate": 9.28584745747422e-05, + "loss": 2.212, + "step": 2694 + }, + { + "epoch": 8.76, + "learning_rate": 9.265766215294725e-05, + "loss": 2.1871, + "step": 2697 + }, + { + "epoch": 8.77, + "learning_rate": 9.245687949569332e-05, + "loss": 2.226, + "step": 2700 + }, + { + "epoch": 8.78, + "learning_rate": 9.225612741691788e-05, + "loss": 2.1323, + "step": 2703 + }, + { + "epoch": 8.79, + "learning_rate": 9.205540673043434e-05, + "loss": 2.1258, + "step": 2706 + }, + { + "epoch": 8.8, + "learning_rate": 9.185471824992891e-05, + "loss": 2.1963, + "step": 2709 + }, + { + "epoch": 8.81, + "learning_rate": 9.165406278895732e-05, + "loss": 2.2423, + "step": 2712 + }, + { + "epoch": 8.81, + "learning_rate": 9.145344116094134e-05, + "loss": 2.1678, + "step": 2715 + }, + { + "epoch": 8.82, + "learning_rate": 9.125285417916563e-05, + "loss": 2.196, + "step": 2718 + }, + { + "epoch": 8.83, + "learning_rate": 9.105230265677437e-05, + "loss": 2.1637, + "step": 2721 + }, + { + "epoch": 8.84, + "learning_rate": 9.085178740676803e-05, + "loss": 2.1019, + "step": 2724 + }, + { + "epoch": 8.85, + "learning_rate": 9.065130924199998e-05, + "loss": 2.1651, + "step": 2727 + }, + { + "epoch": 8.86, + "learning_rate": 9.045086897517337e-05, + "loss": 2.2656, + "step": 2730 + }, + { + "epoch": 8.87, + "learning_rate": 9.025046741883764e-05, + "loss": 2.2224, + "step": 2733 + }, + { + "epoch": 8.88, + "learning_rate": 9.005010538538527e-05, + "loss": 2.1764, + "step": 2736 + }, + { + "epoch": 8.89, + "learning_rate": 8.984978368704855e-05, + "loss": 2.1928, + "step": 2739 + }, + { + "epoch": 8.9, + "learning_rate": 8.964950313589633e-05, + "loss": 2.111, + "step": 2742 + }, + { + "epoch": 8.91, + "learning_rate": 8.944926454383049e-05, + "loss": 2.2286, + "step": 2745 + }, + { + "epoch": 8.92, + "learning_rate": 8.924906872258306e-05, + "loss": 2.18, + "step": 2748 + }, + { + "epoch": 8.93, + "learning_rate": 8.904891648371244e-05, + "loss": 2.1869, + "step": 2751 + }, + { + "epoch": 8.94, + "learning_rate": 8.884880863860051e-05, + "loss": 2.2054, + "step": 2754 + }, + { + "epoch": 8.95, + "learning_rate": 8.864874599844911e-05, + "loss": 2.1351, + "step": 2757 + }, + { + "epoch": 8.96, + "learning_rate": 8.84487293742769e-05, + "loss": 2.2412, + "step": 2760 + }, + { + "epoch": 8.97, + "learning_rate": 8.824875957691588e-05, + "loss": 2.2394, + "step": 2763 + }, + { + "epoch": 8.98, + "learning_rate": 8.804883741700833e-05, + "loss": 2.1228, + "step": 2766 + }, + { + "epoch": 8.99, + "learning_rate": 8.78489637050034e-05, + "loss": 2.1862, + "step": 2769 + }, + { + "epoch": 9.0, + "learning_rate": 8.764913925115381e-05, + "loss": 2.1463, + "step": 2772 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.6357306777396295, + "eval_loss": 2.106170654296875, + "eval_runtime": 16.2999, + "eval_samples_per_second": 134.786, + "eval_steps_per_second": 67.424, + "step": 2772 + }, + { + "epoch": 9.01, + "learning_rate": 8.744936486551262e-05, + "loss": 2.2427, + "step": 2775 + }, + { + "epoch": 9.02, + "learning_rate": 8.724964135792988e-05, + "loss": 2.105, + "step": 2778 + }, + { + "epoch": 9.03, + "learning_rate": 8.70499695380494e-05, + "loss": 2.2257, + "step": 2781 + }, + { + "epoch": 9.04, + "learning_rate": 8.685035021530554e-05, + "loss": 2.2433, + "step": 2784 + }, + { + "epoch": 9.05, + "learning_rate": 8.665078419891977e-05, + "loss": 2.1763, + "step": 2787 + }, + { + "epoch": 9.06, + "learning_rate": 8.645127229789746e-05, + "loss": 2.2108, + "step": 2790 + }, + { + "epoch": 9.07, + "learning_rate": 8.625181532102463e-05, + "loss": 2.115, + "step": 2793 + }, + { + "epoch": 9.08, + "learning_rate": 8.605241407686462e-05, + "loss": 2.2288, + "step": 2796 + }, + { + "epoch": 9.09, + "learning_rate": 8.585306937375486e-05, + "loss": 2.1351, + "step": 2799 + }, + { + "epoch": 9.1, + "learning_rate": 8.565378201980361e-05, + "loss": 2.1875, + "step": 2802 + }, + { + "epoch": 9.11, + "learning_rate": 8.545455282288661e-05, + "loss": 2.1558, + "step": 2805 + }, + { + "epoch": 9.12, + "learning_rate": 8.525538259064381e-05, + "loss": 2.2156, + "step": 2808 + }, + { + "epoch": 9.13, + "learning_rate": 8.505627213047617e-05, + "loss": 2.1982, + "step": 2811 + }, + { + "epoch": 9.14, + "learning_rate": 8.485722224954237e-05, + "loss": 2.1447, + "step": 2814 + }, + { + "epoch": 9.15, + "learning_rate": 8.465823375475537e-05, + "loss": 2.2311, + "step": 2817 + }, + { + "epoch": 9.16, + "learning_rate": 8.445930745277953e-05, + "loss": 2.1685, + "step": 2820 + }, + { + "epoch": 9.17, + "learning_rate": 8.426044415002684e-05, + "loss": 2.1237, + "step": 2823 + }, + { + "epoch": 9.18, + "learning_rate": 8.406164465265406e-05, + "loss": 2.1082, + "step": 2826 + }, + { + "epoch": 9.19, + "learning_rate": 8.386290976655924e-05, + "loss": 2.1352, + "step": 2829 + }, + { + "epoch": 9.19, + "learning_rate": 8.366424029737853e-05, + "loss": 2.1588, + "step": 2832 + }, + { + "epoch": 9.2, + "learning_rate": 8.346563705048277e-05, + "loss": 2.2979, + "step": 2835 + }, + { + "epoch": 9.21, + "learning_rate": 8.326710083097462e-05, + "loss": 2.1507, + "step": 2838 + }, + { + "epoch": 9.22, + "learning_rate": 8.306863244368474e-05, + "loss": 2.127, + "step": 2841 + }, + { + "epoch": 9.23, + "learning_rate": 8.287023269316894e-05, + "loss": 2.1869, + "step": 2844 + }, + { + "epoch": 9.24, + "learning_rate": 8.267190238370482e-05, + "loss": 2.1259, + "step": 2847 + }, + { + "epoch": 9.25, + "learning_rate": 8.247364231928837e-05, + "loss": 2.2649, + "step": 2850 + }, + { + "epoch": 9.26, + "learning_rate": 8.227545330363087e-05, + "loss": 2.1888, + "step": 2853 + }, + { + "epoch": 9.27, + "learning_rate": 8.207733614015566e-05, + "loss": 2.1949, + "step": 2856 + }, + { + "epoch": 9.28, + "learning_rate": 8.18792916319947e-05, + "loss": 2.1379, + "step": 2859 + }, + { + "epoch": 9.29, + "learning_rate": 8.168132058198546e-05, + "loss": 2.1585, + "step": 2862 + }, + { + "epoch": 9.3, + "learning_rate": 8.148342379266759e-05, + "loss": 2.1941, + "step": 2865 + }, + { + "epoch": 9.31, + "learning_rate": 8.128560206627974e-05, + "loss": 2.1447, + "step": 2868 + }, + { + "epoch": 9.32, + "learning_rate": 8.108785620475624e-05, + "loss": 2.1413, + "step": 2871 + }, + { + "epoch": 9.33, + "learning_rate": 8.089018700972393e-05, + "loss": 2.25, + "step": 2874 + }, + { + "epoch": 9.34, + "learning_rate": 8.069259528249882e-05, + "loss": 2.2048, + "step": 2877 + }, + { + "epoch": 9.35, + "learning_rate": 8.049508182408284e-05, + "loss": 2.1432, + "step": 2880 + }, + { + "epoch": 9.36, + "learning_rate": 8.029764743516068e-05, + "loss": 2.2312, + "step": 2883 + }, + { + "epoch": 9.37, + "learning_rate": 8.01002929160965e-05, + "loss": 2.1929, + "step": 2886 + }, + { + "epoch": 9.38, + "learning_rate": 7.990301906693069e-05, + "loss": 2.1687, + "step": 2889 + }, + { + "epoch": 9.39, + "learning_rate": 7.970582668737652e-05, + "loss": 2.1176, + "step": 2892 + }, + { + "epoch": 9.4, + "learning_rate": 7.950871657681716e-05, + "loss": 2.1793, + "step": 2895 + }, + { + "epoch": 9.41, + "learning_rate": 7.931168953430213e-05, + "loss": 2.1538, + "step": 2898 + }, + { + "epoch": 9.42, + "learning_rate": 7.91147463585443e-05, + "loss": 2.1519, + "step": 2901 + }, + { + "epoch": 9.43, + "learning_rate": 7.891788784791655e-05, + "loss": 2.1821, + "step": 2904 + }, + { + "epoch": 9.44, + "learning_rate": 7.872111480044847e-05, + "loss": 2.145, + "step": 2907 + }, + { + "epoch": 9.45, + "learning_rate": 7.852442801382322e-05, + "loss": 2.1574, + "step": 2910 + }, + { + "epoch": 9.46, + "learning_rate": 7.832782828537437e-05, + "loss": 2.2808, + "step": 2913 + }, + { + "epoch": 9.47, + "learning_rate": 7.813131641208245e-05, + "loss": 2.1604, + "step": 2916 + }, + { + "epoch": 9.48, + "learning_rate": 7.793489319057195e-05, + "loss": 2.1786, + "step": 2919 + }, + { + "epoch": 9.49, + "learning_rate": 7.773855941710786e-05, + "loss": 2.2453, + "step": 2922 + }, + { + "epoch": 9.5, + "learning_rate": 7.754231588759265e-05, + "loss": 2.2529, + "step": 2925 + }, + { + "epoch": 9.51, + "learning_rate": 7.734616339756291e-05, + "loss": 2.1199, + "step": 2928 + }, + { + "epoch": 9.52, + "learning_rate": 7.715010274218625e-05, + "loss": 2.2108, + "step": 2931 + }, + { + "epoch": 9.53, + "learning_rate": 7.695413471625792e-05, + "loss": 2.2136, + "step": 2934 + }, + { + "epoch": 9.54, + "learning_rate": 7.675826011419766e-05, + "loss": 2.1859, + "step": 2937 + }, + { + "epoch": 9.55, + "learning_rate": 7.656247973004656e-05, + "loss": 2.1492, + "step": 2940 + }, + { + "epoch": 9.56, + "learning_rate": 7.63667943574637e-05, + "loss": 2.2054, + "step": 2943 + }, + { + "epoch": 9.56, + "learning_rate": 7.617120478972297e-05, + "loss": 2.1295, + "step": 2946 + }, + { + "epoch": 9.57, + "learning_rate": 7.597571181971006e-05, + "loss": 2.0959, + "step": 2949 + }, + { + "epoch": 9.58, + "learning_rate": 7.578031623991886e-05, + "loss": 2.2796, + "step": 2952 + }, + { + "epoch": 9.59, + "learning_rate": 7.558501884244857e-05, + "loss": 2.0984, + "step": 2955 + }, + { + "epoch": 9.6, + "learning_rate": 7.538982041900033e-05, + "loss": 2.2107, + "step": 2958 + }, + { + "epoch": 9.61, + "learning_rate": 7.519472176087414e-05, + "loss": 2.1768, + "step": 2961 + }, + { + "epoch": 9.62, + "learning_rate": 7.49997236589654e-05, + "loss": 2.1817, + "step": 2964 + }, + { + "epoch": 9.63, + "learning_rate": 7.480482690376207e-05, + "loss": 2.1886, + "step": 2967 + }, + { + "epoch": 9.64, + "learning_rate": 7.461003228534115e-05, + "loss": 2.2048, + "step": 2970 + }, + { + "epoch": 9.65, + "learning_rate": 7.441534059336563e-05, + "loss": 2.1702, + "step": 2973 + }, + { + "epoch": 9.66, + "learning_rate": 7.422075261708125e-05, + "loss": 2.1699, + "step": 2976 + }, + { + "epoch": 9.67, + "learning_rate": 7.402626914531328e-05, + "loss": 2.1161, + "step": 2979 + }, + { + "epoch": 9.68, + "learning_rate": 7.383189096646335e-05, + "loss": 2.1665, + "step": 2982 + }, + { + "epoch": 9.69, + "learning_rate": 7.363761886850633e-05, + "loss": 2.196, + "step": 2985 + }, + { + "epoch": 9.7, + "learning_rate": 7.344345363898697e-05, + "loss": 2.1785, + "step": 2988 + }, + { + "epoch": 9.71, + "learning_rate": 7.324939606501685e-05, + "loss": 2.1046, + "step": 2991 + }, + { + "epoch": 9.72, + "learning_rate": 7.305544693327106e-05, + "loss": 2.1544, + "step": 2994 + }, + { + "epoch": 9.73, + "learning_rate": 7.286160702998515e-05, + "loss": 2.1705, + "step": 2997 + }, + { + "epoch": 9.74, + "learning_rate": 7.266787714095182e-05, + "loss": 2.2029, + "step": 3000 + }, + { + "epoch": 9.75, + "learning_rate": 7.247425805151788e-05, + "loss": 2.1483, + "step": 3003 + }, + { + "epoch": 9.76, + "learning_rate": 7.228075054658096e-05, + "loss": 2.1243, + "step": 3006 + }, + { + "epoch": 9.77, + "learning_rate": 7.208735541058622e-05, + "loss": 2.0956, + "step": 3009 + }, + { + "epoch": 9.78, + "learning_rate": 7.189407342752345e-05, + "loss": 2.1788, + "step": 3012 + }, + { + "epoch": 9.79, + "learning_rate": 7.170090538092367e-05, + "loss": 2.2902, + "step": 3015 + }, + { + "epoch": 9.8, + "learning_rate": 7.150785205385596e-05, + "loss": 2.0941, + "step": 3018 + }, + { + "epoch": 9.81, + "learning_rate": 7.131491422892454e-05, + "loss": 2.1134, + "step": 3021 + }, + { + "epoch": 9.82, + "learning_rate": 7.112209268826517e-05, + "loss": 2.1195, + "step": 3024 + }, + { + "epoch": 9.83, + "learning_rate": 7.092938821354238e-05, + "loss": 2.2264, + "step": 3027 + }, + { + "epoch": 9.84, + "learning_rate": 7.073680158594609e-05, + "loss": 2.205, + "step": 3030 + }, + { + "epoch": 9.85, + "learning_rate": 7.054433358618847e-05, + "loss": 2.1962, + "step": 3033 + }, + { + "epoch": 9.86, + "learning_rate": 7.035198499450071e-05, + "loss": 2.1881, + "step": 3036 + }, + { + "epoch": 9.87, + "learning_rate": 7.015975659063017e-05, + "loss": 2.1807, + "step": 3039 + }, + { + "epoch": 9.88, + "learning_rate": 6.996764915383681e-05, + "loss": 2.1038, + "step": 3042 + }, + { + "epoch": 9.89, + "learning_rate": 6.97756634628902e-05, + "loss": 2.1783, + "step": 3045 + }, + { + "epoch": 9.9, + "learning_rate": 6.95838002960665e-05, + "loss": 2.1281, + "step": 3048 + }, + { + "epoch": 9.91, + "learning_rate": 6.939206043114506e-05, + "loss": 2.1633, + "step": 3051 + }, + { + "epoch": 9.92, + "learning_rate": 6.920044464540543e-05, + "loss": 2.1237, + "step": 3054 + }, + { + "epoch": 9.93, + "learning_rate": 6.900895371562419e-05, + "loss": 2.1732, + "step": 3057 + }, + { + "epoch": 9.94, + "learning_rate": 6.881758841807176e-05, + "loss": 2.139, + "step": 3060 + }, + { + "epoch": 9.94, + "learning_rate": 6.862634952850926e-05, + "loss": 2.1296, + "step": 3063 + }, + { + "epoch": 9.95, + "learning_rate": 6.843523782218534e-05, + "loss": 2.1914, + "step": 3066 + }, + { + "epoch": 9.96, + "learning_rate": 6.824425407383315e-05, + "loss": 2.1584, + "step": 3069 + }, + { + "epoch": 9.97, + "learning_rate": 6.805339905766706e-05, + "loss": 2.128, + "step": 3072 + }, + { + "epoch": 9.98, + "learning_rate": 6.786267354737955e-05, + "loss": 2.082, + "step": 3075 + }, + { + "epoch": 9.99, + "learning_rate": 6.767207831613828e-05, + "loss": 2.0882, + "step": 3078 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.636978362829467, + "eval_loss": 2.084742307662964, + "eval_runtime": 16.3444, + "eval_samples_per_second": 134.419, + "eval_steps_per_second": 67.24, + "step": 3080 + }, + { + "epoch": 10.0, + "learning_rate": 6.748161413658256e-05, + "loss": 2.0895, + "step": 3081 + }, + { + "epoch": 10.01, + "learning_rate": 6.729128178082058e-05, + "loss": 2.143, + "step": 3084 + }, + { + "epoch": 10.02, + "learning_rate": 6.71010820204261e-05, + "loss": 2.1523, + "step": 3087 + }, + { + "epoch": 10.03, + "learning_rate": 6.691101562643534e-05, + "loss": 2.1812, + "step": 3090 + }, + { + "epoch": 10.04, + "learning_rate": 6.672108336934386e-05, + "loss": 2.1807, + "step": 3093 + }, + { + "epoch": 10.05, + "learning_rate": 6.653128601910357e-05, + "loss": 2.1916, + "step": 3096 + }, + { + "epoch": 10.06, + "learning_rate": 6.63416243451194e-05, + "loss": 2.2036, + "step": 3099 + }, + { + "epoch": 10.07, + "learning_rate": 6.615209911624623e-05, + "loss": 2.1112, + "step": 3102 + }, + { + "epoch": 10.08, + "learning_rate": 6.596271110078591e-05, + "loss": 2.0984, + "step": 3105 + }, + { + "epoch": 10.09, + "learning_rate": 6.577346106648399e-05, + "loss": 2.2862, + "step": 3108 + }, + { + "epoch": 10.1, + "learning_rate": 6.558434978052667e-05, + "loss": 2.1379, + "step": 3111 + }, + { + "epoch": 10.11, + "learning_rate": 6.539537800953777e-05, + "loss": 2.1475, + "step": 3114 + }, + { + "epoch": 10.12, + "learning_rate": 6.520654651957543e-05, + "loss": 2.144, + "step": 3117 + }, + { + "epoch": 10.13, + "learning_rate": 6.50178560761292e-05, + "loss": 2.1383, + "step": 3120 + }, + { + "epoch": 10.14, + "learning_rate": 6.482930744411677e-05, + "loss": 2.0835, + "step": 3123 + }, + { + "epoch": 10.15, + "learning_rate": 6.464090138788102e-05, + "loss": 2.0889, + "step": 3126 + }, + { + "epoch": 10.16, + "learning_rate": 6.445263867118679e-05, + "loss": 2.2135, + "step": 3129 + }, + { + "epoch": 10.17, + "learning_rate": 6.426452005721797e-05, + "loss": 2.1301, + "step": 3132 + }, + { + "epoch": 10.18, + "learning_rate": 6.407654630857416e-05, + "loss": 2.0498, + "step": 3135 + }, + { + "epoch": 10.19, + "learning_rate": 6.388871818726774e-05, + "loss": 2.0766, + "step": 3138 + }, + { + "epoch": 10.2, + "learning_rate": 6.370103645472072e-05, + "loss": 2.195, + "step": 3141 + }, + { + "epoch": 10.21, + "learning_rate": 6.351350187176176e-05, + "loss": 2.2205, + "step": 3144 + }, + { + "epoch": 10.22, + "learning_rate": 6.332611519862284e-05, + "loss": 2.1172, + "step": 3147 + }, + { + "epoch": 10.23, + "learning_rate": 6.313887719493657e-05, + "loss": 2.1688, + "step": 3150 + }, + { + "epoch": 10.24, + "learning_rate": 6.295178861973267e-05, + "loss": 2.0986, + "step": 3153 + }, + { + "epoch": 10.25, + "learning_rate": 6.27648502314352e-05, + "loss": 2.0658, + "step": 3156 + }, + { + "epoch": 10.26, + "learning_rate": 6.257806278785937e-05, + "loss": 2.1681, + "step": 3159 + }, + { + "epoch": 10.27, + "learning_rate": 6.239142704620853e-05, + "loss": 2.2028, + "step": 3162 + }, + { + "epoch": 10.28, + "learning_rate": 6.220494376307094e-05, + "loss": 2.1707, + "step": 3165 + }, + { + "epoch": 10.29, + "learning_rate": 6.201861369441697e-05, + "loss": 2.2144, + "step": 3168 + }, + { + "epoch": 10.3, + "learning_rate": 6.183243759559579e-05, + "loss": 2.2155, + "step": 3171 + }, + { + "epoch": 10.31, + "learning_rate": 6.164641622133241e-05, + "loss": 2.1628, + "step": 3174 + }, + { + "epoch": 10.31, + "learning_rate": 6.146055032572466e-05, + "loss": 2.1457, + "step": 3177 + }, + { + "epoch": 10.32, + "learning_rate": 6.127484066224005e-05, + "loss": 2.079, + "step": 3180 + }, + { + "epoch": 10.33, + "learning_rate": 6.108928798371272e-05, + "loss": 2.1211, + "step": 3183 + }, + { + "epoch": 10.34, + "learning_rate": 6.090389304234052e-05, + "loss": 2.1723, + "step": 3186 + }, + { + "epoch": 10.35, + "learning_rate": 6.0718656589681764e-05, + "loss": 2.1723, + "step": 3189 + }, + { + "epoch": 10.36, + "learning_rate": 6.053357937665237e-05, + "loss": 2.1179, + "step": 3192 + }, + { + "epoch": 10.37, + "learning_rate": 6.034866215352262e-05, + "loss": 2.2066, + "step": 3195 + }, + { + "epoch": 10.38, + "learning_rate": 6.016390566991429e-05, + "loss": 2.1562, + "step": 3198 + }, + { + "epoch": 10.39, + "learning_rate": 5.997931067479753e-05, + "loss": 2.1374, + "step": 3201 + }, + { + "epoch": 10.4, + "learning_rate": 5.979487791648789e-05, + "loss": 2.1595, + "step": 3204 + }, + { + "epoch": 10.41, + "learning_rate": 5.961060814264321e-05, + "loss": 2.194, + "step": 3207 + }, + { + "epoch": 10.42, + "learning_rate": 5.942650210026055e-05, + "loss": 2.1749, + "step": 3210 + }, + { + "epoch": 10.43, + "learning_rate": 5.9242560535673344e-05, + "loss": 2.207, + "step": 3213 + }, + { + "epoch": 10.44, + "learning_rate": 5.905878419454821e-05, + "loss": 2.1641, + "step": 3216 + }, + { + "epoch": 10.45, + "learning_rate": 5.8875173821881904e-05, + "loss": 2.2015, + "step": 3219 + }, + { + "epoch": 10.46, + "learning_rate": 5.869173016199858e-05, + "loss": 2.0588, + "step": 3222 + }, + { + "epoch": 10.47, + "learning_rate": 5.850845395854636e-05, + "loss": 2.1809, + "step": 3225 + }, + { + "epoch": 10.48, + "learning_rate": 5.8325345954494633e-05, + "loss": 2.0862, + "step": 3228 + }, + { + "epoch": 10.49, + "learning_rate": 5.814240689213086e-05, + "loss": 2.1122, + "step": 3231 + }, + { + "epoch": 10.5, + "learning_rate": 5.795963751305777e-05, + "loss": 2.1289, + "step": 3234 + }, + { + "epoch": 10.51, + "learning_rate": 5.77770385581901e-05, + "loss": 2.1519, + "step": 3237 + }, + { + "epoch": 10.52, + "learning_rate": 5.759461076775177e-05, + "loss": 2.1731, + "step": 3240 + }, + { + "epoch": 10.53, + "learning_rate": 5.7412354881272865e-05, + "loss": 2.1847, + "step": 3243 + }, + { + "epoch": 10.54, + "learning_rate": 5.7230271637586555e-05, + "loss": 2.2063, + "step": 3246 + }, + { + "epoch": 10.55, + "learning_rate": 5.7048361774826086e-05, + "loss": 2.1409, + "step": 3249 + }, + { + "epoch": 10.56, + "learning_rate": 5.686662603042201e-05, + "loss": 2.0635, + "step": 3252 + }, + { + "epoch": 10.57, + "learning_rate": 5.668506514109887e-05, + "loss": 2.0779, + "step": 3255 + }, + { + "epoch": 10.58, + "learning_rate": 5.6503679842872506e-05, + "loss": 2.0536, + "step": 3258 + }, + { + "epoch": 10.59, + "learning_rate": 5.6322470871046825e-05, + "loss": 2.1569, + "step": 3261 + }, + { + "epoch": 10.6, + "learning_rate": 5.6141438960211065e-05, + "loss": 2.1513, + "step": 3264 + }, + { + "epoch": 10.61, + "learning_rate": 5.596058484423656e-05, + "loss": 2.1937, + "step": 3267 + }, + { + "epoch": 10.62, + "learning_rate": 5.5779909256274035e-05, + "loss": 2.1962, + "step": 3270 + }, + { + "epoch": 10.63, + "learning_rate": 5.559941292875035e-05, + "loss": 2.1182, + "step": 3273 + }, + { + "epoch": 10.64, + "learning_rate": 5.5419096593365724e-05, + "loss": 2.1865, + "step": 3276 + }, + { + "epoch": 10.65, + "learning_rate": 5.523896098109079e-05, + "loss": 2.158, + "step": 3279 + }, + { + "epoch": 10.66, + "learning_rate": 5.505900682216354e-05, + "loss": 2.0896, + "step": 3282 + }, + { + "epoch": 10.67, + "learning_rate": 5.487923484608629e-05, + "loss": 2.1242, + "step": 3285 + }, + { + "epoch": 10.68, + "learning_rate": 5.469964578162288e-05, + "loss": 2.1423, + "step": 3288 + }, + { + "epoch": 10.69, + "learning_rate": 5.4520240356795725e-05, + "loss": 2.114, + "step": 3291 + }, + { + "epoch": 10.69, + "learning_rate": 5.4341019298882656e-05, + "loss": 2.1531, + "step": 3294 + }, + { + "epoch": 10.7, + "learning_rate": 5.416198333441423e-05, + "loss": 2.1431, + "step": 3297 + }, + { + "epoch": 10.71, + "learning_rate": 5.3983133189170686e-05, + "loss": 2.0837, + "step": 3300 + }, + { + "epoch": 10.72, + "learning_rate": 5.380446958817888e-05, + "loss": 2.1647, + "step": 3303 + }, + { + "epoch": 10.73, + "learning_rate": 5.362599325570945e-05, + "loss": 2.1104, + "step": 3306 + }, + { + "epoch": 10.74, + "learning_rate": 5.344770491527402e-05, + "loss": 2.11, + "step": 3309 + }, + { + "epoch": 10.75, + "learning_rate": 5.3269605289621947e-05, + "loss": 2.1962, + "step": 3312 + }, + { + "epoch": 10.76, + "learning_rate": 5.309169510073777e-05, + "loss": 2.213, + "step": 3315 + }, + { + "epoch": 10.77, + "learning_rate": 5.291397506983786e-05, + "loss": 2.0556, + "step": 3318 + }, + { + "epoch": 10.78, + "learning_rate": 5.273644591736793e-05, + "loss": 2.2094, + "step": 3321 + }, + { + "epoch": 10.79, + "learning_rate": 5.2559108362999796e-05, + "loss": 2.0953, + "step": 3324 + }, + { + "epoch": 10.8, + "learning_rate": 5.238196312562851e-05, + "loss": 2.2436, + "step": 3327 + }, + { + "epoch": 10.81, + "learning_rate": 5.220501092336966e-05, + "loss": 2.1752, + "step": 3330 + }, + { + "epoch": 10.82, + "learning_rate": 5.2028252473556226e-05, + "loss": 2.1757, + "step": 3333 + }, + { + "epoch": 10.83, + "learning_rate": 5.1851688492735705e-05, + "loss": 2.193, + "step": 3336 + }, + { + "epoch": 10.84, + "learning_rate": 5.167531969666735e-05, + "loss": 2.1306, + "step": 3339 + }, + { + "epoch": 10.85, + "learning_rate": 5.149914680031909e-05, + "loss": 2.1043, + "step": 3342 + }, + { + "epoch": 10.86, + "learning_rate": 5.132317051786468e-05, + "loss": 2.1032, + "step": 3345 + }, + { + "epoch": 10.87, + "learning_rate": 5.114739156268094e-05, + "loss": 2.1205, + "step": 3348 + }, + { + "epoch": 10.88, + "learning_rate": 5.097181064734475e-05, + "loss": 2.1494, + "step": 3351 + }, + { + "epoch": 10.89, + "learning_rate": 5.0796428483630074e-05, + "loss": 2.1382, + "step": 3354 + }, + { + "epoch": 10.9, + "learning_rate": 5.062124578250529e-05, + "loss": 2.0946, + "step": 3357 + }, + { + "epoch": 10.91, + "learning_rate": 5.04462632541301e-05, + "loss": 2.0986, + "step": 3360 + }, + { + "epoch": 10.92, + "learning_rate": 5.027148160785273e-05, + "loss": 2.1342, + "step": 3363 + }, + { + "epoch": 10.93, + "learning_rate": 5.009690155220715e-05, + "loss": 2.1049, + "step": 3366 + }, + { + "epoch": 10.94, + "learning_rate": 4.992252379491012e-05, + "loss": 2.2269, + "step": 3369 + }, + { + "epoch": 10.95, + "learning_rate": 4.974834904285822e-05, + "loss": 2.1275, + "step": 3372 + }, + { + "epoch": 10.96, + "learning_rate": 4.957437800212512e-05, + "loss": 2.0984, + "step": 3375 + }, + { + "epoch": 10.97, + "learning_rate": 4.940061137795876e-05, + "loss": 2.1488, + "step": 3378 + }, + { + "epoch": 10.98, + "learning_rate": 4.9227049874778306e-05, + "loss": 2.1061, + "step": 3381 + }, + { + "epoch": 10.99, + "learning_rate": 4.905369419617137e-05, + "loss": 2.2105, + "step": 3384 + }, + { + "epoch": 11.0, + "learning_rate": 4.888054504489142e-05, + "loss": 2.1669, + "step": 3387 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.6398645281002108, + "eval_loss": 2.068744421005249, + "eval_runtime": 16.3144, + "eval_samples_per_second": 134.666, + "eval_steps_per_second": 67.364, + "step": 3388 + }, + { + "epoch": 11.01, + "learning_rate": 4.870760312285445e-05, + "loss": 2.1265, + "step": 3390 + }, + { + "epoch": 11.02, + "learning_rate": 4.853486913113644e-05, + "loss": 2.2385, + "step": 3393 + }, + { + "epoch": 11.03, + "learning_rate": 4.836234376997056e-05, + "loss": 2.0393, + "step": 3396 + }, + { + "epoch": 11.04, + "learning_rate": 4.8190027738744134e-05, + "loss": 2.0961, + "step": 3399 + }, + { + "epoch": 11.05, + "learning_rate": 4.801792173599586e-05, + "loss": 2.0916, + "step": 3402 + }, + { + "epoch": 11.06, + "learning_rate": 4.784602645941314e-05, + "loss": 2.1191, + "step": 3405 + }, + { + "epoch": 11.06, + "learning_rate": 4.7674342605829094e-05, + "loss": 2.0921, + "step": 3408 + }, + { + "epoch": 11.07, + "learning_rate": 4.7502870871219675e-05, + "loss": 2.0888, + "step": 3411 + }, + { + "epoch": 11.08, + "learning_rate": 4.7331611950701096e-05, + "loss": 2.1485, + "step": 3414 + }, + { + "epoch": 11.09, + "learning_rate": 4.716056653852672e-05, + "loss": 2.1266, + "step": 3417 + }, + { + "epoch": 11.1, + "learning_rate": 4.698973532808443e-05, + "loss": 2.1702, + "step": 3420 + }, + { + "epoch": 11.11, + "learning_rate": 4.6819119011893805e-05, + "loss": 2.1296, + "step": 3423 + }, + { + "epoch": 11.12, + "learning_rate": 4.664871828160331e-05, + "loss": 2.0754, + "step": 3426 + }, + { + "epoch": 11.13, + "learning_rate": 4.647853382798736e-05, + "loss": 2.1102, + "step": 3429 + }, + { + "epoch": 11.14, + "learning_rate": 4.630856634094366e-05, + "loss": 2.1222, + "step": 3432 + }, + { + "epoch": 11.15, + "learning_rate": 4.613881650949044e-05, + "loss": 2.1703, + "step": 3435 + }, + { + "epoch": 11.16, + "learning_rate": 4.596928502176349e-05, + "loss": 2.1288, + "step": 3438 + }, + { + "epoch": 11.17, + "learning_rate": 4.579997256501355e-05, + "loss": 2.0522, + "step": 3441 + }, + { + "epoch": 11.18, + "learning_rate": 4.563087982560345e-05, + "loss": 2.1225, + "step": 3444 + }, + { + "epoch": 11.19, + "learning_rate": 4.546200748900525e-05, + "loss": 2.068, + "step": 3447 + }, + { + "epoch": 11.2, + "learning_rate": 4.529335623979757e-05, + "loss": 2.0433, + "step": 3450 + }, + { + "epoch": 11.21, + "learning_rate": 4.512492676166283e-05, + "loss": 2.0679, + "step": 3453 + }, + { + "epoch": 11.22, + "learning_rate": 4.49567197373844e-05, + "loss": 2.1568, + "step": 3456 + }, + { + "epoch": 11.23, + "learning_rate": 4.478873584884378e-05, + "loss": 2.0446, + "step": 3459 + }, + { + "epoch": 11.24, + "learning_rate": 4.4620975777018034e-05, + "loss": 2.1532, + "step": 3462 + }, + { + "epoch": 11.25, + "learning_rate": 4.44534402019769e-05, + "loss": 2.1659, + "step": 3465 + }, + { + "epoch": 11.26, + "learning_rate": 4.428612980287996e-05, + "loss": 2.1492, + "step": 3468 + }, + { + "epoch": 11.27, + "learning_rate": 4.411904525797408e-05, + "loss": 2.1019, + "step": 3471 + }, + { + "epoch": 11.28, + "learning_rate": 4.395218724459047e-05, + "loss": 2.0283, + "step": 3474 + }, + { + "epoch": 11.29, + "learning_rate": 4.3785556439142005e-05, + "loss": 2.1538, + "step": 3477 + }, + { + "epoch": 11.3, + "learning_rate": 4.361915351712059e-05, + "loss": 2.1303, + "step": 3480 + }, + { + "epoch": 11.31, + "learning_rate": 4.345297915309432e-05, + "loss": 2.0312, + "step": 3483 + }, + { + "epoch": 11.32, + "learning_rate": 4.3287034020704684e-05, + "loss": 2.1296, + "step": 3486 + }, + { + "epoch": 11.33, + "learning_rate": 4.3121318792663914e-05, + "loss": 2.1551, + "step": 3489 + }, + { + "epoch": 11.34, + "learning_rate": 4.295583414075234e-05, + "loss": 2.1197, + "step": 3492 + }, + { + "epoch": 11.35, + "learning_rate": 4.279058073581544e-05, + "loss": 2.2175, + "step": 3495 + }, + { + "epoch": 11.36, + "learning_rate": 4.2625559247761394e-05, + "loss": 2.1445, + "step": 3498 + }, + { + "epoch": 11.37, + "learning_rate": 4.246077034555819e-05, + "loss": 2.1581, + "step": 3501 + }, + { + "epoch": 11.38, + "learning_rate": 4.229621469723091e-05, + "loss": 2.0796, + "step": 3504 + }, + { + "epoch": 11.39, + "learning_rate": 4.2131892969859054e-05, + "loss": 2.1374, + "step": 3507 + }, + { + "epoch": 11.4, + "learning_rate": 4.196780582957396e-05, + "loss": 2.1188, + "step": 3510 + }, + { + "epoch": 11.41, + "learning_rate": 4.180395394155584e-05, + "loss": 2.0809, + "step": 3513 + }, + { + "epoch": 11.42, + "learning_rate": 4.1640337970031384e-05, + "loss": 2.1211, + "step": 3516 + }, + { + "epoch": 11.43, + "learning_rate": 4.1476958578270783e-05, + "loss": 2.1566, + "step": 3519 + }, + { + "epoch": 11.44, + "learning_rate": 4.1313816428585316e-05, + "loss": 2.1824, + "step": 3522 + }, + { + "epoch": 11.44, + "learning_rate": 4.1150912182324396e-05, + "loss": 2.1873, + "step": 3525 + }, + { + "epoch": 11.45, + "learning_rate": 4.098824649987304e-05, + "loss": 2.1329, + "step": 3528 + }, + { + "epoch": 11.46, + "learning_rate": 4.0825820040649246e-05, + "loss": 2.1283, + "step": 3531 + }, + { + "epoch": 11.47, + "learning_rate": 4.06636334631012e-05, + "loss": 2.1378, + "step": 3534 + }, + { + "epoch": 11.48, + "learning_rate": 4.0501687424704613e-05, + "loss": 2.1189, + "step": 3537 + }, + { + "epoch": 11.49, + "learning_rate": 4.033998258196019e-05, + "loss": 2.1541, + "step": 3540 + }, + { + "epoch": 11.5, + "learning_rate": 4.017851959039075e-05, + "loss": 2.1264, + "step": 3543 + }, + { + "epoch": 11.51, + "learning_rate": 4.001729910453872e-05, + "loss": 2.0864, + "step": 3546 + }, + { + "epoch": 11.52, + "learning_rate": 3.985632177796353e-05, + "loss": 2.0457, + "step": 3549 + }, + { + "epoch": 11.53, + "learning_rate": 3.9695588263238847e-05, + "loss": 2.1076, + "step": 3552 + }, + { + "epoch": 11.54, + "learning_rate": 3.953509921194991e-05, + "loss": 2.1368, + "step": 3555 + }, + { + "epoch": 11.55, + "learning_rate": 3.9374855274691035e-05, + "loss": 2.1462, + "step": 3558 + }, + { + "epoch": 11.56, + "learning_rate": 3.921485710106283e-05, + "loss": 2.1003, + "step": 3561 + }, + { + "epoch": 11.57, + "learning_rate": 3.9055105339669595e-05, + "loss": 2.1287, + "step": 3564 + }, + { + "epoch": 11.58, + "learning_rate": 3.889560063811679e-05, + "loss": 2.144, + "step": 3567 + }, + { + "epoch": 11.59, + "learning_rate": 3.873634364300835e-05, + "loss": 2.1544, + "step": 3570 + }, + { + "epoch": 11.6, + "learning_rate": 3.857733499994397e-05, + "loss": 2.0732, + "step": 3573 + }, + { + "epoch": 11.61, + "learning_rate": 3.841857535351657e-05, + "loss": 2.1219, + "step": 3576 + }, + { + "epoch": 11.62, + "learning_rate": 3.82600653473098e-05, + "loss": 2.1629, + "step": 3579 + }, + { + "epoch": 11.63, + "learning_rate": 3.810180562389519e-05, + "loss": 2.1042, + "step": 3582 + }, + { + "epoch": 11.64, + "learning_rate": 3.794379682482965e-05, + "loss": 2.1244, + "step": 3585 + }, + { + "epoch": 11.65, + "learning_rate": 3.7786039590653076e-05, + "loss": 2.18, + "step": 3588 + }, + { + "epoch": 11.66, + "learning_rate": 3.762853456088538e-05, + "loss": 2.1475, + "step": 3591 + }, + { + "epoch": 11.67, + "learning_rate": 3.747128237402409e-05, + "loss": 2.0909, + "step": 3594 + }, + { + "epoch": 11.68, + "learning_rate": 3.7314283667541885e-05, + "loss": 2.123, + "step": 3597 + }, + { + "epoch": 11.69, + "learning_rate": 3.715753907788374e-05, + "loss": 2.121, + "step": 3600 + }, + { + "epoch": 11.7, + "learning_rate": 3.700104924046452e-05, + "loss": 2.0345, + "step": 3603 + }, + { + "epoch": 11.71, + "learning_rate": 3.6844814789666436e-05, + "loss": 2.0413, + "step": 3606 + }, + { + "epoch": 11.72, + "learning_rate": 3.6688836358836386e-05, + "loss": 2.1818, + "step": 3609 + }, + { + "epoch": 11.73, + "learning_rate": 3.6533114580283315e-05, + "loss": 2.1043, + "step": 3612 + }, + { + "epoch": 11.74, + "learning_rate": 3.6377650085275874e-05, + "loss": 2.0476, + "step": 3615 + }, + { + "epoch": 11.75, + "learning_rate": 3.622244350403965e-05, + "loss": 2.124, + "step": 3618 + }, + { + "epoch": 11.76, + "learning_rate": 3.6067495465754666e-05, + "loss": 2.0856, + "step": 3621 + }, + { + "epoch": 11.77, + "learning_rate": 3.591280659855296e-05, + "loss": 2.1257, + "step": 3624 + }, + { + "epoch": 11.78, + "learning_rate": 3.575837752951591e-05, + "loss": 2.1757, + "step": 3627 + }, + { + "epoch": 11.79, + "learning_rate": 3.5604208884671645e-05, + "loss": 2.0124, + "step": 3630 + }, + { + "epoch": 11.8, + "learning_rate": 3.5450301288992596e-05, + "loss": 2.0324, + "step": 3633 + }, + { + "epoch": 11.81, + "learning_rate": 3.529665536639305e-05, + "loss": 2.1634, + "step": 3636 + }, + { + "epoch": 11.81, + "learning_rate": 3.514327173972638e-05, + "loss": 2.1465, + "step": 3639 + }, + { + "epoch": 11.82, + "learning_rate": 3.4990151030782744e-05, + "loss": 2.0668, + "step": 3642 + }, + { + "epoch": 11.83, + "learning_rate": 3.483729386028651e-05, + "loss": 2.1991, + "step": 3645 + }, + { + "epoch": 11.84, + "learning_rate": 3.468470084789359e-05, + "loss": 2.0814, + "step": 3648 + }, + { + "epoch": 11.85, + "learning_rate": 3.4532372612189104e-05, + "loss": 2.1976, + "step": 3651 + }, + { + "epoch": 11.86, + "learning_rate": 3.438030977068487e-05, + "loss": 2.1935, + "step": 3654 + }, + { + "epoch": 11.87, + "learning_rate": 3.422851293981676e-05, + "loss": 2.1086, + "step": 3657 + }, + { + "epoch": 11.88, + "learning_rate": 3.4076982734942296e-05, + "loss": 2.1479, + "step": 3660 + }, + { + "epoch": 11.89, + "learning_rate": 3.392571977033819e-05, + "loss": 2.1281, + "step": 3663 + }, + { + "epoch": 11.9, + "learning_rate": 3.377472465919784e-05, + "loss": 2.1517, + "step": 3666 + }, + { + "epoch": 11.91, + "learning_rate": 3.3623998013628675e-05, + "loss": 2.1178, + "step": 3669 + }, + { + "epoch": 11.92, + "learning_rate": 3.347354044464997e-05, + "loss": 2.0988, + "step": 3672 + }, + { + "epoch": 11.93, + "learning_rate": 3.332335256219012e-05, + "loss": 2.2034, + "step": 3675 + }, + { + "epoch": 11.94, + "learning_rate": 3.317343497508424e-05, + "loss": 2.1123, + "step": 3678 + }, + { + "epoch": 11.95, + "learning_rate": 3.302378829107178e-05, + "loss": 2.1258, + "step": 3681 + }, + { + "epoch": 11.96, + "learning_rate": 3.2874413116794e-05, + "loss": 2.1263, + "step": 3684 + }, + { + "epoch": 11.97, + "learning_rate": 3.2725310057791456e-05, + "loss": 2.0565, + "step": 3687 + }, + { + "epoch": 11.98, + "learning_rate": 3.2576479718501584e-05, + "loss": 2.0825, + "step": 3690 + }, + { + "epoch": 11.99, + "learning_rate": 3.242792270225635e-05, + "loss": 2.1262, + "step": 3693 + }, + { + "epoch": 12.0, + "learning_rate": 3.227963961127961e-05, + "loss": 2.0983, + "step": 3696 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.6422703241176929, + "eval_loss": 2.062872886657715, + "eval_runtime": 16.2609, + "eval_samples_per_second": 135.109, + "eval_steps_per_second": 67.585, + "step": 3696 + }, + { + "epoch": 12.01, + "learning_rate": 3.213163104668485e-05, + "loss": 2.0955, + "step": 3699 + }, + { + "epoch": 12.02, + "learning_rate": 3.19838976084727e-05, + "loss": 2.1043, + "step": 3702 + }, + { + "epoch": 12.03, + "learning_rate": 3.18364398955284e-05, + "loss": 2.1251, + "step": 3705 + }, + { + "epoch": 12.04, + "learning_rate": 3.168925850561943e-05, + "loss": 2.1206, + "step": 3708 + }, + { + "epoch": 12.05, + "learning_rate": 3.154235403539323e-05, + "loss": 2.0734, + "step": 3711 + }, + { + "epoch": 12.06, + "learning_rate": 3.1395727080374505e-05, + "loss": 2.178, + "step": 3714 + }, + { + "epoch": 12.07, + "learning_rate": 3.12493782349631e-05, + "loss": 2.0629, + "step": 3717 + }, + { + "epoch": 12.08, + "learning_rate": 3.110330809243134e-05, + "loss": 2.1535, + "step": 3720 + }, + { + "epoch": 12.09, + "learning_rate": 3.095751724492185e-05, + "loss": 2.1331, + "step": 3723 + }, + { + "epoch": 12.1, + "learning_rate": 3.081200628344494e-05, + "loss": 2.0902, + "step": 3726 + }, + { + "epoch": 12.11, + "learning_rate": 3.066677579787631e-05, + "loss": 2.0434, + "step": 3729 + }, + { + "epoch": 12.12, + "learning_rate": 3.0521826376954755e-05, + "loss": 2.1005, + "step": 3732 + }, + { + "epoch": 12.13, + "learning_rate": 3.0377158608279655e-05, + "loss": 2.1764, + "step": 3735 + }, + { + "epoch": 12.14, + "learning_rate": 3.0232773078308517e-05, + "loss": 2.1099, + "step": 3738 + }, + { + "epoch": 12.15, + "learning_rate": 3.0088670372354877e-05, + "loss": 2.1211, + "step": 3741 + }, + { + "epoch": 12.16, + "learning_rate": 2.99448510745856e-05, + "loss": 2.1546, + "step": 3744 + }, + { + "epoch": 12.17, + "learning_rate": 2.9801315768018688e-05, + "loss": 2.1664, + "step": 3747 + }, + { + "epoch": 12.18, + "learning_rate": 2.9658065034520978e-05, + "loss": 2.0983, + "step": 3750 + }, + { + "epoch": 12.19, + "learning_rate": 2.9515099454805663e-05, + "loss": 2.0519, + "step": 3753 + }, + { + "epoch": 12.19, + "learning_rate": 2.93724196084299e-05, + "loss": 2.1333, + "step": 3756 + }, + { + "epoch": 12.2, + "learning_rate": 2.923002607379265e-05, + "loss": 2.0304, + "step": 3759 + }, + { + "epoch": 12.21, + "learning_rate": 2.9087919428132114e-05, + "loss": 2.1549, + "step": 3762 + }, + { + "epoch": 12.22, + "learning_rate": 2.8946100247523533e-05, + "loss": 2.1191, + "step": 3765 + }, + { + "epoch": 12.23, + "learning_rate": 2.8804569106876832e-05, + "loss": 2.1154, + "step": 3768 + }, + { + "epoch": 12.24, + "learning_rate": 2.8663326579934292e-05, + "loss": 2.175, + "step": 3771 + }, + { + "epoch": 12.25, + "learning_rate": 2.8522373239268152e-05, + "loss": 2.1378, + "step": 3774 + }, + { + "epoch": 12.26, + "learning_rate": 2.8381709656278333e-05, + "loss": 2.0927, + "step": 3777 + }, + { + "epoch": 12.27, + "learning_rate": 2.8241336401190222e-05, + "loss": 2.1146, + "step": 3780 + }, + { + "epoch": 12.28, + "learning_rate": 2.810125404305216e-05, + "loss": 2.0147, + "step": 3783 + }, + { + "epoch": 12.29, + "learning_rate": 2.796146314973325e-05, + "loss": 2.1068, + "step": 3786 + }, + { + "epoch": 12.3, + "learning_rate": 2.7821964287921197e-05, + "loss": 2.1693, + "step": 3789 + }, + { + "epoch": 12.31, + "learning_rate": 2.7682758023119694e-05, + "loss": 2.1336, + "step": 3792 + }, + { + "epoch": 12.32, + "learning_rate": 2.7543844919646323e-05, + "loss": 2.0793, + "step": 3795 + }, + { + "epoch": 12.33, + "learning_rate": 2.740522554063033e-05, + "loss": 2.0712, + "step": 3798 + }, + { + "epoch": 12.34, + "learning_rate": 2.726690044801018e-05, + "loss": 2.0706, + "step": 3801 + }, + { + "epoch": 12.35, + "learning_rate": 2.7128870202531343e-05, + "loss": 2.0728, + "step": 3804 + }, + { + "epoch": 12.36, + "learning_rate": 2.6991135363744068e-05, + "loss": 2.1108, + "step": 3807 + }, + { + "epoch": 12.37, + "learning_rate": 2.6853696490001112e-05, + "loss": 2.104, + "step": 3810 + }, + { + "epoch": 12.38, + "learning_rate": 2.6716554138455353e-05, + "loss": 2.0752, + "step": 3813 + }, + { + "epoch": 12.39, + "learning_rate": 2.6579708865057694e-05, + "loss": 2.154, + "step": 3816 + }, + { + "epoch": 12.4, + "learning_rate": 2.6443161224554704e-05, + "loss": 2.0717, + "step": 3819 + }, + { + "epoch": 12.41, + "learning_rate": 2.6306911770486353e-05, + "loss": 2.1225, + "step": 3822 + }, + { + "epoch": 12.42, + "learning_rate": 2.6170961055183906e-05, + "loss": 2.1377, + "step": 3825 + }, + { + "epoch": 12.43, + "learning_rate": 2.6035309629767603e-05, + "loss": 2.1614, + "step": 3828 + }, + { + "epoch": 12.44, + "learning_rate": 2.5899958044144302e-05, + "loss": 2.1486, + "step": 3831 + }, + { + "epoch": 12.45, + "learning_rate": 2.576490684700542e-05, + "loss": 2.1206, + "step": 3834 + }, + { + "epoch": 12.46, + "learning_rate": 2.5630156585824727e-05, + "loss": 2.0882, + "step": 3837 + }, + { + "epoch": 12.47, + "learning_rate": 2.5495707806855938e-05, + "loss": 2.1787, + "step": 3840 + }, + { + "epoch": 12.48, + "learning_rate": 2.536156105513062e-05, + "loss": 2.0932, + "step": 3843 + }, + { + "epoch": 12.49, + "learning_rate": 2.522771687445612e-05, + "loss": 2.1471, + "step": 3846 + }, + { + "epoch": 12.5, + "learning_rate": 2.5094175807413055e-05, + "loss": 2.1226, + "step": 3849 + }, + { + "epoch": 12.51, + "learning_rate": 2.4960938395353296e-05, + "loss": 2.1666, + "step": 3852 + }, + { + "epoch": 12.52, + "learning_rate": 2.4828005178397838e-05, + "loss": 2.0437, + "step": 3855 + }, + { + "epoch": 12.53, + "learning_rate": 2.4695376695434448e-05, + "loss": 2.0396, + "step": 3858 + }, + { + "epoch": 12.54, + "learning_rate": 2.456305348411554e-05, + "loss": 2.09, + "step": 3861 + }, + { + "epoch": 12.55, + "learning_rate": 2.4431036080856073e-05, + "loss": 2.0419, + "step": 3864 + }, + { + "epoch": 12.56, + "learning_rate": 2.429932502083132e-05, + "loss": 2.0626, + "step": 3867 + }, + { + "epoch": 12.56, + "learning_rate": 2.41679208379746e-05, + "loss": 2.1798, + "step": 3870 + }, + { + "epoch": 12.57, + "learning_rate": 2.4036824064975317e-05, + "loss": 2.1082, + "step": 3873 + }, + { + "epoch": 12.58, + "learning_rate": 2.3906035233276614e-05, + "loss": 2.0504, + "step": 3876 + }, + { + "epoch": 12.59, + "learning_rate": 2.3775554873073292e-05, + "loss": 2.0439, + "step": 3879 + }, + { + "epoch": 12.6, + "learning_rate": 2.3645383513309704e-05, + "loss": 2.1104, + "step": 3882 + }, + { + "epoch": 12.61, + "learning_rate": 2.351552168167761e-05, + "loss": 2.088, + "step": 3885 + }, + { + "epoch": 12.62, + "learning_rate": 2.338596990461388e-05, + "loss": 2.0038, + "step": 3888 + }, + { + "epoch": 12.63, + "learning_rate": 2.3256728707298546e-05, + "loss": 2.043, + "step": 3891 + }, + { + "epoch": 12.64, + "learning_rate": 2.312779861365263e-05, + "loss": 2.0785, + "step": 3894 + }, + { + "epoch": 12.65, + "learning_rate": 2.299918014633592e-05, + "loss": 2.1406, + "step": 3897 + }, + { + "epoch": 12.66, + "learning_rate": 2.2870873826744988e-05, + "loss": 2.1155, + "step": 3900 + }, + { + "epoch": 12.67, + "learning_rate": 2.2742880175011028e-05, + "loss": 2.1258, + "step": 3903 + }, + { + "epoch": 12.68, + "learning_rate": 2.261519970999768e-05, + "loss": 2.1664, + "step": 3906 + }, + { + "epoch": 12.69, + "learning_rate": 2.248783294929897e-05, + "loss": 2.0733, + "step": 3909 + }, + { + "epoch": 12.7, + "learning_rate": 2.2360780409237294e-05, + "loss": 2.135, + "step": 3912 + }, + { + "epoch": 12.71, + "learning_rate": 2.2234042604861182e-05, + "loss": 2.1826, + "step": 3915 + }, + { + "epoch": 12.72, + "learning_rate": 2.2107620049943346e-05, + "loss": 2.0611, + "step": 3918 + }, + { + "epoch": 12.73, + "learning_rate": 2.1981513256978458e-05, + "loss": 2.0883, + "step": 3921 + }, + { + "epoch": 12.74, + "learning_rate": 2.185572273718124e-05, + "loss": 2.0715, + "step": 3924 + }, + { + "epoch": 12.75, + "learning_rate": 2.1730249000484203e-05, + "loss": 2.0608, + "step": 3927 + }, + { + "epoch": 12.76, + "learning_rate": 2.1605092555535712e-05, + "loss": 2.0845, + "step": 3930 + }, + { + "epoch": 12.77, + "learning_rate": 2.14802539096979e-05, + "loss": 2.0893, + "step": 3933 + }, + { + "epoch": 12.78, + "learning_rate": 2.1355733569044635e-05, + "loss": 2.0895, + "step": 3936 + }, + { + "epoch": 12.79, + "learning_rate": 2.1231532038359326e-05, + "loss": 2.1653, + "step": 3939 + }, + { + "epoch": 12.8, + "learning_rate": 2.11076498211331e-05, + "loss": 2.0523, + "step": 3942 + }, + { + "epoch": 12.81, + "learning_rate": 2.098408741956256e-05, + "loss": 2.056, + "step": 3945 + }, + { + "epoch": 12.82, + "learning_rate": 2.086084533454784e-05, + "loss": 2.1268, + "step": 3948 + }, + { + "epoch": 12.83, + "learning_rate": 2.0737924065690606e-05, + "loss": 2.0818, + "step": 3951 + }, + { + "epoch": 12.84, + "learning_rate": 2.0615324111292013e-05, + "loss": 2.038, + "step": 3954 + }, + { + "epoch": 12.85, + "learning_rate": 2.0493045968350567e-05, + "loss": 2.124, + "step": 3957 + }, + { + "epoch": 12.86, + "learning_rate": 2.0371090132560322e-05, + "loss": 2.0462, + "step": 3960 + }, + { + "epoch": 12.87, + "learning_rate": 2.0249457098308665e-05, + "loss": 2.1401, + "step": 3963 + }, + { + "epoch": 12.88, + "learning_rate": 2.012814735867442e-05, + "loss": 2.1156, + "step": 3966 + }, + { + "epoch": 12.89, + "learning_rate": 2.0007161405425866e-05, + "loss": 2.1056, + "step": 3969 + }, + { + "epoch": 12.9, + "learning_rate": 1.9886499729018737e-05, + "loss": 2.093, + "step": 3972 + }, + { + "epoch": 12.91, + "learning_rate": 1.9766162818594114e-05, + "loss": 2.1043, + "step": 3975 + }, + { + "epoch": 12.92, + "learning_rate": 1.9646151161976556e-05, + "loss": 2.0714, + "step": 3978 + }, + { + "epoch": 12.93, + "learning_rate": 1.9526465245672187e-05, + "loss": 2.1165, + "step": 3981 + }, + { + "epoch": 12.94, + "learning_rate": 1.9407105554866557e-05, + "loss": 2.0959, + "step": 3984 + }, + { + "epoch": 12.94, + "learning_rate": 1.92880725734227e-05, + "loss": 2.1551, + "step": 3987 + }, + { + "epoch": 12.95, + "learning_rate": 1.9169366783879428e-05, + "loss": 2.0905, + "step": 3990 + }, + { + "epoch": 12.96, + "learning_rate": 1.9050988667448977e-05, + "loss": 2.1042, + "step": 3993 + }, + { + "epoch": 12.97, + "learning_rate": 1.8932938704015314e-05, + "loss": 2.1607, + "step": 3996 + }, + { + "epoch": 12.98, + "learning_rate": 1.8815217372132198e-05, + "loss": 2.0262, + "step": 3999 + }, + { + "epoch": 12.99, + "learning_rate": 1.8697825149021086e-05, + "loss": 2.1215, + "step": 4002 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.6475752433152033, + "eval_loss": 2.025885820388794, + "eval_runtime": 16.3422, + "eval_samples_per_second": 134.437, + "eval_steps_per_second": 67.249, + "step": 4004 + }, + { + "epoch": 13.0, + "learning_rate": 1.8580762510569295e-05, + "loss": 2.1667, + "step": 4005 + }, + { + "epoch": 13.01, + "learning_rate": 1.846402993132811e-05, + "loss": 2.1035, + "step": 4008 + }, + { + "epoch": 13.02, + "learning_rate": 1.8347627884510832e-05, + "loss": 2.1239, + "step": 4011 + }, + { + "epoch": 13.03, + "learning_rate": 1.823155684199074e-05, + "loss": 2.1166, + "step": 4014 + }, + { + "epoch": 13.04, + "learning_rate": 1.8115817274299396e-05, + "loss": 2.1101, + "step": 4017 + }, + { + "epoch": 13.05, + "learning_rate": 1.800040965062455e-05, + "loss": 2.0607, + "step": 4020 + }, + { + "epoch": 13.06, + "learning_rate": 1.7885334438808287e-05, + "loss": 2.0964, + "step": 4023 + }, + { + "epoch": 13.07, + "learning_rate": 1.777059210534524e-05, + "loss": 2.1132, + "step": 4026 + }, + { + "epoch": 13.08, + "learning_rate": 1.7656183115380577e-05, + "loss": 2.0793, + "step": 4029 + }, + { + "epoch": 13.09, + "learning_rate": 1.754210793270812e-05, + "loss": 2.1091, + "step": 4032 + }, + { + "epoch": 13.1, + "learning_rate": 1.742836701976849e-05, + "loss": 2.0793, + "step": 4035 + }, + { + "epoch": 13.11, + "learning_rate": 1.7314960837647297e-05, + "loss": 2.0164, + "step": 4038 + }, + { + "epoch": 13.12, + "learning_rate": 1.7201889846073183e-05, + "loss": 2.0697, + "step": 4041 + }, + { + "epoch": 13.13, + "learning_rate": 1.7089154503415895e-05, + "loss": 2.0631, + "step": 4044 + }, + { + "epoch": 13.14, + "learning_rate": 1.697675526668473e-05, + "loss": 2.0714, + "step": 4047 + }, + { + "epoch": 13.15, + "learning_rate": 1.6864692591526278e-05, + "loss": 2.1084, + "step": 4050 + }, + { + "epoch": 13.16, + "learning_rate": 1.6752966932222826e-05, + "loss": 2.1333, + "step": 4053 + }, + { + "epoch": 13.17, + "learning_rate": 1.664157874169049e-05, + "loss": 2.1206, + "step": 4056 + }, + { + "epoch": 13.18, + "learning_rate": 1.6530528471477326e-05, + "loss": 2.1727, + "step": 4059 + }, + { + "epoch": 13.19, + "learning_rate": 1.6419816571761482e-05, + "loss": 2.1396, + "step": 4062 + }, + { + "epoch": 13.2, + "learning_rate": 1.6309443491349475e-05, + "loss": 2.0632, + "step": 4065 + }, + { + "epoch": 13.21, + "learning_rate": 1.6199409677674314e-05, + "loss": 2.0968, + "step": 4068 + }, + { + "epoch": 13.22, + "learning_rate": 1.6089715576793584e-05, + "loss": 2.0538, + "step": 4071 + }, + { + "epoch": 13.23, + "learning_rate": 1.5980361633387853e-05, + "loss": 2.1114, + "step": 4074 + }, + { + "epoch": 13.24, + "learning_rate": 1.587134829075867e-05, + "loss": 2.1656, + "step": 4077 + }, + { + "epoch": 13.25, + "learning_rate": 1.576267599082686e-05, + "loss": 2.0781, + "step": 4080 + }, + { + "epoch": 13.26, + "learning_rate": 1.5654345174130756e-05, + "loss": 2.1749, + "step": 4083 + }, + { + "epoch": 13.27, + "learning_rate": 1.5546356279824382e-05, + "loss": 2.0654, + "step": 4086 + }, + { + "epoch": 13.28, + "learning_rate": 1.5438709745675606e-05, + "loss": 2.0904, + "step": 4089 + }, + { + "epoch": 13.29, + "learning_rate": 1.5331406008064475e-05, + "loss": 2.0368, + "step": 4092 + }, + { + "epoch": 13.3, + "learning_rate": 1.522444550198141e-05, + "loss": 2.0759, + "step": 4095 + }, + { + "epoch": 13.31, + "learning_rate": 1.511782866102539e-05, + "loss": 2.0462, + "step": 4098 + }, + { + "epoch": 13.31, + "learning_rate": 1.5011555917402265e-05, + "loss": 2.0873, + "step": 4101 + }, + { + "epoch": 13.32, + "learning_rate": 1.4905627701923009e-05, + "loss": 2.0913, + "step": 4104 + }, + { + "epoch": 13.33, + "learning_rate": 1.480004444400187e-05, + "loss": 2.0516, + "step": 4107 + }, + { + "epoch": 13.34, + "learning_rate": 1.4694806571654696e-05, + "loss": 2.1136, + "step": 4110 + }, + { + "epoch": 13.35, + "learning_rate": 1.4589914511497305e-05, + "loss": 2.1294, + "step": 4113 + }, + { + "epoch": 13.36, + "learning_rate": 1.4485368688743527e-05, + "loss": 2.068, + "step": 4116 + }, + { + "epoch": 13.37, + "learning_rate": 1.4381169527203719e-05, + "loss": 2.0402, + "step": 4119 + }, + { + "epoch": 13.38, + "learning_rate": 1.4277317449282834e-05, + "loss": 2.048, + "step": 4122 + }, + { + "epoch": 13.39, + "learning_rate": 1.4173812875978886e-05, + "loss": 2.0875, + "step": 4125 + }, + { + "epoch": 13.4, + "learning_rate": 1.407065622688113e-05, + "loss": 2.1008, + "step": 4128 + }, + { + "epoch": 13.41, + "learning_rate": 1.3967847920168386e-05, + "loss": 2.1113, + "step": 4131 + }, + { + "epoch": 13.42, + "learning_rate": 1.386538837260738e-05, + "loss": 2.0277, + "step": 4134 + }, + { + "epoch": 13.43, + "learning_rate": 1.376327799955105e-05, + "loss": 2.1696, + "step": 4137 + }, + { + "epoch": 13.44, + "learning_rate": 1.3661517214936782e-05, + "loss": 2.1531, + "step": 4140 + }, + { + "epoch": 13.45, + "learning_rate": 1.356010643128487e-05, + "loss": 2.1222, + "step": 4143 + }, + { + "epoch": 13.46, + "learning_rate": 1.345904605969669e-05, + "loss": 2.0299, + "step": 4146 + }, + { + "epoch": 13.47, + "learning_rate": 1.3358336509853131e-05, + "loss": 2.1065, + "step": 4149 + }, + { + "epoch": 13.48, + "learning_rate": 1.3257978190012931e-05, + "loss": 2.0989, + "step": 4152 + }, + { + "epoch": 13.49, + "learning_rate": 1.3157971507011036e-05, + "loss": 2.0679, + "step": 4155 + }, + { + "epoch": 13.5, + "learning_rate": 1.3058316866256826e-05, + "loss": 2.1828, + "step": 4158 + }, + { + "epoch": 13.51, + "learning_rate": 1.295901467173265e-05, + "loss": 2.057, + "step": 4161 + }, + { + "epoch": 13.52, + "learning_rate": 1.2860065325992066e-05, + "loss": 2.0964, + "step": 4164 + }, + { + "epoch": 13.53, + "learning_rate": 1.2761469230158208e-05, + "loss": 2.1366, + "step": 4167 + }, + { + "epoch": 13.54, + "learning_rate": 1.2663226783922266e-05, + "loss": 2.0889, + "step": 4170 + }, + { + "epoch": 13.55, + "learning_rate": 1.2565338385541792e-05, + "loss": 2.0918, + "step": 4173 + }, + { + "epoch": 13.56, + "learning_rate": 1.2467804431839037e-05, + "loss": 2.1852, + "step": 4176 + }, + { + "epoch": 13.57, + "learning_rate": 1.2370625318199414e-05, + "loss": 2.0561, + "step": 4179 + }, + { + "epoch": 13.58, + "learning_rate": 1.2273801438569932e-05, + "loss": 2.0864, + "step": 4182 + }, + { + "epoch": 13.59, + "learning_rate": 1.2177333185457474e-05, + "loss": 2.125, + "step": 4185 + }, + { + "epoch": 13.6, + "learning_rate": 1.2081220949927252e-05, + "loss": 2.0829, + "step": 4188 + }, + { + "epoch": 13.61, + "learning_rate": 1.1985465121601392e-05, + "loss": 2.0291, + "step": 4191 + }, + { + "epoch": 13.62, + "learning_rate": 1.189006608865707e-05, + "loss": 2.1237, + "step": 4194 + }, + { + "epoch": 13.63, + "learning_rate": 1.1795024237825092e-05, + "loss": 2.1423, + "step": 4197 + }, + { + "epoch": 13.64, + "learning_rate": 1.1700339954388384e-05, + "loss": 2.1, + "step": 4200 + }, + { + "epoch": 13.65, + "learning_rate": 1.1606013622180278e-05, + "loss": 2.0367, + "step": 4203 + }, + { + "epoch": 13.66, + "learning_rate": 1.1512045623583068e-05, + "loss": 2.0967, + "step": 4206 + }, + { + "epoch": 13.67, + "learning_rate": 1.1418436339526429e-05, + "loss": 2.0585, + "step": 4209 + }, + { + "epoch": 13.68, + "learning_rate": 1.1325186149485889e-05, + "loss": 2.1754, + "step": 4212 + }, + { + "epoch": 13.69, + "learning_rate": 1.1232295431481222e-05, + "loss": 2.0563, + "step": 4215 + }, + { + "epoch": 13.69, + "learning_rate": 1.1139764562075017e-05, + "loss": 2.1228, + "step": 4218 + }, + { + "epoch": 13.7, + "learning_rate": 1.104759391637108e-05, + "loss": 2.013, + "step": 4221 + }, + { + "epoch": 13.71, + "learning_rate": 1.0955783868012892e-05, + "loss": 2.1053, + "step": 4224 + }, + { + "epoch": 13.72, + "learning_rate": 1.0864334789182218e-05, + "loss": 2.0723, + "step": 4227 + }, + { + "epoch": 13.73, + "learning_rate": 1.0773247050597468e-05, + "loss": 2.142, + "step": 4230 + }, + { + "epoch": 13.74, + "learning_rate": 1.0682521021512249e-05, + "loss": 2.0928, + "step": 4233 + }, + { + "epoch": 13.75, + "learning_rate": 1.0592157069713826e-05, + "loss": 2.0371, + "step": 4236 + }, + { + "epoch": 13.76, + "learning_rate": 1.0502155561521766e-05, + "loss": 2.1179, + "step": 4239 + }, + { + "epoch": 13.77, + "learning_rate": 1.0412516861786236e-05, + "loss": 2.0816, + "step": 4242 + }, + { + "epoch": 13.78, + "learning_rate": 1.032324133388668e-05, + "loss": 2.0207, + "step": 4245 + }, + { + "epoch": 13.79, + "learning_rate": 1.0234329339730398e-05, + "loss": 2.0805, + "step": 4248 + }, + { + "epoch": 13.8, + "learning_rate": 1.0145781239750863e-05, + "loss": 2.1022, + "step": 4251 + }, + { + "epoch": 13.81, + "learning_rate": 1.0057597392906414e-05, + "loss": 2.1438, + "step": 4254 + }, + { + "epoch": 13.82, + "learning_rate": 9.969778156678854e-06, + "loss": 2.097, + "step": 4257 + }, + { + "epoch": 13.83, + "learning_rate": 9.88232388707182e-06, + "loss": 2.0942, + "step": 4260 + }, + { + "epoch": 13.84, + "learning_rate": 9.795234938609466e-06, + "loss": 2.0325, + "step": 4263 + }, + { + "epoch": 13.85, + "learning_rate": 9.708511664335029e-06, + "loss": 2.1505, + "step": 4266 + }, + { + "epoch": 13.86, + "learning_rate": 9.62215441580936e-06, + "loss": 2.05, + "step": 4269 + }, + { + "epoch": 13.87, + "learning_rate": 9.536163543109488e-06, + "loss": 2.0526, + "step": 4272 + }, + { + "epoch": 13.88, + "learning_rate": 9.450539394827185e-06, + "loss": 2.0956, + "step": 4275 + }, + { + "epoch": 13.89, + "learning_rate": 9.365282318067681e-06, + "loss": 2.0701, + "step": 4278 + }, + { + "epoch": 13.9, + "learning_rate": 9.280392658448078e-06, + "loss": 2.1114, + "step": 4281 + }, + { + "epoch": 13.91, + "learning_rate": 9.19587076009607e-06, + "loss": 2.0833, + "step": 4284 + }, + { + "epoch": 13.92, + "learning_rate": 9.11171696564853e-06, + "loss": 2.1273, + "step": 4287 + }, + { + "epoch": 13.93, + "learning_rate": 9.027931616250063e-06, + "loss": 2.0479, + "step": 4290 + }, + { + "epoch": 13.94, + "learning_rate": 8.94451505155165e-06, + "loss": 2.0574, + "step": 4293 + }, + { + "epoch": 13.95, + "learning_rate": 8.861467609709373e-06, + "loss": 2.0643, + "step": 4296 + }, + { + "epoch": 13.96, + "learning_rate": 8.778789627382833e-06, + "loss": 2.1623, + "step": 4299 + }, + { + "epoch": 13.97, + "learning_rate": 8.696481439734017e-06, + "loss": 2.0858, + "step": 4302 + }, + { + "epoch": 13.98, + "learning_rate": 8.614543380425766e-06, + "loss": 2.1034, + "step": 4305 + }, + { + "epoch": 13.99, + "learning_rate": 8.532975781620512e-06, + "loss": 2.1097, + "step": 4308 + }, + { + "epoch": 14.0, + "learning_rate": 8.451778973978874e-06, + "loss": 2.1255, + "step": 4311 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.6460723583804651, + "eval_loss": 2.037827730178833, + "eval_runtime": 16.2998, + "eval_samples_per_second": 134.787, + "eval_steps_per_second": 67.424, + "step": 4312 + }, + { + "epoch": 14.01, + "learning_rate": 8.370953286658389e-06, + "loss": 2.0412, + "step": 4314 + }, + { + "epoch": 14.02, + "learning_rate": 8.290499047312106e-06, + "loss": 2.1136, + "step": 4317 + }, + { + "epoch": 14.03, + "learning_rate": 8.210416582087332e-06, + "loss": 2.1369, + "step": 4320 + }, + { + "epoch": 14.04, + "learning_rate": 8.130706215624195e-06, + "loss": 2.0917, + "step": 4323 + }, + { + "epoch": 14.05, + "learning_rate": 8.051368271054493e-06, + "loss": 2.1272, + "step": 4326 + }, + { + "epoch": 14.06, + "learning_rate": 7.972403070000222e-06, + "loss": 2.1439, + "step": 4329 + }, + { + "epoch": 14.06, + "learning_rate": 7.893810932572333e-06, + "loss": 2.0715, + "step": 4332 + }, + { + "epoch": 14.07, + "learning_rate": 7.815592177369502e-06, + "loss": 2.0255, + "step": 4335 + }, + { + "epoch": 14.08, + "learning_rate": 7.737747121476757e-06, + "loss": 2.0631, + "step": 4338 + }, + { + "epoch": 14.09, + "learning_rate": 7.66027608046419e-06, + "loss": 2.1339, + "step": 4341 + }, + { + "epoch": 14.1, + "learning_rate": 7.58317936838574e-06, + "loss": 2.0685, + "step": 4344 + }, + { + "epoch": 14.11, + "learning_rate": 7.506457297777847e-06, + "loss": 2.0141, + "step": 4347 + }, + { + "epoch": 14.12, + "learning_rate": 7.4301101796582225e-06, + "loss": 2.0722, + "step": 4350 + }, + { + "epoch": 14.13, + "learning_rate": 7.354138323524617e-06, + "loss": 2.0758, + "step": 4353 + }, + { + "epoch": 14.14, + "learning_rate": 7.278542037353542e-06, + "loss": 2.0993, + "step": 4356 + }, + { + "epoch": 14.15, + "learning_rate": 7.203321627598947e-06, + "loss": 2.114, + "step": 4359 + }, + { + "epoch": 14.16, + "learning_rate": 7.128477399191136e-06, + "loss": 2.114, + "step": 4362 + }, + { + "epoch": 14.17, + "learning_rate": 7.054009655535354e-06, + "loss": 2.0214, + "step": 4365 + }, + { + "epoch": 14.18, + "learning_rate": 6.979918698510701e-06, + "loss": 2.0729, + "step": 4368 + }, + { + "epoch": 14.19, + "learning_rate": 6.906204828468821e-06, + "loss": 1.9927, + "step": 4371 + }, + { + "epoch": 14.2, + "learning_rate": 6.832868344232757e-06, + "loss": 2.0514, + "step": 4374 + }, + { + "epoch": 14.21, + "learning_rate": 6.759909543095632e-06, + "loss": 2.1031, + "step": 4377 + }, + { + "epoch": 14.22, + "learning_rate": 6.687328720819552e-06, + "loss": 2.0984, + "step": 4380 + }, + { + "epoch": 14.23, + "learning_rate": 6.615126171634367e-06, + "loss": 2.0636, + "step": 4383 + }, + { + "epoch": 14.24, + "learning_rate": 6.543302188236445e-06, + "loss": 2.0285, + "step": 4386 + }, + { + "epoch": 14.25, + "learning_rate": 6.471857061787501e-06, + "loss": 2.0266, + "step": 4389 + }, + { + "epoch": 14.26, + "learning_rate": 6.400791081913538e-06, + "loss": 2.1057, + "step": 4392 + }, + { + "epoch": 14.27, + "learning_rate": 6.33010453670343e-06, + "loss": 2.0328, + "step": 4395 + }, + { + "epoch": 14.28, + "learning_rate": 6.25979771270796e-06, + "loss": 2.0908, + "step": 4398 + }, + { + "epoch": 14.29, + "learning_rate": 6.189870894938587e-06, + "loss": 2.067, + "step": 4401 + }, + { + "epoch": 14.3, + "learning_rate": 6.120324366866281e-06, + "loss": 2.0099, + "step": 4404 + }, + { + "epoch": 14.31, + "learning_rate": 6.051158410420355e-06, + "loss": 2.1048, + "step": 4407 + }, + { + "epoch": 14.32, + "learning_rate": 5.98237330598741e-06, + "loss": 2.0286, + "step": 4410 + }, + { + "epoch": 14.33, + "learning_rate": 5.91396933241013e-06, + "loss": 2.0796, + "step": 4413 + }, + { + "epoch": 14.34, + "learning_rate": 5.845946766986099e-06, + "loss": 2.0391, + "step": 4416 + }, + { + "epoch": 14.35, + "learning_rate": 5.778305885466828e-06, + "loss": 2.0937, + "step": 4419 + }, + { + "epoch": 14.36, + "learning_rate": 5.711046962056488e-06, + "loss": 2.1198, + "step": 4422 + }, + { + "epoch": 14.37, + "learning_rate": 5.644170269410853e-06, + "loss": 2.1277, + "step": 4425 + }, + { + "epoch": 14.38, + "learning_rate": 5.577676078636251e-06, + "loss": 2.1039, + "step": 4428 + }, + { + "epoch": 14.39, + "learning_rate": 5.511564659288404e-06, + "loss": 2.0803, + "step": 4431 + }, + { + "epoch": 14.4, + "learning_rate": 5.445836279371308e-06, + "loss": 2.0594, + "step": 4434 + }, + { + "epoch": 14.41, + "learning_rate": 5.380491205336202e-06, + "loss": 2.131, + "step": 4437 + }, + { + "epoch": 14.42, + "learning_rate": 5.315529702080491e-06, + "loss": 2.0799, + "step": 4440 + }, + { + "epoch": 14.43, + "learning_rate": 5.250952032946643e-06, + "loss": 2.0678, + "step": 4443 + }, + { + "epoch": 14.44, + "learning_rate": 5.186758459721075e-06, + "loss": 2.115, + "step": 4446 + }, + { + "epoch": 14.44, + "learning_rate": 5.122949242633279e-06, + "loss": 2.0485, + "step": 4449 + }, + { + "epoch": 14.45, + "learning_rate": 5.059524640354496e-06, + "loss": 2.1301, + "step": 4452 + }, + { + "epoch": 14.46, + "learning_rate": 4.996484909996868e-06, + "loss": 2.0767, + "step": 4455 + }, + { + "epoch": 14.47, + "learning_rate": 4.933830307112353e-06, + "loss": 2.1214, + "step": 4458 + }, + { + "epoch": 14.48, + "learning_rate": 4.871561085691634e-06, + "loss": 2.0206, + "step": 4461 + }, + { + "epoch": 14.49, + "learning_rate": 4.8096774981631235e-06, + "loss": 2.1629, + "step": 4464 + }, + { + "epoch": 14.5, + "learning_rate": 4.7481797953919605e-06, + "loss": 2.0928, + "step": 4467 + }, + { + "epoch": 14.51, + "learning_rate": 4.687068226679004e-06, + "loss": 2.1403, + "step": 4470 + }, + { + "epoch": 14.52, + "learning_rate": 4.6263430397597395e-06, + "loss": 2.0604, + "step": 4473 + }, + { + "epoch": 14.53, + "learning_rate": 4.566004480803332e-06, + "loss": 2.1267, + "step": 4476 + }, + { + "epoch": 14.54, + "learning_rate": 4.5060527944116856e-06, + "loss": 2.1578, + "step": 4479 + }, + { + "epoch": 14.55, + "learning_rate": 4.446488223618306e-06, + "loss": 2.1332, + "step": 4482 + }, + { + "epoch": 14.56, + "learning_rate": 4.387311009887463e-06, + "loss": 1.9903, + "step": 4485 + }, + { + "epoch": 14.57, + "learning_rate": 4.328521393113149e-06, + "loss": 2.2028, + "step": 4488 + }, + { + "epoch": 14.58, + "learning_rate": 4.270119611618073e-06, + "loss": 2.0242, + "step": 4491 + }, + { + "epoch": 14.59, + "learning_rate": 4.21210590215273e-06, + "loss": 2.0528, + "step": 4494 + }, + { + "epoch": 14.6, + "learning_rate": 4.1544804998944756e-06, + "loss": 2.1233, + "step": 4497 + }, + { + "epoch": 14.61, + "learning_rate": 4.097243638446502e-06, + "loss": 2.1108, + "step": 4500 + }, + { + "epoch": 14.62, + "learning_rate": 4.040395549836928e-06, + "loss": 2.0646, + "step": 4503 + }, + { + "epoch": 14.63, + "learning_rate": 3.983936464517901e-06, + "loss": 2.1225, + "step": 4506 + }, + { + "epoch": 14.64, + "learning_rate": 3.9278666113645615e-06, + "loss": 2.0553, + "step": 4509 + }, + { + "epoch": 14.65, + "learning_rate": 3.872186217674167e-06, + "loss": 2.1065, + "step": 4512 + }, + { + "epoch": 14.66, + "learning_rate": 3.816895509165252e-06, + "loss": 2.0563, + "step": 4515 + }, + { + "epoch": 14.67, + "learning_rate": 3.7619947099765353e-06, + "loss": 2.1165, + "step": 4518 + }, + { + "epoch": 14.68, + "learning_rate": 3.707484042666198e-06, + "loss": 2.0359, + "step": 4521 + }, + { + "epoch": 14.69, + "learning_rate": 3.6533637282108347e-06, + "loss": 2.0946, + "step": 4524 + }, + { + "epoch": 14.7, + "learning_rate": 3.599633986004669e-06, + "loss": 2.0738, + "step": 4527 + }, + { + "epoch": 14.71, + "learning_rate": 3.5462950338585597e-06, + "loss": 2.0815, + "step": 4530 + }, + { + "epoch": 14.72, + "learning_rate": 3.4933470879992104e-06, + "loss": 2.05, + "step": 4533 + }, + { + "epoch": 14.73, + "learning_rate": 3.440790363068247e-06, + "loss": 2.0606, + "step": 4536 + }, + { + "epoch": 14.74, + "learning_rate": 3.3886250721213544e-06, + "loss": 2.1675, + "step": 4539 + }, + { + "epoch": 14.75, + "learning_rate": 3.3368514266273964e-06, + "loss": 2.0499, + "step": 4542 + }, + { + "epoch": 14.76, + "learning_rate": 3.2854696364675974e-06, + "loss": 2.1578, + "step": 4545 + }, + { + "epoch": 14.77, + "learning_rate": 3.2344799099346733e-06, + "loss": 2.0859, + "step": 4548 + }, + { + "epoch": 14.78, + "learning_rate": 3.1838824537319456e-06, + "loss": 2.1324, + "step": 4551 + }, + { + "epoch": 14.79, + "learning_rate": 3.1336774729725736e-06, + "loss": 2.117, + "step": 4554 + }, + { + "epoch": 14.8, + "learning_rate": 3.0838651711787013e-06, + "loss": 2.0503, + "step": 4557 + }, + { + "epoch": 14.81, + "learning_rate": 3.034445750280579e-06, + "loss": 2.0449, + "step": 4560 + }, + { + "epoch": 14.81, + "learning_rate": 2.985419410615831e-06, + "loss": 2.1285, + "step": 4563 + }, + { + "epoch": 14.82, + "learning_rate": 2.9367863509285775e-06, + "loss": 2.0391, + "step": 4566 + }, + { + "epoch": 14.83, + "learning_rate": 2.8885467683686497e-06, + "loss": 2.0469, + "step": 4569 + }, + { + "epoch": 14.84, + "learning_rate": 2.840700858490786e-06, + "loss": 2.1386, + "step": 4572 + }, + { + "epoch": 14.85, + "learning_rate": 2.7932488152538794e-06, + "loss": 2.1428, + "step": 4575 + }, + { + "epoch": 14.86, + "learning_rate": 2.7461908310201123e-06, + "loss": 2.0901, + "step": 4578 + }, + { + "epoch": 14.87, + "learning_rate": 2.6995270965542554e-06, + "loss": 2.0583, + "step": 4581 + }, + { + "epoch": 14.88, + "learning_rate": 2.653257801022835e-06, + "loss": 2.0573, + "step": 4584 + }, + { + "epoch": 14.89, + "learning_rate": 2.607383131993424e-06, + "loss": 2.035, + "step": 4587 + }, + { + "epoch": 14.9, + "learning_rate": 2.561903275433797e-06, + "loss": 2.1873, + "step": 4590 + }, + { + "epoch": 14.91, + "learning_rate": 2.5168184157113084e-06, + "loss": 2.0505, + "step": 4593 + }, + { + "epoch": 14.92, + "learning_rate": 2.472128735591983e-06, + "loss": 2.0236, + "step": 4596 + }, + { + "epoch": 14.93, + "learning_rate": 2.4278344162398935e-06, + "loss": 2.022, + "step": 4599 + }, + { + "epoch": 14.94, + "learning_rate": 2.3839356372164056e-06, + "loss": 1.9994, + "step": 4602 + }, + { + "epoch": 14.95, + "learning_rate": 2.3404325764794012e-06, + "loss": 1.9757, + "step": 4605 + }, + { + "epoch": 14.96, + "learning_rate": 2.2973254103826e-06, + "loss": 2.0497, + "step": 4608 + }, + { + "epoch": 14.97, + "learning_rate": 2.254614313674863e-06, + "loss": 2.1178, + "step": 4611 + }, + { + "epoch": 14.98, + "learning_rate": 2.2122994594994227e-06, + "loss": 2.1794, + "step": 4614 + }, + { + "epoch": 14.99, + "learning_rate": 2.1703810193932307e-06, + "loss": 2.0673, + "step": 4617 + }, + { + "epoch": 15.0, + "learning_rate": 2.1288591632862343e-06, + "loss": 2.1751, + "step": 4620 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.6458123953098828, + "eval_loss": 2.0256659984588623, + "eval_runtime": 16.3182, + "eval_samples_per_second": 134.635, + "eval_steps_per_second": 67.348, + "step": 4620 + }, + { + "epoch": 15.01, + "learning_rate": 2.087734059500712e-06, + "loss": 2.0922, + "step": 4623 + }, + { + "epoch": 15.02, + "learning_rate": 2.0470058747505516e-06, + "loss": 2.1413, + "step": 4626 + }, + { + "epoch": 15.03, + "learning_rate": 2.006674774140638e-06, + "loss": 2.0214, + "step": 4629 + }, + { + "epoch": 15.04, + "learning_rate": 1.9667409211661437e-06, + "loss": 2.1027, + "step": 4632 + }, + { + "epoch": 15.05, + "learning_rate": 1.9272044777118524e-06, + "loss": 2.0475, + "step": 4635 + }, + { + "epoch": 15.06, + "learning_rate": 1.8880656040514921e-06, + "loss": 2.0842, + "step": 4638 + }, + { + "epoch": 15.07, + "learning_rate": 1.8493244588471793e-06, + "loss": 2.0245, + "step": 4641 + }, + { + "epoch": 15.08, + "learning_rate": 1.8109811991486646e-06, + "loss": 2.0969, + "step": 4644 + }, + { + "epoch": 15.09, + "learning_rate": 1.7730359803927343e-06, + "loss": 2.1304, + "step": 4647 + }, + { + "epoch": 15.1, + "learning_rate": 1.735488956402631e-06, + "loss": 2.001, + "step": 4650 + }, + { + "epoch": 15.11, + "learning_rate": 1.698340279387356e-06, + "loss": 2.1577, + "step": 4653 + }, + { + "epoch": 15.12, + "learning_rate": 1.6615900999410683e-06, + "loss": 2.058, + "step": 4656 + }, + { + "epoch": 15.13, + "learning_rate": 1.6252385670425307e-06, + "loss": 2.0714, + "step": 4659 + }, + { + "epoch": 15.14, + "learning_rate": 1.589285828054421e-06, + "loss": 2.0709, + "step": 4662 + }, + { + "epoch": 15.15, + "learning_rate": 1.5537320287227764e-06, + "loss": 2.0754, + "step": 4665 + }, + { + "epoch": 15.16, + "learning_rate": 1.5185773131764502e-06, + "loss": 2.1037, + "step": 4668 + }, + { + "epoch": 15.17, + "learning_rate": 1.4838218239264456e-06, + "loss": 2.1344, + "step": 4671 + }, + { + "epoch": 15.18, + "learning_rate": 1.4494657018653823e-06, + "loss": 2.0933, + "step": 4674 + }, + { + "epoch": 15.19, + "learning_rate": 1.4155090862668863e-06, + "loss": 2.0771, + "step": 4677 + }, + { + "epoch": 15.19, + "learning_rate": 1.3819521147851123e-06, + "loss": 2.0867, + "step": 4680 + }, + { + "epoch": 15.2, + "learning_rate": 1.3487949234540664e-06, + "loss": 2.1519, + "step": 4683 + }, + { + "epoch": 15.21, + "learning_rate": 1.3160376466871739e-06, + "loss": 2.0861, + "step": 4686 + }, + { + "epoch": 15.22, + "learning_rate": 1.2836804172766449e-06, + "loss": 2.0644, + "step": 4689 + }, + { + "epoch": 15.23, + "learning_rate": 1.2517233663929651e-06, + "loss": 2.1133, + "step": 4692 + }, + { + "epoch": 15.24, + "learning_rate": 1.2201666235843735e-06, + "loss": 2.0873, + "step": 4695 + }, + { + "epoch": 15.25, + "learning_rate": 1.18901031677634e-06, + "loss": 2.1127, + "step": 4698 + }, + { + "epoch": 15.26, + "learning_rate": 1.1582545722710225e-06, + "loss": 2.0917, + "step": 4701 + }, + { + "epoch": 15.27, + "learning_rate": 1.1278995147467885e-06, + "loss": 2.0339, + "step": 4704 + }, + { + "epoch": 15.28, + "learning_rate": 1.0979452672576718e-06, + "loss": 2.0649, + "step": 4707 + }, + { + "epoch": 15.29, + "learning_rate": 1.0683919512329166e-06, + "loss": 2.0376, + "step": 4710 + }, + { + "epoch": 15.3, + "learning_rate": 1.0392396864764231e-06, + "loss": 2.0764, + "step": 4713 + }, + { + "epoch": 15.31, + "learning_rate": 1.0104885911663474e-06, + "loss": 2.1247, + "step": 4716 + }, + { + "epoch": 15.32, + "learning_rate": 9.821387818545358e-07, + "loss": 2.1067, + "step": 4719 + }, + { + "epoch": 15.33, + "learning_rate": 9.54190373466113e-07, + "loss": 2.0267, + "step": 4722 + }, + { + "epoch": 15.34, + "learning_rate": 9.266434792989942e-07, + "loss": 2.0377, + "step": 4725 + }, + { + "epoch": 15.35, + "learning_rate": 8.994982110234307e-07, + "loss": 2.0895, + "step": 4728 + }, + { + "epoch": 15.36, + "learning_rate": 8.727546786815421e-07, + "loss": 2.027, + "step": 4731 + }, + { + "epoch": 15.37, + "learning_rate": 8.464129906868734e-07, + "loss": 2.0593, + "step": 4734 + }, + { + "epoch": 15.38, + "learning_rate": 8.204732538239835e-07, + "loss": 1.9569, + "step": 4737 + }, + { + "epoch": 15.39, + "learning_rate": 7.949355732479902e-07, + "loss": 2.0478, + "step": 4740 + }, + { + "epoch": 15.4, + "learning_rate": 7.698000524841376e-07, + "loss": 2.0132, + "step": 4743 + }, + { + "epoch": 15.41, + "learning_rate": 7.450667934273958e-07, + "loss": 2.1406, + "step": 4746 + }, + { + "epoch": 15.42, + "learning_rate": 7.207358963420063e-07, + "loss": 2.0989, + "step": 4749 + }, + { + "epoch": 15.43, + "learning_rate": 6.968074598611484e-07, + "loss": 2.1065, + "step": 4752 + }, + { + "epoch": 15.44, + "learning_rate": 6.732815809864734e-07, + "loss": 2.0832, + "step": 4755 + }, + { + "epoch": 15.45, + "learning_rate": 6.501583550877488e-07, + "loss": 2.1096, + "step": 4758 + }, + { + "epoch": 15.46, + "learning_rate": 6.274378759024257e-07, + "loss": 2.0831, + "step": 4761 + }, + { + "epoch": 15.47, + "learning_rate": 6.051202355353392e-07, + "loss": 2.0592, + "step": 4764 + }, + { + "epoch": 15.48, + "learning_rate": 5.832055244582524e-07, + "loss": 2.1122, + "step": 4767 + }, + { + "epoch": 15.49, + "learning_rate": 5.616938315095243e-07, + "loss": 2.1232, + "step": 4770 + }, + { + "epoch": 15.5, + "learning_rate": 5.405852438937764e-07, + "loss": 2.0721, + "step": 4773 + }, + { + "epoch": 15.51, + "learning_rate": 5.198798471814814e-07, + "loss": 2.1295, + "step": 4776 + }, + { + "epoch": 15.52, + "learning_rate": 4.995777253086753e-07, + "loss": 2.0643, + "step": 4779 + }, + { + "epoch": 15.53, + "learning_rate": 4.796789605765573e-07, + "loss": 2.0741, + "step": 4782 + }, + { + "epoch": 15.54, + "learning_rate": 4.601836336512233e-07, + "loss": 2.0587, + "step": 4785 + }, + { + "epoch": 15.55, + "learning_rate": 4.4109182356327774e-07, + "loss": 2.1124, + "step": 4788 + }, + { + "epoch": 15.56, + "learning_rate": 4.2240360770753327e-07, + "loss": 2.1365, + "step": 4791 + }, + { + "epoch": 15.56, + "learning_rate": 4.0411906184273376e-07, + "loss": 2.1643, + "step": 4794 + }, + { + "epoch": 15.57, + "learning_rate": 3.8623826009120955e-07, + "loss": 2.066, + "step": 4797 + }, + { + "epoch": 15.58, + "learning_rate": 3.6876127493854495e-07, + "loss": 2.0562, + "step": 4800 + }, + { + "epoch": 15.59, + "learning_rate": 3.516881772333669e-07, + "loss": 2.1947, + "step": 4803 + }, + { + "epoch": 15.6, + "learning_rate": 3.35019036187012e-07, + "loss": 2.0512, + "step": 4806 + }, + { + "epoch": 15.61, + "learning_rate": 3.187539193732048e-07, + "loss": 2.0424, + "step": 4809 + }, + { + "epoch": 15.62, + "learning_rate": 3.028928927278685e-07, + "loss": 2.101, + "step": 4812 + }, + { + "epoch": 15.63, + "learning_rate": 2.874360205488258e-07, + "loss": 2.1028, + "step": 4815 + }, + { + "epoch": 15.64, + "learning_rate": 2.723833654954655e-07, + "loss": 2.0677, + "step": 4818 + }, + { + "epoch": 15.65, + "learning_rate": 2.577349885886315e-07, + "loss": 2.0715, + "step": 4821 + }, + { + "epoch": 15.66, + "learning_rate": 2.434909492102455e-07, + "loss": 2.106, + "step": 4824 + }, + { + "epoch": 15.67, + "learning_rate": 2.2965130510310685e-07, + "loss": 2.0889, + "step": 4827 + }, + { + "epoch": 15.68, + "learning_rate": 2.1621611237071516e-07, + "loss": 2.0497, + "step": 4830 + }, + { + "epoch": 15.69, + "learning_rate": 2.031854254769594e-07, + "loss": 2.0797, + "step": 4833 + }, + { + "epoch": 15.7, + "learning_rate": 1.9055929724595134e-07, + "loss": 2.0286, + "step": 4836 + }, + { + "epoch": 15.71, + "learning_rate": 1.7833777886175907e-07, + "loss": 1.9975, + "step": 4839 + }, + { + "epoch": 15.72, + "learning_rate": 1.66520919868296e-07, + "loss": 2.0781, + "step": 4842 + }, + { + "epoch": 15.73, + "learning_rate": 1.5510876816898778e-07, + "loss": 2.1324, + "step": 4845 + }, + { + "epoch": 15.74, + "learning_rate": 1.4410137002670575e-07, + "loss": 2.0367, + "step": 4848 + }, + { + "epoch": 15.75, + "learning_rate": 1.334987700634893e-07, + "loss": 2.1207, + "step": 4851 + }, + { + "epoch": 15.76, + "learning_rate": 1.233010112604016e-07, + "loss": 2.1331, + "step": 4854 + }, + { + "epoch": 15.77, + "learning_rate": 1.1350813495737411e-07, + "loss": 2.1653, + "step": 4857 + }, + { + "epoch": 15.78, + "learning_rate": 1.0412018085297348e-07, + "loss": 2.0604, + "step": 4860 + }, + { + "epoch": 15.79, + "learning_rate": 9.513718700432384e-08, + "loss": 2.0798, + "step": 4863 + }, + { + "epoch": 15.8, + "learning_rate": 8.655918982689581e-08, + "loss": 2.04, + "step": 4866 + }, + { + "epoch": 15.81, + "learning_rate": 7.838622409436225e-08, + "loss": 2.017, + "step": 4869 + }, + { + "epoch": 15.82, + "learning_rate": 7.061832293849823e-08, + "loss": 2.0456, + "step": 4872 + }, + { + "epoch": 15.83, + "learning_rate": 6.325551784900352e-08, + "loss": 2.1483, + "step": 4875 + }, + { + "epoch": 15.84, + "learning_rate": 5.629783867336924e-08, + "loss": 2.0091, + "step": 4878 + }, + { + "epoch": 15.85, + "learning_rate": 4.9745313616822445e-08, + "loss": 2.1577, + "step": 4881 + }, + { + "epoch": 15.86, + "learning_rate": 4.3597969242126225e-08, + "loss": 2.0558, + "step": 4884 + }, + { + "epoch": 15.87, + "learning_rate": 3.7855830469535334e-08, + "loss": 2.0223, + "step": 4887 + }, + { + "epoch": 15.88, + "learning_rate": 3.2518920576662945e-08, + "loss": 2.0815, + "step": 4890 + }, + { + "epoch": 15.89, + "learning_rate": 2.7587261198414038e-08, + "loss": 2.0999, + "step": 4893 + }, + { + "epoch": 15.9, + "learning_rate": 2.3060872326841066e-08, + "loss": 2.1187, + "step": 4896 + }, + { + "epoch": 15.91, + "learning_rate": 1.8939772311143967e-08, + "loss": 2.0901, + "step": 4899 + }, + { + "epoch": 15.92, + "learning_rate": 1.522397785752583e-08, + "loss": 2.1402, + "step": 4902 + }, + { + "epoch": 15.93, + "learning_rate": 1.1913504029159583e-08, + "loss": 2.1058, + "step": 4905 + }, + { + "epoch": 15.94, + "learning_rate": 9.008364246121393e-09, + "loss": 2.0656, + "step": 4908 + }, + { + "epoch": 15.94, + "learning_rate": 6.508570285346238e-09, + "loss": 2.1138, + "step": 4911 + }, + { + "epoch": 15.95, + "learning_rate": 4.414132280550209e-09, + "loss": 2.0542, + "step": 4914 + }, + { + "epoch": 15.96, + "learning_rate": 2.7250587222082957e-09, + "loss": 1.972, + "step": 4917 + }, + { + "epoch": 15.97, + "learning_rate": 1.4413564575432858e-09, + "loss": 2.069, + "step": 4920 + }, + { + "epoch": 15.98, + "learning_rate": 5.630306904369498e-10, + "loss": 2.1005, + "step": 4923 + }, + { + "epoch": 15.99, + "learning_rate": 9.008498147444755e-11, + "loss": 1.9516, + "step": 4926 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.6450172750488208, + "eval_loss": 2.037081241607666, + "eval_runtime": 16.3187, + "eval_samples_per_second": 134.631, + "eval_steps_per_second": 67.346, + "step": 4928 + }, + { + "epoch": 16.0, + "step": 4928, + "total_flos": 1.0508396840353792e+16, + "train_loss": 2.4320973860366, + "train_runtime": 3195.9564, + "train_samples_per_second": 98.68, + "train_steps_per_second": 1.542 + } + ], + "max_steps": 4928, + "num_train_epochs": 16, + "total_flos": 1.0508396840353792e+16, + "trial_name": null, + "trial_params": null +}