{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0011229567451098742, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.614783725549371e-06, "grad_norm": 0.6202379465103149, "learning_rate": 2e-05, "loss": 1.0531, "step": 1 }, { "epoch": 5.614783725549371e-06, "eval_loss": 1.0175226926803589, "eval_runtime": 21573.5423, "eval_samples_per_second": 1.738, "eval_steps_per_second": 1.738, "step": 1 }, { "epoch": 1.1229567451098743e-05, "grad_norm": 0.5498092770576477, "learning_rate": 4e-05, "loss": 1.2547, "step": 2 }, { "epoch": 1.6844351176648115e-05, "grad_norm": 0.2675076127052307, "learning_rate": 6e-05, "loss": 0.7059, "step": 3 }, { "epoch": 2.2459134902197485e-05, "grad_norm": 0.6060311198234558, "learning_rate": 8e-05, "loss": 1.031, "step": 4 }, { "epoch": 2.8073918627746856e-05, "grad_norm": 0.43954452872276306, "learning_rate": 0.0001, "loss": 0.9217, "step": 5 }, { "epoch": 3.368870235329623e-05, "grad_norm": 0.5848556160926819, "learning_rate": 0.00012, "loss": 1.3786, "step": 6 }, { "epoch": 3.9303486078845604e-05, "grad_norm": 0.6156655550003052, "learning_rate": 0.00014, "loss": 0.8794, "step": 7 }, { "epoch": 4.491826980439497e-05, "grad_norm": 0.48315155506134033, "learning_rate": 0.00016, "loss": 1.0659, "step": 8 }, { "epoch": 5.0533053529944345e-05, "grad_norm": 0.4259559214115143, "learning_rate": 0.00018, "loss": 0.702, "step": 9 }, { "epoch": 5.614783725549371e-05, "grad_norm": 0.8003010153770447, "learning_rate": 0.0002, "loss": 0.8346, "step": 10 }, { "epoch": 6.176262098104309e-05, "grad_norm": 0.4740721583366394, "learning_rate": 0.0001999863304992469, "loss": 0.7311, "step": 11 }, { "epoch": 6.737740470659246e-05, "grad_norm": 1.4049946069717407, "learning_rate": 0.00019994532573409262, "loss": 1.0161, "step": 12 }, { "epoch": 7.299218843214183e-05, "grad_norm": 1.7016016244888306, "learning_rate": 0.00019987699691483048, "loss": 0.565, "step": 13 }, { "epoch": 7.860697215769121e-05, "grad_norm": 0.5531960129737854, "learning_rate": 0.00019978136272187747, "loss": 0.5972, "step": 14 }, { "epoch": 8.422175588324057e-05, "grad_norm": 1.0788118839263916, "learning_rate": 0.000199658449300667, "loss": 1.0518, "step": 15 }, { "epoch": 8.983653960878994e-05, "grad_norm": 1.4083199501037598, "learning_rate": 0.00019950829025450114, "loss": 0.7576, "step": 16 }, { "epoch": 9.545132333433932e-05, "grad_norm": 0.8020849823951721, "learning_rate": 0.00019933092663536382, "loss": 0.7487, "step": 17 }, { "epoch": 0.00010106610705988869, "grad_norm": 0.5158109068870544, "learning_rate": 0.00019912640693269752, "loss": 0.5468, "step": 18 }, { "epoch": 0.00010668089078543806, "grad_norm": 0.7264314889907837, "learning_rate": 0.00019889478706014687, "loss": 0.679, "step": 19 }, { "epoch": 0.00011229567451098742, "grad_norm": 0.676001787185669, "learning_rate": 0.00019863613034027224, "loss": 0.8173, "step": 20 }, { "epoch": 0.0001179104582365368, "grad_norm": 0.5240503549575806, "learning_rate": 0.00019835050748723824, "loss": 0.6014, "step": 21 }, { "epoch": 0.00012352524196208617, "grad_norm": 0.7221630811691284, "learning_rate": 0.00019803799658748094, "loss": 0.7016, "step": 22 }, { "epoch": 0.00012914002568763553, "grad_norm": 0.8355087041854858, "learning_rate": 0.00019769868307835994, "loss": 0.7144, "step": 23 }, { "epoch": 0.00013475480941318492, "grad_norm": 0.9996209144592285, "learning_rate": 0.0001973326597248006, "loss": 0.8338, "step": 24 }, { "epoch": 0.00014036959313873428, "grad_norm": 0.6815698742866516, "learning_rate": 0.00019694002659393305, "loss": 0.8459, "step": 25 }, { "epoch": 0.00014598437686428367, "grad_norm": 0.9234247207641602, "learning_rate": 0.00019652089102773488, "loss": 0.6576, "step": 26 }, { "epoch": 0.00015159916058983303, "grad_norm": 1.3168270587921143, "learning_rate": 0.00019607536761368484, "loss": 0.9701, "step": 27 }, { "epoch": 0.00015721394431538241, "grad_norm": 0.7586074471473694, "learning_rate": 0.00019560357815343577, "loss": 0.6198, "step": 28 }, { "epoch": 0.00016282872804093177, "grad_norm": 0.6276509761810303, "learning_rate": 0.00019510565162951537, "loss": 0.912, "step": 29 }, { "epoch": 0.00016844351176648113, "grad_norm": 0.9944347739219666, "learning_rate": 0.00019458172417006347, "loss": 0.9922, "step": 30 }, { "epoch": 0.00017405829549203052, "grad_norm": 0.772435188293457, "learning_rate": 0.00019403193901161613, "loss": 0.7621, "step": 31 }, { "epoch": 0.00017967307921757988, "grad_norm": 0.5932286977767944, "learning_rate": 0.0001934564464599461, "loss": 0.6246, "step": 32 }, { "epoch": 0.00018528786294312927, "grad_norm": 0.44153475761413574, "learning_rate": 0.00019285540384897073, "loss": 0.659, "step": 33 }, { "epoch": 0.00019090264666867863, "grad_norm": 0.5189090967178345, "learning_rate": 0.00019222897549773848, "loss": 0.6122, "step": 34 }, { "epoch": 0.000196517430394228, "grad_norm": 0.6731717586517334, "learning_rate": 0.00019157733266550575, "loss": 0.6224, "step": 35 }, { "epoch": 0.00020213221411977738, "grad_norm": 0.5189567804336548, "learning_rate": 0.00019090065350491626, "loss": 0.7545, "step": 36 }, { "epoch": 0.00020774699784532674, "grad_norm": 0.8409593105316162, "learning_rate": 0.00019019912301329592, "loss": 0.7632, "step": 37 }, { "epoch": 0.00021336178157087613, "grad_norm": 0.5888797044754028, "learning_rate": 0.00018947293298207635, "loss": 0.9602, "step": 38 }, { "epoch": 0.00021897656529642549, "grad_norm": 0.4639832377433777, "learning_rate": 0.0001887222819443612, "loss": 0.5281, "step": 39 }, { "epoch": 0.00022459134902197485, "grad_norm": 0.7274787425994873, "learning_rate": 0.0001879473751206489, "loss": 0.5447, "step": 40 }, { "epoch": 0.00023020613274752423, "grad_norm": 0.5326807498931885, "learning_rate": 0.00018714842436272773, "loss": 0.7716, "step": 41 }, { "epoch": 0.0002358209164730736, "grad_norm": 0.5459308624267578, "learning_rate": 0.00018632564809575742, "loss": 1.0218, "step": 42 }, { "epoch": 0.00024143570019862298, "grad_norm": 0.4652407467365265, "learning_rate": 0.0001854792712585539, "loss": 0.6814, "step": 43 }, { "epoch": 0.00024705048392417234, "grad_norm": 0.4710409641265869, "learning_rate": 0.00018460952524209355, "loss": 0.624, "step": 44 }, { "epoch": 0.00025266526764972173, "grad_norm": 0.5593191385269165, "learning_rate": 0.00018371664782625287, "loss": 0.6167, "step": 45 }, { "epoch": 0.00025828005137527106, "grad_norm": 0.876891553401947, "learning_rate": 0.00018280088311480201, "loss": 0.7577, "step": 46 }, { "epoch": 0.00026389483510082045, "grad_norm": 0.6078484654426575, "learning_rate": 0.00018186248146866927, "loss": 0.4229, "step": 47 }, { "epoch": 0.00026950961882636984, "grad_norm": 0.6034438014030457, "learning_rate": 0.00018090169943749476, "loss": 0.5671, "step": 48 }, { "epoch": 0.0002751244025519192, "grad_norm": 0.48336154222488403, "learning_rate": 0.0001799187996894925, "loss": 0.8663, "step": 49 }, { "epoch": 0.00028073918627746856, "grad_norm": 0.917046844959259, "learning_rate": 0.00017891405093963938, "loss": 0.8188, "step": 50 }, { "epoch": 0.00028073918627746856, "eval_loss": 0.7379979491233826, "eval_runtime": 21585.5399, "eval_samples_per_second": 1.737, "eval_steps_per_second": 1.737, "step": 50 }, { "epoch": 0.00028635397000301795, "grad_norm": 0.6631708145141602, "learning_rate": 0.00017788772787621126, "loss": 0.7043, "step": 51 }, { "epoch": 0.00029196875372856733, "grad_norm": 0.6282797455787659, "learning_rate": 0.00017684011108568592, "loss": 0.8696, "step": 52 }, { "epoch": 0.00029758353745411667, "grad_norm": 0.6374572515487671, "learning_rate": 0.0001757714869760335, "loss": 0.6252, "step": 53 }, { "epoch": 0.00030319832117966605, "grad_norm": 0.6826508045196533, "learning_rate": 0.0001746821476984154, "loss": 0.7257, "step": 54 }, { "epoch": 0.00030881310490521544, "grad_norm": 0.6594266891479492, "learning_rate": 0.00017357239106731317, "loss": 0.9205, "step": 55 }, { "epoch": 0.00031442788863076483, "grad_norm": 0.6436939835548401, "learning_rate": 0.00017244252047910892, "loss": 0.6385, "step": 56 }, { "epoch": 0.00032004267235631416, "grad_norm": 0.5752350687980652, "learning_rate": 0.00017129284482913972, "loss": 0.7424, "step": 57 }, { "epoch": 0.00032565745608186355, "grad_norm": 0.7134163975715637, "learning_rate": 0.00017012367842724887, "loss": 0.7284, "step": 58 }, { "epoch": 0.00033127223980741294, "grad_norm": 2.738507032394409, "learning_rate": 0.0001689353409118566, "loss": 0.6917, "step": 59 }, { "epoch": 0.00033688702353296227, "grad_norm": 0.6593369245529175, "learning_rate": 0.00016772815716257412, "loss": 1.0237, "step": 60 }, { "epoch": 0.00034250180725851166, "grad_norm": 0.5328947305679321, "learning_rate": 0.0001665024572113848, "loss": 0.5792, "step": 61 }, { "epoch": 0.00034811659098406104, "grad_norm": 1.0252548456192017, "learning_rate": 0.00016525857615241687, "loss": 0.7674, "step": 62 }, { "epoch": 0.0003537313747096104, "grad_norm": 0.7161698937416077, "learning_rate": 0.00016399685405033167, "loss": 0.6849, "step": 63 }, { "epoch": 0.00035934615843515977, "grad_norm": 0.4108976721763611, "learning_rate": 0.0001627176358473537, "loss": 0.3426, "step": 64 }, { "epoch": 0.00036496094216070915, "grad_norm": 0.8395034670829773, "learning_rate": 0.0001614212712689668, "loss": 0.7893, "step": 65 }, { "epoch": 0.00037057572588625854, "grad_norm": 1.7357856035232544, "learning_rate": 0.00016010811472830252, "loss": 0.6872, "step": 66 }, { "epoch": 0.0003761905096118079, "grad_norm": 0.6169449687004089, "learning_rate": 0.00015877852522924732, "loss": 0.7493, "step": 67 }, { "epoch": 0.00038180529333735726, "grad_norm": 0.40986934304237366, "learning_rate": 0.00015743286626829437, "loss": 0.4622, "step": 68 }, { "epoch": 0.00038742007706290665, "grad_norm": 0.9793535470962524, "learning_rate": 0.0001560715057351673, "loss": 0.6613, "step": 69 }, { "epoch": 0.000393034860788456, "grad_norm": 0.8405007123947144, "learning_rate": 0.00015469481581224272, "loss": 0.7482, "step": 70 }, { "epoch": 0.00039864964451400537, "grad_norm": 0.9131999611854553, "learning_rate": 0.0001533031728727994, "loss": 1.1605, "step": 71 }, { "epoch": 0.00040426442823955476, "grad_norm": 0.47172001004219055, "learning_rate": 0.00015189695737812152, "loss": 0.659, "step": 72 }, { "epoch": 0.00040987921196510414, "grad_norm": 0.5213868021965027, "learning_rate": 0.0001504765537734844, "loss": 0.6634, "step": 73 }, { "epoch": 0.0004154939956906535, "grad_norm": 0.8890513181686401, "learning_rate": 0.00014904235038305083, "loss": 1.0718, "step": 74 }, { "epoch": 0.00042110877941620286, "grad_norm": 0.9680766463279724, "learning_rate": 0.00014759473930370736, "loss": 0.7103, "step": 75 }, { "epoch": 0.00042672356314175225, "grad_norm": 0.4047940969467163, "learning_rate": 0.0001461341162978688, "loss": 0.71, "step": 76 }, { "epoch": 0.0004323383468673016, "grad_norm": 0.5290956497192383, "learning_rate": 0.00014466088068528068, "loss": 0.6457, "step": 77 }, { "epoch": 0.00043795313059285097, "grad_norm": 0.6832664012908936, "learning_rate": 0.00014317543523384928, "loss": 0.6552, "step": 78 }, { "epoch": 0.00044356791431840036, "grad_norm": 0.4100581407546997, "learning_rate": 0.00014167818604952906, "loss": 0.5278, "step": 79 }, { "epoch": 0.0004491826980439497, "grad_norm": 0.7382338643074036, "learning_rate": 0.00014016954246529696, "loss": 0.8645, "step": 80 }, { "epoch": 0.0004547974817694991, "grad_norm": 0.2650225758552551, "learning_rate": 0.00013864991692924523, "loss": 0.4649, "step": 81 }, { "epoch": 0.00046041226549504847, "grad_norm": 0.367811918258667, "learning_rate": 0.00013711972489182208, "loss": 0.7294, "step": 82 }, { "epoch": 0.00046602704922059786, "grad_norm": 0.4647292494773865, "learning_rate": 0.00013557938469225167, "loss": 0.5452, "step": 83 }, { "epoch": 0.0004716418329461472, "grad_norm": 0.6576372385025024, "learning_rate": 0.00013402931744416433, "loss": 0.9787, "step": 84 }, { "epoch": 0.0004772566166716966, "grad_norm": 0.5366365313529968, "learning_rate": 0.00013246994692046836, "loss": 0.7488, "step": 85 }, { "epoch": 0.00048287140039724596, "grad_norm": 0.5728768706321716, "learning_rate": 0.00013090169943749476, "loss": 0.6648, "step": 86 }, { "epoch": 0.0004884861841227954, "grad_norm": 0.6398264169692993, "learning_rate": 0.0001293250037384465, "loss": 0.8214, "step": 87 }, { "epoch": 0.0004941009678483447, "grad_norm": 1.2357827425003052, "learning_rate": 0.00012774029087618446, "loss": 0.8501, "step": 88 }, { "epoch": 0.000499715751573894, "grad_norm": 0.394092321395874, "learning_rate": 0.00012614799409538198, "loss": 0.3837, "step": 89 }, { "epoch": 0.0005053305352994435, "grad_norm": 0.6195949912071228, "learning_rate": 0.00012454854871407994, "loss": 0.8326, "step": 90 }, { "epoch": 0.0005109453190249928, "grad_norm": 0.435367226600647, "learning_rate": 0.00012294239200467516, "loss": 0.7117, "step": 91 }, { "epoch": 0.0005165601027505421, "grad_norm": 1.041673183441162, "learning_rate": 0.0001213299630743747, "loss": 1.0454, "step": 92 }, { "epoch": 0.0005221748864760916, "grad_norm": 0.5448310375213623, "learning_rate": 0.00011971170274514802, "loss": 0.6795, "step": 93 }, { "epoch": 0.0005277896702016409, "grad_norm": 0.5703837275505066, "learning_rate": 0.000118088053433211, "loss": 0.6598, "step": 94 }, { "epoch": 0.0005334044539271903, "grad_norm": 0.4229240417480469, "learning_rate": 0.00011645945902807341, "loss": 0.4612, "step": 95 }, { "epoch": 0.0005390192376527397, "grad_norm": 0.6660332679748535, "learning_rate": 0.0001148263647711842, "loss": 0.8873, "step": 96 }, { "epoch": 0.000544634021378289, "grad_norm": 0.6834558248519897, "learning_rate": 0.00011318921713420691, "loss": 0.7653, "step": 97 }, { "epoch": 0.0005502488051038384, "grad_norm": 0.4525357186794281, "learning_rate": 0.00011154846369695863, "loss": 0.4677, "step": 98 }, { "epoch": 0.0005558635888293878, "grad_norm": 0.34924590587615967, "learning_rate": 0.0001099045530250463, "loss": 0.4567, "step": 99 }, { "epoch": 0.0005614783725549371, "grad_norm": 0.7102425694465637, "learning_rate": 0.00010825793454723325, "loss": 0.4776, "step": 100 }, { "epoch": 0.0005614783725549371, "eval_loss": 0.7291257977485657, "eval_runtime": 21564.2046, "eval_samples_per_second": 1.739, "eval_steps_per_second": 1.739, "step": 100 }, { "epoch": 0.0005670931562804866, "grad_norm": 0.8440341949462891, "learning_rate": 0.00010660905843256994, "loss": 0.5559, "step": 101 }, { "epoch": 0.0005727079400060359, "grad_norm": 0.8778635859489441, "learning_rate": 0.00010495837546732224, "loss": 0.8599, "step": 102 }, { "epoch": 0.0005783227237315852, "grad_norm": 0.5961638689041138, "learning_rate": 0.00010330633693173082, "loss": 0.6385, "step": 103 }, { "epoch": 0.0005839375074571347, "grad_norm": 0.460720956325531, "learning_rate": 0.00010165339447663587, "loss": 0.4011, "step": 104 }, { "epoch": 0.000589552291182684, "grad_norm": 0.8094878196716309, "learning_rate": 0.0001, "loss": 0.7205, "step": 105 }, { "epoch": 0.0005951670749082333, "grad_norm": 0.5714880228042603, "learning_rate": 9.834660552336415e-05, "loss": 0.7492, "step": 106 }, { "epoch": 0.0006007818586337828, "grad_norm": 0.8501264452934265, "learning_rate": 9.669366306826919e-05, "loss": 0.8841, "step": 107 }, { "epoch": 0.0006063966423593321, "grad_norm": 0.6514793634414673, "learning_rate": 9.504162453267777e-05, "loss": 0.6724, "step": 108 }, { "epoch": 0.0006120114260848814, "grad_norm": 0.7907955050468445, "learning_rate": 9.339094156743007e-05, "loss": 0.454, "step": 109 }, { "epoch": 0.0006176262098104309, "grad_norm": 0.6769828200340271, "learning_rate": 9.174206545276677e-05, "loss": 0.7664, "step": 110 }, { "epoch": 0.0006232409935359802, "grad_norm": 0.6691708564758301, "learning_rate": 9.009544697495374e-05, "loss": 1.1113, "step": 111 }, { "epoch": 0.0006288557772615297, "grad_norm": 0.603244960308075, "learning_rate": 8.845153630304139e-05, "loss": 0.7398, "step": 112 }, { "epoch": 0.000634470560987079, "grad_norm": 0.5727530121803284, "learning_rate": 8.681078286579311e-05, "loss": 0.5164, "step": 113 }, { "epoch": 0.0006400853447126283, "grad_norm": 0.45102572441101074, "learning_rate": 8.517363522881579e-05, "loss": 0.4242, "step": 114 }, { "epoch": 0.0006457001284381778, "grad_norm": 0.5324817895889282, "learning_rate": 8.35405409719266e-05, "loss": 0.7243, "step": 115 }, { "epoch": 0.0006513149121637271, "grad_norm": 0.6306778788566589, "learning_rate": 8.191194656678904e-05, "loss": 0.8012, "step": 116 }, { "epoch": 0.0006569296958892764, "grad_norm": 0.6321919560432434, "learning_rate": 8.028829725485199e-05, "loss": 0.7433, "step": 117 }, { "epoch": 0.0006625444796148259, "grad_norm": 0.7640076279640198, "learning_rate": 7.867003692562534e-05, "loss": 0.6027, "step": 118 }, { "epoch": 0.0006681592633403752, "grad_norm": 0.4536445438861847, "learning_rate": 7.705760799532485e-05, "loss": 0.3564, "step": 119 }, { "epoch": 0.0006737740470659245, "grad_norm": 0.7957231998443604, "learning_rate": 7.54514512859201e-05, "loss": 0.7709, "step": 120 }, { "epoch": 0.000679388830791474, "grad_norm": 0.354618102312088, "learning_rate": 7.385200590461803e-05, "loss": 0.458, "step": 121 }, { "epoch": 0.0006850036145170233, "grad_norm": 0.6814515590667725, "learning_rate": 7.225970912381556e-05, "loss": 0.5223, "step": 122 }, { "epoch": 0.0006906183982425726, "grad_norm": 0.695183515548706, "learning_rate": 7.067499626155354e-05, "loss": 0.5944, "step": 123 }, { "epoch": 0.0006962331819681221, "grad_norm": 0.5911761522293091, "learning_rate": 6.909830056250527e-05, "loss": 0.57, "step": 124 }, { "epoch": 0.0007018479656936714, "grad_norm": 0.8290671110153198, "learning_rate": 6.753005307953167e-05, "loss": 0.7753, "step": 125 }, { "epoch": 0.0007074627494192208, "grad_norm": 0.5861778259277344, "learning_rate": 6.59706825558357e-05, "loss": 0.7765, "step": 126 }, { "epoch": 0.0007130775331447702, "grad_norm": 0.4672585725784302, "learning_rate": 6.442061530774834e-05, "loss": 0.4829, "step": 127 }, { "epoch": 0.0007186923168703195, "grad_norm": 0.47954103350639343, "learning_rate": 6.28802751081779e-05, "loss": 0.7241, "step": 128 }, { "epoch": 0.000724307100595869, "grad_norm": 0.45866596698760986, "learning_rate": 6.135008307075481e-05, "loss": 0.4806, "step": 129 }, { "epoch": 0.0007299218843214183, "grad_norm": 0.8541582226753235, "learning_rate": 5.983045753470308e-05, "loss": 0.6784, "step": 130 }, { "epoch": 0.0007355366680469676, "grad_norm": 0.6664092540740967, "learning_rate": 5.832181395047098e-05, "loss": 0.8403, "step": 131 }, { "epoch": 0.0007411514517725171, "grad_norm": 1.3184312582015991, "learning_rate": 5.6824564766150726e-05, "loss": 0.6905, "step": 132 }, { "epoch": 0.0007467662354980664, "grad_norm": 0.5796993374824524, "learning_rate": 5.533911931471936e-05, "loss": 0.61, "step": 133 }, { "epoch": 0.0007523810192236157, "grad_norm": 0.3954677879810333, "learning_rate": 5.386588370213124e-05, "loss": 0.4762, "step": 134 }, { "epoch": 0.0007579958029491652, "grad_norm": 0.2643950581550598, "learning_rate": 5.240526069629265e-05, "loss": 0.3033, "step": 135 }, { "epoch": 0.0007636105866747145, "grad_norm": 0.749079167842865, "learning_rate": 5.095764961694922e-05, "loss": 0.8238, "step": 136 }, { "epoch": 0.0007692253704002639, "grad_norm": 0.5847424268722534, "learning_rate": 4.952344622651566e-05, "loss": 0.5255, "step": 137 }, { "epoch": 0.0007748401541258133, "grad_norm": 0.46530458331108093, "learning_rate": 4.810304262187852e-05, "loss": 0.5624, "step": 138 }, { "epoch": 0.0007804549378513626, "grad_norm": 0.49851590394973755, "learning_rate": 4.669682712720065e-05, "loss": 0.7465, "step": 139 }, { "epoch": 0.000786069721576912, "grad_norm": 0.7142413258552551, "learning_rate": 4.530518418775733e-05, "loss": 0.7593, "step": 140 }, { "epoch": 0.0007916845053024614, "grad_norm": 0.6527928113937378, "learning_rate": 4.392849426483274e-05, "loss": 0.7741, "step": 141 }, { "epoch": 0.0007972992890280107, "grad_norm": 0.377798855304718, "learning_rate": 4.256713373170564e-05, "loss": 0.3863, "step": 142 }, { "epoch": 0.0008029140727535601, "grad_norm": 0.6521917581558228, "learning_rate": 4.12214747707527e-05, "loss": 0.6036, "step": 143 }, { "epoch": 0.0008085288564791095, "grad_norm": 0.6146506071090698, "learning_rate": 3.9891885271697496e-05, "loss": 0.6178, "step": 144 }, { "epoch": 0.0008141436402046588, "grad_norm": 0.36121559143066406, "learning_rate": 3.857872873103322e-05, "loss": 0.4544, "step": 145 }, { "epoch": 0.0008197584239302083, "grad_norm": 0.3757206201553345, "learning_rate": 3.7282364152646297e-05, "loss": 0.6141, "step": 146 }, { "epoch": 0.0008253732076557576, "grad_norm": 0.604722261428833, "learning_rate": 3.600314594966834e-05, "loss": 0.6223, "step": 147 }, { "epoch": 0.000830987991381307, "grad_norm": 0.761726438999176, "learning_rate": 3.4741423847583134e-05, "loss": 0.725, "step": 148 }, { "epoch": 0.0008366027751068564, "grad_norm": 0.586891233921051, "learning_rate": 3.349754278861517e-05, "loss": 0.936, "step": 149 }, { "epoch": 0.0008422175588324057, "grad_norm": 0.6373620629310608, "learning_rate": 3.227184283742591e-05, "loss": 0.9005, "step": 150 }, { "epoch": 0.0008422175588324057, "eval_loss": 0.7196696400642395, "eval_runtime": 21575.1375, "eval_samples_per_second": 1.738, "eval_steps_per_second": 1.738, "step": 150 }, { "epoch": 0.0008478323425579551, "grad_norm": 0.5523321628570557, "learning_rate": 3.106465908814342e-05, "loss": 0.6273, "step": 151 }, { "epoch": 0.0008534471262835045, "grad_norm": 0.5256928205490112, "learning_rate": 2.9876321572751144e-05, "loss": 0.6441, "step": 152 }, { "epoch": 0.0008590619100090538, "grad_norm": 0.595013439655304, "learning_rate": 2.87071551708603e-05, "loss": 0.5625, "step": 153 }, { "epoch": 0.0008646766937346032, "grad_norm": 0.38558343052864075, "learning_rate": 2.7557479520891104e-05, "loss": 0.79, "step": 154 }, { "epoch": 0.0008702914774601526, "grad_norm": 0.416737824678421, "learning_rate": 2.6427608932686843e-05, "loss": 0.6048, "step": 155 }, { "epoch": 0.0008759062611857019, "grad_norm": 0.7657343745231628, "learning_rate": 2.5317852301584643e-05, "loss": 0.6573, "step": 156 }, { "epoch": 0.0008815210449112513, "grad_norm": 1.0111124515533447, "learning_rate": 2.422851302396655e-05, "loss": 0.8601, "step": 157 }, { "epoch": 0.0008871358286368007, "grad_norm": 0.5592943429946899, "learning_rate": 2.315988891431412e-05, "loss": 0.757, "step": 158 }, { "epoch": 0.00089275061236235, "grad_norm": 0.38879042863845825, "learning_rate": 2.2112272123788768e-05, "loss": 0.5481, "step": 159 }, { "epoch": 0.0008983653960878994, "grad_norm": 0.5495730638504028, "learning_rate": 2.1085949060360654e-05, "loss": 1.0124, "step": 160 }, { "epoch": 0.0009039801798134488, "grad_norm": 0.4455462396144867, "learning_rate": 2.008120031050753e-05, "loss": 0.6964, "step": 161 }, { "epoch": 0.0009095949635389982, "grad_norm": 0.2935314476490021, "learning_rate": 1.9098300562505266e-05, "loss": 0.4621, "step": 162 }, { "epoch": 0.0009152097472645476, "grad_norm": 0.48424121737480164, "learning_rate": 1.8137518531330767e-05, "loss": 0.6099, "step": 163 }, { "epoch": 0.0009208245309900969, "grad_norm": 0.784777820110321, "learning_rate": 1.7199116885197995e-05, "loss": 0.9775, "step": 164 }, { "epoch": 0.0009264393147156463, "grad_norm": 0.37943509221076965, "learning_rate": 1.6283352173747145e-05, "loss": 0.6561, "step": 165 }, { "epoch": 0.0009320540984411957, "grad_norm": 0.5564980506896973, "learning_rate": 1.5390474757906446e-05, "loss": 0.5258, "step": 166 }, { "epoch": 0.000937668882166745, "grad_norm": 0.5110490918159485, "learning_rate": 1.4520728741446089e-05, "loss": 0.7777, "step": 167 }, { "epoch": 0.0009432836658922944, "grad_norm": 0.7255272269248962, "learning_rate": 1.3674351904242611e-05, "loss": 0.8582, "step": 168 }, { "epoch": 0.0009488984496178438, "grad_norm": 0.5513525605201721, "learning_rate": 1.2851575637272262e-05, "loss": 0.3705, "step": 169 }, { "epoch": 0.0009545132333433932, "grad_norm": 0.4703540802001953, "learning_rate": 1.2052624879351104e-05, "loss": 0.746, "step": 170 }, { "epoch": 0.0009601280170689425, "grad_norm": 0.7460604906082153, "learning_rate": 1.1277718055638819e-05, "loss": 0.6982, "step": 171 }, { "epoch": 0.0009657428007944919, "grad_norm": 0.4515000879764557, "learning_rate": 1.0527067017923654e-05, "loss": 0.6835, "step": 172 }, { "epoch": 0.0009713575845200413, "grad_norm": 0.6428802013397217, "learning_rate": 9.80087698670411e-06, "loss": 0.8466, "step": 173 }, { "epoch": 0.0009769723682455907, "grad_norm": 0.5509766340255737, "learning_rate": 9.09934649508375e-06, "loss": 0.5473, "step": 174 }, { "epoch": 0.00098258715197114, "grad_norm": 0.5568180084228516, "learning_rate": 8.422667334494249e-06, "loss": 0.8137, "step": 175 }, { "epoch": 0.0009882019356966894, "grad_norm": 0.5117676258087158, "learning_rate": 7.771024502261526e-06, "loss": 0.7012, "step": 176 }, { "epoch": 0.0009938167194222388, "grad_norm": 0.42283153533935547, "learning_rate": 7.144596151029303e-06, "loss": 0.6836, "step": 177 }, { "epoch": 0.000999431503147788, "grad_norm": 0.8226689100265503, "learning_rate": 6.543553540053926e-06, "loss": 0.8315, "step": 178 }, { "epoch": 0.0010050462868733375, "grad_norm": 0.8063510656356812, "learning_rate": 5.968060988383883e-06, "loss": 0.7767, "step": 179 }, { "epoch": 0.001010661070598887, "grad_norm": 0.4655192494392395, "learning_rate": 5.418275829936537e-06, "loss": 0.4222, "step": 180 }, { "epoch": 0.0010162758543244361, "grad_norm": 0.8273990154266357, "learning_rate": 4.8943483704846475e-06, "loss": 0.8631, "step": 181 }, { "epoch": 0.0010218906380499856, "grad_norm": 0.4272424876689911, "learning_rate": 4.3964218465642355e-06, "loss": 0.5201, "step": 182 }, { "epoch": 0.001027505421775535, "grad_norm": 0.39918985962867737, "learning_rate": 3.924632386315186e-06, "loss": 0.4394, "step": 183 }, { "epoch": 0.0010331202055010843, "grad_norm": 0.4294412136077881, "learning_rate": 3.4791089722651436e-06, "loss": 0.5491, "step": 184 }, { "epoch": 0.0010387349892266337, "grad_norm": 0.398103266954422, "learning_rate": 3.059973406066963e-06, "loss": 0.6337, "step": 185 }, { "epoch": 0.0010443497729521831, "grad_norm": 1.2630798816680908, "learning_rate": 2.667340275199426e-06, "loss": 0.8147, "step": 186 }, { "epoch": 0.0010499645566777326, "grad_norm": 0.3572145998477936, "learning_rate": 2.3013169216400733e-06, "loss": 0.453, "step": 187 }, { "epoch": 0.0010555793404032818, "grad_norm": 0.6993924975395203, "learning_rate": 1.9620034125190644e-06, "loss": 0.8392, "step": 188 }, { "epoch": 0.0010611941241288312, "grad_norm": 0.4211289584636688, "learning_rate": 1.6494925127617634e-06, "loss": 0.637, "step": 189 }, { "epoch": 0.0010668089078543807, "grad_norm": 0.5733233690261841, "learning_rate": 1.3638696597277679e-06, "loss": 0.778, "step": 190 }, { "epoch": 0.00107242369157993, "grad_norm": 0.4856647849082947, "learning_rate": 1.1052129398531507e-06, "loss": 0.5214, "step": 191 }, { "epoch": 0.0010780384753054793, "grad_norm": 0.5088161826133728, "learning_rate": 8.735930673024806e-07, "loss": 0.7683, "step": 192 }, { "epoch": 0.0010836532590310288, "grad_norm": 0.4626195430755615, "learning_rate": 6.690733646361857e-07, "loss": 0.5463, "step": 193 }, { "epoch": 0.001089268042756578, "grad_norm": 0.32035353779792786, "learning_rate": 4.917097454988584e-07, "loss": 0.2829, "step": 194 }, { "epoch": 0.0010948828264821275, "grad_norm": 1.2485867738723755, "learning_rate": 3.415506993330153e-07, "loss": 0.6357, "step": 195 }, { "epoch": 0.001100497610207677, "grad_norm": 0.6193792819976807, "learning_rate": 2.1863727812254653e-07, "loss": 0.8182, "step": 196 }, { "epoch": 0.0011061123939332261, "grad_norm": 0.6518222093582153, "learning_rate": 1.230030851695263e-07, "loss": 0.7356, "step": 197 }, { "epoch": 0.0011117271776587756, "grad_norm": 0.6630931496620178, "learning_rate": 5.467426590739511e-08, "loss": 0.738, "step": 198 }, { "epoch": 0.001117341961384325, "grad_norm": 0.5655919909477234, "learning_rate": 1.3669500753099585e-08, "loss": 0.6549, "step": 199 }, { "epoch": 0.0011229567451098742, "grad_norm": 0.4997493624687195, "learning_rate": 0.0, "loss": 0.6157, "step": 200 }, { "epoch": 0.0011229567451098742, "eval_loss": 0.7180939316749573, "eval_runtime": 21573.751, "eval_samples_per_second": 1.738, "eval_steps_per_second": 1.738, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.477589814214656e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }