{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.27279497818916904, "eval_steps": 2000, "global_step": 641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042557718906266625, "grad_norm": 0.3652644157409668, "learning_rate": 1e-05, "loss": 1.7802, "step": 1 }, { "epoch": 0.00042557718906266625, "eval_loss": 1.6649078130722046, "eval_runtime": 273.5839, "eval_samples_per_second": 7.234, "eval_steps_per_second": 1.809, "step": 1 }, { "epoch": 0.0008511543781253325, "grad_norm": 0.4177471697330475, "learning_rate": 2e-05, "loss": 1.579, "step": 2 }, { "epoch": 0.0012767315671879987, "grad_norm": 0.30818939208984375, "learning_rate": 3e-05, "loss": 1.6122, "step": 3 }, { "epoch": 0.001702308756250665, "grad_norm": 0.37045714259147644, "learning_rate": 4e-05, "loss": 1.72, "step": 4 }, { "epoch": 0.002127885945313331, "grad_norm": 0.34079983830451965, "learning_rate": 5e-05, "loss": 1.5969, "step": 5 }, { "epoch": 0.0025534631343759975, "grad_norm": 0.381185919046402, "learning_rate": 6e-05, "loss": 1.7075, "step": 6 }, { "epoch": 0.0029790403234386635, "grad_norm": 0.43642061948776245, "learning_rate": 7e-05, "loss": 1.5826, "step": 7 }, { "epoch": 0.00340461751250133, "grad_norm": 0.395150750875473, "learning_rate": 8e-05, "loss": 1.666, "step": 8 }, { "epoch": 0.003830194701563996, "grad_norm": 0.4904192090034485, "learning_rate": 9e-05, "loss": 1.5554, "step": 9 }, { "epoch": 0.004255771890626662, "grad_norm": 0.4711903929710388, "learning_rate": 0.0001, "loss": 1.6409, "step": 10 }, { "epoch": 0.0046813490796893285, "grad_norm": 0.443033367395401, "learning_rate": 9.99993803019373e-05, "loss": 1.5268, "step": 11 }, { "epoch": 0.005106926268751995, "grad_norm": 0.4341345727443695, "learning_rate": 9.999752122311022e-05, "loss": 1.6879, "step": 12 }, { "epoch": 0.005532503457814661, "grad_norm": 0.36973270773887634, "learning_rate": 9.999442280960142e-05, "loss": 1.5406, "step": 13 }, { "epoch": 0.005958080646877327, "grad_norm": 0.41089990735054016, "learning_rate": 9.99900851382142e-05, "loss": 1.3971, "step": 14 }, { "epoch": 0.0063836578359399935, "grad_norm": 0.37346774339675903, "learning_rate": 9.998450831647039e-05, "loss": 1.5966, "step": 15 }, { "epoch": 0.00680923502500266, "grad_norm": 0.37496069073677063, "learning_rate": 9.99776924826078e-05, "loss": 1.4682, "step": 16 }, { "epoch": 0.007234812214065326, "grad_norm": 0.496907114982605, "learning_rate": 9.996963780557683e-05, "loss": 1.4213, "step": 17 }, { "epoch": 0.007660389403127992, "grad_norm": 0.39572206139564514, "learning_rate": 9.996034448503616e-05, "loss": 1.3925, "step": 18 }, { "epoch": 0.00808596659219066, "grad_norm": 0.3716890215873718, "learning_rate": 9.994981275134791e-05, "loss": 1.3852, "step": 19 }, { "epoch": 0.008511543781253324, "grad_norm": 0.39378899335861206, "learning_rate": 9.993804286557188e-05, "loss": 1.4922, "step": 20 }, { "epoch": 0.00893712097031599, "grad_norm": 0.394651859998703, "learning_rate": 9.992503511945907e-05, "loss": 1.4906, "step": 21 }, { "epoch": 0.009362698159378657, "grad_norm": 0.43118777871131897, "learning_rate": 9.991078983544452e-05, "loss": 1.355, "step": 22 }, { "epoch": 0.009788275348441323, "grad_norm": 0.4793911278247833, "learning_rate": 9.989530736663918e-05, "loss": 1.5297, "step": 23 }, { "epoch": 0.01021385253750399, "grad_norm": 0.4128536283969879, "learning_rate": 9.987858809682132e-05, "loss": 1.4856, "step": 24 }, { "epoch": 0.010639429726566656, "grad_norm": 0.33675417304039, "learning_rate": 9.986063244042689e-05, "loss": 1.4352, "step": 25 }, { "epoch": 0.011065006915629323, "grad_norm": 0.35233640670776367, "learning_rate": 9.984144084253932e-05, "loss": 1.3988, "step": 26 }, { "epoch": 0.01149058410469199, "grad_norm": 0.37231576442718506, "learning_rate": 9.982101377887845e-05, "loss": 1.4834, "step": 27 }, { "epoch": 0.011916161293754654, "grad_norm": 0.35900095105171204, "learning_rate": 9.979935175578873e-05, "loss": 1.441, "step": 28 }, { "epoch": 0.01234173848281732, "grad_norm": 0.33582940697669983, "learning_rate": 9.977645531022672e-05, "loss": 1.3407, "step": 29 }, { "epoch": 0.012767315671879987, "grad_norm": 0.3340405225753784, "learning_rate": 9.975232500974776e-05, "loss": 1.3519, "step": 30 }, { "epoch": 0.013192892860942653, "grad_norm": 0.32448646426200867, "learning_rate": 9.972696145249185e-05, "loss": 1.3666, "step": 31 }, { "epoch": 0.01361847005000532, "grad_norm": 0.31764093041419983, "learning_rate": 9.970036526716889e-05, "loss": 1.3873, "step": 32 }, { "epoch": 0.014044047239067986, "grad_norm": 0.37046271562576294, "learning_rate": 9.967253711304304e-05, "loss": 1.4668, "step": 33 }, { "epoch": 0.014469624428130653, "grad_norm": 0.35011404752731323, "learning_rate": 9.964347767991644e-05, "loss": 1.4616, "step": 34 }, { "epoch": 0.01489520161719332, "grad_norm": 0.37187859416007996, "learning_rate": 9.961318768811209e-05, "loss": 1.5379, "step": 35 }, { "epoch": 0.015320778806255984, "grad_norm": 0.37234461307525635, "learning_rate": 9.958166788845592e-05, "loss": 1.5205, "step": 36 }, { "epoch": 0.01574635599531865, "grad_norm": 0.3146509528160095, "learning_rate": 9.954891906225832e-05, "loss": 1.3043, "step": 37 }, { "epoch": 0.01617193318438132, "grad_norm": 0.29514676332473755, "learning_rate": 9.951494202129461e-05, "loss": 1.2737, "step": 38 }, { "epoch": 0.016597510373443983, "grad_norm": 0.3454630672931671, "learning_rate": 9.94797376077851e-05, "loss": 1.4062, "step": 39 }, { "epoch": 0.017023087562506648, "grad_norm": 0.32935354113578796, "learning_rate": 9.944330669437402e-05, "loss": 1.3869, "step": 40 }, { "epoch": 0.017448664751569316, "grad_norm": 0.3183695077896118, "learning_rate": 9.940565018410805e-05, "loss": 1.4048, "step": 41 }, { "epoch": 0.01787424194063198, "grad_norm": 0.3441618084907532, "learning_rate": 9.936676901041386e-05, "loss": 1.4645, "step": 42 }, { "epoch": 0.01829981912969465, "grad_norm": 0.2902175784111023, "learning_rate": 9.932666413707494e-05, "loss": 1.4638, "step": 43 }, { "epoch": 0.018725396318757314, "grad_norm": 0.3078022003173828, "learning_rate": 9.92853365582078e-05, "loss": 1.4624, "step": 44 }, { "epoch": 0.019150973507819982, "grad_norm": 0.34494903683662415, "learning_rate": 9.924278729823727e-05, "loss": 1.458, "step": 45 }, { "epoch": 0.019576550696882647, "grad_norm": 0.3399023413658142, "learning_rate": 9.91990174118711e-05, "loss": 1.4537, "step": 46 }, { "epoch": 0.02000212788594531, "grad_norm": 0.3627009391784668, "learning_rate": 9.915402798407383e-05, "loss": 1.366, "step": 47 }, { "epoch": 0.02042770507500798, "grad_norm": 0.3227439224720001, "learning_rate": 9.910782013003994e-05, "loss": 1.4666, "step": 48 }, { "epoch": 0.020853282264070645, "grad_norm": 0.37096184492111206, "learning_rate": 9.906039499516611e-05, "loss": 1.3548, "step": 49 }, { "epoch": 0.021278859453133313, "grad_norm": 0.3019040822982788, "learning_rate": 9.90117537550229e-05, "loss": 1.4061, "step": 50 }, { "epoch": 0.021704436642195978, "grad_norm": 0.4224420487880707, "learning_rate": 9.896189761532563e-05, "loss": 1.4288, "step": 51 }, { "epoch": 0.022130013831258646, "grad_norm": 0.3436116874217987, "learning_rate": 9.89108278119044e-05, "loss": 1.4487, "step": 52 }, { "epoch": 0.02255559102032131, "grad_norm": 0.3421087861061096, "learning_rate": 9.885854561067357e-05, "loss": 1.4266, "step": 53 }, { "epoch": 0.02298116820938398, "grad_norm": 0.3183712661266327, "learning_rate": 9.880505230760027e-05, "loss": 1.4702, "step": 54 }, { "epoch": 0.023406745398446643, "grad_norm": 0.4090493321418762, "learning_rate": 9.875034922867236e-05, "loss": 1.5018, "step": 55 }, { "epoch": 0.023832322587509308, "grad_norm": 0.3179483115673065, "learning_rate": 9.869443772986551e-05, "loss": 1.4036, "step": 56 }, { "epoch": 0.024257899776571976, "grad_norm": 0.29585397243499756, "learning_rate": 9.863731919710963e-05, "loss": 1.3401, "step": 57 }, { "epoch": 0.02468347696563464, "grad_norm": 0.33189573884010315, "learning_rate": 9.85789950462545e-05, "loss": 1.4556, "step": 58 }, { "epoch": 0.02510905415469731, "grad_norm": 0.3170922100543976, "learning_rate": 9.85194667230346e-05, "loss": 1.3966, "step": 59 }, { "epoch": 0.025534631343759974, "grad_norm": 0.3412818908691406, "learning_rate": 9.845873570303346e-05, "loss": 1.4879, "step": 60 }, { "epoch": 0.025960208532822642, "grad_norm": 0.32975345849990845, "learning_rate": 9.839680349164684e-05, "loss": 1.4782, "step": 61 }, { "epoch": 0.026385785721885307, "grad_norm": 0.35211431980133057, "learning_rate": 9.833367162404563e-05, "loss": 1.4164, "step": 62 }, { "epoch": 0.02681136291094797, "grad_norm": 0.3229023516178131, "learning_rate": 9.826934166513767e-05, "loss": 1.4521, "step": 63 }, { "epoch": 0.02723694010001064, "grad_norm": 0.3026021718978882, "learning_rate": 9.8203815209529e-05, "loss": 1.4134, "step": 64 }, { "epoch": 0.027662517289073305, "grad_norm": 0.33294665813446045, "learning_rate": 9.813709388148429e-05, "loss": 1.4614, "step": 65 }, { "epoch": 0.028088094478135973, "grad_norm": 0.32732123136520386, "learning_rate": 9.806917933488668e-05, "loss": 1.4745, "step": 66 }, { "epoch": 0.028513671667198637, "grad_norm": 0.3315070569515228, "learning_rate": 9.800007325319666e-05, "loss": 1.4696, "step": 67 }, { "epoch": 0.028939248856261306, "grad_norm": 0.3497024476528168, "learning_rate": 9.792977734941049e-05, "loss": 1.4456, "step": 68 }, { "epoch": 0.02936482604532397, "grad_norm": 0.3212997615337372, "learning_rate": 9.785829336601751e-05, "loss": 1.4609, "step": 69 }, { "epoch": 0.02979040323438664, "grad_norm": 0.3074348270893097, "learning_rate": 9.778562307495722e-05, "loss": 1.4099, "step": 70 }, { "epoch": 0.030215980423449303, "grad_norm": 0.3627077043056488, "learning_rate": 9.771176827757512e-05, "loss": 1.4311, "step": 71 }, { "epoch": 0.030641557612511968, "grad_norm": 0.2996334135532379, "learning_rate": 9.763673080457823e-05, "loss": 1.3821, "step": 72 }, { "epoch": 0.031067134801574636, "grad_norm": 0.3352314233779907, "learning_rate": 9.756051251598962e-05, "loss": 1.4393, "step": 73 }, { "epoch": 0.0314927119906373, "grad_norm": 0.34438130259513855, "learning_rate": 9.748311530110229e-05, "loss": 1.4367, "step": 74 }, { "epoch": 0.031918289179699966, "grad_norm": 0.3071272373199463, "learning_rate": 9.740454107843243e-05, "loss": 1.3883, "step": 75 }, { "epoch": 0.03234386636876264, "grad_norm": 0.32912176847457886, "learning_rate": 9.732479179567177e-05, "loss": 1.4138, "step": 76 }, { "epoch": 0.0327694435578253, "grad_norm": 0.3019627332687378, "learning_rate": 9.724386942963938e-05, "loss": 1.3916, "step": 77 }, { "epoch": 0.03319502074688797, "grad_norm": 0.35912489891052246, "learning_rate": 9.716177598623257e-05, "loss": 1.3278, "step": 78 }, { "epoch": 0.03362059793595063, "grad_norm": 0.31681278347969055, "learning_rate": 9.707851350037726e-05, "loss": 1.2639, "step": 79 }, { "epoch": 0.034046175125013296, "grad_norm": 0.37369856238365173, "learning_rate": 9.699408403597749e-05, "loss": 1.4526, "step": 80 }, { "epoch": 0.03447175231407597, "grad_norm": 0.3292621076107025, "learning_rate": 9.69084896858643e-05, "loss": 1.3093, "step": 81 }, { "epoch": 0.03489732950313863, "grad_norm": 0.32593628764152527, "learning_rate": 9.68217325717438e-05, "loss": 1.4091, "step": 82 }, { "epoch": 0.0353229066922013, "grad_norm": 0.34049278497695923, "learning_rate": 9.67338148441446e-05, "loss": 1.5005, "step": 83 }, { "epoch": 0.03574848388126396, "grad_norm": 0.3360632061958313, "learning_rate": 9.664473868236452e-05, "loss": 1.4722, "step": 84 }, { "epoch": 0.036174061070326634, "grad_norm": 0.3554811477661133, "learning_rate": 9.655450629441659e-05, "loss": 1.4261, "step": 85 }, { "epoch": 0.0365996382593893, "grad_norm": 0.35133641958236694, "learning_rate": 9.646311991697419e-05, "loss": 1.3752, "step": 86 }, { "epoch": 0.03702521544845196, "grad_norm": 0.29005640745162964, "learning_rate": 9.637058181531582e-05, "loss": 1.2862, "step": 87 }, { "epoch": 0.03745079263751463, "grad_norm": 0.3389727771282196, "learning_rate": 9.627689428326873e-05, "loss": 1.3316, "step": 88 }, { "epoch": 0.03787636982657729, "grad_norm": 0.35122182965278625, "learning_rate": 9.618205964315223e-05, "loss": 1.4339, "step": 89 }, { "epoch": 0.038301947015639964, "grad_norm": 0.3106866776943207, "learning_rate": 9.608608024572004e-05, "loss": 1.4314, "step": 90 }, { "epoch": 0.03872752420470263, "grad_norm": 0.32559552788734436, "learning_rate": 9.598895847010198e-05, "loss": 1.4047, "step": 91 }, { "epoch": 0.039153101393765294, "grad_norm": 0.3118344843387604, "learning_rate": 9.589069672374515e-05, "loss": 1.4316, "step": 92 }, { "epoch": 0.03957867858282796, "grad_norm": 0.33870643377304077, "learning_rate": 9.579129744235408e-05, "loss": 1.4269, "step": 93 }, { "epoch": 0.04000425577189062, "grad_norm": 0.3164644241333008, "learning_rate": 9.569076308983045e-05, "loss": 1.4331, "step": 94 }, { "epoch": 0.040429832960953295, "grad_norm": 0.3030961751937866, "learning_rate": 9.5589096158212e-05, "loss": 1.3635, "step": 95 }, { "epoch": 0.04085541015001596, "grad_norm": 0.3118191063404083, "learning_rate": 9.548629916761076e-05, "loss": 1.3997, "step": 96 }, { "epoch": 0.041280987339078624, "grad_norm": 0.2993659973144531, "learning_rate": 9.538237466615058e-05, "loss": 1.3914, "step": 97 }, { "epoch": 0.04170656452814129, "grad_norm": 0.3075704872608185, "learning_rate": 9.527732522990391e-05, "loss": 1.413, "step": 98 }, { "epoch": 0.04213214171720396, "grad_norm": 0.3316209316253662, "learning_rate": 9.517115346282807e-05, "loss": 1.4021, "step": 99 }, { "epoch": 0.042557718906266626, "grad_norm": 0.2849251627922058, "learning_rate": 9.50638619967006e-05, "loss": 1.3836, "step": 100 }, { "epoch": 0.04298329609532929, "grad_norm": 0.3636033535003662, "learning_rate": 9.495545349105401e-05, "loss": 1.3604, "step": 101 }, { "epoch": 0.043408873284391955, "grad_norm": 0.3679890036582947, "learning_rate": 9.484593063310998e-05, "loss": 1.4271, "step": 102 }, { "epoch": 0.04383445047345462, "grad_norm": 0.36696523427963257, "learning_rate": 9.47352961377126e-05, "loss": 1.4735, "step": 103 }, { "epoch": 0.04426002766251729, "grad_norm": 0.33296412229537964, "learning_rate": 9.462355274726116e-05, "loss": 1.3739, "step": 104 }, { "epoch": 0.044685604851579956, "grad_norm": 0.3318886160850525, "learning_rate": 9.451070323164218e-05, "loss": 1.4887, "step": 105 }, { "epoch": 0.04511118204064262, "grad_norm": 0.3170441687107086, "learning_rate": 9.439675038816072e-05, "loss": 1.4224, "step": 106 }, { "epoch": 0.045536759229705286, "grad_norm": 0.29584380984306335, "learning_rate": 9.428169704147101e-05, "loss": 1.3517, "step": 107 }, { "epoch": 0.04596233641876796, "grad_norm": 0.36917728185653687, "learning_rate": 9.416554604350649e-05, "loss": 1.4098, "step": 108 }, { "epoch": 0.04638791360783062, "grad_norm": 0.3215009868144989, "learning_rate": 9.404830027340912e-05, "loss": 1.2738, "step": 109 }, { "epoch": 0.04681349079689329, "grad_norm": 0.3498933017253876, "learning_rate": 9.392996263745795e-05, "loss": 1.3365, "step": 110 }, { "epoch": 0.04723906798595595, "grad_norm": 0.30752959847450256, "learning_rate": 9.381053606899713e-05, "loss": 1.4227, "step": 111 }, { "epoch": 0.047664645175018616, "grad_norm": 0.304813951253891, "learning_rate": 9.36900235283632e-05, "loss": 1.3317, "step": 112 }, { "epoch": 0.04809022236408129, "grad_norm": 0.323702871799469, "learning_rate": 9.356842800281164e-05, "loss": 1.4016, "step": 113 }, { "epoch": 0.04851579955314395, "grad_norm": 0.31091248989105225, "learning_rate": 9.344575250644297e-05, "loss": 1.4067, "step": 114 }, { "epoch": 0.04894137674220662, "grad_norm": 0.37255966663360596, "learning_rate": 9.332200008012784e-05, "loss": 1.4713, "step": 115 }, { "epoch": 0.04936695393126928, "grad_norm": 0.3643665015697479, "learning_rate": 9.31971737914318e-05, "loss": 1.4232, "step": 116 }, { "epoch": 0.04979253112033195, "grad_norm": 0.3126155138015747, "learning_rate": 9.307127673453927e-05, "loss": 1.3248, "step": 117 }, { "epoch": 0.05021810830939462, "grad_norm": 0.3057401478290558, "learning_rate": 9.29443120301767e-05, "loss": 1.4206, "step": 118 }, { "epoch": 0.05064368549845728, "grad_norm": 0.3248518109321594, "learning_rate": 9.281628282553536e-05, "loss": 1.3366, "step": 119 }, { "epoch": 0.05106926268751995, "grad_norm": 0.3369111716747284, "learning_rate": 9.268719229419325e-05, "loss": 1.41, "step": 120 }, { "epoch": 0.05149483987658261, "grad_norm": 0.32806459069252014, "learning_rate": 9.255704363603645e-05, "loss": 1.3577, "step": 121 }, { "epoch": 0.051920417065645284, "grad_norm": 0.3423938751220703, "learning_rate": 9.242584007717983e-05, "loss": 1.4803, "step": 122 }, { "epoch": 0.05234599425470795, "grad_norm": 0.2834835350513458, "learning_rate": 9.229358486988702e-05, "loss": 1.3583, "step": 123 }, { "epoch": 0.052771571443770614, "grad_norm": 0.3676677644252777, "learning_rate": 9.216028129248986e-05, "loss": 1.5176, "step": 124 }, { "epoch": 0.05319714863283328, "grad_norm": 0.32076701521873474, "learning_rate": 9.202593264930708e-05, "loss": 1.3902, "step": 125 }, { "epoch": 0.05362272582189594, "grad_norm": 0.31246402859687805, "learning_rate": 9.189054227056247e-05, "loss": 1.3882, "step": 126 }, { "epoch": 0.054048303010958615, "grad_norm": 0.3379463851451874, "learning_rate": 9.175411351230222e-05, "loss": 1.268, "step": 127 }, { "epoch": 0.05447388020002128, "grad_norm": 0.30225270986557007, "learning_rate": 9.161664975631184e-05, "loss": 1.4595, "step": 128 }, { "epoch": 0.054899457389083944, "grad_norm": 0.3265242874622345, "learning_rate": 9.147815441003223e-05, "loss": 1.3656, "step": 129 }, { "epoch": 0.05532503457814661, "grad_norm": 0.3495875597000122, "learning_rate": 9.133863090647532e-05, "loss": 1.4378, "step": 130 }, { "epoch": 0.05575061176720928, "grad_norm": 0.3625856935977936, "learning_rate": 9.119808270413891e-05, "loss": 1.4344, "step": 131 }, { "epoch": 0.056176188956271945, "grad_norm": 0.304872065782547, "learning_rate": 9.105651328692093e-05, "loss": 1.3909, "step": 132 }, { "epoch": 0.05660176614533461, "grad_norm": 0.3018715977668762, "learning_rate": 9.091392616403314e-05, "loss": 1.3361, "step": 133 }, { "epoch": 0.057027343334397275, "grad_norm": 0.27433308959007263, "learning_rate": 9.077032486991408e-05, "loss": 1.226, "step": 134 }, { "epoch": 0.05745292052345994, "grad_norm": 0.3812541663646698, "learning_rate": 9.062571296414154e-05, "loss": 1.3638, "step": 135 }, { "epoch": 0.05787849771252261, "grad_norm": 0.33055293560028076, "learning_rate": 9.048009403134417e-05, "loss": 1.4088, "step": 136 }, { "epoch": 0.058304074901585276, "grad_norm": 0.325934499502182, "learning_rate": 9.033347168111282e-05, "loss": 1.3748, "step": 137 }, { "epoch": 0.05872965209064794, "grad_norm": 0.327552855014801, "learning_rate": 9.018584954791096e-05, "loss": 1.4807, "step": 138 }, { "epoch": 0.059155229279710606, "grad_norm": 0.3454499840736389, "learning_rate": 9.003723129098458e-05, "loss": 1.4543, "step": 139 }, { "epoch": 0.05958080646877328, "grad_norm": 0.312281996011734, "learning_rate": 8.98876205942715e-05, "loss": 1.4524, "step": 140 }, { "epoch": 0.06000638365783594, "grad_norm": 0.31058740615844727, "learning_rate": 8.97370211663101e-05, "loss": 1.4508, "step": 141 }, { "epoch": 0.06043196084689861, "grad_norm": 0.3037337362766266, "learning_rate": 8.958543674014732e-05, "loss": 1.2849, "step": 142 }, { "epoch": 0.06085753803596127, "grad_norm": 0.3011512756347656, "learning_rate": 8.943287107324617e-05, "loss": 1.3596, "step": 143 }, { "epoch": 0.061283115225023936, "grad_norm": 0.3848366141319275, "learning_rate": 8.927932794739257e-05, "loss": 1.3857, "step": 144 }, { "epoch": 0.06170869241408661, "grad_norm": 0.31173983216285706, "learning_rate": 8.912481116860166e-05, "loss": 1.4048, "step": 145 }, { "epoch": 0.06213426960314927, "grad_norm": 0.3007739782333374, "learning_rate": 8.896932456702332e-05, "loss": 1.2664, "step": 146 }, { "epoch": 0.06255984679221194, "grad_norm": 0.2995300889015198, "learning_rate": 8.881287199684743e-05, "loss": 1.3843, "step": 147 }, { "epoch": 0.0629854239812746, "grad_norm": 0.31542861461639404, "learning_rate": 8.865545733620815e-05, "loss": 1.472, "step": 148 }, { "epoch": 0.06341100117033727, "grad_norm": 0.32314345240592957, "learning_rate": 8.84970844870879e-05, "loss": 1.4709, "step": 149 }, { "epoch": 0.06383657835939993, "grad_norm": 0.30198317766189575, "learning_rate": 8.83377573752206e-05, "loss": 1.4225, "step": 150 }, { "epoch": 0.0642621555484626, "grad_norm": 0.29066887497901917, "learning_rate": 8.817747994999432e-05, "loss": 1.3723, "step": 151 }, { "epoch": 0.06468773273752527, "grad_norm": 0.3179444372653961, "learning_rate": 8.801625618435351e-05, "loss": 1.4489, "step": 152 }, { "epoch": 0.06511330992658794, "grad_norm": 0.3465145230293274, "learning_rate": 8.785409007470032e-05, "loss": 1.3848, "step": 153 }, { "epoch": 0.0655388871156506, "grad_norm": 0.27494004368782043, "learning_rate": 8.769098564079574e-05, "loss": 1.3309, "step": 154 }, { "epoch": 0.06596446430471327, "grad_norm": 0.32629844546318054, "learning_rate": 8.752694692565986e-05, "loss": 1.3737, "step": 155 }, { "epoch": 0.06639004149377593, "grad_norm": 0.3132442533969879, "learning_rate": 8.736197799547159e-05, "loss": 1.4035, "step": 156 }, { "epoch": 0.0668156186828386, "grad_norm": 0.31382986903190613, "learning_rate": 8.719608293946802e-05, "loss": 1.3258, "step": 157 }, { "epoch": 0.06724119587190126, "grad_norm": 0.32756349444389343, "learning_rate": 8.702926586984294e-05, "loss": 1.3506, "step": 158 }, { "epoch": 0.06766677306096393, "grad_norm": 0.33079269528388977, "learning_rate": 8.686153092164493e-05, "loss": 1.4653, "step": 159 }, { "epoch": 0.06809235025002659, "grad_norm": 0.30706673860549927, "learning_rate": 8.669288225267492e-05, "loss": 1.3536, "step": 160 }, { "epoch": 0.06851792743908927, "grad_norm": 0.31477653980255127, "learning_rate": 8.6523324043383e-05, "loss": 1.3129, "step": 161 }, { "epoch": 0.06894350462815194, "grad_norm": 0.32898736000061035, "learning_rate": 8.635286049676496e-05, "loss": 1.3889, "step": 162 }, { "epoch": 0.0693690818172146, "grad_norm": 0.35487836599349976, "learning_rate": 8.618149583825796e-05, "loss": 1.4337, "step": 163 }, { "epoch": 0.06979465900627727, "grad_norm": 0.3246158957481384, "learning_rate": 8.60092343156359e-05, "loss": 1.4514, "step": 164 }, { "epoch": 0.07022023619533993, "grad_norm": 0.34181657433509827, "learning_rate": 8.583608019890406e-05, "loss": 1.429, "step": 165 }, { "epoch": 0.0706458133844026, "grad_norm": 0.3031291961669922, "learning_rate": 8.566203778019322e-05, "loss": 1.3333, "step": 166 }, { "epoch": 0.07107139057346526, "grad_norm": 0.32903921604156494, "learning_rate": 8.54871113736534e-05, "loss": 1.5184, "step": 167 }, { "epoch": 0.07149696776252792, "grad_norm": 0.31410080194473267, "learning_rate": 8.531130531534683e-05, "loss": 1.581, "step": 168 }, { "epoch": 0.07192254495159059, "grad_norm": 0.344510555267334, "learning_rate": 8.513462396314041e-05, "loss": 1.4454, "step": 169 }, { "epoch": 0.07234812214065327, "grad_norm": 0.3599744141101837, "learning_rate": 8.495707169659786e-05, "loss": 1.4448, "step": 170 }, { "epoch": 0.07277369932971593, "grad_norm": 0.3286546468734741, "learning_rate": 8.477865291687095e-05, "loss": 1.3882, "step": 171 }, { "epoch": 0.0731992765187786, "grad_norm": 0.30131304264068604, "learning_rate": 8.459937204659063e-05, "loss": 1.3306, "step": 172 }, { "epoch": 0.07362485370784126, "grad_norm": 0.32244452834129333, "learning_rate": 8.441923352975716e-05, "loss": 1.4304, "step": 173 }, { "epoch": 0.07405043089690393, "grad_norm": 0.32980459928512573, "learning_rate": 8.423824183163016e-05, "loss": 1.4298, "step": 174 }, { "epoch": 0.07447600808596659, "grad_norm": 0.2912943661212921, "learning_rate": 8.405640143861782e-05, "loss": 1.4104, "step": 175 }, { "epoch": 0.07490158527502926, "grad_norm": 0.31498202681541443, "learning_rate": 8.387371685816572e-05, "loss": 1.4688, "step": 176 }, { "epoch": 0.07532716246409192, "grad_norm": 0.31826215982437134, "learning_rate": 8.369019261864505e-05, "loss": 1.3991, "step": 177 }, { "epoch": 0.07575273965315459, "grad_norm": 0.3330978453159332, "learning_rate": 8.350583326924048e-05, "loss": 1.4436, "step": 178 }, { "epoch": 0.07617831684221726, "grad_norm": 0.31430912017822266, "learning_rate": 8.332064337983725e-05, "loss": 1.4602, "step": 179 }, { "epoch": 0.07660389403127993, "grad_norm": 0.3026297986507416, "learning_rate": 8.3134627540908e-05, "loss": 1.3956, "step": 180 }, { "epoch": 0.0770294712203426, "grad_norm": 0.33081120252609253, "learning_rate": 8.294779036339893e-05, "loss": 1.516, "step": 181 }, { "epoch": 0.07745504840940526, "grad_norm": 0.299742728471756, "learning_rate": 8.27601364786155e-05, "loss": 1.3524, "step": 182 }, { "epoch": 0.07788062559846792, "grad_norm": 0.3342128396034241, "learning_rate": 8.257167053810768e-05, "loss": 1.4861, "step": 183 }, { "epoch": 0.07830620278753059, "grad_norm": 0.336272269487381, "learning_rate": 8.238239721355461e-05, "loss": 1.3751, "step": 184 }, { "epoch": 0.07873177997659325, "grad_norm": 0.3233320116996765, "learning_rate": 8.219232119664877e-05, "loss": 1.3365, "step": 185 }, { "epoch": 0.07915735716565592, "grad_norm": 0.31845808029174805, "learning_rate": 8.200144719897974e-05, "loss": 1.416, "step": 186 }, { "epoch": 0.07958293435471858, "grad_norm": 0.3381212055683136, "learning_rate": 8.180977995191738e-05, "loss": 1.3525, "step": 187 }, { "epoch": 0.08000851154378125, "grad_norm": 0.30561843514442444, "learning_rate": 8.161732420649459e-05, "loss": 1.3485, "step": 188 }, { "epoch": 0.08043408873284393, "grad_norm": 0.29561662673950195, "learning_rate": 8.142408473328945e-05, "loss": 1.2745, "step": 189 }, { "epoch": 0.08085966592190659, "grad_norm": 0.2992284893989563, "learning_rate": 8.123006632230703e-05, "loss": 1.3667, "step": 190 }, { "epoch": 0.08128524311096925, "grad_norm": 0.3103450834751129, "learning_rate": 8.103527378286071e-05, "loss": 1.3894, "step": 191 }, { "epoch": 0.08171082030003192, "grad_norm": 0.31997600197792053, "learning_rate": 8.083971194345282e-05, "loss": 1.4296, "step": 192 }, { "epoch": 0.08213639748909458, "grad_norm": 0.2979292571544647, "learning_rate": 8.06433856516551e-05, "loss": 1.4681, "step": 193 }, { "epoch": 0.08256197467815725, "grad_norm": 0.35576626658439636, "learning_rate": 8.044629977398845e-05, "loss": 1.4637, "step": 194 }, { "epoch": 0.08298755186721991, "grad_norm": 0.3102336823940277, "learning_rate": 8.024845919580235e-05, "loss": 1.3598, "step": 195 }, { "epoch": 0.08341312905628258, "grad_norm": 0.3177992105484009, "learning_rate": 8.004986882115371e-05, "loss": 1.3457, "step": 196 }, { "epoch": 0.08383870624534524, "grad_norm": 0.32051852345466614, "learning_rate": 7.985053357268533e-05, "loss": 1.3453, "step": 197 }, { "epoch": 0.08426428343440792, "grad_norm": 0.35972169041633606, "learning_rate": 7.965045839150394e-05, "loss": 1.4343, "step": 198 }, { "epoch": 0.08468986062347059, "grad_norm": 0.3172051012516022, "learning_rate": 7.94496482370576e-05, "loss": 1.2598, "step": 199 }, { "epoch": 0.08511543781253325, "grad_norm": 0.315286785364151, "learning_rate": 7.924810808701286e-05, "loss": 1.2469, "step": 200 }, { "epoch": 0.08554101500159592, "grad_norm": 0.34473949670791626, "learning_rate": 7.904584293713134e-05, "loss": 1.376, "step": 201 }, { "epoch": 0.08596659219065858, "grad_norm": 0.3357507884502411, "learning_rate": 7.884285780114593e-05, "loss": 1.3202, "step": 202 }, { "epoch": 0.08639216937972125, "grad_norm": 0.3239094018936157, "learning_rate": 7.86391577106364e-05, "loss": 1.3751, "step": 203 }, { "epoch": 0.08681774656878391, "grad_norm": 0.32212114334106445, "learning_rate": 7.843474771490486e-05, "loss": 1.3618, "step": 204 }, { "epoch": 0.08724332375784657, "grad_norm": 0.3353959321975708, "learning_rate": 7.82296328808504e-05, "loss": 1.3613, "step": 205 }, { "epoch": 0.08766890094690924, "grad_norm": 0.3211444318294525, "learning_rate": 7.802381829284366e-05, "loss": 1.3828, "step": 206 }, { "epoch": 0.08809447813597192, "grad_norm": 0.3303898572921753, "learning_rate": 7.78173090526007e-05, "loss": 1.4529, "step": 207 }, { "epoch": 0.08852005532503458, "grad_norm": 0.3457483947277069, "learning_rate": 7.761011027905654e-05, "loss": 1.3714, "step": 208 }, { "epoch": 0.08894563251409725, "grad_norm": 0.3243163228034973, "learning_rate": 7.740222710823837e-05, "loss": 1.3595, "step": 209 }, { "epoch": 0.08937120970315991, "grad_norm": 0.322988361120224, "learning_rate": 7.719366469313806e-05, "loss": 1.3353, "step": 210 }, { "epoch": 0.08979678689222258, "grad_norm": 0.31282839179039, "learning_rate": 7.698442820358463e-05, "loss": 1.4834, "step": 211 }, { "epoch": 0.09022236408128524, "grad_norm": 0.318215548992157, "learning_rate": 7.677452282611594e-05, "loss": 1.3521, "step": 212 }, { "epoch": 0.0906479412703479, "grad_norm": 0.3105640113353729, "learning_rate": 7.656395376385027e-05, "loss": 1.4099, "step": 213 }, { "epoch": 0.09107351845941057, "grad_norm": 0.3511587083339691, "learning_rate": 7.635272623635716e-05, "loss": 1.3177, "step": 214 }, { "epoch": 0.09149909564847324, "grad_norm": 0.3450727164745331, "learning_rate": 7.614084547952824e-05, "loss": 1.5545, "step": 215 }, { "epoch": 0.09192467283753591, "grad_norm": 0.33173757791519165, "learning_rate": 7.592831674544728e-05, "loss": 1.4891, "step": 216 }, { "epoch": 0.09235025002659858, "grad_norm": 0.29296064376831055, "learning_rate": 7.571514530226004e-05, "loss": 1.3501, "step": 217 }, { "epoch": 0.09277582721566124, "grad_norm": 0.3370741605758667, "learning_rate": 7.550133643404377e-05, "loss": 1.4028, "step": 218 }, { "epoch": 0.09320140440472391, "grad_norm": 0.31852227449417114, "learning_rate": 7.528689544067611e-05, "loss": 1.341, "step": 219 }, { "epoch": 0.09362698159378657, "grad_norm": 0.3008808493614197, "learning_rate": 7.507182763770381e-05, "loss": 1.3032, "step": 220 }, { "epoch": 0.09405255878284924, "grad_norm": 0.3290260434150696, "learning_rate": 7.485613835621088e-05, "loss": 1.3937, "step": 221 }, { "epoch": 0.0944781359719119, "grad_norm": 0.3093232810497284, "learning_rate": 7.46398329426865e-05, "loss": 1.4153, "step": 222 }, { "epoch": 0.09490371316097457, "grad_norm": 0.35814496874809265, "learning_rate": 7.442291675889254e-05, "loss": 1.4717, "step": 223 }, { "epoch": 0.09532929035003723, "grad_norm": 0.33721545338630676, "learning_rate": 7.420539518173053e-05, "loss": 1.4509, "step": 224 }, { "epoch": 0.09575486753909991, "grad_norm": 0.3308022916316986, "learning_rate": 7.398727360310848e-05, "loss": 1.4284, "step": 225 }, { "epoch": 0.09618044472816258, "grad_norm": 0.29898306727409363, "learning_rate": 7.376855742980718e-05, "loss": 1.3514, "step": 226 }, { "epoch": 0.09660602191722524, "grad_norm": 0.2964514493942261, "learning_rate": 7.354925208334614e-05, "loss": 1.3246, "step": 227 }, { "epoch": 0.0970315991062879, "grad_norm": 0.2953067421913147, "learning_rate": 7.332936299984937e-05, "loss": 1.4034, "step": 228 }, { "epoch": 0.09745717629535057, "grad_norm": 0.2890208661556244, "learning_rate": 7.310889562991037e-05, "loss": 1.2483, "step": 229 }, { "epoch": 0.09788275348441323, "grad_norm": 0.31622129678726196, "learning_rate": 7.288785543845725e-05, "loss": 1.4184, "step": 230 }, { "epoch": 0.0983083306734759, "grad_norm": 0.3258267641067505, "learning_rate": 7.266624790461713e-05, "loss": 1.4013, "step": 231 }, { "epoch": 0.09873390786253856, "grad_norm": 0.3144655227661133, "learning_rate": 7.244407852158042e-05, "loss": 1.4689, "step": 232 }, { "epoch": 0.09915948505160123, "grad_norm": 0.30230382084846497, "learning_rate": 7.222135279646453e-05, "loss": 1.4526, "step": 233 }, { "epoch": 0.0995850622406639, "grad_norm": 0.3089566230773926, "learning_rate": 7.199807625017749e-05, "loss": 1.315, "step": 234 }, { "epoch": 0.10001063942972657, "grad_norm": 0.30076929926872253, "learning_rate": 7.177425441728103e-05, "loss": 1.3653, "step": 235 }, { "epoch": 0.10043621661878924, "grad_norm": 0.3337990641593933, "learning_rate": 7.154989284585342e-05, "loss": 1.4545, "step": 236 }, { "epoch": 0.1008617938078519, "grad_norm": 0.33799222111701965, "learning_rate": 7.132499709735187e-05, "loss": 1.3691, "step": 237 }, { "epoch": 0.10128737099691457, "grad_norm": 0.33428463339805603, "learning_rate": 7.109957274647478e-05, "loss": 1.5371, "step": 238 }, { "epoch": 0.10171294818597723, "grad_norm": 0.32650595903396606, "learning_rate": 7.08736253810235e-05, "loss": 1.5122, "step": 239 }, { "epoch": 0.1021385253750399, "grad_norm": 0.3586381673812866, "learning_rate": 7.06471606017638e-05, "loss": 1.5361, "step": 240 }, { "epoch": 0.10256410256410256, "grad_norm": 0.3396207392215729, "learning_rate": 7.04201840222871e-05, "loss": 1.4595, "step": 241 }, { "epoch": 0.10298967975316523, "grad_norm": 0.3792440593242645, "learning_rate": 7.019270126887123e-05, "loss": 1.31, "step": 242 }, { "epoch": 0.10341525694222789, "grad_norm": 0.3366132378578186, "learning_rate": 6.996471798034108e-05, "loss": 1.4679, "step": 243 }, { "epoch": 0.10384083413129057, "grad_norm": 0.282144159078598, "learning_rate": 6.973623980792875e-05, "loss": 1.2485, "step": 244 }, { "epoch": 0.10426641132035323, "grad_norm": 0.3441198766231537, "learning_rate": 6.950727241513344e-05, "loss": 1.378, "step": 245 }, { "epoch": 0.1046919885094159, "grad_norm": 0.298406720161438, "learning_rate": 6.927782147758117e-05, "loss": 1.37, "step": 246 }, { "epoch": 0.10511756569847856, "grad_norm": 0.3125241696834564, "learning_rate": 6.904789268288398e-05, "loss": 1.3247, "step": 247 }, { "epoch": 0.10554314288754123, "grad_norm": 0.31313666701316833, "learning_rate": 6.881749173049901e-05, "loss": 1.3459, "step": 248 }, { "epoch": 0.10596872007660389, "grad_norm": 0.31229567527770996, "learning_rate": 6.858662433158724e-05, "loss": 1.4175, "step": 249 }, { "epoch": 0.10639429726566656, "grad_norm": 0.3220798671245575, "learning_rate": 6.835529620887185e-05, "loss": 1.3616, "step": 250 }, { "epoch": 0.10681987445472922, "grad_norm": 0.30676543712615967, "learning_rate": 6.81235130964964e-05, "loss": 1.4731, "step": 251 }, { "epoch": 0.10724545164379189, "grad_norm": 0.3131166994571686, "learning_rate": 6.789128073988276e-05, "loss": 1.3409, "step": 252 }, { "epoch": 0.10767102883285457, "grad_norm": 0.311882346868515, "learning_rate": 6.765860489558856e-05, "loss": 1.3375, "step": 253 }, { "epoch": 0.10809660602191723, "grad_norm": 0.3434353470802307, "learning_rate": 6.74254913311646e-05, "loss": 1.3905, "step": 254 }, { "epoch": 0.1085221832109799, "grad_norm": 0.2908113896846771, "learning_rate": 6.719194582501183e-05, "loss": 1.4806, "step": 255 }, { "epoch": 0.10894776040004256, "grad_norm": 0.3273056149482727, "learning_rate": 6.695797416623821e-05, "loss": 1.4091, "step": 256 }, { "epoch": 0.10937333758910522, "grad_norm": 0.32346075773239136, "learning_rate": 6.672358215451507e-05, "loss": 1.4084, "step": 257 }, { "epoch": 0.10979891477816789, "grad_norm": 0.3289119601249695, "learning_rate": 6.648877559993339e-05, "loss": 1.3466, "step": 258 }, { "epoch": 0.11022449196723055, "grad_norm": 0.2760949432849884, "learning_rate": 6.62535603228599e-05, "loss": 1.3571, "step": 259 }, { "epoch": 0.11065006915629322, "grad_norm": 0.29785817861557007, "learning_rate": 6.601794215379266e-05, "loss": 1.3862, "step": 260 }, { "epoch": 0.11107564634535588, "grad_norm": 0.31330984830856323, "learning_rate": 6.578192693321656e-05, "loss": 1.3972, "step": 261 }, { "epoch": 0.11150122353441856, "grad_norm": 0.2923828363418579, "learning_rate": 6.55455205114586e-05, "loss": 1.3496, "step": 262 }, { "epoch": 0.11192680072348123, "grad_norm": 0.32257649302482605, "learning_rate": 6.530872874854285e-05, "loss": 1.3539, "step": 263 }, { "epoch": 0.11235237791254389, "grad_norm": 0.32502755522727966, "learning_rate": 6.507155751404518e-05, "loss": 1.4341, "step": 264 }, { "epoch": 0.11277795510160656, "grad_norm": 0.3588520586490631, "learning_rate": 6.483401268694777e-05, "loss": 1.4789, "step": 265 }, { "epoch": 0.11320353229066922, "grad_norm": 0.3285142183303833, "learning_rate": 6.45961001554934e-05, "loss": 1.3466, "step": 266 }, { "epoch": 0.11362910947973189, "grad_norm": 0.3209497928619385, "learning_rate": 6.435782581703945e-05, "loss": 1.4529, "step": 267 }, { "epoch": 0.11405468666879455, "grad_norm": 0.32251620292663574, "learning_rate": 6.411919557791176e-05, "loss": 1.3964, "step": 268 }, { "epoch": 0.11448026385785721, "grad_norm": 0.27929726243019104, "learning_rate": 6.388021535325821e-05, "loss": 1.3372, "step": 269 }, { "epoch": 0.11490584104691988, "grad_norm": 0.35057204961776733, "learning_rate": 6.364089106690209e-05, "loss": 1.4241, "step": 270 }, { "epoch": 0.11533141823598256, "grad_norm": 0.32568198442459106, "learning_rate": 6.340122865119524e-05, "loss": 1.2808, "step": 271 }, { "epoch": 0.11575699542504522, "grad_norm": 0.3379870057106018, "learning_rate": 6.316123404687108e-05, "loss": 1.4953, "step": 272 }, { "epoch": 0.11618257261410789, "grad_norm": 0.297118604183197, "learning_rate": 6.292091320289725e-05, "loss": 1.327, "step": 273 }, { "epoch": 0.11660814980317055, "grad_norm": 0.30609849095344543, "learning_rate": 6.268027207632821e-05, "loss": 1.2376, "step": 274 }, { "epoch": 0.11703372699223322, "grad_norm": 0.3672487735748291, "learning_rate": 6.243931663215756e-05, "loss": 1.4375, "step": 275 }, { "epoch": 0.11745930418129588, "grad_norm": 0.3317699134349823, "learning_rate": 6.219805284317019e-05, "loss": 1.4371, "step": 276 }, { "epoch": 0.11788488137035855, "grad_norm": 0.285065233707428, "learning_rate": 6.195648668979417e-05, "loss": 1.3968, "step": 277 }, { "epoch": 0.11831045855942121, "grad_norm": 0.33966484665870667, "learning_rate": 6.171462415995263e-05, "loss": 1.4376, "step": 278 }, { "epoch": 0.11873603574848388, "grad_norm": 0.32605886459350586, "learning_rate": 6.147247124891519e-05, "loss": 1.4235, "step": 279 }, { "epoch": 0.11916161293754655, "grad_norm": 0.29964199662208557, "learning_rate": 6.123003395914945e-05, "loss": 1.3581, "step": 280 }, { "epoch": 0.11958719012660922, "grad_norm": 0.3049403429031372, "learning_rate": 6.098731830017217e-05, "loss": 1.398, "step": 281 }, { "epoch": 0.12001276731567188, "grad_norm": 0.30261459946632385, "learning_rate": 6.074433028840029e-05, "loss": 1.3876, "step": 282 }, { "epoch": 0.12043834450473455, "grad_norm": 0.3096999526023865, "learning_rate": 6.0501075947001816e-05, "loss": 1.4192, "step": 283 }, { "epoch": 0.12086392169379721, "grad_norm": 0.3477337956428528, "learning_rate": 6.025756130574652e-05, "loss": 1.2923, "step": 284 }, { "epoch": 0.12128949888285988, "grad_norm": 0.2876332998275757, "learning_rate": 6.001379240085645e-05, "loss": 1.3937, "step": 285 }, { "epoch": 0.12171507607192254, "grad_norm": 0.31134235858917236, "learning_rate": 5.976977527485633e-05, "loss": 1.4417, "step": 286 }, { "epoch": 0.12214065326098521, "grad_norm": 0.33744943141937256, "learning_rate": 5.9525515976423775e-05, "loss": 1.4114, "step": 287 }, { "epoch": 0.12256623045004787, "grad_norm": 0.32942283153533936, "learning_rate": 5.928102056023935e-05, "loss": 1.4425, "step": 288 }, { "epoch": 0.12299180763911054, "grad_norm": 0.3516654074192047, "learning_rate": 5.903629508683649e-05, "loss": 1.3629, "step": 289 }, { "epoch": 0.12341738482817322, "grad_norm": 0.31652727723121643, "learning_rate": 5.879134562245123e-05, "loss": 1.4935, "step": 290 }, { "epoch": 0.12384296201723588, "grad_norm": 0.30340665578842163, "learning_rate": 5.854617823887196e-05, "loss": 1.3113, "step": 291 }, { "epoch": 0.12426853920629854, "grad_norm": 0.3198196589946747, "learning_rate": 5.8300799013288754e-05, "loss": 1.2864, "step": 292 }, { "epoch": 0.12469411639536121, "grad_norm": 0.28304675221443176, "learning_rate": 5.8055214028142844e-05, "loss": 1.3676, "step": 293 }, { "epoch": 0.12511969358442387, "grad_norm": 0.2878504693508148, "learning_rate": 5.780942937097584e-05, "loss": 1.3713, "step": 294 }, { "epoch": 0.12554527077348654, "grad_norm": 0.3235960900783539, "learning_rate": 5.7563451134278735e-05, "loss": 1.4314, "step": 295 }, { "epoch": 0.1259708479625492, "grad_norm": 0.3305697739124298, "learning_rate": 5.731728541534101e-05, "loss": 1.4453, "step": 296 }, { "epoch": 0.12639642515161187, "grad_norm": 0.34397658705711365, "learning_rate": 5.7070938316099455e-05, "loss": 1.3833, "step": 297 }, { "epoch": 0.12682200234067453, "grad_norm": 0.31075742840766907, "learning_rate": 5.6824415942986844e-05, "loss": 1.3919, "step": 298 }, { "epoch": 0.1272475795297372, "grad_norm": 0.32299962639808655, "learning_rate": 5.65777244067807e-05, "loss": 1.3616, "step": 299 }, { "epoch": 0.12767315671879986, "grad_norm": 0.31307482719421387, "learning_rate": 5.633086982245166e-05, "loss": 1.315, "step": 300 }, { "epoch": 0.12809873390786253, "grad_norm": 0.31546834111213684, "learning_rate": 5.608385830901206e-05, "loss": 1.3429, "step": 301 }, { "epoch": 0.1285243110969252, "grad_norm": 0.32566624879837036, "learning_rate": 5.583669598936414e-05, "loss": 1.3693, "step": 302 }, { "epoch": 0.12894988828598788, "grad_norm": 0.3248218894004822, "learning_rate": 5.5589388990148326e-05, "loss": 1.337, "step": 303 }, { "epoch": 0.12937546547505055, "grad_norm": 0.29174041748046875, "learning_rate": 5.534194344159136e-05, "loss": 1.4242, "step": 304 }, { "epoch": 0.12980104266411321, "grad_norm": 0.3306146264076233, "learning_rate": 5.5094365477354325e-05, "loss": 1.3176, "step": 305 }, { "epoch": 0.13022661985317588, "grad_norm": 0.34080812335014343, "learning_rate": 5.48466612343806e-05, "loss": 1.3678, "step": 306 }, { "epoch": 0.13065219704223854, "grad_norm": 0.33598506450653076, "learning_rate": 5.4598836852743774e-05, "loss": 1.3354, "step": 307 }, { "epoch": 0.1310777742313012, "grad_norm": 0.3259298801422119, "learning_rate": 5.435089847549541e-05, "loss": 1.3574, "step": 308 }, { "epoch": 0.13150335142036387, "grad_norm": 0.32616904377937317, "learning_rate": 5.4102852248512814e-05, "loss": 1.4339, "step": 309 }, { "epoch": 0.13192892860942654, "grad_norm": 0.34405210614204407, "learning_rate": 5.38547043203466e-05, "loss": 1.4094, "step": 310 }, { "epoch": 0.1323545057984892, "grad_norm": 0.31888607144355774, "learning_rate": 5.3606460842068426e-05, "loss": 1.4321, "step": 311 }, { "epoch": 0.13278008298755187, "grad_norm": 0.3342142403125763, "learning_rate": 5.3358127967118354e-05, "loss": 1.4156, "step": 312 }, { "epoch": 0.13320566017661453, "grad_norm": 0.4066295921802521, "learning_rate": 5.3109711851152456e-05, "loss": 1.4043, "step": 313 }, { "epoch": 0.1336312373656772, "grad_norm": 0.3196505606174469, "learning_rate": 5.286121865189017e-05, "loss": 1.2663, "step": 314 }, { "epoch": 0.13405681455473986, "grad_norm": 0.30326151847839355, "learning_rate": 5.261265452896167e-05, "loss": 1.4344, "step": 315 }, { "epoch": 0.13448239174380253, "grad_norm": 0.3233386278152466, "learning_rate": 5.236402564375514e-05, "loss": 1.3237, "step": 316 }, { "epoch": 0.1349079689328652, "grad_norm": 0.3291011154651642, "learning_rate": 5.211533815926417e-05, "loss": 1.4302, "step": 317 }, { "epoch": 0.13533354612192786, "grad_norm": 0.338759183883667, "learning_rate": 5.186659823993482e-05, "loss": 1.4143, "step": 318 }, { "epoch": 0.13575912331099052, "grad_norm": 0.33174023032188416, "learning_rate": 5.161781205151294e-05, "loss": 1.4507, "step": 319 }, { "epoch": 0.13618470050005319, "grad_norm": 0.3163542151451111, "learning_rate": 5.136898576089131e-05, "loss": 1.3993, "step": 320 }, { "epoch": 0.13661027768911585, "grad_norm": 0.30455371737480164, "learning_rate": 5.112012553595671e-05, "loss": 1.4941, "step": 321 }, { "epoch": 0.13703585487817854, "grad_norm": 0.30642834305763245, "learning_rate": 5.0871237545437145e-05, "loss": 1.4564, "step": 322 }, { "epoch": 0.1374614320672412, "grad_norm": 0.3390425443649292, "learning_rate": 5.062232795874879e-05, "loss": 1.3943, "step": 323 }, { "epoch": 0.13788700925630387, "grad_norm": 0.31367331743240356, "learning_rate": 5.037340294584323e-05, "loss": 1.2689, "step": 324 }, { "epoch": 0.13831258644536654, "grad_norm": 0.3435627520084381, "learning_rate": 5.0124468677054384e-05, "loss": 1.4298, "step": 325 }, { "epoch": 0.1387381636344292, "grad_norm": 0.35670870542526245, "learning_rate": 4.987553132294563e-05, "loss": 1.321, "step": 326 }, { "epoch": 0.13916374082349187, "grad_norm": 0.32847118377685547, "learning_rate": 4.962659705415677e-05, "loss": 1.3769, "step": 327 }, { "epoch": 0.13958931801255453, "grad_norm": 0.33433982729911804, "learning_rate": 4.937767204125122e-05, "loss": 1.2956, "step": 328 }, { "epoch": 0.1400148952016172, "grad_norm": 0.2976665198802948, "learning_rate": 4.912876245456288e-05, "loss": 1.4297, "step": 329 }, { "epoch": 0.14044047239067986, "grad_norm": 0.33189690113067627, "learning_rate": 4.88798744640433e-05, "loss": 1.3718, "step": 330 }, { "epoch": 0.14086604957974252, "grad_norm": 0.33690720796585083, "learning_rate": 4.86310142391087e-05, "loss": 1.4438, "step": 331 }, { "epoch": 0.1412916267688052, "grad_norm": 0.3246925175189972, "learning_rate": 4.8382187948487054e-05, "loss": 1.427, "step": 332 }, { "epoch": 0.14171720395786785, "grad_norm": 0.3402460217475891, "learning_rate": 4.813340176006519e-05, "loss": 1.3958, "step": 333 }, { "epoch": 0.14214278114693052, "grad_norm": 0.263990193605423, "learning_rate": 4.788466184073585e-05, "loss": 1.2336, "step": 334 }, { "epoch": 0.14256835833599318, "grad_norm": 0.330545037984848, "learning_rate": 4.7635974356244864e-05, "loss": 1.3919, "step": 335 }, { "epoch": 0.14299393552505585, "grad_norm": 0.3169040083885193, "learning_rate": 4.738734547103836e-05, "loss": 1.3512, "step": 336 }, { "epoch": 0.1434195127141185, "grad_norm": 0.32559284567832947, "learning_rate": 4.713878134810984e-05, "loss": 1.4493, "step": 337 }, { "epoch": 0.14384508990318118, "grad_norm": 0.34363633394241333, "learning_rate": 4.689028814884756e-05, "loss": 1.2949, "step": 338 }, { "epoch": 0.14427066709224384, "grad_norm": 0.3349243998527527, "learning_rate": 4.664187203288167e-05, "loss": 1.4964, "step": 339 }, { "epoch": 0.14469624428130654, "grad_norm": 0.30426791310310364, "learning_rate": 4.6393539157931586e-05, "loss": 1.3914, "step": 340 }, { "epoch": 0.1451218214703692, "grad_norm": 0.30210012197494507, "learning_rate": 4.61452956796534e-05, "loss": 1.4269, "step": 341 }, { "epoch": 0.14554739865943186, "grad_norm": 0.35527628660202026, "learning_rate": 4.589714775148719e-05, "loss": 1.4037, "step": 342 }, { "epoch": 0.14597297584849453, "grad_norm": 0.30980184674263, "learning_rate": 4.564910152450459e-05, "loss": 1.3133, "step": 343 }, { "epoch": 0.1463985530375572, "grad_norm": 0.3374204635620117, "learning_rate": 4.540116314725622e-05, "loss": 1.3178, "step": 344 }, { "epoch": 0.14682413022661986, "grad_norm": 0.32149648666381836, "learning_rate": 4.515333876561941e-05, "loss": 1.3854, "step": 345 }, { "epoch": 0.14724970741568252, "grad_norm": 0.3345913589000702, "learning_rate": 4.4905634522645694e-05, "loss": 1.4373, "step": 346 }, { "epoch": 0.1476752846047452, "grad_norm": 0.3649419844150543, "learning_rate": 4.4658056558408644e-05, "loss": 1.418, "step": 347 }, { "epoch": 0.14810086179380785, "grad_norm": 0.3086884319782257, "learning_rate": 4.4410611009851686e-05, "loss": 1.3965, "step": 348 }, { "epoch": 0.14852643898287052, "grad_norm": 0.3454095125198364, "learning_rate": 4.4163304010635876e-05, "loss": 1.5076, "step": 349 }, { "epoch": 0.14895201617193318, "grad_norm": 0.34644293785095215, "learning_rate": 4.391614169098795e-05, "loss": 1.4771, "step": 350 }, { "epoch": 0.14937759336099585, "grad_norm": 0.3557009994983673, "learning_rate": 4.366913017754836e-05, "loss": 1.3823, "step": 351 }, { "epoch": 0.1498031705500585, "grad_norm": 0.31497180461883545, "learning_rate": 4.342227559321932e-05, "loss": 1.421, "step": 352 }, { "epoch": 0.15022874773912118, "grad_norm": 0.29452037811279297, "learning_rate": 4.317558405701316e-05, "loss": 1.3823, "step": 353 }, { "epoch": 0.15065432492818384, "grad_norm": 0.35082679986953735, "learning_rate": 4.292906168390055e-05, "loss": 1.4215, "step": 354 }, { "epoch": 0.1510799021172465, "grad_norm": 0.32057732343673706, "learning_rate": 4.2682714584659e-05, "loss": 1.2923, "step": 355 }, { "epoch": 0.15150547930630917, "grad_norm": 0.3253067135810852, "learning_rate": 4.243654886572129e-05, "loss": 1.3602, "step": 356 }, { "epoch": 0.15193105649537184, "grad_norm": 0.3210274875164032, "learning_rate": 4.219057062902417e-05, "loss": 1.4533, "step": 357 }, { "epoch": 0.15235663368443453, "grad_norm": 0.4304807782173157, "learning_rate": 4.194478597185716e-05, "loss": 1.4197, "step": 358 }, { "epoch": 0.1527822108734972, "grad_norm": 0.3173682391643524, "learning_rate": 4.169920098671124e-05, "loss": 1.4527, "step": 359 }, { "epoch": 0.15320778806255986, "grad_norm": 0.3237103819847107, "learning_rate": 4.145382176112804e-05, "loss": 1.3939, "step": 360 }, { "epoch": 0.15363336525162252, "grad_norm": 0.2935871183872223, "learning_rate": 4.120865437754877e-05, "loss": 1.312, "step": 361 }, { "epoch": 0.1540589424406852, "grad_norm": 0.3397037982940674, "learning_rate": 4.0963704913163526e-05, "loss": 1.5097, "step": 362 }, { "epoch": 0.15448451962974785, "grad_norm": 0.31791093945503235, "learning_rate": 4.0718979439760665e-05, "loss": 1.3923, "step": 363 }, { "epoch": 0.15491009681881052, "grad_norm": 0.32730478048324585, "learning_rate": 4.047448402357622e-05, "loss": 1.3108, "step": 364 }, { "epoch": 0.15533567400787318, "grad_norm": 0.3350706398487091, "learning_rate": 4.023022472514368e-05, "loss": 1.3763, "step": 365 }, { "epoch": 0.15576125119693585, "grad_norm": 0.31630778312683105, "learning_rate": 3.9986207599143564e-05, "loss": 1.4107, "step": 366 }, { "epoch": 0.1561868283859985, "grad_norm": 0.32477709650993347, "learning_rate": 3.9742438694253484e-05, "loss": 1.3997, "step": 367 }, { "epoch": 0.15661240557506118, "grad_norm": 0.32294079661369324, "learning_rate": 3.9498924052998195e-05, "loss": 1.4678, "step": 368 }, { "epoch": 0.15703798276412384, "grad_norm": 0.30389007925987244, "learning_rate": 3.925566971159971e-05, "loss": 1.317, "step": 369 }, { "epoch": 0.1574635599531865, "grad_norm": 0.29815390706062317, "learning_rate": 3.901268169982784e-05, "loss": 1.2426, "step": 370 }, { "epoch": 0.15788913714224917, "grad_norm": 0.34378722310066223, "learning_rate": 3.8769966040850566e-05, "loss": 1.5315, "step": 371 }, { "epoch": 0.15831471433131183, "grad_norm": 0.2889562249183655, "learning_rate": 3.8527528751084826e-05, "loss": 1.2998, "step": 372 }, { "epoch": 0.1587402915203745, "grad_norm": 0.31654536724090576, "learning_rate": 3.8285375840047395e-05, "loss": 1.3312, "step": 373 }, { "epoch": 0.15916586870943716, "grad_norm": 0.3028466999530792, "learning_rate": 3.804351331020584e-05, "loss": 1.3981, "step": 374 }, { "epoch": 0.15959144589849983, "grad_norm": 0.31716054677963257, "learning_rate": 3.7801947156829834e-05, "loss": 1.4174, "step": 375 }, { "epoch": 0.1600170230875625, "grad_norm": 0.3307088017463684, "learning_rate": 3.7560683367842456e-05, "loss": 1.3022, "step": 376 }, { "epoch": 0.16044260027662519, "grad_norm": 0.37073442339897156, "learning_rate": 3.7319727923671785e-05, "loss": 1.2788, "step": 377 }, { "epoch": 0.16086817746568785, "grad_norm": 0.3232400119304657, "learning_rate": 3.707908679710276e-05, "loss": 1.4228, "step": 378 }, { "epoch": 0.16129375465475052, "grad_norm": 0.2912347912788391, "learning_rate": 3.6838765953128914e-05, "loss": 1.3194, "step": 379 }, { "epoch": 0.16171933184381318, "grad_norm": 0.35164567828178406, "learning_rate": 3.6598771348804766e-05, "loss": 1.4528, "step": 380 }, { "epoch": 0.16214490903287584, "grad_norm": 0.3369240164756775, "learning_rate": 3.635910893309792e-05, "loss": 1.3688, "step": 381 }, { "epoch": 0.1625704862219385, "grad_norm": 0.3216218054294586, "learning_rate": 3.6119784646741804e-05, "loss": 1.3827, "step": 382 }, { "epoch": 0.16299606341100117, "grad_norm": 0.33442652225494385, "learning_rate": 3.5880804422088255e-05, "loss": 1.4323, "step": 383 }, { "epoch": 0.16342164060006384, "grad_norm": 0.3149116337299347, "learning_rate": 3.564217418296055e-05, "loss": 1.3756, "step": 384 }, { "epoch": 0.1638472177891265, "grad_norm": 0.32829785346984863, "learning_rate": 3.540389984450661e-05, "loss": 1.4512, "step": 385 }, { "epoch": 0.16427279497818917, "grad_norm": 0.37037840485572815, "learning_rate": 3.516598731305222e-05, "loss": 1.3539, "step": 386 }, { "epoch": 0.16469837216725183, "grad_norm": 0.304059773683548, "learning_rate": 3.492844248595483e-05, "loss": 1.3534, "step": 387 }, { "epoch": 0.1651239493563145, "grad_norm": 0.3242076337337494, "learning_rate": 3.469127125145717e-05, "loss": 1.3389, "step": 388 }, { "epoch": 0.16554952654537716, "grad_norm": 0.33115020394325256, "learning_rate": 3.4454479488541414e-05, "loss": 1.4357, "step": 389 }, { "epoch": 0.16597510373443983, "grad_norm": 0.32753780484199524, "learning_rate": 3.421807306678346e-05, "loss": 1.3659, "step": 390 }, { "epoch": 0.1664006809235025, "grad_norm": 0.30588462948799133, "learning_rate": 3.398205784620735e-05, "loss": 1.3735, "step": 391 }, { "epoch": 0.16682625811256516, "grad_norm": 0.30423295497894287, "learning_rate": 3.3746439677140106e-05, "loss": 1.2956, "step": 392 }, { "epoch": 0.16725183530162782, "grad_norm": 0.33183082938194275, "learning_rate": 3.351122440006661e-05, "loss": 1.338, "step": 393 }, { "epoch": 0.1676774124906905, "grad_norm": 0.32961684465408325, "learning_rate": 3.327641784548495e-05, "loss": 1.4384, "step": 394 }, { "epoch": 0.16810298967975318, "grad_norm": 0.2988288700580597, "learning_rate": 3.3042025833761806e-05, "loss": 1.3986, "step": 395 }, { "epoch": 0.16852856686881584, "grad_norm": 0.32923299074172974, "learning_rate": 3.280805417498816e-05, "loss": 1.4054, "step": 396 }, { "epoch": 0.1689541440578785, "grad_norm": 0.31945064663887024, "learning_rate": 3.257450866883542e-05, "loss": 1.3031, "step": 397 }, { "epoch": 0.16937972124694117, "grad_norm": 0.3342922627925873, "learning_rate": 3.234139510441146e-05, "loss": 1.4884, "step": 398 }, { "epoch": 0.16980529843600384, "grad_norm": 0.31479397416114807, "learning_rate": 3.210871926011725e-05, "loss": 1.3229, "step": 399 }, { "epoch": 0.1702308756250665, "grad_norm": 0.32442063093185425, "learning_rate": 3.18764869035036e-05, "loss": 1.358, "step": 400 }, { "epoch": 0.17065645281412917, "grad_norm": 0.3147645592689514, "learning_rate": 3.164470379112816e-05, "loss": 1.3472, "step": 401 }, { "epoch": 0.17108203000319183, "grad_norm": 0.3168991506099701, "learning_rate": 3.141337566841277e-05, "loss": 1.365, "step": 402 }, { "epoch": 0.1715076071922545, "grad_norm": 0.33438199758529663, "learning_rate": 3.1182508269500985e-05, "loss": 1.3538, "step": 403 }, { "epoch": 0.17193318438131716, "grad_norm": 0.31441277265548706, "learning_rate": 3.095210731711603e-05, "loss": 1.4489, "step": 404 }, { "epoch": 0.17235876157037983, "grad_norm": 0.3182431757450104, "learning_rate": 3.072217852241884e-05, "loss": 1.3436, "step": 405 }, { "epoch": 0.1727843387594425, "grad_norm": 0.3269684612751007, "learning_rate": 3.0492727584866554e-05, "loss": 1.393, "step": 406 }, { "epoch": 0.17320991594850516, "grad_norm": 0.3089240789413452, "learning_rate": 3.026376019207126e-05, "loss": 1.4003, "step": 407 }, { "epoch": 0.17363549313756782, "grad_norm": 0.32416653633117676, "learning_rate": 3.0035282019658928e-05, "loss": 1.4448, "step": 408 }, { "epoch": 0.17406107032663048, "grad_norm": 0.35581550002098083, "learning_rate": 2.9807298731128773e-05, "loss": 1.3682, "step": 409 }, { "epoch": 0.17448664751569315, "grad_norm": 0.34638750553131104, "learning_rate": 2.957981597771292e-05, "loss": 1.3646, "step": 410 }, { "epoch": 0.17491222470475581, "grad_norm": 0.3218763470649719, "learning_rate": 2.935283939823621e-05, "loss": 1.4833, "step": 411 }, { "epoch": 0.17533780189381848, "grad_norm": 0.29522332549095154, "learning_rate": 2.9126374618976528e-05, "loss": 1.2961, "step": 412 }, { "epoch": 0.17576337908288114, "grad_norm": 0.30747368931770325, "learning_rate": 2.8900427253525248e-05, "loss": 1.3874, "step": 413 }, { "epoch": 0.17618895627194384, "grad_norm": 0.3579199016094208, "learning_rate": 2.8675002902648146e-05, "loss": 1.4221, "step": 414 }, { "epoch": 0.1766145334610065, "grad_norm": 0.29601120948791504, "learning_rate": 2.8450107154146606e-05, "loss": 1.4503, "step": 415 }, { "epoch": 0.17704011065006917, "grad_norm": 0.32221487164497375, "learning_rate": 2.8225745582718965e-05, "loss": 1.4458, "step": 416 }, { "epoch": 0.17746568783913183, "grad_norm": 0.316237211227417, "learning_rate": 2.8001923749822523e-05, "loss": 1.3074, "step": 417 }, { "epoch": 0.1778912650281945, "grad_norm": 0.374932199716568, "learning_rate": 2.7778647203535475e-05, "loss": 1.337, "step": 418 }, { "epoch": 0.17831684221725716, "grad_norm": 0.3187963366508484, "learning_rate": 2.75559214784196e-05, "loss": 1.2731, "step": 419 }, { "epoch": 0.17874241940631982, "grad_norm": 0.3493146300315857, "learning_rate": 2.733375209538288e-05, "loss": 1.3924, "step": 420 }, { "epoch": 0.1791679965953825, "grad_norm": 0.3364221155643463, "learning_rate": 2.7112144561542757e-05, "loss": 1.3695, "step": 421 }, { "epoch": 0.17959357378444515, "grad_norm": 0.3134714663028717, "learning_rate": 2.6891104370089642e-05, "loss": 1.4045, "step": 422 }, { "epoch": 0.18001915097350782, "grad_norm": 0.32247835397720337, "learning_rate": 2.6670637000150633e-05, "loss": 1.3614, "step": 423 }, { "epoch": 0.18044472816257048, "grad_norm": 0.3357102572917938, "learning_rate": 2.6450747916653857e-05, "loss": 1.4259, "step": 424 }, { "epoch": 0.18087030535163315, "grad_norm": 0.3296355605125427, "learning_rate": 2.6231442570192845e-05, "loss": 1.4139, "step": 425 }, { "epoch": 0.1812958825406958, "grad_norm": 0.3181535005569458, "learning_rate": 2.6012726396891518e-05, "loss": 1.3069, "step": 426 }, { "epoch": 0.18172145972975848, "grad_norm": 0.33894723653793335, "learning_rate": 2.579460481826947e-05, "loss": 1.4014, "step": 427 }, { "epoch": 0.18214703691882114, "grad_norm": 0.33284929394721985, "learning_rate": 2.557708324110747e-05, "loss": 1.3041, "step": 428 }, { "epoch": 0.1825726141078838, "grad_norm": 0.35112953186035156, "learning_rate": 2.536016705731351e-05, "loss": 1.4356, "step": 429 }, { "epoch": 0.18299819129694647, "grad_norm": 0.3994680941104889, "learning_rate": 2.514386164378915e-05, "loss": 1.412, "step": 430 }, { "epoch": 0.18342376848600914, "grad_norm": 0.32337042689323425, "learning_rate": 2.4928172362296205e-05, "loss": 1.4461, "step": 431 }, { "epoch": 0.18384934567507183, "grad_norm": 0.3220783472061157, "learning_rate": 2.4713104559323895e-05, "loss": 1.4217, "step": 432 }, { "epoch": 0.1842749228641345, "grad_norm": 0.3344449996948242, "learning_rate": 2.4498663565956233e-05, "loss": 1.4241, "step": 433 }, { "epoch": 0.18470050005319716, "grad_norm": 0.3385544717311859, "learning_rate": 2.428485469773997e-05, "loss": 1.3389, "step": 434 }, { "epoch": 0.18512607724225982, "grad_norm": 0.36458736658096313, "learning_rate": 2.4071683254552752e-05, "loss": 1.4758, "step": 435 }, { "epoch": 0.1855516544313225, "grad_norm": 0.3004361093044281, "learning_rate": 2.3859154520471766e-05, "loss": 1.3474, "step": 436 }, { "epoch": 0.18597723162038515, "grad_norm": 0.3578473627567291, "learning_rate": 2.364727376364285e-05, "loss": 1.4717, "step": 437 }, { "epoch": 0.18640280880944782, "grad_norm": 0.3373146057128906, "learning_rate": 2.343604623614974e-05, "loss": 1.3185, "step": 438 }, { "epoch": 0.18682838599851048, "grad_norm": 0.33882611989974976, "learning_rate": 2.3225477173884063e-05, "loss": 1.4053, "step": 439 }, { "epoch": 0.18725396318757315, "grad_norm": 0.3465374708175659, "learning_rate": 2.3015571796415398e-05, "loss": 1.3454, "step": 440 }, { "epoch": 0.1876795403766358, "grad_norm": 0.37321579456329346, "learning_rate": 2.280633530686195e-05, "loss": 1.3604, "step": 441 }, { "epoch": 0.18810511756569848, "grad_norm": 0.3368617594242096, "learning_rate": 2.2597772891761653e-05, "loss": 1.4659, "step": 442 }, { "epoch": 0.18853069475476114, "grad_norm": 0.3479175269603729, "learning_rate": 2.2389889720943447e-05, "loss": 1.4366, "step": 443 }, { "epoch": 0.1889562719438238, "grad_norm": 0.31625932455062866, "learning_rate": 2.2182690947399304e-05, "loss": 1.3762, "step": 444 }, { "epoch": 0.18938184913288647, "grad_norm": 0.30867132544517517, "learning_rate": 2.1976181707156345e-05, "loss": 1.4616, "step": 445 }, { "epoch": 0.18980742632194914, "grad_norm": 0.31360673904418945, "learning_rate": 2.1770367119149598e-05, "loss": 1.4013, "step": 446 }, { "epoch": 0.1902330035110118, "grad_norm": 0.3171330690383911, "learning_rate": 2.1565252285095156e-05, "loss": 1.2904, "step": 447 }, { "epoch": 0.19065858070007446, "grad_norm": 0.33907023072242737, "learning_rate": 2.1360842289363614e-05, "loss": 1.3836, "step": 448 }, { "epoch": 0.19108415788913713, "grad_norm": 0.3281523585319519, "learning_rate": 2.1157142198854103e-05, "loss": 1.4903, "step": 449 }, { "epoch": 0.19150973507819982, "grad_norm": 0.32757705450057983, "learning_rate": 2.0954157062868668e-05, "loss": 1.5003, "step": 450 }, { "epoch": 0.1919353122672625, "grad_norm": 0.3444954752922058, "learning_rate": 2.075189191298716e-05, "loss": 1.3952, "step": 451 }, { "epoch": 0.19236088945632515, "grad_norm": 0.3098054826259613, "learning_rate": 2.0550351762942428e-05, "loss": 1.2589, "step": 452 }, { "epoch": 0.19278646664538782, "grad_norm": 0.3291134834289551, "learning_rate": 2.0349541608496077e-05, "loss": 1.4151, "step": 453 }, { "epoch": 0.19321204383445048, "grad_norm": 0.32852277159690857, "learning_rate": 2.0149466427314682e-05, "loss": 1.4448, "step": 454 }, { "epoch": 0.19363762102351315, "grad_norm": 0.3113350570201874, "learning_rate": 1.9950131178846303e-05, "loss": 1.3029, "step": 455 }, { "epoch": 0.1940631982125758, "grad_norm": 0.30784913897514343, "learning_rate": 1.9751540804197666e-05, "loss": 1.2535, "step": 456 }, { "epoch": 0.19448877540163848, "grad_norm": 0.32419663667678833, "learning_rate": 1.9553700226011567e-05, "loss": 1.2871, "step": 457 }, { "epoch": 0.19491435259070114, "grad_norm": 0.31145137548446655, "learning_rate": 1.935661434834491e-05, "loss": 1.4369, "step": 458 }, { "epoch": 0.1953399297797638, "grad_norm": 0.3428165316581726, "learning_rate": 1.91602880565472e-05, "loss": 1.395, "step": 459 }, { "epoch": 0.19576550696882647, "grad_norm": 0.3369043469429016, "learning_rate": 1.89647262171393e-05, "loss": 1.3816, "step": 460 }, { "epoch": 0.19619108415788913, "grad_norm": 0.33101511001586914, "learning_rate": 1.876993367769297e-05, "loss": 1.4208, "step": 461 }, { "epoch": 0.1966166613469518, "grad_norm": 0.34739160537719727, "learning_rate": 1.8575915266710564e-05, "loss": 1.3579, "step": 462 }, { "epoch": 0.19704223853601446, "grad_norm": 0.34199175238609314, "learning_rate": 1.8382675793505406e-05, "loss": 1.4565, "step": 463 }, { "epoch": 0.19746781572507713, "grad_norm": 0.3198068141937256, "learning_rate": 1.8190220048082613e-05, "loss": 1.4965, "step": 464 }, { "epoch": 0.1978933929141398, "grad_norm": 0.324301153421402, "learning_rate": 1.7998552801020257e-05, "loss": 1.4035, "step": 465 }, { "epoch": 0.19831897010320246, "grad_norm": 0.347256600856781, "learning_rate": 1.7807678803351237e-05, "loss": 1.4661, "step": 466 }, { "epoch": 0.19874454729226512, "grad_norm": 0.3176523447036743, "learning_rate": 1.7617602786445403e-05, "loss": 1.4176, "step": 467 }, { "epoch": 0.1991701244813278, "grad_norm": 0.32808732986450195, "learning_rate": 1.7428329461892328e-05, "loss": 1.4373, "step": 468 }, { "epoch": 0.19959570167039048, "grad_norm": 0.3186570405960083, "learning_rate": 1.723986352138452e-05, "loss": 1.478, "step": 469 }, { "epoch": 0.20002127885945314, "grad_norm": 0.31049400568008423, "learning_rate": 1.7052209636601087e-05, "loss": 1.3014, "step": 470 }, { "epoch": 0.2004468560485158, "grad_norm": 0.314045250415802, "learning_rate": 1.686537245909201e-05, "loss": 1.4446, "step": 471 }, { "epoch": 0.20087243323757847, "grad_norm": 0.3211539387702942, "learning_rate": 1.6679356620162766e-05, "loss": 1.3681, "step": 472 }, { "epoch": 0.20129801042664114, "grad_norm": 0.30641838908195496, "learning_rate": 1.6494166730759524e-05, "loss": 1.3608, "step": 473 }, { "epoch": 0.2017235876157038, "grad_norm": 0.33943578600883484, "learning_rate": 1.6309807381354958e-05, "loss": 1.3157, "step": 474 }, { "epoch": 0.20214916480476647, "grad_norm": 0.29183852672576904, "learning_rate": 1.6126283141834293e-05, "loss": 1.3042, "step": 475 }, { "epoch": 0.20257474199382913, "grad_norm": 0.294260710477829, "learning_rate": 1.594359856138219e-05, "loss": 1.393, "step": 476 }, { "epoch": 0.2030003191828918, "grad_norm": 0.3283140957355499, "learning_rate": 1.5761758168369862e-05, "loss": 1.431, "step": 477 }, { "epoch": 0.20342589637195446, "grad_norm": 0.32865697145462036, "learning_rate": 1.558076647024285e-05, "loss": 1.3548, "step": 478 }, { "epoch": 0.20385147356101713, "grad_norm": 0.3092752993106842, "learning_rate": 1.5400627953409396e-05, "loss": 1.2688, "step": 479 }, { "epoch": 0.2042770507500798, "grad_norm": 0.3280682861804962, "learning_rate": 1.5221347083129045e-05, "loss": 1.3923, "step": 480 }, { "epoch": 0.20470262793914246, "grad_norm": 0.3039858639240265, "learning_rate": 1.5042928303402155e-05, "loss": 1.3433, "step": 481 }, { "epoch": 0.20512820512820512, "grad_norm": 0.3173251748085022, "learning_rate": 1.4865376036859597e-05, "loss": 1.427, "step": 482 }, { "epoch": 0.20555378231726779, "grad_norm": 0.31789281964302063, "learning_rate": 1.4688694684653181e-05, "loss": 1.4348, "step": 483 }, { "epoch": 0.20597935950633045, "grad_norm": 0.31420034170150757, "learning_rate": 1.4512888626346599e-05, "loss": 1.4612, "step": 484 }, { "epoch": 0.20640493669539312, "grad_norm": 0.3418427109718323, "learning_rate": 1.4337962219806778e-05, "loss": 1.527, "step": 485 }, { "epoch": 0.20683051388445578, "grad_norm": 0.2978665828704834, "learning_rate": 1.4163919801095954e-05, "loss": 1.4091, "step": 486 }, { "epoch": 0.20725609107351847, "grad_norm": 0.3711923658847809, "learning_rate": 1.39907656843641e-05, "loss": 1.3739, "step": 487 }, { "epoch": 0.20768166826258114, "grad_norm": 0.3134306073188782, "learning_rate": 1.381850416174203e-05, "loss": 1.4328, "step": 488 }, { "epoch": 0.2081072454516438, "grad_norm": 0.32045456767082214, "learning_rate": 1.3647139503235046e-05, "loss": 1.4792, "step": 489 }, { "epoch": 0.20853282264070647, "grad_norm": 0.3508617579936981, "learning_rate": 1.3476675956617007e-05, "loss": 1.3979, "step": 490 }, { "epoch": 0.20895839982976913, "grad_norm": 0.3196643590927124, "learning_rate": 1.3307117747325104e-05, "loss": 1.311, "step": 491 }, { "epoch": 0.2093839770188318, "grad_norm": 0.3366539776325226, "learning_rate": 1.313846907835507e-05, "loss": 1.4454, "step": 492 }, { "epoch": 0.20980955420789446, "grad_norm": 0.31268173456192017, "learning_rate": 1.2970734130157069e-05, "loss": 1.373, "step": 493 }, { "epoch": 0.21023513139695713, "grad_norm": 0.3394613265991211, "learning_rate": 1.2803917060531994e-05, "loss": 1.4365, "step": 494 }, { "epoch": 0.2106607085860198, "grad_norm": 0.3489893674850464, "learning_rate": 1.2638022004528416e-05, "loss": 1.3662, "step": 495 }, { "epoch": 0.21108628577508246, "grad_norm": 0.3316808044910431, "learning_rate": 1.2473053074340157e-05, "loss": 1.4181, "step": 496 }, { "epoch": 0.21151186296414512, "grad_norm": 0.3342049717903137, "learning_rate": 1.2309014359204252e-05, "loss": 1.2779, "step": 497 }, { "epoch": 0.21193744015320778, "grad_norm": 0.33748358488082886, "learning_rate": 1.2145909925299687e-05, "loss": 1.3213, "step": 498 }, { "epoch": 0.21236301734227045, "grad_norm": 0.406751811504364, "learning_rate": 1.1983743815646508e-05, "loss": 1.3167, "step": 499 }, { "epoch": 0.21278859453133311, "grad_norm": 0.30885371565818787, "learning_rate": 1.1822520050005675e-05, "loss": 1.4627, "step": 500 }, { "epoch": 0.21321417172039578, "grad_norm": 0.30035650730133057, "learning_rate": 1.1662242624779413e-05, "loss": 1.3638, "step": 501 }, { "epoch": 0.21363974890945844, "grad_norm": 0.3490166962146759, "learning_rate": 1.1502915512912094e-05, "loss": 1.3379, "step": 502 }, { "epoch": 0.2140653260985211, "grad_norm": 0.31692811846733093, "learning_rate": 1.1344542663791851e-05, "loss": 1.3315, "step": 503 }, { "epoch": 0.21449090328758377, "grad_norm": 0.3350829780101776, "learning_rate": 1.118712800315258e-05, "loss": 1.4076, "step": 504 }, { "epoch": 0.21491648047664647, "grad_norm": 0.32507750391960144, "learning_rate": 1.1030675432976679e-05, "loss": 1.3802, "step": 505 }, { "epoch": 0.21534205766570913, "grad_norm": 0.3646472692489624, "learning_rate": 1.0875188831398353e-05, "loss": 1.5334, "step": 506 }, { "epoch": 0.2157676348547718, "grad_norm": 0.3090904951095581, "learning_rate": 1.0720672052607416e-05, "loss": 1.3333, "step": 507 }, { "epoch": 0.21619321204383446, "grad_norm": 0.3293876349925995, "learning_rate": 1.0567128926753827e-05, "loss": 1.2744, "step": 508 }, { "epoch": 0.21661878923289712, "grad_norm": 0.32312512397766113, "learning_rate": 1.0414563259852684e-05, "loss": 1.4732, "step": 509 }, { "epoch": 0.2170443664219598, "grad_norm": 0.3056824505329132, "learning_rate": 1.0262978833689906e-05, "loss": 1.3128, "step": 510 }, { "epoch": 0.21746994361102245, "grad_norm": 0.321416974067688, "learning_rate": 1.0112379405728512e-05, "loss": 1.2522, "step": 511 }, { "epoch": 0.21789552080008512, "grad_norm": 0.30441486835479736, "learning_rate": 9.962768709015435e-06, "loss": 1.4061, "step": 512 }, { "epoch": 0.21832109798914778, "grad_norm": 0.3073800504207611, "learning_rate": 9.814150452089055e-06, "loss": 1.312, "step": 513 }, { "epoch": 0.21874667517821045, "grad_norm": 0.3266609311103821, "learning_rate": 9.666528318887198e-06, "loss": 1.2735, "step": 514 }, { "epoch": 0.2191722523672731, "grad_norm": 0.30625462532043457, "learning_rate": 9.51990596865585e-06, "loss": 1.3488, "step": 515 }, { "epoch": 0.21959782955633578, "grad_norm": 0.326201856136322, "learning_rate": 9.374287035858491e-06, "loss": 1.3727, "step": 516 }, { "epoch": 0.22002340674539844, "grad_norm": 0.3594081401824951, "learning_rate": 9.229675130085918e-06, "loss": 1.3612, "step": 517 }, { "epoch": 0.2204489839344611, "grad_norm": 0.31369614601135254, "learning_rate": 9.086073835966869e-06, "loss": 1.4186, "step": 518 }, { "epoch": 0.22087456112352377, "grad_norm": 0.33593955636024475, "learning_rate": 8.94348671307907e-06, "loss": 1.4858, "step": 519 }, { "epoch": 0.22130013831258644, "grad_norm": 0.33301830291748047, "learning_rate": 8.801917295861101e-06, "loss": 1.2687, "step": 520 }, { "epoch": 0.2217257155016491, "grad_norm": 0.3187962770462036, "learning_rate": 8.66136909352469e-06, "loss": 1.3218, "step": 521 }, { "epoch": 0.22215129269071177, "grad_norm": 0.31819725036621094, "learning_rate": 8.521845589967775e-06, "loss": 1.4195, "step": 522 }, { "epoch": 0.22257686987977443, "grad_norm": 0.3147396743297577, "learning_rate": 8.383350243688175e-06, "loss": 1.3723, "step": 523 }, { "epoch": 0.22300244706883712, "grad_norm": 0.3238096237182617, "learning_rate": 8.24588648769778e-06, "loss": 1.3681, "step": 524 }, { "epoch": 0.2234280242578998, "grad_norm": 0.34045907855033875, "learning_rate": 8.109457729437537e-06, "loss": 1.4074, "step": 525 }, { "epoch": 0.22385360144696245, "grad_norm": 0.3139466643333435, "learning_rate": 7.974067350692921e-06, "loss": 1.3848, "step": 526 }, { "epoch": 0.22427917863602512, "grad_norm": 0.33936434984207153, "learning_rate": 7.839718707510146e-06, "loss": 1.471, "step": 527 }, { "epoch": 0.22470475582508778, "grad_norm": 0.300231009721756, "learning_rate": 7.706415130112993e-06, "loss": 1.4524, "step": 528 }, { "epoch": 0.22513033301415045, "grad_norm": 0.3271010220050812, "learning_rate": 7.574159922820184e-06, "loss": 1.3812, "step": 529 }, { "epoch": 0.2255559102032131, "grad_norm": 0.3297939896583557, "learning_rate": 7.44295636396356e-06, "loss": 1.3319, "step": 530 }, { "epoch": 0.22598148739227578, "grad_norm": 0.3267982304096222, "learning_rate": 7.3128077058067675e-06, "loss": 1.3745, "step": 531 }, { "epoch": 0.22640706458133844, "grad_norm": 0.3305337727069855, "learning_rate": 7.1837171744646494e-06, "loss": 1.4081, "step": 532 }, { "epoch": 0.2268326417704011, "grad_norm": 0.3130592405796051, "learning_rate": 7.05568796982331e-06, "loss": 1.3635, "step": 533 }, { "epoch": 0.22725821895946377, "grad_norm": 0.31961482763290405, "learning_rate": 6.9287232654607345e-06, "loss": 1.2938, "step": 534 }, { "epoch": 0.22768379614852643, "grad_norm": 0.3059883117675781, "learning_rate": 6.802826208568203e-06, "loss": 1.2205, "step": 535 }, { "epoch": 0.2281093733375891, "grad_norm": 0.3591614067554474, "learning_rate": 6.677999919872185e-06, "loss": 1.4298, "step": 536 }, { "epoch": 0.22853495052665176, "grad_norm": 0.3192720115184784, "learning_rate": 6.554247493557047e-06, "loss": 1.486, "step": 537 }, { "epoch": 0.22896052771571443, "grad_norm": 0.3339605927467346, "learning_rate": 6.431571997188363e-06, "loss": 1.3664, "step": 538 }, { "epoch": 0.2293861049047771, "grad_norm": 0.33562207221984863, "learning_rate": 6.309976471636808e-06, "loss": 1.3784, "step": 539 }, { "epoch": 0.22981168209383976, "grad_norm": 0.3569333553314209, "learning_rate": 6.18946393100287e-06, "loss": 1.4994, "step": 540 }, { "epoch": 0.23023725928290242, "grad_norm": 0.3155967593193054, "learning_rate": 6.070037362542058e-06, "loss": 1.3339, "step": 541 }, { "epoch": 0.23066283647196512, "grad_norm": 0.3392326533794403, "learning_rate": 5.951699726590881e-06, "loss": 1.2821, "step": 542 }, { "epoch": 0.23108841366102778, "grad_norm": 0.31855908036231995, "learning_rate": 5.834453956493519e-06, "loss": 1.2014, "step": 543 }, { "epoch": 0.23151399085009045, "grad_norm": 0.3185470700263977, "learning_rate": 5.718302958528998e-06, "loss": 1.465, "step": 544 }, { "epoch": 0.2319395680391531, "grad_norm": 0.3613238036632538, "learning_rate": 5.603249611839295e-06, "loss": 1.4258, "step": 545 }, { "epoch": 0.23236514522821577, "grad_norm": 0.3471008837223053, "learning_rate": 5.489296768357827e-06, "loss": 1.2939, "step": 546 }, { "epoch": 0.23279072241727844, "grad_norm": 0.33712702989578247, "learning_rate": 5.376447252738848e-06, "loss": 1.3278, "step": 547 }, { "epoch": 0.2332162996063411, "grad_norm": 0.33189454674720764, "learning_rate": 5.264703862287418e-06, "loss": 1.3793, "step": 548 }, { "epoch": 0.23364187679540377, "grad_norm": 0.32526296377182007, "learning_rate": 5.1540693668900355e-06, "loss": 1.4734, "step": 549 }, { "epoch": 0.23406745398446643, "grad_norm": 0.32723116874694824, "learning_rate": 5.044546508945996e-06, "loss": 1.3883, "step": 550 }, { "epoch": 0.2344930311735291, "grad_norm": 0.3163575232028961, "learning_rate": 4.936138003299412e-06, "loss": 1.3487, "step": 551 }, { "epoch": 0.23491860836259176, "grad_norm": 0.3888072371482849, "learning_rate": 4.828846537171932e-06, "loss": 1.3733, "step": 552 }, { "epoch": 0.23534418555165443, "grad_norm": 0.31089290976524353, "learning_rate": 4.722674770096097e-06, "loss": 1.3349, "step": 553 }, { "epoch": 0.2357697627407171, "grad_norm": 0.3057332932949066, "learning_rate": 4.617625333849434e-06, "loss": 1.4249, "step": 554 }, { "epoch": 0.23619533992977976, "grad_norm": 0.34270864725112915, "learning_rate": 4.513700832389245e-06, "loss": 1.3863, "step": 555 }, { "epoch": 0.23662091711884242, "grad_norm": 0.29578831791877747, "learning_rate": 4.410903841788e-06, "loss": 1.3942, "step": 556 }, { "epoch": 0.2370464943079051, "grad_norm": 0.3092251121997833, "learning_rate": 4.309236910169562e-06, "loss": 1.3332, "step": 557 }, { "epoch": 0.23747207149696775, "grad_norm": 0.35976263880729675, "learning_rate": 4.208702557645933e-06, "loss": 1.3551, "step": 558 }, { "epoch": 0.23789764868603042, "grad_norm": 0.332644522190094, "learning_rate": 4.10930327625485e-06, "loss": 1.3691, "step": 559 }, { "epoch": 0.2383232258750931, "grad_norm": 0.384097158908844, "learning_rate": 4.011041529898019e-06, "loss": 1.367, "step": 560 }, { "epoch": 0.23874880306415577, "grad_norm": 0.3527825176715851, "learning_rate": 3.913919754279966e-06, "loss": 1.546, "step": 561 }, { "epoch": 0.23917438025321844, "grad_norm": 0.3396340012550354, "learning_rate": 3.817940356847765e-06, "loss": 1.3436, "step": 562 }, { "epoch": 0.2395999574422811, "grad_norm": 0.3177872896194458, "learning_rate": 3.7231057167312676e-06, "loss": 1.3255, "step": 563 }, { "epoch": 0.24002553463134377, "grad_norm": 0.33055418729782104, "learning_rate": 3.6294181846841856e-06, "loss": 1.3571, "step": 564 }, { "epoch": 0.24045111182040643, "grad_norm": 0.29036635160446167, "learning_rate": 3.5368800830258064e-06, "loss": 1.3218, "step": 565 }, { "epoch": 0.2408766890094691, "grad_norm": 0.2930334806442261, "learning_rate": 3.445493705583419e-06, "loss": 1.434, "step": 566 }, { "epoch": 0.24130226619853176, "grad_norm": 0.3378046751022339, "learning_rate": 3.3552613176354717e-06, "loss": 1.3088, "step": 567 }, { "epoch": 0.24172784338759443, "grad_norm": 0.3452695906162262, "learning_rate": 3.2661851558554057e-06, "loss": 1.3953, "step": 568 }, { "epoch": 0.2421534205766571, "grad_norm": 0.32704198360443115, "learning_rate": 3.17826742825621e-06, "loss": 1.4465, "step": 569 }, { "epoch": 0.24257899776571976, "grad_norm": 0.3004928529262543, "learning_rate": 3.0915103141356984e-06, "loss": 1.4172, "step": 570 }, { "epoch": 0.24300457495478242, "grad_norm": 0.3545885980129242, "learning_rate": 3.0059159640225097e-06, "loss": 1.3583, "step": 571 }, { "epoch": 0.24343015214384509, "grad_norm": 0.34187474846839905, "learning_rate": 2.92148649962275e-06, "loss": 1.4452, "step": 572 }, { "epoch": 0.24385572933290775, "grad_norm": 0.35106217861175537, "learning_rate": 2.8382240137674376e-06, "loss": 1.3827, "step": 573 }, { "epoch": 0.24428130652197041, "grad_norm": 0.29575130343437195, "learning_rate": 2.756130570360621e-06, "loss": 1.3322, "step": 574 }, { "epoch": 0.24470688371103308, "grad_norm": 0.3385341465473175, "learning_rate": 2.6752082043282277e-06, "loss": 1.3895, "step": 575 }, { "epoch": 0.24513246090009574, "grad_norm": 0.32416394352912903, "learning_rate": 2.595458921567573e-06, "loss": 1.3432, "step": 576 }, { "epoch": 0.2455580380891584, "grad_norm": 0.3210682272911072, "learning_rate": 2.51688469889772e-06, "loss": 1.4158, "step": 577 }, { "epoch": 0.24598361527822107, "grad_norm": 0.3364936411380768, "learning_rate": 2.4394874840104e-06, "loss": 1.4189, "step": 578 }, { "epoch": 0.24640919246728377, "grad_norm": 0.2938537001609802, "learning_rate": 2.3632691954217746e-06, "loss": 1.2996, "step": 579 }, { "epoch": 0.24683476965634643, "grad_norm": 0.31600701808929443, "learning_rate": 2.2882317224248883e-06, "loss": 1.3783, "step": 580 }, { "epoch": 0.2472603468454091, "grad_norm": 0.3460082411766052, "learning_rate": 2.2143769250427883e-06, "loss": 1.3835, "step": 581 }, { "epoch": 0.24768592403447176, "grad_norm": 0.3603164255619049, "learning_rate": 2.141706633982493e-06, "loss": 1.3649, "step": 582 }, { "epoch": 0.24811150122353443, "grad_norm": 0.3762056529521942, "learning_rate": 2.070222650589526e-06, "loss": 1.4519, "step": 583 }, { "epoch": 0.2485370784125971, "grad_norm": 0.3702375590801239, "learning_rate": 1.9999267468033323e-06, "loss": 1.337, "step": 584 }, { "epoch": 0.24896265560165975, "grad_norm": 0.3490135371685028, "learning_rate": 1.9308206651133365e-06, "loss": 1.3851, "step": 585 }, { "epoch": 0.24938823279072242, "grad_norm": 0.32408085465431213, "learning_rate": 1.8629061185157225e-06, "loss": 1.3629, "step": 586 }, { "epoch": 0.24981380997978508, "grad_norm": 0.318099707365036, "learning_rate": 1.7961847904710227e-06, "loss": 1.3742, "step": 587 }, { "epoch": 0.25023938716884775, "grad_norm": 0.3147919178009033, "learning_rate": 1.730658334862334e-06, "loss": 1.4073, "step": 588 }, { "epoch": 0.2506649643579104, "grad_norm": 0.37398630380630493, "learning_rate": 1.666328375954368e-06, "loss": 1.2967, "step": 589 }, { "epoch": 0.2510905415469731, "grad_norm": 0.36172211170196533, "learning_rate": 1.6031965083531609e-06, "loss": 1.4686, "step": 590 }, { "epoch": 0.25151611873603574, "grad_norm": 0.39417141675949097, "learning_rate": 1.5412642969665546e-06, "loss": 1.4736, "step": 591 }, { "epoch": 0.2519416959250984, "grad_norm": 0.3256359398365021, "learning_rate": 1.4805332769654012e-06, "loss": 1.4766, "step": 592 }, { "epoch": 0.2523672731141611, "grad_norm": 0.35618171095848083, "learning_rate": 1.4210049537455195e-06, "loss": 1.2675, "step": 593 }, { "epoch": 0.25279285030322374, "grad_norm": 0.31525132060050964, "learning_rate": 1.362680802890376e-06, "loss": 1.3951, "step": 594 }, { "epoch": 0.2532184274922864, "grad_norm": 0.3315390348434448, "learning_rate": 1.3055622701344972e-06, "loss": 1.4476, "step": 595 }, { "epoch": 0.25364400468134907, "grad_norm": 0.3497242033481598, "learning_rate": 1.2496507713276483e-06, "loss": 1.507, "step": 596 }, { "epoch": 0.25406958187041173, "grad_norm": 0.32933562994003296, "learning_rate": 1.1949476923997394e-06, "loss": 1.3046, "step": 597 }, { "epoch": 0.2544951590594744, "grad_norm": 0.340378999710083, "learning_rate": 1.141454389326435e-06, "loss": 1.264, "step": 598 }, { "epoch": 0.25492073624853706, "grad_norm": 0.30960968136787415, "learning_rate": 1.0891721880955995e-06, "loss": 1.3234, "step": 599 }, { "epoch": 0.2553463134375997, "grad_norm": 0.3078996241092682, "learning_rate": 1.038102384674383e-06, "loss": 1.4485, "step": 600 }, { "epoch": 0.2557718906266624, "grad_norm": 0.3417295217514038, "learning_rate": 9.882462449771035e-07, "loss": 1.3488, "step": 601 }, { "epoch": 0.25619746781572506, "grad_norm": 0.34004366397857666, "learning_rate": 9.3960500483391e-07, "loss": 1.3679, "step": 602 }, { "epoch": 0.2566230450047877, "grad_norm": 0.30232709646224976, "learning_rate": 8.921798699600692e-07, "loss": 1.2706, "step": 603 }, { "epoch": 0.2570486221938504, "grad_norm": 0.26610705256462097, "learning_rate": 8.459720159261719e-07, "loss": 1.4251, "step": 604 }, { "epoch": 0.25747419938291305, "grad_norm": 0.3400372564792633, "learning_rate": 8.009825881289124e-07, "loss": 1.3359, "step": 605 }, { "epoch": 0.25789977657197577, "grad_norm": 0.3164251744747162, "learning_rate": 7.572127017627329e-07, "loss": 1.3374, "step": 606 }, { "epoch": 0.25832535376103843, "grad_norm": 0.31121399998664856, "learning_rate": 7.146634417922016e-07, "loss": 1.4161, "step": 607 }, { "epoch": 0.2587509309501011, "grad_norm": 0.3260907828807831, "learning_rate": 6.733358629250619e-07, "loss": 1.3978, "step": 608 }, { "epoch": 0.25917650813916376, "grad_norm": 0.32060369849205017, "learning_rate": 6.332309895861533e-07, "loss": 1.3607, "step": 609 }, { "epoch": 0.25960208532822643, "grad_norm": 0.30628058314323425, "learning_rate": 5.943498158919536e-07, "loss": 1.3121, "step": 610 }, { "epoch": 0.2600276625172891, "grad_norm": 0.33873802423477173, "learning_rate": 5.566933056259882e-07, "loss": 1.4258, "step": 611 }, { "epoch": 0.26045323970635176, "grad_norm": 0.33910998702049255, "learning_rate": 5.202623922149152e-07, "loss": 1.4206, "step": 612 }, { "epoch": 0.2608788168954144, "grad_norm": 0.32269635796546936, "learning_rate": 4.850579787053944e-07, "loss": 1.4678, "step": 613 }, { "epoch": 0.2613043940844771, "grad_norm": 0.3426719903945923, "learning_rate": 4.510809377416936e-07, "loss": 1.403, "step": 614 }, { "epoch": 0.26172997127353975, "grad_norm": 0.3457522690296173, "learning_rate": 4.1833211154408414e-07, "loss": 1.4181, "step": 615 }, { "epoch": 0.2621555484626024, "grad_norm": 0.3166434168815613, "learning_rate": 3.8681231188791857e-07, "loss": 1.423, "step": 616 }, { "epoch": 0.2625811256516651, "grad_norm": 0.357695609331131, "learning_rate": 3.565223200835577e-07, "loss": 1.506, "step": 617 }, { "epoch": 0.26300670284072775, "grad_norm": 0.3549404740333557, "learning_rate": 3.2746288695696404e-07, "loss": 1.4517, "step": 618 }, { "epoch": 0.2634322800297904, "grad_norm": 0.3207859694957733, "learning_rate": 2.996347328311222e-07, "loss": 1.4455, "step": 619 }, { "epoch": 0.2638578572188531, "grad_norm": 0.3387575149536133, "learning_rate": 2.7303854750815316e-07, "loss": 1.4779, "step": 620 }, { "epoch": 0.26428343440791574, "grad_norm": 0.34494006633758545, "learning_rate": 2.4767499025223904e-07, "loss": 1.3463, "step": 621 }, { "epoch": 0.2647090115969784, "grad_norm": 0.33494681119918823, "learning_rate": 2.2354468977327514e-07, "loss": 1.23, "step": 622 }, { "epoch": 0.26513458878604107, "grad_norm": 0.3021242916584015, "learning_rate": 2.006482442112767e-07, "loss": 1.2947, "step": 623 }, { "epoch": 0.26556016597510373, "grad_norm": 0.31967541575431824, "learning_rate": 1.7898622112156317e-07, "loss": 1.4255, "step": 624 }, { "epoch": 0.2659857431641664, "grad_norm": 0.3589042127132416, "learning_rate": 1.5855915746068594e-07, "loss": 1.3617, "step": 625 }, { "epoch": 0.26641132035322906, "grad_norm": 0.3286203444004059, "learning_rate": 1.3936755957311143e-07, "loss": 1.4144, "step": 626 }, { "epoch": 0.26683689754229173, "grad_norm": 0.3432556986808777, "learning_rate": 1.214119031786809e-07, "loss": 1.4002, "step": 627 }, { "epoch": 0.2672624747313544, "grad_norm": 0.32939445972442627, "learning_rate": 1.0469263336082002e-07, "loss": 1.4409, "step": 628 }, { "epoch": 0.26768805192041706, "grad_norm": 0.3337137997150421, "learning_rate": 8.921016455548659e-08, "loss": 1.346, "step": 629 }, { "epoch": 0.2681136291094797, "grad_norm": 0.3303409516811371, "learning_rate": 7.496488054092865e-08, "loss": 1.4784, "step": 630 }, { "epoch": 0.2685392062985424, "grad_norm": 0.32682710886001587, "learning_rate": 6.195713442812556e-08, "loss": 1.4013, "step": 631 }, { "epoch": 0.26896478348760505, "grad_norm": 0.32266002893447876, "learning_rate": 5.0187248652094896e-08, "loss": 1.3807, "step": 632 }, { "epoch": 0.2693903606766677, "grad_norm": 0.37381482124328613, "learning_rate": 3.965551496384334e-08, "loss": 1.3242, "step": 633 }, { "epoch": 0.2698159378657304, "grad_norm": 0.3564024567604065, "learning_rate": 3.036219442317245e-08, "loss": 1.4303, "step": 634 }, { "epoch": 0.27024151505479305, "grad_norm": 0.3509672284126282, "learning_rate": 2.2307517392194944e-08, "loss": 1.3847, "step": 635 }, { "epoch": 0.2706670922438557, "grad_norm": 0.344876229763031, "learning_rate": 1.549168352961705e-08, "loss": 1.445, "step": 636 }, { "epoch": 0.2710926694329184, "grad_norm": 0.3283238708972931, "learning_rate": 9.914861785803586e-09, "loss": 1.4581, "step": 637 }, { "epoch": 0.27151824662198104, "grad_norm": 0.33270278573036194, "learning_rate": 5.577190398575738e-09, "loss": 1.4358, "step": 638 }, { "epoch": 0.2719438238110437, "grad_norm": 0.3181372880935669, "learning_rate": 2.478776889797141e-09, "loss": 1.3994, "step": 639 }, { "epoch": 0.27236940100010637, "grad_norm": 0.33495181798934937, "learning_rate": 6.196980627093396e-10, "loss": 1.3727, "step": 640 }, { "epoch": 0.27279497818916904, "grad_norm": 0.37345388531684875, "learning_rate": 0.0, "loss": 1.4929, "step": 641 } ], "logging_steps": 1, "max_steps": 641, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.377339310411612e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }