{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.01884195009205, "learning_rate": 4.1666666666666667e-07, "loss": 1.6675, "step": 1 }, { "epoch": 0.0, "grad_norm": 10.180633447078613, "learning_rate": 8.333333333333333e-07, "loss": 1.7678, "step": 2 }, { "epoch": 0.0, "grad_norm": 10.303211452992821, "learning_rate": 1.25e-06, "loss": 1.7284, "step": 3 }, { "epoch": 0.01, "grad_norm": 9.910461388084395, "learning_rate": 1.6666666666666667e-06, "loss": 1.7352, "step": 4 }, { "epoch": 0.01, "grad_norm": 9.326250707724988, "learning_rate": 2.0833333333333334e-06, "loss": 1.7242, "step": 5 }, { "epoch": 0.01, "grad_norm": 7.913664539477789, "learning_rate": 2.5e-06, "loss": 1.6469, "step": 6 }, { "epoch": 0.01, "grad_norm": 7.078423538808147, "learning_rate": 2.916666666666667e-06, "loss": 1.6132, "step": 7 }, { "epoch": 0.01, "grad_norm": 5.384402059977251, "learning_rate": 3.3333333333333333e-06, "loss": 1.4678, "step": 8 }, { "epoch": 0.01, "grad_norm": 4.3577482532491025, "learning_rate": 3.7500000000000005e-06, "loss": 1.3556, "step": 9 }, { "epoch": 0.01, "grad_norm": 4.174369624886316, "learning_rate": 4.166666666666667e-06, "loss": 1.4044, "step": 10 }, { "epoch": 0.01, "grad_norm": 3.8639134882240826, "learning_rate": 4.583333333333333e-06, "loss": 1.3128, "step": 11 }, { "epoch": 0.02, "grad_norm": 3.9821742334186547, "learning_rate": 5e-06, "loss": 1.3084, "step": 12 }, { "epoch": 0.02, "grad_norm": 3.8428940928189363, "learning_rate": 5.416666666666667e-06, "loss": 1.287, "step": 13 }, { "epoch": 0.02, "grad_norm": 2.770103811111949, "learning_rate": 5.833333333333334e-06, "loss": 1.1749, "step": 14 }, { "epoch": 0.02, "grad_norm": 3.2520397890717034, "learning_rate": 6.25e-06, "loss": 1.1662, "step": 15 }, { "epoch": 0.02, "grad_norm": 2.3752009512247585, "learning_rate": 6.666666666666667e-06, "loss": 1.1161, "step": 16 }, { "epoch": 0.02, "grad_norm": 2.2116579911789667, "learning_rate": 7.083333333333335e-06, "loss": 1.1, "step": 17 }, { "epoch": 0.02, "grad_norm": 2.0963580548505436, "learning_rate": 7.500000000000001e-06, "loss": 1.0793, "step": 18 }, { "epoch": 0.02, "grad_norm": 1.651378799679108, "learning_rate": 7.916666666666667e-06, "loss": 1.0386, "step": 19 }, { "epoch": 0.03, "grad_norm": 1.5740210080816908, "learning_rate": 8.333333333333334e-06, "loss": 1.0234, "step": 20 }, { "epoch": 0.03, "grad_norm": 1.777625330317658, "learning_rate": 8.750000000000001e-06, "loss": 1.0418, "step": 21 }, { "epoch": 0.03, "grad_norm": 1.5402956873406757, "learning_rate": 9.166666666666666e-06, "loss": 0.9971, "step": 22 }, { "epoch": 0.03, "grad_norm": 1.4503947750874269, "learning_rate": 9.583333333333335e-06, "loss": 1.0418, "step": 23 }, { "epoch": 0.03, "grad_norm": 1.4809567732326383, "learning_rate": 1e-05, "loss": 0.9896, "step": 24 }, { "epoch": 0.03, "grad_norm": 1.347063159372048, "learning_rate": 9.999955663494783e-06, "loss": 0.9821, "step": 25 }, { "epoch": 0.03, "grad_norm": 1.337982719973813, "learning_rate": 9.999822654765424e-06, "loss": 0.984, "step": 26 }, { "epoch": 0.04, "grad_norm": 1.2598040340641647, "learning_rate": 9.999600976170775e-06, "loss": 0.9564, "step": 27 }, { "epoch": 0.04, "grad_norm": 1.5283239907844695, "learning_rate": 9.999290631642222e-06, "loss": 0.9315, "step": 28 }, { "epoch": 0.04, "grad_norm": 1.3107099251584715, "learning_rate": 9.9988916266836e-06, "loss": 0.9524, "step": 29 }, { "epoch": 0.04, "grad_norm": 1.2841120681980969, "learning_rate": 9.998403968371104e-06, "loss": 0.9801, "step": 30 }, { "epoch": 0.04, "grad_norm": 1.2833311749613852, "learning_rate": 9.997827665353159e-06, "loss": 0.9564, "step": 31 }, { "epoch": 0.04, "grad_norm": 1.326424018708689, "learning_rate": 9.997162727850271e-06, "loss": 0.9359, "step": 32 }, { "epoch": 0.04, "grad_norm": 1.4100755530546323, "learning_rate": 9.996409167654843e-06, "loss": 0.9462, "step": 33 }, { "epoch": 0.04, "grad_norm": 1.3058423401625958, "learning_rate": 9.995566998130962e-06, "loss": 0.9495, "step": 34 }, { "epoch": 0.05, "grad_norm": 1.3957855779834178, "learning_rate": 9.99463623421417e-06, "loss": 0.9394, "step": 35 }, { "epoch": 0.05, "grad_norm": 1.2590639321281085, "learning_rate": 9.993616892411198e-06, "loss": 0.9165, "step": 36 }, { "epoch": 0.05, "grad_norm": 1.2489518258284393, "learning_rate": 9.992508990799665e-06, "loss": 0.9682, "step": 37 }, { "epoch": 0.05, "grad_norm": 1.3114899180647628, "learning_rate": 9.991312549027762e-06, "loss": 0.9939, "step": 38 }, { "epoch": 0.05, "grad_norm": 1.2902990615583814, "learning_rate": 9.990027588313916e-06, "loss": 0.935, "step": 39 }, { "epoch": 0.05, "grad_norm": 1.334306124290386, "learning_rate": 9.988654131446385e-06, "loss": 0.9489, "step": 40 }, { "epoch": 0.05, "grad_norm": 1.214618974357902, "learning_rate": 9.987192202782886e-06, "loss": 0.9122, "step": 41 }, { "epoch": 0.05, "grad_norm": 1.2918632529579752, "learning_rate": 9.98564182825014e-06, "loss": 0.9633, "step": 42 }, { "epoch": 0.06, "grad_norm": 1.3534377529573218, "learning_rate": 9.984003035343422e-06, "loss": 0.9306, "step": 43 }, { "epoch": 0.06, "grad_norm": 1.3761114385394022, "learning_rate": 9.982275853126073e-06, "loss": 0.9354, "step": 44 }, { "epoch": 0.06, "grad_norm": 1.7160524235309491, "learning_rate": 9.980460312228981e-06, "loss": 0.9524, "step": 45 }, { "epoch": 0.06, "grad_norm": 1.4535654188609335, "learning_rate": 9.978556444850043e-06, "loss": 0.9126, "step": 46 }, { "epoch": 0.06, "grad_norm": 1.329692670990971, "learning_rate": 9.97656428475359e-06, "loss": 0.8982, "step": 47 }, { "epoch": 0.06, "grad_norm": 1.2204169092183164, "learning_rate": 9.974483867269787e-06, "loss": 0.8878, "step": 48 }, { "epoch": 0.06, "grad_norm": 1.3914134893180399, "learning_rate": 9.97231522929401e-06, "loss": 0.8933, "step": 49 }, { "epoch": 0.06, "grad_norm": 1.314449142464296, "learning_rate": 9.97005840928619e-06, "loss": 0.9163, "step": 50 }, { "epoch": 0.07, "grad_norm": 1.2684644507071798, "learning_rate": 9.967713447270134e-06, "loss": 0.9036, "step": 51 }, { "epoch": 0.07, "grad_norm": 1.1289718531785145, "learning_rate": 9.965280384832809e-06, "loss": 0.8844, "step": 52 }, { "epoch": 0.07, "grad_norm": 1.2737395334908646, "learning_rate": 9.962759265123611e-06, "loss": 0.8624, "step": 53 }, { "epoch": 0.07, "grad_norm": 1.3627730655756511, "learning_rate": 9.960150132853592e-06, "loss": 0.8977, "step": 54 }, { "epoch": 0.07, "grad_norm": 1.2442014917910598, "learning_rate": 9.957453034294677e-06, "loss": 0.9067, "step": 55 }, { "epoch": 0.07, "grad_norm": 1.4810477924919117, "learning_rate": 9.954668017278834e-06, "loss": 0.9119, "step": 56 }, { "epoch": 0.07, "grad_norm": 1.3799563005304054, "learning_rate": 9.951795131197233e-06, "loss": 0.9261, "step": 57 }, { "epoch": 0.08, "grad_norm": 1.414006096549083, "learning_rate": 9.948834426999363e-06, "loss": 0.9121, "step": 58 }, { "epoch": 0.08, "grad_norm": 1.6428491735539237, "learning_rate": 9.945785957192138e-06, "loss": 0.9428, "step": 59 }, { "epoch": 0.08, "grad_norm": 1.2653571709570268, "learning_rate": 9.942649775838955e-06, "loss": 0.8767, "step": 60 }, { "epoch": 0.08, "grad_norm": 1.15765431948816, "learning_rate": 9.939425938558744e-06, "loss": 0.9034, "step": 61 }, { "epoch": 0.08, "grad_norm": 1.1798179797077757, "learning_rate": 9.936114502524974e-06, "loss": 0.9168, "step": 62 }, { "epoch": 0.08, "grad_norm": 1.17972963935367, "learning_rate": 9.932715526464646e-06, "loss": 0.8591, "step": 63 }, { "epoch": 0.08, "grad_norm": 1.1938866521236933, "learning_rate": 9.929229070657251e-06, "loss": 0.9049, "step": 64 }, { "epoch": 0.08, "grad_norm": 1.2396855840030339, "learning_rate": 9.925655196933692e-06, "loss": 0.9578, "step": 65 }, { "epoch": 0.09, "grad_norm": 1.266928905031081, "learning_rate": 9.921993968675198e-06, "loss": 0.9097, "step": 66 }, { "epoch": 0.09, "grad_norm": 1.164862535741009, "learning_rate": 9.918245450812196e-06, "loss": 0.9182, "step": 67 }, { "epoch": 0.09, "grad_norm": 1.3760807748353976, "learning_rate": 9.914409709823158e-06, "loss": 0.9183, "step": 68 }, { "epoch": 0.09, "grad_norm": 1.204551307828834, "learning_rate": 9.910486813733427e-06, "loss": 0.909, "step": 69 }, { "epoch": 0.09, "grad_norm": 1.3492227169803832, "learning_rate": 9.906476832114e-06, "loss": 0.8767, "step": 70 }, { "epoch": 0.09, "grad_norm": 1.2599025510474737, "learning_rate": 9.902379836080308e-06, "loss": 0.9017, "step": 71 }, { "epoch": 0.09, "grad_norm": 1.2291538586506283, "learning_rate": 9.898195898290944e-06, "loss": 0.879, "step": 72 }, { "epoch": 0.09, "grad_norm": 1.3486283669610692, "learning_rate": 9.893925092946379e-06, "loss": 0.904, "step": 73 }, { "epoch": 0.1, "grad_norm": 1.2322670763852388, "learning_rate": 9.889567495787651e-06, "loss": 0.9129, "step": 74 }, { "epoch": 0.1, "grad_norm": 1.6502741500539375, "learning_rate": 9.885123184095007e-06, "loss": 0.893, "step": 75 }, { "epoch": 0.1, "grad_norm": 1.2779919969142226, "learning_rate": 9.880592236686548e-06, "loss": 0.9129, "step": 76 }, { "epoch": 0.1, "grad_norm": 1.221028204978688, "learning_rate": 9.875974733916822e-06, "loss": 0.8834, "step": 77 }, { "epoch": 0.1, "grad_norm": 1.185083112243534, "learning_rate": 9.871270757675406e-06, "loss": 0.9237, "step": 78 }, { "epoch": 0.1, "grad_norm": 1.2932265161475407, "learning_rate": 9.866480391385446e-06, "loss": 0.8421, "step": 79 }, { "epoch": 0.1, "grad_norm": 1.4143826220607103, "learning_rate": 9.861603720002182e-06, "loss": 0.8825, "step": 80 }, { "epoch": 0.11, "grad_norm": 1.3149520714145249, "learning_rate": 9.856640830011437e-06, "loss": 0.8686, "step": 81 }, { "epoch": 0.11, "grad_norm": 1.2757323469959327, "learning_rate": 9.851591809428096e-06, "loss": 0.9248, "step": 82 }, { "epoch": 0.11, "grad_norm": 1.2619145491525985, "learning_rate": 9.846456747794526e-06, "loss": 0.9045, "step": 83 }, { "epoch": 0.11, "grad_norm": 1.2192461206205478, "learning_rate": 9.841235736179002e-06, "loss": 0.9009, "step": 84 }, { "epoch": 0.11, "grad_norm": 1.2543181200558176, "learning_rate": 9.83592886717409e-06, "loss": 0.8777, "step": 85 }, { "epoch": 0.11, "grad_norm": 1.2385868215899136, "learning_rate": 9.830536234894996e-06, "loss": 0.9023, "step": 86 }, { "epoch": 0.11, "grad_norm": 1.2899363487484155, "learning_rate": 9.825057934977912e-06, "loss": 0.9033, "step": 87 }, { "epoch": 0.11, "grad_norm": 1.4391428437617175, "learning_rate": 9.819494064578305e-06, "loss": 0.8457, "step": 88 }, { "epoch": 0.12, "grad_norm": 1.3299900370660194, "learning_rate": 9.813844722369204e-06, "loss": 0.8632, "step": 89 }, { "epoch": 0.12, "grad_norm": 1.174244071731833, "learning_rate": 9.808110008539441e-06, "loss": 0.8913, "step": 90 }, { "epoch": 0.12, "grad_norm": 1.266275549208671, "learning_rate": 9.80229002479189e-06, "loss": 0.8955, "step": 91 }, { "epoch": 0.12, "grad_norm": 1.296325653451658, "learning_rate": 9.796384874341643e-06, "loss": 0.8731, "step": 92 }, { "epoch": 0.12, "grad_norm": 1.264882203381404, "learning_rate": 9.790394661914194e-06, "loss": 0.8788, "step": 93 }, { "epoch": 0.12, "grad_norm": 1.0978252424854067, "learning_rate": 9.784319493743576e-06, "loss": 0.8415, "step": 94 }, { "epoch": 0.12, "grad_norm": 1.2829083314019198, "learning_rate": 9.778159477570483e-06, "loss": 0.9018, "step": 95 }, { "epoch": 0.12, "grad_norm": 1.2586291926807105, "learning_rate": 9.771914722640345e-06, "loss": 0.9072, "step": 96 }, { "epoch": 0.13, "grad_norm": 1.1796963405145942, "learning_rate": 9.76558533970141e-06, "loss": 0.8726, "step": 97 }, { "epoch": 0.13, "grad_norm": 1.306469482808402, "learning_rate": 9.759171441002766e-06, "loss": 0.9025, "step": 98 }, { "epoch": 0.13, "grad_norm": 1.2313395789612747, "learning_rate": 9.75267314029235e-06, "loss": 0.8555, "step": 99 }, { "epoch": 0.13, "grad_norm": 1.3784125721886025, "learning_rate": 9.746090552814944e-06, "loss": 0.8959, "step": 100 }, { "epoch": 0.13, "grad_norm": 1.3345787723484401, "learning_rate": 9.739423795310115e-06, "loss": 0.8818, "step": 101 }, { "epoch": 0.13, "grad_norm": 1.2453323910999627, "learning_rate": 9.732672986010157e-06, "loss": 0.9028, "step": 102 }, { "epoch": 0.13, "grad_norm": 1.2850929631258812, "learning_rate": 9.725838244637982e-06, "loss": 0.8962, "step": 103 }, { "epoch": 0.14, "grad_norm": 1.257323570554658, "learning_rate": 9.718919692405014e-06, "loss": 0.8679, "step": 104 }, { "epoch": 0.14, "grad_norm": 1.1908601963221823, "learning_rate": 9.711917452009021e-06, "loss": 0.9098, "step": 105 }, { "epoch": 0.14, "grad_norm": 1.4435199482611327, "learning_rate": 9.704831647631951e-06, "loss": 0.8695, "step": 106 }, { "epoch": 0.14, "grad_norm": 1.2562296479791273, "learning_rate": 9.697662404937724e-06, "loss": 0.9202, "step": 107 }, { "epoch": 0.14, "grad_norm": 1.2710317483643663, "learning_rate": 9.690409851070009e-06, "loss": 0.9095, "step": 108 }, { "epoch": 0.14, "grad_norm": 1.0364322803097805, "learning_rate": 9.68307411464996e-06, "loss": 0.8897, "step": 109 }, { "epoch": 0.14, "grad_norm": 1.7483739299613759, "learning_rate": 9.675655325773943e-06, "loss": 0.872, "step": 110 }, { "epoch": 0.14, "grad_norm": 1.2668959871346073, "learning_rate": 9.66815361601123e-06, "loss": 0.905, "step": 111 }, { "epoch": 0.15, "grad_norm": 1.1308633342373222, "learning_rate": 9.660569118401656e-06, "loss": 0.9043, "step": 112 }, { "epoch": 0.15, "grad_norm": 1.272143699055615, "learning_rate": 9.65290196745327e-06, "loss": 0.8669, "step": 113 }, { "epoch": 0.15, "grad_norm": 1.5486854453771295, "learning_rate": 9.64515229913994e-06, "loss": 0.8908, "step": 114 }, { "epoch": 0.15, "grad_norm": 1.3773680680248115, "learning_rate": 9.637320250898953e-06, "loss": 0.8752, "step": 115 }, { "epoch": 0.15, "grad_norm": 1.4237537841409098, "learning_rate": 9.629405961628568e-06, "loss": 0.9257, "step": 116 }, { "epoch": 0.15, "grad_norm": 1.162885378975861, "learning_rate": 9.621409571685555e-06, "loss": 0.8581, "step": 117 }, { "epoch": 0.15, "grad_norm": 1.4262660718320415, "learning_rate": 9.61333122288271e-06, "loss": 0.8929, "step": 118 }, { "epoch": 0.15, "grad_norm": 1.3321707519752775, "learning_rate": 9.605171058486329e-06, "loss": 0.8715, "step": 119 }, { "epoch": 0.16, "grad_norm": 1.220589676110642, "learning_rate": 9.596929223213685e-06, "loss": 0.9275, "step": 120 }, { "epoch": 0.16, "grad_norm": 1.8226230069534708, "learning_rate": 9.588605863230447e-06, "loss": 0.8913, "step": 121 }, { "epoch": 0.16, "grad_norm": 1.1686577144464307, "learning_rate": 9.58020112614809e-06, "loss": 0.8661, "step": 122 }, { "epoch": 0.16, "grad_norm": 1.4726816440981407, "learning_rate": 9.571715161021285e-06, "loss": 0.8741, "step": 123 }, { "epoch": 0.16, "grad_norm": 1.1893010742558143, "learning_rate": 9.563148118345242e-06, "loss": 0.8963, "step": 124 }, { "epoch": 0.16, "grad_norm": 1.1341800604331242, "learning_rate": 9.55450015005306e-06, "loss": 0.8872, "step": 125 }, { "epoch": 0.16, "grad_norm": 1.0555002605396062, "learning_rate": 9.545771409513012e-06, "loss": 0.8417, "step": 126 }, { "epoch": 0.16, "grad_norm": 1.2311976830107538, "learning_rate": 9.536962051525837e-06, "loss": 0.8598, "step": 127 }, { "epoch": 0.17, "grad_norm": 1.226898171357091, "learning_rate": 9.528072232321996e-06, "loss": 0.8893, "step": 128 }, { "epoch": 0.17, "grad_norm": 1.2603976581878822, "learning_rate": 9.519102109558893e-06, "loss": 0.8824, "step": 129 }, { "epoch": 0.17, "grad_norm": 1.188060037369501, "learning_rate": 9.510051842318089e-06, "loss": 0.8809, "step": 130 }, { "epoch": 0.17, "grad_norm": 1.2452630639374425, "learning_rate": 9.50092159110247e-06, "loss": 0.8778, "step": 131 }, { "epoch": 0.17, "grad_norm": 1.344515264849714, "learning_rate": 9.49171151783341e-06, "loss": 0.8657, "step": 132 }, { "epoch": 0.17, "grad_norm": 1.2874467819022641, "learning_rate": 9.48242178584789e-06, "loss": 0.8662, "step": 133 }, { "epoch": 0.17, "grad_norm": 1.1720764172369453, "learning_rate": 9.473052559895615e-06, "loss": 0.8398, "step": 134 }, { "epoch": 0.18, "grad_norm": 1.2724997102438147, "learning_rate": 9.463604006136076e-06, "loss": 0.8691, "step": 135 }, { "epoch": 0.18, "grad_norm": 1.2157718055799869, "learning_rate": 9.454076292135615e-06, "loss": 0.8966, "step": 136 }, { "epoch": 0.18, "grad_norm": 1.2041736734636745, "learning_rate": 9.44446958686445e-06, "loss": 0.8315, "step": 137 }, { "epoch": 0.18, "grad_norm": 1.2873000337357716, "learning_rate": 9.434784060693671e-06, "loss": 0.8387, "step": 138 }, { "epoch": 0.18, "grad_norm": 1.1238231039137265, "learning_rate": 9.425019885392238e-06, "loss": 0.9066, "step": 139 }, { "epoch": 0.18, "grad_norm": 1.3605721992080366, "learning_rate": 9.41517723412391e-06, "loss": 0.9199, "step": 140 }, { "epoch": 0.18, "grad_norm": 1.2105376013486355, "learning_rate": 9.405256281444192e-06, "loss": 0.8621, "step": 141 }, { "epoch": 0.18, "grad_norm": 1.1508108761585434, "learning_rate": 9.395257203297232e-06, "loss": 0.8725, "step": 142 }, { "epoch": 0.19, "grad_norm": 1.3578283923073486, "learning_rate": 9.385180177012703e-06, "loss": 0.9158, "step": 143 }, { "epoch": 0.19, "grad_norm": 1.1901249895249055, "learning_rate": 9.375025381302656e-06, "loss": 0.8794, "step": 144 }, { "epoch": 0.19, "grad_norm": 1.2932679790426476, "learning_rate": 9.36479299625835e-06, "loss": 0.8719, "step": 145 }, { "epoch": 0.19, "grad_norm": 1.1658450769719235, "learning_rate": 9.354483203347066e-06, "loss": 0.9041, "step": 146 }, { "epoch": 0.19, "grad_norm": 1.1074333464226818, "learning_rate": 9.344096185408875e-06, "loss": 0.9061, "step": 147 }, { "epoch": 0.19, "grad_norm": 1.3904132378009597, "learning_rate": 9.333632126653412e-06, "loss": 0.8168, "step": 148 }, { "epoch": 0.19, "grad_norm": 1.3016771798542626, "learning_rate": 9.323091212656589e-06, "loss": 0.9129, "step": 149 }, { "epoch": 0.19, "grad_norm": 1.3787628224004695, "learning_rate": 9.312473630357326e-06, "loss": 0.8934, "step": 150 }, { "epoch": 0.2, "grad_norm": 1.225566612536322, "learning_rate": 9.301779568054219e-06, "loss": 0.8483, "step": 151 }, { "epoch": 0.2, "grad_norm": 1.2125198119803513, "learning_rate": 9.291009215402204e-06, "loss": 0.8858, "step": 152 }, { "epoch": 0.2, "grad_norm": 1.1519000629696743, "learning_rate": 9.280162763409207e-06, "loss": 0.8435, "step": 153 }, { "epoch": 0.2, "grad_norm": 1.1552307028646323, "learning_rate": 9.269240404432732e-06, "loss": 0.852, "step": 154 }, { "epoch": 0.2, "grad_norm": 1.2118496637645362, "learning_rate": 9.258242332176473e-06, "loss": 0.8951, "step": 155 }, { "epoch": 0.2, "grad_norm": 1.3963015983530243, "learning_rate": 9.247168741686863e-06, "loss": 0.8546, "step": 156 }, { "epoch": 0.2, "grad_norm": 1.309312245402273, "learning_rate": 9.236019829349623e-06, "loss": 0.8902, "step": 157 }, { "epoch": 0.21, "grad_norm": 1.3032791306057538, "learning_rate": 9.224795792886276e-06, "loss": 0.8645, "step": 158 }, { "epoch": 0.21, "grad_norm": 1.2500412045624514, "learning_rate": 9.213496831350647e-06, "loss": 0.8514, "step": 159 }, { "epoch": 0.21, "grad_norm": 1.502113914941289, "learning_rate": 9.202123145125318e-06, "loss": 0.8812, "step": 160 }, { "epoch": 0.21, "grad_norm": 1.179178003711897, "learning_rate": 9.190674935918092e-06, "loss": 0.8585, "step": 161 }, { "epoch": 0.21, "grad_norm": 1.273869159733753, "learning_rate": 9.1791524067584e-06, "loss": 0.8649, "step": 162 }, { "epoch": 0.21, "grad_norm": 1.2314877134912634, "learning_rate": 9.167555761993716e-06, "loss": 0.8649, "step": 163 }, { "epoch": 0.21, "grad_norm": 1.3622037711073158, "learning_rate": 9.155885207285919e-06, "loss": 0.8668, "step": 164 }, { "epoch": 0.21, "grad_norm": 1.2546049124533816, "learning_rate": 9.14414094960765e-06, "loss": 0.8182, "step": 165 }, { "epoch": 0.22, "grad_norm": 2.113434892192804, "learning_rate": 9.132323197238649e-06, "loss": 0.859, "step": 166 }, { "epoch": 0.22, "grad_norm": 1.7423410683870517, "learning_rate": 9.120432159762051e-06, "loss": 0.9227, "step": 167 }, { "epoch": 0.22, "grad_norm": 1.1368796798921579, "learning_rate": 9.108468048060675e-06, "loss": 0.8546, "step": 168 }, { "epoch": 0.22, "grad_norm": 1.224122669035051, "learning_rate": 9.096431074313278e-06, "loss": 0.8319, "step": 169 }, { "epoch": 0.22, "grad_norm": 1.3637642569977657, "learning_rate": 9.084321451990804e-06, "loss": 0.884, "step": 170 }, { "epoch": 0.22, "grad_norm": 1.3239374587315518, "learning_rate": 9.072139395852582e-06, "loss": 0.8418, "step": 171 }, { "epoch": 0.22, "grad_norm": 1.18034938438751, "learning_rate": 9.059885121942533e-06, "loss": 0.8471, "step": 172 }, { "epoch": 0.22, "grad_norm": 1.2432620846129294, "learning_rate": 9.04755884758533e-06, "loss": 0.895, "step": 173 }, { "epoch": 0.23, "grad_norm": 1.2124450178376394, "learning_rate": 9.03516079138254e-06, "loss": 0.8576, "step": 174 }, { "epoch": 0.23, "grad_norm": 1.2905752914519677, "learning_rate": 9.022691173208759e-06, "loss": 0.836, "step": 175 }, { "epoch": 0.23, "grad_norm": 1.1768633931424846, "learning_rate": 9.010150214207704e-06, "loss": 0.8324, "step": 176 }, { "epoch": 0.23, "grad_norm": 1.3781750954365992, "learning_rate": 8.997538136788291e-06, "loss": 0.8426, "step": 177 }, { "epoch": 0.23, "grad_norm": 1.230640779663414, "learning_rate": 8.984855164620694e-06, "loss": 0.8679, "step": 178 }, { "epoch": 0.23, "grad_norm": 1.2255727503119238, "learning_rate": 8.97210152263238e-06, "loss": 0.85, "step": 179 }, { "epoch": 0.23, "grad_norm": 1.3100217977587998, "learning_rate": 8.959277437004114e-06, "loss": 0.89, "step": 180 }, { "epoch": 0.24, "grad_norm": 1.3085937284787819, "learning_rate": 8.94638313516595e-06, "loss": 0.8748, "step": 181 }, { "epoch": 0.24, "grad_norm": 1.10287922354063, "learning_rate": 8.933418845793202e-06, "loss": 0.8553, "step": 182 }, { "epoch": 0.24, "grad_norm": 1.27133597219518, "learning_rate": 8.920384798802384e-06, "loss": 0.8757, "step": 183 }, { "epoch": 0.24, "grad_norm": 1.6520106528813114, "learning_rate": 8.907281225347134e-06, "loss": 0.8242, "step": 184 }, { "epoch": 0.24, "grad_norm": 1.2525684796940382, "learning_rate": 8.894108357814107e-06, "loss": 0.8834, "step": 185 }, { "epoch": 0.24, "grad_norm": 1.2578480714177394, "learning_rate": 8.880866429818873e-06, "loss": 0.8633, "step": 186 }, { "epoch": 0.24, "grad_norm": 1.723729927706631, "learning_rate": 8.867555676201753e-06, "loss": 0.8565, "step": 187 }, { "epoch": 0.24, "grad_norm": 1.5654752498240772, "learning_rate": 8.85417633302367e-06, "loss": 0.875, "step": 188 }, { "epoch": 0.25, "grad_norm": 1.1660030377875041, "learning_rate": 8.840728637561947e-06, "loss": 0.8172, "step": 189 }, { "epoch": 0.25, "grad_norm": 1.304332990745047, "learning_rate": 8.827212828306111e-06, "loss": 0.8593, "step": 190 }, { "epoch": 0.25, "grad_norm": 1.5873586326273417, "learning_rate": 8.813629144953666e-06, "loss": 0.8656, "step": 191 }, { "epoch": 0.25, "grad_norm": 1.173378431318532, "learning_rate": 8.799977828405826e-06, "loss": 0.8444, "step": 192 }, { "epoch": 0.25, "grad_norm": 1.3871104053810464, "learning_rate": 8.786259120763263e-06, "loss": 0.8551, "step": 193 }, { "epoch": 0.25, "grad_norm": 1.2616067480138016, "learning_rate": 8.772473265321794e-06, "loss": 0.8798, "step": 194 }, { "epoch": 0.25, "grad_norm": 1.180462258223769, "learning_rate": 8.758620506568084e-06, "loss": 0.8514, "step": 195 }, { "epoch": 0.25, "grad_norm": 1.1530272269450472, "learning_rate": 8.74470109017529e-06, "loss": 0.8726, "step": 196 }, { "epoch": 0.26, "grad_norm": 1.1563346325065118, "learning_rate": 8.730715262998733e-06, "loss": 0.8617, "step": 197 }, { "epoch": 0.26, "grad_norm": 1.2336725498438685, "learning_rate": 8.716663273071484e-06, "loss": 0.814, "step": 198 }, { "epoch": 0.26, "grad_norm": 1.2605889115541364, "learning_rate": 8.702545369599997e-06, "loss": 0.8588, "step": 199 }, { "epoch": 0.26, "grad_norm": 1.19906305613824, "learning_rate": 8.688361802959673e-06, "loss": 0.8849, "step": 200 }, { "epoch": 0.26, "grad_norm": 1.1538706074366336, "learning_rate": 8.674112824690419e-06, "loss": 0.8267, "step": 201 }, { "epoch": 0.26, "grad_norm": 1.169788765403587, "learning_rate": 8.659798687492199e-06, "loss": 0.8593, "step": 202 }, { "epoch": 0.26, "grad_norm": 1.3244594230863784, "learning_rate": 8.645419645220538e-06, "loss": 0.8348, "step": 203 }, { "epoch": 0.26, "grad_norm": 1.1732992626263374, "learning_rate": 8.630975952882027e-06, "loss": 0.8246, "step": 204 }, { "epoch": 0.27, "grad_norm": 1.297363781740773, "learning_rate": 8.616467866629808e-06, "loss": 0.835, "step": 205 }, { "epoch": 0.27, "grad_norm": 1.7609709518413195, "learning_rate": 8.601895643759014e-06, "loss": 0.8755, "step": 206 }, { "epoch": 0.27, "grad_norm": 1.5862887040904983, "learning_rate": 8.58725954270222e-06, "loss": 0.8726, "step": 207 }, { "epoch": 0.27, "grad_norm": 1.1935398277338376, "learning_rate": 8.572559823024853e-06, "loss": 0.866, "step": 208 }, { "epoch": 0.27, "grad_norm": 1.1834450572696433, "learning_rate": 8.557796745420592e-06, "loss": 0.8614, "step": 209 }, { "epoch": 0.27, "grad_norm": 1.1878168847959716, "learning_rate": 8.542970571706748e-06, "loss": 0.8799, "step": 210 }, { "epoch": 0.27, "grad_norm": 1.354522490073717, "learning_rate": 8.528081564819608e-06, "loss": 0.8531, "step": 211 }, { "epoch": 0.28, "grad_norm": 1.317765104330031, "learning_rate": 8.513129988809787e-06, "loss": 0.8459, "step": 212 }, { "epoch": 0.28, "grad_norm": 1.3118174417979898, "learning_rate": 8.498116108837533e-06, "loss": 0.8922, "step": 213 }, { "epoch": 0.28, "grad_norm": 1.1131070658330877, "learning_rate": 8.483040191168037e-06, "loss": 0.8812, "step": 214 }, { "epoch": 0.28, "grad_norm": 1.2336386228496021, "learning_rate": 8.467902503166698e-06, "loss": 0.8282, "step": 215 }, { "epoch": 0.28, "grad_norm": 1.4586312090220346, "learning_rate": 8.45270331329439e-06, "loss": 0.8635, "step": 216 }, { "epoch": 0.28, "grad_norm": 1.3656966584287829, "learning_rate": 8.437442891102696e-06, "loss": 0.8877, "step": 217 }, { "epoch": 0.28, "grad_norm": 1.3937380322780935, "learning_rate": 8.42212150722913e-06, "loss": 0.8282, "step": 218 }, { "epoch": 0.28, "grad_norm": 1.22224963815494, "learning_rate": 8.406739433392343e-06, "loss": 0.8424, "step": 219 }, { "epoch": 0.29, "grad_norm": 1.6065974227695905, "learning_rate": 8.391296942387293e-06, "loss": 0.8572, "step": 220 }, { "epoch": 0.29, "grad_norm": 1.3023080745688278, "learning_rate": 8.37579430808041e-06, "loss": 0.8362, "step": 221 }, { "epoch": 0.29, "grad_norm": 1.2324977420758008, "learning_rate": 8.360231805404745e-06, "loss": 0.8589, "step": 222 }, { "epoch": 0.29, "grad_norm": 1.4254937288107534, "learning_rate": 8.344609710355092e-06, "loss": 0.8644, "step": 223 }, { "epoch": 0.29, "grad_norm": 1.2762189341727412, "learning_rate": 8.32892829998309e-06, "loss": 0.8759, "step": 224 }, { "epoch": 0.29, "grad_norm": 1.212162649007418, "learning_rate": 8.313187852392314e-06, "loss": 0.8318, "step": 225 }, { "epoch": 0.29, "grad_norm": 1.2879599902194216, "learning_rate": 8.297388646733335e-06, "loss": 0.8668, "step": 226 }, { "epoch": 0.29, "grad_norm": 1.2284347722023181, "learning_rate": 8.281530963198782e-06, "loss": 0.8455, "step": 227 }, { "epoch": 0.3, "grad_norm": 1.3556686197816876, "learning_rate": 8.26561508301836e-06, "loss": 0.8212, "step": 228 }, { "epoch": 0.3, "grad_norm": 1.391929292166319, "learning_rate": 8.249641288453872e-06, "loss": 0.8788, "step": 229 }, { "epoch": 0.3, "grad_norm": 3.4586892421492013, "learning_rate": 8.23360986279421e-06, "loss": 0.8261, "step": 230 }, { "epoch": 0.3, "grad_norm": 1.2170746219562474, "learning_rate": 8.217521090350326e-06, "loss": 0.8421, "step": 231 }, { "epoch": 0.3, "grad_norm": 1.218271614680763, "learning_rate": 8.201375256450198e-06, "loss": 0.883, "step": 232 }, { "epoch": 0.3, "grad_norm": 1.4799501574669076, "learning_rate": 8.185172647433766e-06, "loss": 0.87, "step": 233 }, { "epoch": 0.3, "grad_norm": 1.400079191714797, "learning_rate": 8.168913550647855e-06, "loss": 0.8373, "step": 234 }, { "epoch": 0.31, "grad_norm": 1.1669396201944626, "learning_rate": 8.152598254441076e-06, "loss": 0.847, "step": 235 }, { "epoch": 0.31, "grad_norm": 1.195621110624864, "learning_rate": 8.136227048158716e-06, "loss": 0.8601, "step": 236 }, { "epoch": 0.31, "grad_norm": 1.2953519903155755, "learning_rate": 8.1198002221376e-06, "loss": 0.8441, "step": 237 }, { "epoch": 0.31, "grad_norm": 1.6326150827119306, "learning_rate": 8.103318067700957e-06, "loss": 0.8448, "step": 238 }, { "epoch": 0.31, "grad_norm": 1.0961196527359565, "learning_rate": 8.086780877153233e-06, "loss": 0.8268, "step": 239 }, { "epoch": 0.31, "grad_norm": 1.4247422383868384, "learning_rate": 8.070188943774921e-06, "loss": 0.8115, "step": 240 }, { "epoch": 0.31, "grad_norm": 1.2240799976807206, "learning_rate": 8.053542561817364e-06, "loss": 0.8047, "step": 241 }, { "epoch": 0.31, "grad_norm": 1.1148459295674251, "learning_rate": 8.036842026497515e-06, "loss": 0.7947, "step": 242 }, { "epoch": 0.32, "grad_norm": 1.3046439821028708, "learning_rate": 8.020087633992729e-06, "loss": 0.8596, "step": 243 }, { "epoch": 0.32, "grad_norm": 1.3923522847308203, "learning_rate": 8.003279681435483e-06, "loss": 0.8815, "step": 244 }, { "epoch": 0.32, "grad_norm": 1.279395243287966, "learning_rate": 7.986418466908133e-06, "loss": 0.8218, "step": 245 }, { "epoch": 0.32, "grad_norm": 1.305938895131756, "learning_rate": 7.969504289437607e-06, "loss": 0.8653, "step": 246 }, { "epoch": 0.32, "grad_norm": 1.2194222921731876, "learning_rate": 7.952537448990114e-06, "loss": 0.8413, "step": 247 }, { "epoch": 0.32, "grad_norm": 1.3454506997775046, "learning_rate": 7.935518246465815e-06, "loss": 0.8556, "step": 248 }, { "epoch": 0.32, "grad_norm": 1.2952291235408084, "learning_rate": 7.918446983693498e-06, "loss": 0.869, "step": 249 }, { "epoch": 0.32, "grad_norm": 1.2459907150930951, "learning_rate": 7.901323963425213e-06, "loss": 0.8427, "step": 250 }, { "epoch": 0.33, "grad_norm": 1.2147661517452935, "learning_rate": 7.884149489330912e-06, "loss": 0.832, "step": 251 }, { "epoch": 0.33, "grad_norm": 1.1668831211471047, "learning_rate": 7.866923865993057e-06, "loss": 0.8734, "step": 252 }, { "epoch": 0.33, "grad_norm": 1.4995150707097251, "learning_rate": 7.849647398901227e-06, "loss": 0.8809, "step": 253 }, { "epoch": 0.33, "grad_norm": 1.1424611915270306, "learning_rate": 7.832320394446688e-06, "loss": 0.8384, "step": 254 }, { "epoch": 0.33, "grad_norm": 1.2621218740072504, "learning_rate": 7.814943159916974e-06, "loss": 0.8465, "step": 255 }, { "epoch": 0.33, "grad_norm": 1.273110955180023, "learning_rate": 7.797516003490421e-06, "loss": 0.8253, "step": 256 }, { "epoch": 0.33, "grad_norm": 1.3313009954548312, "learning_rate": 7.780039234230714e-06, "loss": 0.8794, "step": 257 }, { "epoch": 0.34, "grad_norm": 1.5759780161169947, "learning_rate": 7.762513162081402e-06, "loss": 0.8649, "step": 258 }, { "epoch": 0.34, "grad_norm": 1.812796559030521, "learning_rate": 7.7449380978604e-06, "loss": 0.8065, "step": 259 }, { "epoch": 0.34, "grad_norm": 1.3596453509876505, "learning_rate": 7.727314353254482e-06, "loss": 0.8655, "step": 260 }, { "epoch": 0.34, "grad_norm": 1.1898419559244204, "learning_rate": 7.709642240813742e-06, "loss": 0.8415, "step": 261 }, { "epoch": 0.34, "grad_norm": 1.3500754898160217, "learning_rate": 7.691922073946063e-06, "loss": 0.853, "step": 262 }, { "epoch": 0.34, "grad_norm": 1.1504817003231094, "learning_rate": 7.674154166911553e-06, "loss": 0.8793, "step": 263 }, { "epoch": 0.34, "grad_norm": 1.2590688573293491, "learning_rate": 7.656338834816976e-06, "loss": 0.8715, "step": 264 }, { "epoch": 0.34, "grad_norm": 1.2651292489923993, "learning_rate": 7.638476393610155e-06, "loss": 0.8388, "step": 265 }, { "epoch": 0.35, "grad_norm": 1.3571956680408448, "learning_rate": 7.620567160074377e-06, "loss": 0.8849, "step": 266 }, { "epoch": 0.35, "grad_norm": 1.552153053502718, "learning_rate": 7.602611451822775e-06, "loss": 0.8586, "step": 267 }, { "epoch": 0.35, "grad_norm": 1.5020758017980491, "learning_rate": 7.584609587292686e-06, "loss": 0.8817, "step": 268 }, { "epoch": 0.35, "grad_norm": 1.329746891781287, "learning_rate": 7.566561885740019e-06, "loss": 0.8723, "step": 269 }, { "epoch": 0.35, "grad_norm": 1.1578631093841143, "learning_rate": 7.548468667233576e-06, "loss": 0.8455, "step": 270 }, { "epoch": 0.35, "grad_norm": 1.1032924408612441, "learning_rate": 7.5303302526493894e-06, "loss": 0.8342, "step": 271 }, { "epoch": 0.35, "grad_norm": 1.564083216357106, "learning_rate": 7.512146963665023e-06, "loss": 0.8263, "step": 272 }, { "epoch": 0.35, "grad_norm": 1.2052297883957035, "learning_rate": 7.493919122753873e-06, "loss": 0.8385, "step": 273 }, { "epoch": 0.36, "grad_norm": 1.1808734641861955, "learning_rate": 7.475647053179444e-06, "loss": 0.8514, "step": 274 }, { "epoch": 0.36, "grad_norm": 1.401160272277566, "learning_rate": 7.457331078989619e-06, "loss": 0.8467, "step": 275 }, { "epoch": 0.36, "grad_norm": 1.1227772209522688, "learning_rate": 7.438971525010914e-06, "loss": 0.8692, "step": 276 }, { "epoch": 0.36, "grad_norm": 1.370076448447391, "learning_rate": 7.420568716842711e-06, "loss": 0.8432, "step": 277 }, { "epoch": 0.36, "grad_norm": 1.244630228546123, "learning_rate": 7.402122980851491e-06, "loss": 0.8583, "step": 278 }, { "epoch": 0.36, "grad_norm": 1.27669216892998, "learning_rate": 7.383634644165041e-06, "loss": 0.8712, "step": 279 }, { "epoch": 0.36, "grad_norm": 1.553980990671672, "learning_rate": 7.365104034666657e-06, "loss": 0.8197, "step": 280 }, { "epoch": 0.36, "grad_norm": 1.187131952435461, "learning_rate": 7.346531480989325e-06, "loss": 0.8434, "step": 281 }, { "epoch": 0.37, "grad_norm": 1.8707182730243352, "learning_rate": 7.327917312509893e-06, "loss": 0.847, "step": 282 }, { "epoch": 0.37, "grad_norm": 1.4802799795263466, "learning_rate": 7.309261859343233e-06, "loss": 0.8184, "step": 283 }, { "epoch": 0.37, "grad_norm": 1.2194638672966402, "learning_rate": 7.290565452336382e-06, "loss": 0.8264, "step": 284 }, { "epoch": 0.37, "grad_norm": 1.228307425661477, "learning_rate": 7.27182842306268e-06, "loss": 0.8445, "step": 285 }, { "epoch": 0.37, "grad_norm": 1.330130601732375, "learning_rate": 7.253051103815887e-06, "loss": 0.8487, "step": 286 }, { "epoch": 0.37, "grad_norm": 1.2351793793938697, "learning_rate": 7.234233827604285e-06, "loss": 0.8315, "step": 287 }, { "epoch": 0.37, "grad_norm": 1.4996469832250112, "learning_rate": 7.215376928144783e-06, "loss": 0.8522, "step": 288 }, { "epoch": 0.38, "grad_norm": 1.2552008111918165, "learning_rate": 7.196480739856988e-06, "loss": 0.8163, "step": 289 }, { "epoch": 0.38, "grad_norm": 1.502543788757623, "learning_rate": 7.177545597857279e-06, "loss": 0.8441, "step": 290 }, { "epoch": 0.38, "grad_norm": 1.494129590625939, "learning_rate": 7.158571837952867e-06, "loss": 0.8256, "step": 291 }, { "epoch": 0.38, "grad_norm": 1.1732463644816806, "learning_rate": 7.139559796635833e-06, "loss": 0.8545, "step": 292 }, { "epoch": 0.38, "grad_norm": 1.2940044086310112, "learning_rate": 7.120509811077164e-06, "loss": 0.8436, "step": 293 }, { "epoch": 0.38, "grad_norm": 2.587490092215707, "learning_rate": 7.101422219120774e-06, "loss": 0.8492, "step": 294 }, { "epoch": 0.38, "grad_norm": 1.2525502879285224, "learning_rate": 7.082297359277513e-06, "loss": 0.8355, "step": 295 }, { "epoch": 0.38, "grad_norm": 1.2154301868677129, "learning_rate": 7.0631355707191575e-06, "loss": 0.864, "step": 296 }, { "epoch": 0.39, "grad_norm": 1.509297691003082, "learning_rate": 7.043937193272405e-06, "loss": 0.8535, "step": 297 }, { "epoch": 0.39, "grad_norm": 1.560282341886913, "learning_rate": 7.024702567412839e-06, "loss": 0.8415, "step": 298 }, { "epoch": 0.39, "grad_norm": 1.215819914432597, "learning_rate": 7.0054320342588954e-06, "loss": 0.8307, "step": 299 }, { "epoch": 0.39, "grad_norm": 1.4363490411881552, "learning_rate": 6.986125935565813e-06, "loss": 0.8635, "step": 300 }, { "epoch": 0.39, "grad_norm": 1.252680452931007, "learning_rate": 6.966784613719568e-06, "loss": 0.8187, "step": 301 }, { "epoch": 0.39, "grad_norm": 1.147759930914122, "learning_rate": 6.94740841173081e-06, "loss": 0.855, "step": 302 }, { "epoch": 0.39, "grad_norm": 1.2481611069144203, "learning_rate": 6.927997673228766e-06, "loss": 0.88, "step": 303 }, { "epoch": 0.39, "grad_norm": 1.1605598358791287, "learning_rate": 6.908552742455167e-06, "loss": 0.8238, "step": 304 }, { "epoch": 0.4, "grad_norm": 1.150740940595073, "learning_rate": 6.889073964258116e-06, "loss": 0.8416, "step": 305 }, { "epoch": 0.4, "grad_norm": 1.1875419366249447, "learning_rate": 6.869561684085998e-06, "loss": 0.861, "step": 306 }, { "epoch": 0.4, "grad_norm": 1.117161313240673, "learning_rate": 6.850016247981335e-06, "loss": 0.8187, "step": 307 }, { "epoch": 0.4, "grad_norm": 1.178563333637316, "learning_rate": 6.83043800257466e-06, "loss": 0.8637, "step": 308 }, { "epoch": 0.4, "grad_norm": 1.4846187498958823, "learning_rate": 6.810827295078365e-06, "loss": 0.8084, "step": 309 }, { "epoch": 0.4, "grad_norm": 1.2242229357089285, "learning_rate": 6.791184473280542e-06, "loss": 0.8452, "step": 310 }, { "epoch": 0.4, "grad_norm": 1.3028754268878384, "learning_rate": 6.771509885538823e-06, "loss": 0.8158, "step": 311 }, { "epoch": 0.41, "grad_norm": 1.1123018120100558, "learning_rate": 6.7518038807741915e-06, "loss": 0.8729, "step": 312 }, { "epoch": 0.41, "grad_norm": 1.1932793058105855, "learning_rate": 6.7320668084648e-06, "loss": 0.8522, "step": 313 }, { "epoch": 0.41, "grad_norm": 1.1640197426308538, "learning_rate": 6.712299018639772e-06, "loss": 0.8811, "step": 314 }, { "epoch": 0.41, "grad_norm": 1.2756920346871423, "learning_rate": 6.692500861872996e-06, "loss": 0.8499, "step": 315 }, { "epoch": 0.41, "grad_norm": 1.2063462879314655, "learning_rate": 6.672672689276902e-06, "loss": 0.8401, "step": 316 }, { "epoch": 0.41, "grad_norm": 1.4598555490832712, "learning_rate": 6.652814852496242e-06, "loss": 0.8271, "step": 317 }, { "epoch": 0.41, "grad_norm": 1.0938343281591207, "learning_rate": 6.6329277037018505e-06, "loss": 0.8206, "step": 318 }, { "epoch": 0.41, "grad_norm": 1.225705583990496, "learning_rate": 6.6130115955843975e-06, "loss": 0.862, "step": 319 }, { "epoch": 0.42, "grad_norm": 1.2180868955960955, "learning_rate": 6.593066881348133e-06, "loss": 0.8253, "step": 320 }, { "epoch": 0.42, "grad_norm": 1.213674446057375, "learning_rate": 6.573093914704633e-06, "loss": 0.833, "step": 321 }, { "epoch": 0.42, "grad_norm": 1.2175598412319608, "learning_rate": 6.553093049866509e-06, "loss": 0.863, "step": 322 }, { "epoch": 0.42, "grad_norm": 1.8426677722391969, "learning_rate": 6.533064641541142e-06, "loss": 0.8585, "step": 323 }, { "epoch": 0.42, "grad_norm": 1.1805886991608463, "learning_rate": 6.513009044924384e-06, "loss": 0.8604, "step": 324 }, { "epoch": 0.42, "grad_norm": 1.1638818327339862, "learning_rate": 6.492926615694262e-06, "loss": 0.8624, "step": 325 }, { "epoch": 0.42, "grad_norm": 1.1190056074940464, "learning_rate": 6.472817710004664e-06, "loss": 0.8318, "step": 326 }, { "epoch": 0.42, "grad_norm": 1.4703765166977123, "learning_rate": 6.452682684479032e-06, "loss": 0.8659, "step": 327 }, { "epoch": 0.43, "grad_norm": 1.1488809794920523, "learning_rate": 6.432521896204035e-06, "loss": 0.8133, "step": 328 }, { "epoch": 0.43, "grad_norm": 1.2077971564958, "learning_rate": 6.412335702723224e-06, "loss": 0.8488, "step": 329 }, { "epoch": 0.43, "grad_norm": 1.335953923852408, "learning_rate": 6.392124462030715e-06, "loss": 0.8209, "step": 330 }, { "epoch": 0.43, "grad_norm": 1.239560657787868, "learning_rate": 6.371888532564817e-06, "loss": 0.8582, "step": 331 }, { "epoch": 0.43, "grad_norm": 1.123443909247595, "learning_rate": 6.351628273201687e-06, "loss": 0.8522, "step": 332 }, { "epoch": 0.43, "grad_norm": 1.1930390364093206, "learning_rate": 6.331344043248961e-06, "loss": 0.8612, "step": 333 }, { "epoch": 0.43, "grad_norm": 1.1651674600359125, "learning_rate": 6.311036202439388e-06, "loss": 0.8141, "step": 334 }, { "epoch": 0.44, "grad_norm": 1.827712401238591, "learning_rate": 6.290705110924442e-06, "loss": 0.8257, "step": 335 }, { "epoch": 0.44, "grad_norm": 1.1730736711077356, "learning_rate": 6.270351129267944e-06, "loss": 0.809, "step": 336 }, { "epoch": 0.44, "grad_norm": 1.2321894607586943, "learning_rate": 6.249974618439657e-06, "loss": 0.865, "step": 337 }, { "epoch": 0.44, "grad_norm": 1.2508419001037108, "learning_rate": 6.229575939808893e-06, "loss": 0.858, "step": 338 }, { "epoch": 0.44, "grad_norm": 1.1664795356123143, "learning_rate": 6.209155455138102e-06, "loss": 0.8473, "step": 339 }, { "epoch": 0.44, "grad_norm": 1.1058463932196927, "learning_rate": 6.188713526576452e-06, "loss": 0.827, "step": 340 }, { "epoch": 0.44, "grad_norm": 1.5948496072571947, "learning_rate": 6.1682505166534134e-06, "loss": 0.8441, "step": 341 }, { "epoch": 0.44, "grad_norm": 1.123050976281229, "learning_rate": 6.1477667882723245e-06, "loss": 0.824, "step": 342 }, { "epoch": 0.45, "grad_norm": 1.1179511468396548, "learning_rate": 6.127262704703956e-06, "loss": 0.8116, "step": 343 }, { "epoch": 0.45, "grad_norm": 1.448611313915091, "learning_rate": 6.106738629580073e-06, "loss": 0.8133, "step": 344 }, { "epoch": 0.45, "grad_norm": 1.4292078314595598, "learning_rate": 6.0861949268869814e-06, "loss": 0.8445, "step": 345 }, { "epoch": 0.45, "grad_norm": 1.1799038394669346, "learning_rate": 6.065631960959072e-06, "loss": 0.8298, "step": 346 }, { "epoch": 0.45, "grad_norm": 1.2343005419584467, "learning_rate": 6.045050096472363e-06, "loss": 0.874, "step": 347 }, { "epoch": 0.45, "grad_norm": 1.133793942853395, "learning_rate": 6.024449698438033e-06, "loss": 0.8373, "step": 348 }, { "epoch": 0.45, "grad_norm": 1.1482697304859235, "learning_rate": 6.003831132195943e-06, "loss": 0.8291, "step": 349 }, { "epoch": 0.45, "grad_norm": 1.0714069634839316, "learning_rate": 5.983194763408161e-06, "loss": 0.8038, "step": 350 }, { "epoch": 0.46, "grad_norm": 1.189866180029149, "learning_rate": 5.962540958052478e-06, "loss": 0.8369, "step": 351 }, { "epoch": 0.46, "grad_norm": 1.151937951000298, "learning_rate": 5.94187008241591e-06, "loss": 0.8724, "step": 352 }, { "epoch": 0.46, "grad_norm": 1.1885630504841458, "learning_rate": 5.921182503088212e-06, "loss": 0.8363, "step": 353 }, { "epoch": 0.46, "grad_norm": 1.2563198905659214, "learning_rate": 5.900478586955374e-06, "loss": 0.8414, "step": 354 }, { "epoch": 0.46, "grad_norm": 1.0903183738957514, "learning_rate": 5.879758701193108e-06, "loss": 0.8104, "step": 355 }, { "epoch": 0.46, "grad_norm": 1.3514303801827983, "learning_rate": 5.8590232132603444e-06, "loss": 0.8723, "step": 356 }, { "epoch": 0.46, "grad_norm": 1.385355867796163, "learning_rate": 5.838272490892708e-06, "loss": 0.8155, "step": 357 }, { "epoch": 0.46, "grad_norm": 1.4230336181646532, "learning_rate": 5.817506902096007e-06, "loss": 0.8227, "step": 358 }, { "epoch": 0.47, "grad_norm": 1.292768981531148, "learning_rate": 5.796726815139695e-06, "loss": 0.8571, "step": 359 }, { "epoch": 0.47, "grad_norm": 1.2735642058681054, "learning_rate": 5.7759325985503435e-06, "loss": 0.8342, "step": 360 }, { "epoch": 0.47, "grad_norm": 1.2086676089354491, "learning_rate": 5.755124621105111e-06, "loss": 0.8496, "step": 361 }, { "epoch": 0.47, "grad_norm": 1.244245262090597, "learning_rate": 5.734303251825198e-06, "loss": 0.8257, "step": 362 }, { "epoch": 0.47, "grad_norm": 1.0803631521753734, "learning_rate": 5.713468859969301e-06, "loss": 0.813, "step": 363 }, { "epoch": 0.47, "grad_norm": 1.1478802532788033, "learning_rate": 5.6926218150270716e-06, "loss": 0.8022, "step": 364 }, { "epoch": 0.47, "grad_norm": 0.9961671906693075, "learning_rate": 5.671762486712557e-06, "loss": 0.8405, "step": 365 }, { "epoch": 0.48, "grad_norm": 1.1541301819630243, "learning_rate": 5.650891244957644e-06, "loss": 0.8289, "step": 366 }, { "epoch": 0.48, "grad_norm": 1.1824673976498992, "learning_rate": 5.630008459905498e-06, "loss": 0.8413, "step": 367 }, { "epoch": 0.48, "grad_norm": 1.2250269994788847, "learning_rate": 5.609114501904006e-06, "loss": 0.8447, "step": 368 }, { "epoch": 0.48, "grad_norm": 1.16055884464047, "learning_rate": 5.588209741499196e-06, "loss": 0.8173, "step": 369 }, { "epoch": 0.48, "grad_norm": 1.1285506194740014, "learning_rate": 5.567294549428678e-06, "loss": 0.8435, "step": 370 }, { "epoch": 0.48, "grad_norm": 1.3319483590214511, "learning_rate": 5.54636929661506e-06, "loss": 0.8393, "step": 371 }, { "epoch": 0.48, "grad_norm": 1.1399581144803144, "learning_rate": 5.525434354159374e-06, "loss": 0.8383, "step": 372 }, { "epoch": 0.48, "grad_norm": 1.2097290183876572, "learning_rate": 5.504490093334493e-06, "loss": 0.8489, "step": 373 }, { "epoch": 0.49, "grad_norm": 4.352780644899712, "learning_rate": 5.48353688557855e-06, "loss": 0.8643, "step": 374 }, { "epoch": 0.49, "grad_norm": 1.2582400293178824, "learning_rate": 5.462575102488348e-06, "loss": 0.805, "step": 375 }, { "epoch": 0.49, "grad_norm": 1.574728499559222, "learning_rate": 5.441605115812767e-06, "loss": 0.8594, "step": 376 }, { "epoch": 0.49, "grad_norm": 1.829195542286078, "learning_rate": 5.420627297446179e-06, "loss": 0.8765, "step": 377 }, { "epoch": 0.49, "grad_norm": 1.0805397201337004, "learning_rate": 5.399642019421844e-06, "loss": 0.8453, "step": 378 }, { "epoch": 0.49, "grad_norm": 1.2382172071093036, "learning_rate": 5.378649653905316e-06, "loss": 0.8332, "step": 379 }, { "epoch": 0.49, "grad_norm": 1.1809812345716155, "learning_rate": 5.357650573187847e-06, "loss": 0.8254, "step": 380 }, { "epoch": 0.49, "grad_norm": 1.1932829048262574, "learning_rate": 5.336645149679775e-06, "loss": 0.8231, "step": 381 }, { "epoch": 0.5, "grad_norm": 1.3006622982723932, "learning_rate": 5.315633755903931e-06, "loss": 0.8341, "step": 382 }, { "epoch": 0.5, "grad_norm": 1.1773771127758201, "learning_rate": 5.294616764489018e-06, "loss": 0.82, "step": 383 }, { "epoch": 0.5, "grad_norm": 1.120358554988103, "learning_rate": 5.27359454816302e-06, "loss": 0.8183, "step": 384 }, { "epoch": 0.5, "grad_norm": 1.1479966317416317, "learning_rate": 5.252567479746577e-06, "loss": 0.8504, "step": 385 }, { "epoch": 0.5, "grad_norm": 1.7373144722386622, "learning_rate": 5.231535932146382e-06, "loss": 0.8293, "step": 386 }, { "epoch": 0.5, "grad_norm": 1.2159912654625296, "learning_rate": 5.210500278348561e-06, "loss": 0.828, "step": 387 }, { "epoch": 0.5, "grad_norm": 1.2134302086400865, "learning_rate": 5.1894608914120635e-06, "loss": 0.8645, "step": 388 }, { "epoch": 0.51, "grad_norm": 1.0591258858274246, "learning_rate": 5.168418144462046e-06, "loss": 0.8164, "step": 389 }, { "epoch": 0.51, "grad_norm": 1.2186717818024067, "learning_rate": 5.147372410683252e-06, "loss": 0.8476, "step": 390 }, { "epoch": 0.51, "grad_norm": 1.1213501657531966, "learning_rate": 5.126324063313397e-06, "loss": 0.8663, "step": 391 }, { "epoch": 0.51, "grad_norm": 1.4491273350649847, "learning_rate": 5.105273475636545e-06, "loss": 0.8525, "step": 392 }, { "epoch": 0.51, "grad_norm": 1.159514917414318, "learning_rate": 5.084221020976491e-06, "loss": 0.8317, "step": 393 }, { "epoch": 0.51, "grad_norm": 1.1877065524083912, "learning_rate": 5.063167072690144e-06, "loss": 0.8363, "step": 394 }, { "epoch": 0.51, "grad_norm": 1.0824757433851597, "learning_rate": 5.042112004160898e-06, "loss": 0.8384, "step": 395 }, { "epoch": 0.51, "grad_norm": 1.1452248714301483, "learning_rate": 5.021056188792014e-06, "loss": 0.8789, "step": 396 }, { "epoch": 0.52, "grad_norm": 1.1364607148991899, "learning_rate": 5e-06, "loss": 0.8524, "step": 397 }, { "epoch": 0.52, "grad_norm": 1.1839720849840152, "learning_rate": 4.978943811207988e-06, "loss": 0.8741, "step": 398 }, { "epoch": 0.52, "grad_norm": 1.5936600484839722, "learning_rate": 4.957887995839104e-06, "loss": 0.8254, "step": 399 }, { "epoch": 0.52, "grad_norm": 1.0926397681862798, "learning_rate": 4.936832927309858e-06, "loss": 0.8252, "step": 400 }, { "epoch": 0.52, "grad_norm": 1.0770992635214238, "learning_rate": 4.915778979023511e-06, "loss": 0.8048, "step": 401 }, { "epoch": 0.52, "grad_norm": 1.3714502182024384, "learning_rate": 4.894726524363456e-06, "loss": 0.8148, "step": 402 }, { "epoch": 0.52, "grad_norm": 1.1328097681910083, "learning_rate": 4.873675936686604e-06, "loss": 0.8155, "step": 403 }, { "epoch": 0.52, "grad_norm": 1.1731809825959303, "learning_rate": 4.852627589316749e-06, "loss": 0.8593, "step": 404 }, { "epoch": 0.53, "grad_norm": 1.049090459083091, "learning_rate": 4.831581855537955e-06, "loss": 0.8239, "step": 405 }, { "epoch": 0.53, "grad_norm": 1.0993193737486686, "learning_rate": 4.810539108587938e-06, "loss": 0.8425, "step": 406 }, { "epoch": 0.53, "grad_norm": 1.1941858463970723, "learning_rate": 4.789499721651441e-06, "loss": 0.8411, "step": 407 }, { "epoch": 0.53, "grad_norm": 1.1456952108338223, "learning_rate": 4.76846406785362e-06, "loss": 0.8074, "step": 408 }, { "epoch": 0.53, "grad_norm": 1.1787915666433677, "learning_rate": 4.747432520253424e-06, "loss": 0.8203, "step": 409 }, { "epoch": 0.53, "grad_norm": 1.3051379948424053, "learning_rate": 4.726405451836982e-06, "loss": 0.8447, "step": 410 }, { "epoch": 0.53, "grad_norm": 1.1233484298047998, "learning_rate": 4.705383235510984e-06, "loss": 0.8301, "step": 411 }, { "epoch": 0.54, "grad_norm": 1.0834579202868906, "learning_rate": 4.684366244096072e-06, "loss": 0.8429, "step": 412 }, { "epoch": 0.54, "grad_norm": 1.1507289567364096, "learning_rate": 4.663354850320226e-06, "loss": 0.8594, "step": 413 }, { "epoch": 0.54, "grad_norm": 1.1908348320197186, "learning_rate": 4.642349426812155e-06, "loss": 0.8214, "step": 414 }, { "epoch": 0.54, "grad_norm": 1.2504470609063638, "learning_rate": 4.621350346094685e-06, "loss": 0.8131, "step": 415 }, { "epoch": 0.54, "grad_norm": 2.0624917538169445, "learning_rate": 4.600357980578158e-06, "loss": 0.8468, "step": 416 }, { "epoch": 0.54, "grad_norm": 1.3297179381863848, "learning_rate": 4.579372702553822e-06, "loss": 0.7982, "step": 417 }, { "epoch": 0.54, "grad_norm": 1.246901494601956, "learning_rate": 4.558394884187234e-06, "loss": 0.8227, "step": 418 }, { "epoch": 0.54, "grad_norm": 1.151150781962948, "learning_rate": 4.537424897511654e-06, "loss": 0.8338, "step": 419 }, { "epoch": 0.55, "grad_norm": 1.1660907114296764, "learning_rate": 4.516463114421452e-06, "loss": 0.8159, "step": 420 }, { "epoch": 0.55, "grad_norm": 1.7766157190258682, "learning_rate": 4.495509906665508e-06, "loss": 0.8345, "step": 421 }, { "epoch": 0.55, "grad_norm": 1.1857385105788216, "learning_rate": 4.474565645840629e-06, "loss": 0.8233, "step": 422 }, { "epoch": 0.55, "grad_norm": 1.2264446822967827, "learning_rate": 4.453630703384942e-06, "loss": 0.8468, "step": 423 }, { "epoch": 0.55, "grad_norm": 1.264976558078766, "learning_rate": 4.432705450571323e-06, "loss": 0.8165, "step": 424 }, { "epoch": 0.55, "grad_norm": 1.1222621762765579, "learning_rate": 4.411790258500805e-06, "loss": 0.8184, "step": 425 }, { "epoch": 0.55, "grad_norm": 1.2233198012545898, "learning_rate": 4.390885498095996e-06, "loss": 0.8601, "step": 426 }, { "epoch": 0.55, "grad_norm": 1.1030451313547371, "learning_rate": 4.369991540094503e-06, "loss": 0.8259, "step": 427 }, { "epoch": 0.56, "grad_norm": 1.2243881638199383, "learning_rate": 4.3491087550423585e-06, "loss": 0.8308, "step": 428 }, { "epoch": 0.56, "grad_norm": 1.2802454455900687, "learning_rate": 4.328237513287444e-06, "loss": 0.8273, "step": 429 }, { "epoch": 0.56, "grad_norm": 1.5883389737605764, "learning_rate": 4.3073781849729276e-06, "loss": 0.793, "step": 430 }, { "epoch": 0.56, "grad_norm": 1.151105984490431, "learning_rate": 4.286531140030699e-06, "loss": 0.7827, "step": 431 }, { "epoch": 0.56, "grad_norm": 1.2218234503282421, "learning_rate": 4.265696748174803e-06, "loss": 0.819, "step": 432 }, { "epoch": 0.56, "grad_norm": 1.140797795358718, "learning_rate": 4.2448753788948895e-06, "loss": 0.8087, "step": 433 }, { "epoch": 0.56, "grad_norm": 1.0760664395492803, "learning_rate": 4.2240674014496565e-06, "loss": 0.8267, "step": 434 }, { "epoch": 0.56, "grad_norm": 1.1139625369896868, "learning_rate": 4.203273184860306e-06, "loss": 0.8008, "step": 435 }, { "epoch": 0.57, "grad_norm": 1.4018290508347282, "learning_rate": 4.1824930979039926e-06, "loss": 0.8546, "step": 436 }, { "epoch": 0.57, "grad_norm": 1.4091864309994824, "learning_rate": 4.161727509107292e-06, "loss": 0.7943, "step": 437 }, { "epoch": 0.57, "grad_norm": 1.1324871046006824, "learning_rate": 4.140976786739658e-06, "loss": 0.7966, "step": 438 }, { "epoch": 0.57, "grad_norm": 1.6224874962550682, "learning_rate": 4.120241298806893e-06, "loss": 0.8261, "step": 439 }, { "epoch": 0.57, "grad_norm": 1.272631877145078, "learning_rate": 4.099521413044627e-06, "loss": 0.7966, "step": 440 }, { "epoch": 0.57, "grad_norm": 1.1425226366031473, "learning_rate": 4.078817496911788e-06, "loss": 0.8261, "step": 441 }, { "epoch": 0.57, "grad_norm": 1.4359985462900144, "learning_rate": 4.058129917584091e-06, "loss": 0.8568, "step": 442 }, { "epoch": 0.58, "grad_norm": 1.0753954087608588, "learning_rate": 4.037459041947523e-06, "loss": 0.8217, "step": 443 }, { "epoch": 0.58, "grad_norm": 1.2692450418319305, "learning_rate": 4.016805236591839e-06, "loss": 0.8673, "step": 444 }, { "epoch": 0.58, "grad_norm": 1.1195139212914398, "learning_rate": 3.996168867804058e-06, "loss": 0.7953, "step": 445 }, { "epoch": 0.58, "grad_norm": 1.3678518854634432, "learning_rate": 3.975550301561968e-06, "loss": 0.8095, "step": 446 }, { "epoch": 0.58, "grad_norm": 1.1569918654905087, "learning_rate": 3.9549499035276375e-06, "loss": 0.8733, "step": 447 }, { "epoch": 0.58, "grad_norm": 1.1854799970605574, "learning_rate": 3.934368039040929e-06, "loss": 0.8126, "step": 448 }, { "epoch": 0.58, "grad_norm": 1.3730103333668784, "learning_rate": 3.9138050731130185e-06, "loss": 0.8309, "step": 449 }, { "epoch": 0.58, "grad_norm": 1.1140616423192409, "learning_rate": 3.893261370419927e-06, "loss": 0.8065, "step": 450 }, { "epoch": 0.59, "grad_norm": 1.4052106203909946, "learning_rate": 3.872737295296044e-06, "loss": 0.8248, "step": 451 }, { "epoch": 0.59, "grad_norm": 1.1758315380501903, "learning_rate": 3.852233211727676e-06, "loss": 0.8342, "step": 452 }, { "epoch": 0.59, "grad_norm": 1.4482783731512796, "learning_rate": 3.8317494833465865e-06, "loss": 0.8264, "step": 453 }, { "epoch": 0.59, "grad_norm": 1.0844929617557844, "learning_rate": 3.811286473423549e-06, "loss": 0.8268, "step": 454 }, { "epoch": 0.59, "grad_norm": 1.152076315782049, "learning_rate": 3.7908445448618992e-06, "loss": 0.8079, "step": 455 }, { "epoch": 0.59, "grad_norm": 1.1935228824138842, "learning_rate": 3.7704240601911075e-06, "loss": 0.8202, "step": 456 }, { "epoch": 0.59, "grad_norm": 1.1813858261394568, "learning_rate": 3.7500253815603442e-06, "loss": 0.8646, "step": 457 }, { "epoch": 0.59, "grad_norm": 1.2716301549560993, "learning_rate": 3.729648870732058e-06, "loss": 0.8167, "step": 458 }, { "epoch": 0.6, "grad_norm": 1.177945879650482, "learning_rate": 3.7092948890755577e-06, "loss": 0.8678, "step": 459 }, { "epoch": 0.6, "grad_norm": 1.214019403562676, "learning_rate": 3.688963797560615e-06, "loss": 0.8327, "step": 460 }, { "epoch": 0.6, "grad_norm": 1.636773329857946, "learning_rate": 3.6686559567510417e-06, "loss": 0.824, "step": 461 }, { "epoch": 0.6, "grad_norm": 1.0666034783382468, "learning_rate": 3.648371726798316e-06, "loss": 0.7909, "step": 462 }, { "epoch": 0.6, "grad_norm": 1.361459612074104, "learning_rate": 3.6281114674351846e-06, "loss": 0.8477, "step": 463 }, { "epoch": 0.6, "grad_norm": 1.6122680059960277, "learning_rate": 3.6078755379692855e-06, "loss": 0.8425, "step": 464 }, { "epoch": 0.6, "grad_norm": 1.1605817366410531, "learning_rate": 3.587664297276776e-06, "loss": 0.8335, "step": 465 }, { "epoch": 0.61, "grad_norm": 1.5046134018346586, "learning_rate": 3.5674781037959683e-06, "loss": 0.7833, "step": 466 }, { "epoch": 0.61, "grad_norm": 1.0563278373051415, "learning_rate": 3.5473173155209694e-06, "loss": 0.799, "step": 467 }, { "epoch": 0.61, "grad_norm": 1.0755240081794408, "learning_rate": 3.527182289995339e-06, "loss": 0.8536, "step": 468 }, { "epoch": 0.61, "grad_norm": 1.1146568468192999, "learning_rate": 3.5070733843057415e-06, "loss": 0.8271, "step": 469 }, { "epoch": 0.61, "grad_norm": 1.2145240314146524, "learning_rate": 3.4869909550756177e-06, "loss": 0.8215, "step": 470 }, { "epoch": 0.61, "grad_norm": 1.1149256639601721, "learning_rate": 3.4669353584588606e-06, "loss": 0.8287, "step": 471 }, { "epoch": 0.61, "grad_norm": 1.2796860456730539, "learning_rate": 3.4469069501334932e-06, "loss": 0.8484, "step": 472 }, { "epoch": 0.61, "grad_norm": 1.073005938552458, "learning_rate": 3.426906085295369e-06, "loss": 0.8355, "step": 473 }, { "epoch": 0.62, "grad_norm": 1.1930321678421913, "learning_rate": 3.4069331186518677e-06, "loss": 0.8197, "step": 474 }, { "epoch": 0.62, "grad_norm": 1.1883434410680984, "learning_rate": 3.3869884044156054e-06, "loss": 0.7895, "step": 475 }, { "epoch": 0.62, "grad_norm": 1.3604734593340317, "learning_rate": 3.3670722962981516e-06, "loss": 0.8288, "step": 476 }, { "epoch": 0.62, "grad_norm": 1.0748441692901816, "learning_rate": 3.3471851475037596e-06, "loss": 0.8449, "step": 477 }, { "epoch": 0.62, "grad_norm": 1.0860864001092179, "learning_rate": 3.3273273107231007e-06, "loss": 0.8468, "step": 478 }, { "epoch": 0.62, "grad_norm": 1.1203049509506295, "learning_rate": 3.3074991381270072e-06, "loss": 0.7999, "step": 479 }, { "epoch": 0.62, "grad_norm": 1.0833871352844642, "learning_rate": 3.28770098136023e-06, "loss": 0.7806, "step": 480 }, { "epoch": 0.62, "grad_norm": 1.143657532263609, "learning_rate": 3.2679331915352023e-06, "loss": 0.8364, "step": 481 }, { "epoch": 0.63, "grad_norm": 1.0808130722425977, "learning_rate": 3.248196119225811e-06, "loss": 0.8162, "step": 482 }, { "epoch": 0.63, "grad_norm": 1.5790710971517254, "learning_rate": 3.228490114461178e-06, "loss": 0.7935, "step": 483 }, { "epoch": 0.63, "grad_norm": 1.2311619644001286, "learning_rate": 3.2088155267194586e-06, "loss": 0.7944, "step": 484 }, { "epoch": 0.63, "grad_norm": 1.282202384930966, "learning_rate": 3.1891727049216375e-06, "loss": 0.8352, "step": 485 }, { "epoch": 0.63, "grad_norm": 1.4793811130434844, "learning_rate": 3.169561997425342e-06, "loss": 0.822, "step": 486 }, { "epoch": 0.63, "grad_norm": 1.1796102209432577, "learning_rate": 3.1499837520186676e-06, "loss": 0.8111, "step": 487 }, { "epoch": 0.63, "grad_norm": 1.1580009886459264, "learning_rate": 3.130438315914005e-06, "loss": 0.8148, "step": 488 }, { "epoch": 0.64, "grad_norm": 1.0446124399556485, "learning_rate": 3.110926035741886e-06, "loss": 0.8328, "step": 489 }, { "epoch": 0.64, "grad_norm": 1.651469788442752, "learning_rate": 3.091447257544836e-06, "loss": 0.8243, "step": 490 }, { "epoch": 0.64, "grad_norm": 1.5532921877403698, "learning_rate": 3.072002326771235e-06, "loss": 0.8522, "step": 491 }, { "epoch": 0.64, "grad_norm": 1.1116055858154035, "learning_rate": 3.0525915882691923e-06, "loss": 0.8214, "step": 492 }, { "epoch": 0.64, "grad_norm": 1.1956196368057803, "learning_rate": 3.0332153862804324e-06, "loss": 0.8314, "step": 493 }, { "epoch": 0.64, "grad_norm": 1.1689114541431895, "learning_rate": 3.0138740644341887e-06, "loss": 0.8838, "step": 494 }, { "epoch": 0.64, "grad_norm": 1.248229372898906, "learning_rate": 2.9945679657411054e-06, "loss": 0.8347, "step": 495 }, { "epoch": 0.64, "grad_norm": 1.1078504742591242, "learning_rate": 2.9752974325871625e-06, "loss": 0.8227, "step": 496 }, { "epoch": 0.65, "grad_norm": 1.1900434139705938, "learning_rate": 2.9560628067275966e-06, "loss": 0.8188, "step": 497 }, { "epoch": 0.65, "grad_norm": 1.3818403864096889, "learning_rate": 2.9368644292808433e-06, "loss": 0.8107, "step": 498 }, { "epoch": 0.65, "grad_norm": 1.149364405276468, "learning_rate": 2.917702640722488e-06, "loss": 0.8319, "step": 499 }, { "epoch": 0.65, "grad_norm": 1.1033046148197456, "learning_rate": 2.898577780879227e-06, "loss": 0.8056, "step": 500 }, { "epoch": 0.65, "grad_norm": 1.4667019536685615, "learning_rate": 2.879490188922837e-06, "loss": 0.8301, "step": 501 }, { "epoch": 0.65, "grad_norm": 1.0975707519773683, "learning_rate": 2.86044020336417e-06, "loss": 0.8436, "step": 502 }, { "epoch": 0.65, "grad_norm": 1.124374118696095, "learning_rate": 2.8414281620471347e-06, "loss": 0.8468, "step": 503 }, { "epoch": 0.65, "grad_norm": 1.858826654639766, "learning_rate": 2.8224544021427234e-06, "loss": 0.8187, "step": 504 }, { "epoch": 0.66, "grad_norm": 1.0707690364920266, "learning_rate": 2.803519260143014e-06, "loss": 0.7986, "step": 505 }, { "epoch": 0.66, "grad_norm": 1.4117752904872918, "learning_rate": 2.784623071855217e-06, "loss": 0.8525, "step": 506 }, { "epoch": 0.66, "grad_norm": 1.127786410455673, "learning_rate": 2.765766172395716e-06, "loss": 0.8042, "step": 507 }, { "epoch": 0.66, "grad_norm": 1.7330464613002825, "learning_rate": 2.746948896184114e-06, "loss": 0.8447, "step": 508 }, { "epoch": 0.66, "grad_norm": 1.2899602920949957, "learning_rate": 2.7281715769373205e-06, "loss": 0.854, "step": 509 }, { "epoch": 0.66, "grad_norm": 1.1424757403756332, "learning_rate": 2.7094345476636185e-06, "loss": 0.8148, "step": 510 }, { "epoch": 0.66, "grad_norm": 1.2199975615104413, "learning_rate": 2.6907381406567696e-06, "loss": 0.8014, "step": 511 }, { "epoch": 0.66, "grad_norm": 1.1194583712399984, "learning_rate": 2.6720826874901083e-06, "loss": 0.8419, "step": 512 }, { "epoch": 0.67, "grad_norm": 1.16983949626066, "learning_rate": 2.653468519010677e-06, "loss": 0.8181, "step": 513 }, { "epoch": 0.67, "grad_norm": 1.1476527120151712, "learning_rate": 2.634895965333344e-06, "loss": 0.8038, "step": 514 }, { "epoch": 0.67, "grad_norm": 1.2383972572556945, "learning_rate": 2.6163653558349613e-06, "loss": 0.7947, "step": 515 }, { "epoch": 0.67, "grad_norm": 1.294782640008379, "learning_rate": 2.5978770191485115e-06, "loss": 0.8118, "step": 516 }, { "epoch": 0.67, "grad_norm": 1.1307407949263424, "learning_rate": 2.5794312831572897e-06, "loss": 0.8161, "step": 517 }, { "epoch": 0.67, "grad_norm": 1.1156038483537878, "learning_rate": 2.561028474989088e-06, "loss": 0.8175, "step": 518 }, { "epoch": 0.67, "grad_norm": 1.1441747497674815, "learning_rate": 2.5426689210103813e-06, "loss": 0.8345, "step": 519 }, { "epoch": 0.68, "grad_norm": 1.1286048632129229, "learning_rate": 2.5243529468205574e-06, "loss": 0.8512, "step": 520 }, { "epoch": 0.68, "grad_norm": 1.2143329409471455, "learning_rate": 2.5060808772461275e-06, "loss": 0.84, "step": 521 }, { "epoch": 0.68, "grad_norm": 1.2053778551775718, "learning_rate": 2.487853036334979e-06, "loss": 0.8246, "step": 522 }, { "epoch": 0.68, "grad_norm": 1.1960048327957544, "learning_rate": 2.4696697473506122e-06, "loss": 0.8231, "step": 523 }, { "epoch": 0.68, "grad_norm": 1.295745581171811, "learning_rate": 2.451531332766426e-06, "loss": 0.8853, "step": 524 }, { "epoch": 0.68, "grad_norm": 1.3067594332973278, "learning_rate": 2.433438114259982e-06, "loss": 0.8309, "step": 525 }, { "epoch": 0.68, "grad_norm": 1.1373281583361006, "learning_rate": 2.4153904127073137e-06, "loss": 0.8146, "step": 526 }, { "epoch": 0.68, "grad_norm": 1.1417580445878792, "learning_rate": 2.397388548177227e-06, "loss": 0.839, "step": 527 }, { "epoch": 0.69, "grad_norm": 1.5599739904042915, "learning_rate": 2.3794328399256235e-06, "loss": 0.8294, "step": 528 }, { "epoch": 0.69, "grad_norm": 1.625491080719815, "learning_rate": 2.3615236063898474e-06, "loss": 0.8558, "step": 529 }, { "epoch": 0.69, "grad_norm": 1.1287172439081854, "learning_rate": 2.343661165183025e-06, "loss": 0.8196, "step": 530 }, { "epoch": 0.69, "grad_norm": 1.2174944956603801, "learning_rate": 2.325845833088448e-06, "loss": 0.8036, "step": 531 }, { "epoch": 0.69, "grad_norm": 1.251400066331298, "learning_rate": 2.308077926053939e-06, "loss": 0.8371, "step": 532 }, { "epoch": 0.69, "grad_norm": 1.2121696312359778, "learning_rate": 2.290357759186261e-06, "loss": 0.8426, "step": 533 }, { "epoch": 0.69, "grad_norm": 1.0604225747034348, "learning_rate": 2.27268564674552e-06, "loss": 0.8188, "step": 534 }, { "epoch": 0.69, "grad_norm": 1.1011428657548785, "learning_rate": 2.2550619021396e-06, "loss": 0.8079, "step": 535 }, { "epoch": 0.7, "grad_norm": 1.1723339573000198, "learning_rate": 2.2374868379185998e-06, "loss": 0.8178, "step": 536 }, { "epoch": 0.7, "grad_norm": 1.135210308251682, "learning_rate": 2.2199607657692874e-06, "loss": 0.8045, "step": 537 }, { "epoch": 0.7, "grad_norm": 1.3722545706665699, "learning_rate": 2.2024839965095814e-06, "loss": 0.8314, "step": 538 }, { "epoch": 0.7, "grad_norm": 1.1631275771309266, "learning_rate": 2.1850568400830268e-06, "loss": 0.8411, "step": 539 }, { "epoch": 0.7, "grad_norm": 1.0760153562190804, "learning_rate": 2.1676796055533125e-06, "loss": 0.8176, "step": 540 }, { "epoch": 0.7, "grad_norm": 1.1177832971628443, "learning_rate": 2.150352601098774e-06, "loss": 0.8719, "step": 541 }, { "epoch": 0.7, "grad_norm": 1.3419502743335265, "learning_rate": 2.133076134006945e-06, "loss": 0.8166, "step": 542 }, { "epoch": 0.71, "grad_norm": 1.0758424378799882, "learning_rate": 2.11585051066909e-06, "loss": 0.7853, "step": 543 }, { "epoch": 0.71, "grad_norm": 1.291711507267418, "learning_rate": 2.0986760365747883e-06, "loss": 0.829, "step": 544 }, { "epoch": 0.71, "grad_norm": 1.0798176397290844, "learning_rate": 2.081553016306504e-06, "loss": 0.8003, "step": 545 }, { "epoch": 0.71, "grad_norm": 1.1801650428025168, "learning_rate": 2.0644817535341856e-06, "loss": 0.8362, "step": 546 }, { "epoch": 0.71, "grad_norm": 1.268664958156847, "learning_rate": 2.0474625510098883e-06, "loss": 0.837, "step": 547 }, { "epoch": 0.71, "grad_norm": 1.1501634035936659, "learning_rate": 2.0304957105623936e-06, "loss": 0.8105, "step": 548 }, { "epoch": 0.71, "grad_norm": 1.0585458538794812, "learning_rate": 2.013581533091869e-06, "loss": 0.8033, "step": 549 }, { "epoch": 0.71, "grad_norm": 1.3468267171455577, "learning_rate": 1.996720318564518e-06, "loss": 0.8565, "step": 550 }, { "epoch": 0.72, "grad_norm": 1.1428836719091247, "learning_rate": 1.9799123660072744e-06, "loss": 0.8195, "step": 551 }, { "epoch": 0.72, "grad_norm": 1.206897896948396, "learning_rate": 1.9631579735024854e-06, "loss": 0.84, "step": 552 }, { "epoch": 0.72, "grad_norm": 1.1891641075077786, "learning_rate": 1.9464574381826367e-06, "loss": 0.8356, "step": 553 }, { "epoch": 0.72, "grad_norm": 1.222933588941957, "learning_rate": 1.9298110562250787e-06, "loss": 0.8156, "step": 554 }, { "epoch": 0.72, "grad_norm": 1.155842038000571, "learning_rate": 1.9132191228467685e-06, "loss": 0.8097, "step": 555 }, { "epoch": 0.72, "grad_norm": 1.140681245819448, "learning_rate": 1.8966819322990455e-06, "loss": 0.8128, "step": 556 }, { "epoch": 0.72, "grad_norm": 1.1221328511746198, "learning_rate": 1.8801997778623998e-06, "loss": 0.8572, "step": 557 }, { "epoch": 0.72, "grad_norm": 1.2526117843090938, "learning_rate": 1.8637729518412861e-06, "loss": 0.7972, "step": 558 }, { "epoch": 0.73, "grad_norm": 1.1590154010447482, "learning_rate": 1.8474017455589238e-06, "loss": 0.8268, "step": 559 }, { "epoch": 0.73, "grad_norm": 1.1525228183168728, "learning_rate": 1.8310864493521453e-06, "loss": 0.823, "step": 560 }, { "epoch": 0.73, "grad_norm": 1.5610374437152565, "learning_rate": 1.8148273525662336e-06, "loss": 0.8313, "step": 561 }, { "epoch": 0.73, "grad_norm": 1.518763002771371, "learning_rate": 1.7986247435498033e-06, "loss": 0.8418, "step": 562 }, { "epoch": 0.73, "grad_norm": 1.2059483655034768, "learning_rate": 1.7824789096496752e-06, "loss": 0.8304, "step": 563 }, { "epoch": 0.73, "grad_norm": 1.2028996101251008, "learning_rate": 1.7663901372057907e-06, "loss": 0.805, "step": 564 }, { "epoch": 0.73, "grad_norm": 1.1237925328355798, "learning_rate": 1.7503587115461286e-06, "loss": 0.8279, "step": 565 }, { "epoch": 0.74, "grad_norm": 1.1888955513675719, "learning_rate": 1.7343849169816396e-06, "loss": 0.8456, "step": 566 }, { "epoch": 0.74, "grad_norm": 1.196964494587607, "learning_rate": 1.7184690368012191e-06, "loss": 0.8181, "step": 567 }, { "epoch": 0.74, "grad_norm": 1.4923991366523424, "learning_rate": 1.702611353266665e-06, "loss": 0.8275, "step": 568 }, { "epoch": 0.74, "grad_norm": 1.0896582192802815, "learning_rate": 1.6868121476076877e-06, "loss": 0.7931, "step": 569 }, { "epoch": 0.74, "grad_norm": 1.1459479171554634, "learning_rate": 1.6710717000169098e-06, "loss": 0.8249, "step": 570 }, { "epoch": 0.74, "grad_norm": 1.2161166441890499, "learning_rate": 1.6553902896449092e-06, "loss": 0.8541, "step": 571 }, { "epoch": 0.74, "grad_norm": 1.181061357249287, "learning_rate": 1.639768194595256e-06, "loss": 0.7886, "step": 572 }, { "epoch": 0.74, "grad_norm": 1.0881658213398064, "learning_rate": 1.624205691919591e-06, "loss": 0.8381, "step": 573 }, { "epoch": 0.75, "grad_norm": 1.1232986701514402, "learning_rate": 1.6087030576127082e-06, "loss": 0.8017, "step": 574 }, { "epoch": 0.75, "grad_norm": 1.1128820987957522, "learning_rate": 1.5932605666076557e-06, "loss": 0.8363, "step": 575 }, { "epoch": 0.75, "grad_norm": 1.3744563473037221, "learning_rate": 1.5778784927708695e-06, "loss": 0.8154, "step": 576 }, { "epoch": 0.75, "grad_norm": 1.120775385580711, "learning_rate": 1.5625571088973051e-06, "loss": 0.8199, "step": 577 }, { "epoch": 0.75, "grad_norm": 1.1360877101243405, "learning_rate": 1.5472966867056122e-06, "loss": 0.8327, "step": 578 }, { "epoch": 0.75, "grad_norm": 1.1280760221777546, "learning_rate": 1.5320974968333025e-06, "loss": 0.832, "step": 579 }, { "epoch": 0.75, "grad_norm": 1.3216185504239597, "learning_rate": 1.5169598088319642e-06, "loss": 0.8328, "step": 580 }, { "epoch": 0.75, "grad_norm": 1.1555998427076246, "learning_rate": 1.5018838911624671e-06, "loss": 0.7986, "step": 581 }, { "epoch": 0.76, "grad_norm": 1.097188726494774, "learning_rate": 1.486870011190214e-06, "loss": 0.8139, "step": 582 }, { "epoch": 0.76, "grad_norm": 1.313359424122802, "learning_rate": 1.4719184351803927e-06, "loss": 0.8247, "step": 583 }, { "epoch": 0.76, "grad_norm": 1.2841336725150148, "learning_rate": 1.457029428293254e-06, "loss": 0.8214, "step": 584 }, { "epoch": 0.76, "grad_norm": 1.4019331627637832, "learning_rate": 1.4422032545794096e-06, "loss": 0.8476, "step": 585 }, { "epoch": 0.76, "grad_norm": 1.2479270352689151, "learning_rate": 1.4274401769751496e-06, "loss": 0.8596, "step": 586 }, { "epoch": 0.76, "grad_norm": 1.190584202906868, "learning_rate": 1.412740457297782e-06, "loss": 0.7976, "step": 587 }, { "epoch": 0.76, "grad_norm": 1.3358512600414083, "learning_rate": 1.398104356240988e-06, "loss": 0.8467, "step": 588 }, { "epoch": 0.76, "grad_norm": 1.1134129745594221, "learning_rate": 1.383532133370193e-06, "loss": 0.7994, "step": 589 }, { "epoch": 0.77, "grad_norm": 1.7236513155468485, "learning_rate": 1.369024047117974e-06, "loss": 0.7879, "step": 590 }, { "epoch": 0.77, "grad_norm": 1.2979089557824244, "learning_rate": 1.3545803547794639e-06, "loss": 0.8403, "step": 591 }, { "epoch": 0.77, "grad_norm": 1.235539117645581, "learning_rate": 1.3402013125078039e-06, "loss": 0.8364, "step": 592 }, { "epoch": 0.77, "grad_norm": 1.5366346531355402, "learning_rate": 1.325887175309582e-06, "loss": 0.8379, "step": 593 }, { "epoch": 0.77, "grad_norm": 1.121574181414578, "learning_rate": 1.3116381970403302e-06, "loss": 0.836, "step": 594 }, { "epoch": 0.77, "grad_norm": 1.340696954512561, "learning_rate": 1.2974546304000046e-06, "loss": 0.8027, "step": 595 }, { "epoch": 0.77, "grad_norm": 1.2132502416588156, "learning_rate": 1.2833367269285168e-06, "loss": 0.8077, "step": 596 }, { "epoch": 0.78, "grad_norm": 1.1192375791395337, "learning_rate": 1.2692847370012696e-06, "loss": 0.8522, "step": 597 }, { "epoch": 0.78, "grad_norm": 1.741500020742988, "learning_rate": 1.2552989098247092e-06, "loss": 0.8067, "step": 598 }, { "epoch": 0.78, "grad_norm": 1.1150261698728017, "learning_rate": 1.241379493431919e-06, "loss": 0.8529, "step": 599 }, { "epoch": 0.78, "grad_norm": 1.3385533216939478, "learning_rate": 1.2275267346782067e-06, "loss": 0.843, "step": 600 }, { "epoch": 0.78, "grad_norm": 1.1806042588679646, "learning_rate": 1.2137408792367388e-06, "loss": 0.7897, "step": 601 }, { "epoch": 0.78, "grad_norm": 1.2608515351865308, "learning_rate": 1.2000221715941746e-06, "loss": 0.8248, "step": 602 }, { "epoch": 0.78, "grad_norm": 1.1452255358786536, "learning_rate": 1.1863708550463372e-06, "loss": 0.8283, "step": 603 }, { "epoch": 0.78, "grad_norm": 1.584053048603632, "learning_rate": 1.1727871716938904e-06, "loss": 0.8472, "step": 604 }, { "epoch": 0.79, "grad_norm": 1.16292088995077, "learning_rate": 1.1592713624380553e-06, "loss": 0.814, "step": 605 }, { "epoch": 0.79, "grad_norm": 1.0574349634190905, "learning_rate": 1.1458236669763323e-06, "loss": 0.8029, "step": 606 }, { "epoch": 0.79, "grad_norm": 1.1504507119536145, "learning_rate": 1.132444323798247e-06, "loss": 0.8376, "step": 607 }, { "epoch": 0.79, "grad_norm": 1.1683086518860988, "learning_rate": 1.1191335701811285e-06, "loss": 0.8231, "step": 608 }, { "epoch": 0.79, "grad_norm": 1.0085098659895237, "learning_rate": 1.105891642185894e-06, "loss": 0.8007, "step": 609 }, { "epoch": 0.79, "grad_norm": 1.2738890651414907, "learning_rate": 1.0927187746528695e-06, "loss": 0.7735, "step": 610 }, { "epoch": 0.79, "grad_norm": 1.0889145540805063, "learning_rate": 1.0796152011976164e-06, "loss": 0.8529, "step": 611 }, { "epoch": 0.79, "grad_norm": 1.0941937359259284, "learning_rate": 1.0665811542067988e-06, "loss": 0.8374, "step": 612 }, { "epoch": 0.8, "grad_norm": 1.1244112767282193, "learning_rate": 1.0536168648340506e-06, "loss": 0.8098, "step": 613 }, { "epoch": 0.8, "grad_norm": 1.1652423680488342, "learning_rate": 1.0407225629958883e-06, "loss": 0.8586, "step": 614 }, { "epoch": 0.8, "grad_norm": 1.371358727142576, "learning_rate": 1.0278984773676214e-06, "loss": 0.8302, "step": 615 }, { "epoch": 0.8, "grad_norm": 1.282130434347346, "learning_rate": 1.0151448353793064e-06, "loss": 0.7846, "step": 616 }, { "epoch": 0.8, "grad_norm": 1.0308140323506172, "learning_rate": 1.0024618632117112e-06, "loss": 0.8424, "step": 617 }, { "epoch": 0.8, "grad_norm": 1.0307135501463645, "learning_rate": 9.898497857922978e-07, "loss": 0.804, "step": 618 }, { "epoch": 0.8, "grad_norm": 1.1411850630578646, "learning_rate": 9.773088267912423e-07, "loss": 0.8061, "step": 619 }, { "epoch": 0.81, "grad_norm": 1.0646486743244608, "learning_rate": 9.648392086174612e-07, "loss": 0.834, "step": 620 }, { "epoch": 0.81, "grad_norm": 1.3421257146608427, "learning_rate": 9.524411524146726e-07, "loss": 0.849, "step": 621 }, { "epoch": 0.81, "grad_norm": 1.094638647544279, "learning_rate": 9.401148780574682e-07, "loss": 0.8179, "step": 622 }, { "epoch": 0.81, "grad_norm": 1.1619475819740164, "learning_rate": 9.278606041474203e-07, "loss": 0.8457, "step": 623 }, { "epoch": 0.81, "grad_norm": 1.2816836446796047, "learning_rate": 9.15678548009199e-07, "loss": 0.8202, "step": 624 }, { "epoch": 0.81, "grad_norm": 1.1426328170657212, "learning_rate": 9.03568925686723e-07, "loss": 0.8166, "step": 625 }, { "epoch": 0.81, "grad_norm": 1.1247140360252137, "learning_rate": 8.915319519393278e-07, "loss": 0.8326, "step": 626 }, { "epoch": 0.81, "grad_norm": 1.105494579178503, "learning_rate": 8.795678402379498e-07, "loss": 0.8282, "step": 627 }, { "epoch": 0.82, "grad_norm": 1.0572570612604646, "learning_rate": 8.676768027613525e-07, "loss": 0.812, "step": 628 }, { "epoch": 0.82, "grad_norm": 1.1209163806272, "learning_rate": 8.558590503923509e-07, "loss": 0.8326, "step": 629 }, { "epoch": 0.82, "grad_norm": 1.415314790531641, "learning_rate": 8.441147927140836e-07, "loss": 0.8396, "step": 630 }, { "epoch": 0.82, "grad_norm": 1.1376087262273429, "learning_rate": 8.324442380062847e-07, "loss": 0.8003, "step": 631 }, { "epoch": 0.82, "grad_norm": 1.1053995721453131, "learning_rate": 8.208475932416005e-07, "loss": 0.8151, "step": 632 }, { "epoch": 0.82, "grad_norm": 1.24029540386004, "learning_rate": 8.093250640819095e-07, "loss": 0.8624, "step": 633 }, { "epoch": 0.82, "grad_norm": 1.2129126237029006, "learning_rate": 7.978768548746818e-07, "loss": 0.8647, "step": 634 }, { "epoch": 0.82, "grad_norm": 1.2236973968631748, "learning_rate": 7.865031686493546e-07, "loss": 0.8326, "step": 635 }, { "epoch": 0.83, "grad_norm": 1.4420200195634538, "learning_rate": 7.752042071137239e-07, "loss": 0.8318, "step": 636 }, { "epoch": 0.83, "grad_norm": 1.152704237069485, "learning_rate": 7.639801706503791e-07, "loss": 0.7957, "step": 637 }, { "epoch": 0.83, "grad_norm": 1.1410843484070319, "learning_rate": 7.528312583131387e-07, "loss": 0.8563, "step": 638 }, { "epoch": 0.83, "grad_norm": 1.4441417221706796, "learning_rate": 7.417576678235288e-07, "loss": 0.8485, "step": 639 }, { "epoch": 0.83, "grad_norm": 1.2782496928533214, "learning_rate": 7.307595955672686e-07, "loss": 0.8131, "step": 640 }, { "epoch": 0.83, "grad_norm": 1.3239848225764461, "learning_rate": 7.198372365907946e-07, "loss": 0.8278, "step": 641 }, { "epoch": 0.83, "grad_norm": 1.0124273493787945, "learning_rate": 7.089907845977962e-07, "loss": 0.8285, "step": 642 }, { "epoch": 0.84, "grad_norm": 1.1984388728115332, "learning_rate": 6.982204319457831e-07, "loss": 0.8147, "step": 643 }, { "epoch": 0.84, "grad_norm": 1.7431070656458465, "learning_rate": 6.875263696426759e-07, "loss": 0.8392, "step": 644 }, { "epoch": 0.84, "grad_norm": 1.0841381618287178, "learning_rate": 6.769087873434122e-07, "loss": 0.7805, "step": 645 }, { "epoch": 0.84, "grad_norm": 1.1363894535495727, "learning_rate": 6.663678733465905e-07, "loss": 0.8522, "step": 646 }, { "epoch": 0.84, "grad_norm": 1.149142197303568, "learning_rate": 6.55903814591125e-07, "loss": 0.8143, "step": 647 }, { "epoch": 0.84, "grad_norm": 1.2176090443322134, "learning_rate": 6.455167966529357e-07, "loss": 0.8031, "step": 648 }, { "epoch": 0.84, "grad_norm": 1.210210988431294, "learning_rate": 6.352070037416503e-07, "loss": 0.8527, "step": 649 }, { "epoch": 0.84, "grad_norm": 1.1647607912725104, "learning_rate": 6.24974618697346e-07, "loss": 0.8086, "step": 650 }, { "epoch": 0.85, "grad_norm": 1.210283794503598, "learning_rate": 6.148198229872981e-07, "loss": 0.7931, "step": 651 }, { "epoch": 0.85, "grad_norm": 1.1449560108431547, "learning_rate": 6.04742796702768e-07, "loss": 0.7754, "step": 652 }, { "epoch": 0.85, "grad_norm": 1.229482257223063, "learning_rate": 5.947437185558091e-07, "loss": 0.7979, "step": 653 }, { "epoch": 0.85, "grad_norm": 1.1064351563786532, "learning_rate": 5.848227658760914e-07, "loss": 0.8302, "step": 654 }, { "epoch": 0.85, "grad_norm": 1.1639624432408997, "learning_rate": 5.749801146077638e-07, "loss": 0.783, "step": 655 }, { "epoch": 0.85, "grad_norm": 1.3066438219306855, "learning_rate": 5.652159393063295e-07, "loss": 0.8009, "step": 656 }, { "epoch": 0.85, "grad_norm": 1.443570241927174, "learning_rate": 5.555304131355532e-07, "loss": 0.8553, "step": 657 }, { "epoch": 0.85, "grad_norm": 1.1154090455850014, "learning_rate": 5.459237078643864e-07, "loss": 0.8639, "step": 658 }, { "epoch": 0.86, "grad_norm": 1.3023330347714015, "learning_rate": 5.363959938639257e-07, "loss": 0.7804, "step": 659 }, { "epoch": 0.86, "grad_norm": 1.249767283287092, "learning_rate": 5.269474401043861e-07, "loss": 0.7951, "step": 660 }, { "epoch": 0.86, "grad_norm": 1.1063372151581325, "learning_rate": 5.175782141521107e-07, "loss": 0.8224, "step": 661 }, { "epoch": 0.86, "grad_norm": 1.1100286563923407, "learning_rate": 5.082884821665918e-07, "loss": 0.7727, "step": 662 }, { "epoch": 0.86, "grad_norm": 2.3724062097558187, "learning_rate": 4.990784088975298e-07, "loss": 0.8162, "step": 663 }, { "epoch": 0.86, "grad_norm": 1.0999505714218432, "learning_rate": 4.899481576819116e-07, "loss": 0.7921, "step": 664 }, { "epoch": 0.86, "grad_norm": 1.1869204585037552, "learning_rate": 4.808978904411066e-07, "loss": 0.7788, "step": 665 }, { "epoch": 0.86, "grad_norm": 1.203463681829628, "learning_rate": 4.719277676780054e-07, "loss": 0.8159, "step": 666 }, { "epoch": 0.87, "grad_norm": 1.0816234617283351, "learning_rate": 4.630379484741643e-07, "loss": 0.8381, "step": 667 }, { "epoch": 0.87, "grad_norm": 1.0235382933487907, "learning_rate": 4.542285904869903e-07, "loss": 0.82, "step": 668 }, { "epoch": 0.87, "grad_norm": 1.1180943607355467, "learning_rate": 4.4549984994694095e-07, "loss": 0.8201, "step": 669 }, { "epoch": 0.87, "grad_norm": 1.0643402324837885, "learning_rate": 4.3685188165475847e-07, "loss": 0.8194, "step": 670 }, { "epoch": 0.87, "grad_norm": 1.2761426870344974, "learning_rate": 4.2828483897871644e-07, "loss": 0.8491, "step": 671 }, { "epoch": 0.87, "grad_norm": 1.131085874662742, "learning_rate": 4.197988738519099e-07, "loss": 0.8208, "step": 672 }, { "epoch": 0.87, "grad_norm": 1.4239166104583856, "learning_rate": 4.11394136769554e-07, "loss": 0.8546, "step": 673 }, { "epoch": 0.88, "grad_norm": 1.1422745721220944, "learning_rate": 4.030707767863151e-07, "loss": 0.8394, "step": 674 }, { "epoch": 0.88, "grad_norm": 1.163144758918877, "learning_rate": 3.9482894151367193e-07, "loss": 0.7996, "step": 675 }, { "epoch": 0.88, "grad_norm": 1.1466246998351644, "learning_rate": 3.866687771172917e-07, "loss": 0.842, "step": 676 }, { "epoch": 0.88, "grad_norm": 3.5669370889766294, "learning_rate": 3.785904283144454e-07, "loss": 0.8256, "step": 677 }, { "epoch": 0.88, "grad_norm": 1.118821104205034, "learning_rate": 3.705940383714318e-07, "loss": 0.8273, "step": 678 }, { "epoch": 0.88, "grad_norm": 1.323490909694076, "learning_rate": 3.6267974910104696e-07, "loss": 0.7964, "step": 679 }, { "epoch": 0.88, "grad_norm": 1.115932337714536, "learning_rate": 3.5484770086006037e-07, "loss": 0.8155, "step": 680 }, { "epoch": 0.88, "grad_norm": 1.0438668202049772, "learning_rate": 3.470980325467316e-07, "loss": 0.7981, "step": 681 }, { "epoch": 0.89, "grad_norm": 1.1725553396037112, "learning_rate": 3.394308815983455e-07, "loss": 0.8166, "step": 682 }, { "epoch": 0.89, "grad_norm": 1.0432079376075305, "learning_rate": 3.318463839887714e-07, "loss": 0.8048, "step": 683 }, { "epoch": 0.89, "grad_norm": 1.1190181566238873, "learning_rate": 3.243446742260581e-07, "loss": 0.8143, "step": 684 }, { "epoch": 0.89, "grad_norm": 1.2027311340026403, "learning_rate": 3.169258853500423e-07, "loss": 0.8072, "step": 685 }, { "epoch": 0.89, "grad_norm": 1.0448826480431468, "learning_rate": 3.095901489299935e-07, "loss": 0.8156, "step": 686 }, { "epoch": 0.89, "grad_norm": 1.192618042559272, "learning_rate": 3.0233759506227646e-07, "loss": 0.8091, "step": 687 }, { "epoch": 0.89, "grad_norm": 1.5187121355543114, "learning_rate": 2.951683523680504e-07, "loss": 0.8679, "step": 688 }, { "epoch": 0.89, "grad_norm": 1.0879754257424312, "learning_rate": 2.8808254799097936e-07, "loss": 0.7972, "step": 689 }, { "epoch": 0.9, "grad_norm": 1.1613265458170998, "learning_rate": 2.8108030759498583e-07, "loss": 0.829, "step": 690 }, { "epoch": 0.9, "grad_norm": 1.1230654665039808, "learning_rate": 2.7416175536201794e-07, "loss": 0.8032, "step": 691 }, { "epoch": 0.9, "grad_norm": 1.1769239213597456, "learning_rate": 2.673270139898443e-07, "loss": 0.8239, "step": 692 }, { "epoch": 0.9, "grad_norm": 1.0197306575365235, "learning_rate": 2.605762046898852e-07, "loss": 0.8078, "step": 693 }, { "epoch": 0.9, "grad_norm": 1.1409475466437597, "learning_rate": 2.539094471850562e-07, "loss": 0.8414, "step": 694 }, { "epoch": 0.9, "grad_norm": 1.4106234568201388, "learning_rate": 2.4732685970765004e-07, "loss": 0.8255, "step": 695 }, { "epoch": 0.9, "grad_norm": 1.392418029087564, "learning_rate": 2.408285589972353e-07, "loss": 0.8616, "step": 696 }, { "epoch": 0.91, "grad_norm": 1.1287766159634611, "learning_rate": 2.3441466029859027e-07, "loss": 0.8257, "step": 697 }, { "epoch": 0.91, "grad_norm": 1.176991079235627, "learning_rate": 2.280852773596548e-07, "loss": 0.8592, "step": 698 }, { "epoch": 0.91, "grad_norm": 1.159662292058494, "learning_rate": 2.218405224295178e-07, "loss": 0.8608, "step": 699 }, { "epoch": 0.91, "grad_norm": 1.1122413124432546, "learning_rate": 2.1568050625642323e-07, "loss": 0.8338, "step": 700 }, { "epoch": 0.91, "grad_norm": 1.3908225007282236, "learning_rate": 2.0960533808580596e-07, "loss": 0.824, "step": 701 }, { "epoch": 0.91, "grad_norm": 1.0773093703955594, "learning_rate": 2.0361512565835738e-07, "loss": 0.8022, "step": 702 }, { "epoch": 0.91, "grad_norm": 1.0665415052562757, "learning_rate": 1.9770997520810965e-07, "loss": 0.833, "step": 703 }, { "epoch": 0.91, "grad_norm": 1.1527526893626212, "learning_rate": 1.918899914605582e-07, "loss": 0.8078, "step": 704 }, { "epoch": 0.92, "grad_norm": 1.1609049518698369, "learning_rate": 1.8615527763079678e-07, "loss": 0.8044, "step": 705 }, { "epoch": 0.92, "grad_norm": 1.4022083725557934, "learning_rate": 1.8050593542169537e-07, "loss": 0.8395, "step": 706 }, { "epoch": 0.92, "grad_norm": 1.152545683730929, "learning_rate": 1.7494206502208787e-07, "loss": 0.8257, "step": 707 }, { "epoch": 0.92, "grad_norm": 1.1491485626025015, "learning_rate": 1.6946376510500406e-07, "loss": 0.8384, "step": 708 }, { "epoch": 0.92, "grad_norm": 1.052754057159298, "learning_rate": 1.6407113282591204e-07, "loss": 0.8274, "step": 709 }, { "epoch": 0.92, "grad_norm": 1.0398923442519588, "learning_rate": 1.5876426382099908e-07, "loss": 0.8142, "step": 710 }, { "epoch": 0.92, "grad_norm": 1.4521184232963287, "learning_rate": 1.5354325220547638e-07, "loss": 0.8346, "step": 711 }, { "epoch": 0.92, "grad_norm": 1.1173628865842729, "learning_rate": 1.4840819057190591e-07, "loss": 0.8292, "step": 712 }, { "epoch": 0.93, "grad_norm": 1.0382298520155955, "learning_rate": 1.433591699885639e-07, "loss": 0.7851, "step": 713 }, { "epoch": 0.93, "grad_norm": 1.0096877302771539, "learning_rate": 1.3839627999782056e-07, "loss": 0.7929, "step": 714 }, { "epoch": 0.93, "grad_norm": 1.0884547484191, "learning_rate": 1.3351960861455515e-07, "loss": 0.8375, "step": 715 }, { "epoch": 0.93, "grad_norm": 1.1252011150006394, "learning_rate": 1.287292423245945e-07, "loss": 0.7717, "step": 716 }, { "epoch": 0.93, "grad_norm": 1.040686560253859, "learning_rate": 1.2402526608317812e-07, "loss": 0.7949, "step": 717 }, { "epoch": 0.93, "grad_norm": 3.0364147961167727, "learning_rate": 1.1940776331345383e-07, "loss": 0.7683, "step": 718 }, { "epoch": 0.93, "grad_norm": 1.1201818935745307, "learning_rate": 1.1487681590499456e-07, "loss": 0.8266, "step": 719 }, { "epoch": 0.94, "grad_norm": 1.3409144389292686, "learning_rate": 1.1043250421235107e-07, "loss": 0.874, "step": 720 }, { "epoch": 0.94, "grad_norm": 1.182209135051885, "learning_rate": 1.06074907053621e-07, "loss": 0.8179, "step": 721 }, { "epoch": 0.94, "grad_norm": 1.1131457112516843, "learning_rate": 1.0180410170905819e-07, "loss": 0.8328, "step": 722 }, { "epoch": 0.94, "grad_norm": 1.190739874954435, "learning_rate": 9.762016391969386e-08, "loss": 0.7798, "step": 723 }, { "epoch": 0.94, "grad_norm": 1.1368913561260203, "learning_rate": 9.352316788600102e-08, "loss": 0.8303, "step": 724 }, { "epoch": 0.94, "grad_norm": 1.1850727243568353, "learning_rate": 8.95131862665749e-08, "loss": 0.7791, "step": 725 }, { "epoch": 0.94, "grad_norm": 1.1643319143252835, "learning_rate": 8.559029017684184e-08, "loss": 0.8276, "step": 726 }, { "epoch": 0.94, "grad_norm": 1.1274946360164755, "learning_rate": 8.175454918780467e-08, "loss": 0.8095, "step": 727 }, { "epoch": 0.95, "grad_norm": 1.1648118926590638, "learning_rate": 7.800603132480322e-08, "loss": 0.8263, "step": 728 }, { "epoch": 0.95, "grad_norm": 1.1391257495952203, "learning_rate": 7.434480306630965e-08, "loss": 0.7894, "step": 729 }, { "epoch": 0.95, "grad_norm": 1.5657383282724364, "learning_rate": 7.077092934275054e-08, "loss": 0.8102, "step": 730 }, { "epoch": 0.95, "grad_norm": 1.1438725716115983, "learning_rate": 6.72844735353545e-08, "loss": 0.8541, "step": 731 }, { "epoch": 0.95, "grad_norm": 1.3162108137240458, "learning_rate": 6.388549747502748e-08, "loss": 0.814, "step": 732 }, { "epoch": 0.95, "grad_norm": 1.2615633941509532, "learning_rate": 6.057406144125755e-08, "loss": 0.7947, "step": 733 }, { "epoch": 0.95, "grad_norm": 1.0545752870068084, "learning_rate": 5.7350224161046294e-08, "loss": 0.8528, "step": 734 }, { "epoch": 0.95, "grad_norm": 1.1412889461353484, "learning_rate": 5.421404280786302e-08, "loss": 0.812, "step": 735 }, { "epoch": 0.96, "grad_norm": 1.2921519350201203, "learning_rate": 5.116557300063774e-08, "loss": 0.8166, "step": 736 }, { "epoch": 0.96, "grad_norm": 1.1139933495136198, "learning_rate": 4.8204868802768645e-08, "loss": 0.8318, "step": 737 }, { "epoch": 0.96, "grad_norm": 1.0513282814995402, "learning_rate": 4.5331982721167345e-08, "loss": 0.836, "step": 738 }, { "epoch": 0.96, "grad_norm": 1.232938382319419, "learning_rate": 4.254696570532402e-08, "loss": 0.8203, "step": 739 }, { "epoch": 0.96, "grad_norm": 1.2688797329670713, "learning_rate": 3.98498671464087e-08, "loss": 0.838, "step": 740 }, { "epoch": 0.96, "grad_norm": 1.2712704012974276, "learning_rate": 3.7240734876389796e-08, "loss": 0.8357, "step": 741 }, { "epoch": 0.96, "grad_norm": 1.2446289864712674, "learning_rate": 3.47196151671908e-08, "loss": 0.8294, "step": 742 }, { "epoch": 0.96, "grad_norm": 1.1249447697847985, "learning_rate": 3.2286552729866585e-08, "loss": 0.8005, "step": 743 }, { "epoch": 0.97, "grad_norm": 1.1970341383463035, "learning_rate": 2.9941590713810645e-08, "loss": 0.8006, "step": 744 }, { "epoch": 0.97, "grad_norm": 1.052001308737665, "learning_rate": 2.768477070599185e-08, "loss": 0.8469, "step": 745 }, { "epoch": 0.97, "grad_norm": 1.4637979916072497, "learning_rate": 2.5516132730215028e-08, "loss": 0.833, "step": 746 }, { "epoch": 0.97, "grad_norm": 1.1844659936098128, "learning_rate": 2.3435715246411527e-08, "loss": 0.8483, "step": 747 }, { "epoch": 0.97, "grad_norm": 1.0850626596964983, "learning_rate": 2.1443555149957552e-08, "loss": 0.8224, "step": 748 }, { "epoch": 0.97, "grad_norm": 1.2824700674922327, "learning_rate": 1.9539687771019666e-08, "loss": 0.8203, "step": 749 }, { "epoch": 0.97, "grad_norm": 1.1706439545733154, "learning_rate": 1.772414687392865e-08, "loss": 0.8267, "step": 750 }, { "epoch": 0.98, "grad_norm": 1.3274049419404885, "learning_rate": 1.5996964656579405e-08, "loss": 0.8095, "step": 751 }, { "epoch": 0.98, "grad_norm": 1.1111642410420237, "learning_rate": 1.4358171749861427e-08, "loss": 0.7953, "step": 752 }, { "epoch": 0.98, "grad_norm": 1.0865690516156918, "learning_rate": 1.2807797217114782e-08, "loss": 0.8062, "step": 753 }, { "epoch": 0.98, "grad_norm": 1.1521753176211016, "learning_rate": 1.1345868553615525e-08, "loss": 0.8374, "step": 754 }, { "epoch": 0.98, "grad_norm": 1.463454458386019, "learning_rate": 9.972411686085537e-09, "loss": 0.829, "step": 755 }, { "epoch": 0.98, "grad_norm": 1.3936488651540875, "learning_rate": 8.687450972237332e-09, "loss": 0.803, "step": 756 }, { "epoch": 0.98, "grad_norm": 1.136244795446509, "learning_rate": 7.49100920033663e-09, "loss": 0.8198, "step": 757 }, { "epoch": 0.98, "grad_norm": 1.1217747805128075, "learning_rate": 6.383107588802673e-09, "loss": 0.8234, "step": 758 }, { "epoch": 0.99, "grad_norm": 1.0561447394170373, "learning_rate": 5.363765785829644e-09, "loss": 0.8227, "step": 759 }, { "epoch": 0.99, "grad_norm": 1.1412749499521242, "learning_rate": 4.433001869039166e-09, "loss": 0.8051, "step": 760 }, { "epoch": 0.99, "grad_norm": 1.2442817598179234, "learning_rate": 3.590832345158335e-09, "loss": 0.8286, "step": 761 }, { "epoch": 0.99, "grad_norm": 1.9819842833626258, "learning_rate": 2.8372721497288423e-09, "loss": 0.8063, "step": 762 }, { "epoch": 0.99, "grad_norm": 1.1106115527855582, "learning_rate": 2.172334646841079e-09, "loss": 0.7925, "step": 763 }, { "epoch": 0.99, "grad_norm": 1.060032513959221, "learning_rate": 1.596031628896544e-09, "loss": 0.8132, "step": 764 }, { "epoch": 0.99, "grad_norm": 1.1721837091645986, "learning_rate": 1.1083733164007904e-09, "loss": 0.8032, "step": 765 }, { "epoch": 0.99, "grad_norm": 1.1718732472907902, "learning_rate": 7.093683577791277e-10, "loss": 0.8607, "step": 766 }, { "epoch": 1.0, "grad_norm": 1.1656756317925252, "learning_rate": 3.99023829225631e-10, "loss": 0.8347, "step": 767 }, { "epoch": 1.0, "grad_norm": 1.200374279193499, "learning_rate": 1.7734523457824116e-10, "loss": 0.8098, "step": 768 }, { "epoch": 1.0, "grad_norm": 1.2441092942236753, "learning_rate": 4.433650521717958e-11, "loss": 0.7924, "step": 769 }, { "epoch": 1.0, "grad_norm": 1.0824485212601966, "learning_rate": 0.0, "loss": 0.8336, "step": 770 }, { "epoch": 1.0, "step": 770, "total_flos": 423037538631680.0, "train_loss": 0.8593585531432907, "train_runtime": 3300.3249, "train_samples_per_second": 29.87, "train_steps_per_second": 0.233 } ], "logging_steps": 1.0, "max_steps": 770, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 423037538631680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }