diff --git "a/checkpoint-6139/trainer_state.json" "b/checkpoint-6139/trainer_state.json" deleted file mode 100644--- "a/checkpoint-6139/trainer_state.json" +++ /dev/null @@ -1,4428 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.999959278413487, - "eval_steps": 500, - "global_step": 6139, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0001628863460520422, - "eval_loss": 2.4029905796051025, - "eval_runtime": 101.0475, - "eval_samples_per_second": 25.582, - "eval_steps_per_second": 25.582, - "step": 1 - }, - { - "epoch": 0.0016288634605204219, - "grad_norm": 24.467580795288086, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.7479, - "step": 10 - }, - { - "epoch": 0.0032577269210408437, - "grad_norm": 25.53505516052246, - "learning_rate": 4.000000000000001e-06, - "loss": 1.7216, - "step": 20 - }, - { - "epoch": 0.004886590381561266, - "grad_norm": 18.23355484008789, - "learning_rate": 6e-06, - "loss": 1.5159, - "step": 30 - }, - { - "epoch": 0.006515453842081687, - "grad_norm": 12.464448928833008, - "learning_rate": 8.000000000000001e-06, - "loss": 1.4322, - "step": 40 - }, - { - "epoch": 0.00814431730260211, - "grad_norm": 10.985918998718262, - "learning_rate": 1e-05, - "loss": 1.3426, - "step": 50 - }, - { - "epoch": 0.009773180763122532, - "grad_norm": 9.040105819702148, - "learning_rate": 1.2e-05, - "loss": 1.314, - "step": 60 - }, - { - "epoch": 0.011402044223642953, - "grad_norm": 10.273420333862305, - "learning_rate": 1.4e-05, - "loss": 1.2463, - "step": 70 - }, - { - "epoch": 0.013030907684163375, - "grad_norm": 11.754436492919922, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.2561, - "step": 80 - }, - { - "epoch": 0.014659771144683796, - "grad_norm": 12.387304306030273, - "learning_rate": 1.8e-05, - "loss": 1.275, - "step": 90 - }, - { - "epoch": 0.01628863460520422, - "grad_norm": 9.28602409362793, - "learning_rate": 2e-05, - "loss": 1.3315, - "step": 100 - }, - { - "epoch": 0.01791749806572464, - "grad_norm": 9.231125831604004, - "learning_rate": 1.9999864687252914e-05, - "loss": 1.3106, - "step": 110 - }, - { - "epoch": 0.019546361526245063, - "grad_norm": 19.837526321411133, - "learning_rate": 1.9999458752673554e-05, - "loss": 1.342, - "step": 120 - }, - { - "epoch": 0.021175224986765485, - "grad_norm": 11.052694320678711, - "learning_rate": 1.9998782207247556e-05, - "loss": 1.2678, - "step": 130 - }, - { - "epoch": 0.022804088447285906, - "grad_norm": 14.448543548583984, - "learning_rate": 1.9997835069283954e-05, - "loss": 1.262, - "step": 140 - }, - { - "epoch": 0.024432951907806328, - "grad_norm": 8.338994979858398, - "learning_rate": 1.999661736441472e-05, - "loss": 1.2932, - "step": 150 - }, - { - "epoch": 0.02606181536832675, - "grad_norm": 10.267782211303711, - "learning_rate": 1.9995129125594058e-05, - "loss": 1.2789, - "step": 160 - }, - { - "epoch": 0.02769067882884717, - "grad_norm": 17.692113876342773, - "learning_rate": 1.999337039309749e-05, - "loss": 1.2546, - "step": 170 - }, - { - "epoch": 0.029319542289367593, - "grad_norm": 11.050065040588379, - "learning_rate": 1.9991341214520816e-05, - "loss": 1.2649, - "step": 180 - }, - { - "epoch": 0.030948405749888015, - "grad_norm": 11.198172569274902, - "learning_rate": 1.9989041644778773e-05, - "loss": 1.2018, - "step": 190 - }, - { - "epoch": 0.03257726921040844, - "grad_norm": 8.730816841125488, - "learning_rate": 1.9986471746103584e-05, - "loss": 1.3364, - "step": 200 - }, - { - "epoch": 0.03420613267092886, - "grad_norm": 12.193893432617188, - "learning_rate": 1.998363158804326e-05, - "loss": 1.3358, - "step": 210 - }, - { - "epoch": 0.03583499613144928, - "grad_norm": 12.82657241821289, - "learning_rate": 1.9980521247459714e-05, - "loss": 1.3936, - "step": 220 - }, - { - "epoch": 0.037463859591969705, - "grad_norm": 10.334324836730957, - "learning_rate": 1.9977140808526696e-05, - "loss": 1.2811, - "step": 230 - }, - { - "epoch": 0.039092723052490126, - "grad_norm": 10.822787284851074, - "learning_rate": 1.99734903627275e-05, - "loss": 1.3964, - "step": 240 - }, - { - "epoch": 0.04072158651301055, - "grad_norm": 10.038861274719238, - "learning_rate": 1.9969570008852498e-05, - "loss": 1.3229, - "step": 250 - }, - { - "epoch": 0.04235044997353097, - "grad_norm": 10.05347728729248, - "learning_rate": 1.9965379852996455e-05, - "loss": 1.2529, - "step": 260 - }, - { - "epoch": 0.04397931343405139, - "grad_norm": 15.111456871032715, - "learning_rate": 1.996092000855568e-05, - "loss": 1.3372, - "step": 270 - }, - { - "epoch": 0.04560817689457181, - "grad_norm": 8.95826244354248, - "learning_rate": 1.9956190596224923e-05, - "loss": 1.3154, - "step": 280 - }, - { - "epoch": 0.047237040355092234, - "grad_norm": 8.862021446228027, - "learning_rate": 1.9951191743994146e-05, - "loss": 1.2578, - "step": 290 - }, - { - "epoch": 0.048865903815612656, - "grad_norm": 12.974872589111328, - "learning_rate": 1.9945923587145032e-05, - "loss": 1.2848, - "step": 300 - }, - { - "epoch": 0.05049476727613308, - "grad_norm": 8.246659278869629, - "learning_rate": 1.994038626824734e-05, - "loss": 1.2349, - "step": 310 - }, - { - "epoch": 0.0521236307366535, - "grad_norm": 10.3037748336792, - "learning_rate": 1.993457993715503e-05, - "loss": 1.2731, - "step": 320 - }, - { - "epoch": 0.05375249419717392, - "grad_norm": 9.140987396240234, - "learning_rate": 1.992850475100223e-05, - "loss": 1.2673, - "step": 330 - }, - { - "epoch": 0.05538135765769434, - "grad_norm": 9.048047065734863, - "learning_rate": 1.992216087419896e-05, - "loss": 1.2842, - "step": 340 - }, - { - "epoch": 0.057010221118214764, - "grad_norm": 8.832723617553711, - "learning_rate": 1.99155484784267e-05, - "loss": 1.227, - "step": 350 - }, - { - "epoch": 0.058639084578735186, - "grad_norm": 8.928304672241211, - "learning_rate": 1.9908667742633742e-05, - "loss": 1.3681, - "step": 360 - }, - { - "epoch": 0.06026794803925561, - "grad_norm": 9.978281021118164, - "learning_rate": 1.990151885303034e-05, - "loss": 1.1123, - "step": 370 - }, - { - "epoch": 0.06189681149977603, - "grad_norm": 8.430088996887207, - "learning_rate": 1.989410200308366e-05, - "loss": 1.263, - "step": 380 - }, - { - "epoch": 0.06352567496029646, - "grad_norm": 11.750625610351562, - "learning_rate": 1.9886417393512584e-05, - "loss": 1.255, - "step": 390 - }, - { - "epoch": 0.06515453842081688, - "grad_norm": 10.856910705566406, - "learning_rate": 1.987846523228223e-05, - "loss": 1.2889, - "step": 400 - }, - { - "epoch": 0.0667834018813373, - "grad_norm": 10.42456340789795, - "learning_rate": 1.9870245734598358e-05, - "loss": 1.3936, - "step": 410 - }, - { - "epoch": 0.06841226534185772, - "grad_norm": 9.490607261657715, - "learning_rate": 1.986175912290153e-05, - "loss": 1.3335, - "step": 420 - }, - { - "epoch": 0.07004112880237814, - "grad_norm": 8.353849411010742, - "learning_rate": 1.985300562686109e-05, - "loss": 1.2302, - "step": 430 - }, - { - "epoch": 0.07166999226289857, - "grad_norm": 13.212517738342285, - "learning_rate": 1.9843985483368968e-05, - "loss": 1.38, - "step": 440 - }, - { - "epoch": 0.07329885572341899, - "grad_norm": 6.969715118408203, - "learning_rate": 1.9834698936533226e-05, - "loss": 1.2775, - "step": 450 - }, - { - "epoch": 0.07492771918393941, - "grad_norm": 7.460076332092285, - "learning_rate": 1.9825146237671513e-05, - "loss": 1.4045, - "step": 460 - }, - { - "epoch": 0.07655658264445983, - "grad_norm": 8.584343910217285, - "learning_rate": 1.9815327645304204e-05, - "loss": 1.2884, - "step": 470 - }, - { - "epoch": 0.07818544610498025, - "grad_norm": 8.771677017211914, - "learning_rate": 1.9805243425147448e-05, - "loss": 1.3839, - "step": 480 - }, - { - "epoch": 0.07981430956550067, - "grad_norm": 10.828780174255371, - "learning_rate": 1.9794893850105942e-05, - "loss": 1.3274, - "step": 490 - }, - { - "epoch": 0.0814431730260211, - "grad_norm": 8.709300994873047, - "learning_rate": 1.978427920026558e-05, - "loss": 1.2572, - "step": 500 - }, - { - "epoch": 0.0814431730260211, - "eval_loss": 1.1935244798660278, - "eval_runtime": 101.9438, - "eval_samples_per_second": 25.357, - "eval_steps_per_second": 25.357, - "step": 500 - }, - { - "epoch": 0.08307203648654152, - "grad_norm": 9.961617469787598, - "learning_rate": 1.977339976288584e-05, - "loss": 1.382, - "step": 510 - }, - { - "epoch": 0.08470089994706194, - "grad_norm": 9.85478401184082, - "learning_rate": 1.9762255832392046e-05, - "loss": 1.3768, - "step": 520 - }, - { - "epoch": 0.08632976340758236, - "grad_norm": 9.125312805175781, - "learning_rate": 1.975084771036736e-05, - "loss": 1.2088, - "step": 530 - }, - { - "epoch": 0.08795862686810278, - "grad_norm": 7.915559768676758, - "learning_rate": 1.973917570554464e-05, - "loss": 1.2951, - "step": 540 - }, - { - "epoch": 0.0895874903286232, - "grad_norm": 9.664942741394043, - "learning_rate": 1.9727240133798106e-05, - "loss": 1.3305, - "step": 550 - }, - { - "epoch": 0.09121635378914363, - "grad_norm": 10.48803997039795, - "learning_rate": 1.9715041318134756e-05, - "loss": 1.4256, - "step": 560 - }, - { - "epoch": 0.09284521724966405, - "grad_norm": 10.634197235107422, - "learning_rate": 1.9702579588685634e-05, - "loss": 1.2912, - "step": 570 - }, - { - "epoch": 0.09447408071018447, - "grad_norm": 9.283407211303711, - "learning_rate": 1.968985528269692e-05, - "loss": 1.3337, - "step": 580 - }, - { - "epoch": 0.09610294417070489, - "grad_norm": 9.688226699829102, - "learning_rate": 1.9676868744520768e-05, - "loss": 1.3231, - "step": 590 - }, - { - "epoch": 0.09773180763122531, - "grad_norm": 10.292962074279785, - "learning_rate": 1.9663620325606005e-05, - "loss": 1.2227, - "step": 600 - }, - { - "epoch": 0.09936067109174573, - "grad_norm": 5.677855014801025, - "learning_rate": 1.965011038448863e-05, - "loss": 1.4159, - "step": 610 - }, - { - "epoch": 0.10098953455226616, - "grad_norm": 7.153832912445068, - "learning_rate": 1.963633928678209e-05, - "loss": 1.3664, - "step": 620 - }, - { - "epoch": 0.10261839801278658, - "grad_norm": 12.980141639709473, - "learning_rate": 1.9622307405167395e-05, - "loss": 1.2955, - "step": 630 - }, - { - "epoch": 0.104247261473307, - "grad_norm": 10.464163780212402, - "learning_rate": 1.9608015119383036e-05, - "loss": 1.2868, - "step": 640 - }, - { - "epoch": 0.10587612493382742, - "grad_norm": 11.611323356628418, - "learning_rate": 1.9593462816214698e-05, - "loss": 1.2963, - "step": 650 - }, - { - "epoch": 0.10750498839434784, - "grad_norm": 6.969916820526123, - "learning_rate": 1.9578650889484815e-05, - "loss": 1.2765, - "step": 660 - }, - { - "epoch": 0.10913385185486826, - "grad_norm": 10.346363067626953, - "learning_rate": 1.9563579740041884e-05, - "loss": 1.3072, - "step": 670 - }, - { - "epoch": 0.11076271531538869, - "grad_norm": 10.077250480651855, - "learning_rate": 1.9548249775749623e-05, - "loss": 1.3099, - "step": 680 - }, - { - "epoch": 0.1123915787759091, - "grad_norm": 10.486137390136719, - "learning_rate": 1.9532661411475955e-05, - "loss": 1.3504, - "step": 690 - }, - { - "epoch": 0.11402044223642953, - "grad_norm": 6.885483264923096, - "learning_rate": 1.9516815069081758e-05, - "loss": 1.2268, - "step": 700 - }, - { - "epoch": 0.11564930569694995, - "grad_norm": 7.814432144165039, - "learning_rate": 1.9500711177409456e-05, - "loss": 1.394, - "step": 710 - }, - { - "epoch": 0.11727816915747037, - "grad_norm": 13.40298080444336, - "learning_rate": 1.948435017227141e-05, - "loss": 1.2634, - "step": 720 - }, - { - "epoch": 0.1189070326179908, - "grad_norm": 8.984935760498047, - "learning_rate": 1.9467732496438137e-05, - "loss": 1.2645, - "step": 730 - }, - { - "epoch": 0.12053589607851121, - "grad_norm": 8.895575523376465, - "learning_rate": 1.9450858599626304e-05, - "loss": 1.2724, - "step": 740 - }, - { - "epoch": 0.12216475953903164, - "grad_norm": 9.734760284423828, - "learning_rate": 1.9433728938486576e-05, - "loss": 1.3362, - "step": 750 - }, - { - "epoch": 0.12379362299955206, - "grad_norm": 11.392585754394531, - "learning_rate": 1.941634397659126e-05, - "loss": 1.2574, - "step": 760 - }, - { - "epoch": 0.12542248646007248, - "grad_norm": 7.931302070617676, - "learning_rate": 1.9398704184421745e-05, - "loss": 1.3003, - "step": 770 - }, - { - "epoch": 0.12705134992059292, - "grad_norm": 7.866138458251953, - "learning_rate": 1.9380810039355776e-05, - "loss": 1.2672, - "step": 780 - }, - { - "epoch": 0.12868021338111332, - "grad_norm": 12.06697940826416, - "learning_rate": 1.936266202565454e-05, - "loss": 1.4121, - "step": 790 - }, - { - "epoch": 0.13030907684163376, - "grad_norm": 10.089153289794922, - "learning_rate": 1.9344260634449556e-05, - "loss": 1.3314, - "step": 800 - }, - { - "epoch": 0.13193794030215417, - "grad_norm": 7.83293342590332, - "learning_rate": 1.9325606363729378e-05, - "loss": 1.332, - "step": 810 - }, - { - "epoch": 0.1335668037626746, - "grad_norm": 18.291975021362305, - "learning_rate": 1.930669971832613e-05, - "loss": 1.2154, - "step": 820 - }, - { - "epoch": 0.135195667223195, - "grad_norm": 9.173296928405762, - "learning_rate": 1.9287541209901842e-05, - "loss": 1.3371, - "step": 830 - }, - { - "epoch": 0.13682453068371545, - "grad_norm": 7.5841875076293945, - "learning_rate": 1.9268131356934592e-05, - "loss": 1.3549, - "step": 840 - }, - { - "epoch": 0.13845339414423585, - "grad_norm": 9.693364143371582, - "learning_rate": 1.924847068470449e-05, - "loss": 1.2978, - "step": 850 - }, - { - "epoch": 0.1400822576047563, - "grad_norm": 9.127781867980957, - "learning_rate": 1.9228559725279444e-05, - "loss": 1.261, - "step": 860 - }, - { - "epoch": 0.1417111210652767, - "grad_norm": 10.408056259155273, - "learning_rate": 1.9208399017500773e-05, - "loss": 1.2478, - "step": 870 - }, - { - "epoch": 0.14333998452579713, - "grad_norm": 9.875020980834961, - "learning_rate": 1.918798910696864e-05, - "loss": 1.3205, - "step": 880 - }, - { - "epoch": 0.14496884798631754, - "grad_norm": 8.818717956542969, - "learning_rate": 1.916733054602725e-05, - "loss": 1.389, - "step": 890 - }, - { - "epoch": 0.14659771144683797, - "grad_norm": 13.227215766906738, - "learning_rate": 1.9146423893749924e-05, - "loss": 1.3243, - "step": 900 - }, - { - "epoch": 0.14822657490735838, - "grad_norm": 7.599000930786133, - "learning_rate": 1.9125269715923983e-05, - "loss": 1.1711, - "step": 910 - }, - { - "epoch": 0.14985543836787882, - "grad_norm": 9.84504508972168, - "learning_rate": 1.910386858503541e-05, - "loss": 1.2145, - "step": 920 - }, - { - "epoch": 0.15148430182839923, - "grad_norm": 7.403562068939209, - "learning_rate": 1.908222108025336e-05, - "loss": 1.2809, - "step": 930 - }, - { - "epoch": 0.15311316528891966, - "grad_norm": 12.983716011047363, - "learning_rate": 1.9060327787414498e-05, - "loss": 1.2949, - "step": 940 - }, - { - "epoch": 0.15474202874944007, - "grad_norm": 8.185487747192383, - "learning_rate": 1.9038189299007154e-05, - "loss": 1.3662, - "step": 950 - }, - { - "epoch": 0.1563708922099605, - "grad_norm": 9.041784286499023, - "learning_rate": 1.901580621415526e-05, - "loss": 1.2835, - "step": 960 - }, - { - "epoch": 0.1579997556704809, - "grad_norm": 7.92235803604126, - "learning_rate": 1.899317913860215e-05, - "loss": 1.3625, - "step": 970 - }, - { - "epoch": 0.15962861913100135, - "grad_norm": 15.111337661743164, - "learning_rate": 1.8970308684694186e-05, - "loss": 1.2816, - "step": 980 - }, - { - "epoch": 0.16125748259152176, - "grad_norm": 10.149383544921875, - "learning_rate": 1.894719547136415e-05, - "loss": 1.2481, - "step": 990 - }, - { - "epoch": 0.1628863460520422, - "grad_norm": 10.364059448242188, - "learning_rate": 1.8923840124114517e-05, - "loss": 1.3061, - "step": 1000 - }, - { - "epoch": 0.1628863460520422, - "eval_loss": 1.1865288019180298, - "eval_runtime": 101.3159, - "eval_samples_per_second": 25.514, - "eval_steps_per_second": 25.514, - "step": 1000 - }, - { - "epoch": 0.1645152095125626, - "grad_norm": 8.624238014221191, - "learning_rate": 1.8900243275000532e-05, - "loss": 1.2933, - "step": 1010 - }, - { - "epoch": 0.16614407297308303, - "grad_norm": 9.401992797851562, - "learning_rate": 1.8876405562613088e-05, - "loss": 1.244, - "step": 1020 - }, - { - "epoch": 0.16777293643360344, - "grad_norm": 10.441336631774902, - "learning_rate": 1.8852327632061457e-05, - "loss": 1.277, - "step": 1030 - }, - { - "epoch": 0.16940179989412388, - "grad_norm": 11.799003601074219, - "learning_rate": 1.8828010134955822e-05, - "loss": 1.33, - "step": 1040 - }, - { - "epoch": 0.17103066335464429, - "grad_norm": 9.582467079162598, - "learning_rate": 1.8803453729389648e-05, - "loss": 1.4002, - "step": 1050 - }, - { - "epoch": 0.17265952681516472, - "grad_norm": 8.40396499633789, - "learning_rate": 1.8778659079921877e-05, - "loss": 1.3458, - "step": 1060 - }, - { - "epoch": 0.17428839027568513, - "grad_norm": 10.204269409179688, - "learning_rate": 1.8753626857558935e-05, - "loss": 1.3259, - "step": 1070 - }, - { - "epoch": 0.17591725373620556, - "grad_norm": 6.9873948097229, - "learning_rate": 1.8728357739736578e-05, - "loss": 1.3199, - "step": 1080 - }, - { - "epoch": 0.17754611719672597, - "grad_norm": 9.196039199829102, - "learning_rate": 1.8702852410301556e-05, - "loss": 1.337, - "step": 1090 - }, - { - "epoch": 0.1791749806572464, - "grad_norm": 8.873251914978027, - "learning_rate": 1.86771115594931e-05, - "loss": 1.2909, - "step": 1100 - }, - { - "epoch": 0.18080384411776682, - "grad_norm": 9.222848892211914, - "learning_rate": 1.865113588392427e-05, - "loss": 1.2325, - "step": 1110 - }, - { - "epoch": 0.18243270757828725, - "grad_norm": 7.872706413269043, - "learning_rate": 1.8624926086563057e-05, - "loss": 1.2469, - "step": 1120 - }, - { - "epoch": 0.18406157103880766, - "grad_norm": 9.13894271850586, - "learning_rate": 1.85984828767134e-05, - "loss": 1.2889, - "step": 1130 - }, - { - "epoch": 0.1856904344993281, - "grad_norm": 7.499070167541504, - "learning_rate": 1.8571806969995982e-05, - "loss": 1.3696, - "step": 1140 - }, - { - "epoch": 0.1873192979598485, - "grad_norm": 10.489768028259277, - "learning_rate": 1.854489908832884e-05, - "loss": 1.2415, - "step": 1150 - }, - { - "epoch": 0.18894816142036894, - "grad_norm": 9.07557201385498, - "learning_rate": 1.8517759959907845e-05, - "loss": 1.307, - "step": 1160 - }, - { - "epoch": 0.19057702488088937, - "grad_norm": 8.078954696655273, - "learning_rate": 1.849039031918701e-05, - "loss": 1.2854, - "step": 1170 - }, - { - "epoch": 0.19220588834140978, - "grad_norm": 8.329951286315918, - "learning_rate": 1.846279090685859e-05, - "loss": 1.2446, - "step": 1180 - }, - { - "epoch": 0.19383475180193022, - "grad_norm": 9.939054489135742, - "learning_rate": 1.8434962469833036e-05, - "loss": 1.4163, - "step": 1190 - }, - { - "epoch": 0.19546361526245062, - "grad_norm": 9.757793426513672, - "learning_rate": 1.8406905761218815e-05, - "loss": 1.1905, - "step": 1200 - }, - { - "epoch": 0.19709247872297106, - "grad_norm": 7.9107489585876465, - "learning_rate": 1.8378621540301976e-05, - "loss": 1.2955, - "step": 1210 - }, - { - "epoch": 0.19872134218349147, - "grad_norm": 9.854551315307617, - "learning_rate": 1.835011057252565e-05, - "loss": 1.2293, - "step": 1220 - }, - { - "epoch": 0.2003502056440119, - "grad_norm": 10.388916969299316, - "learning_rate": 1.8321373629469313e-05, - "loss": 1.337, - "step": 1230 - }, - { - "epoch": 0.2019790691045323, - "grad_norm": 9.081336975097656, - "learning_rate": 1.8292411488827906e-05, - "loss": 1.2708, - "step": 1240 - }, - { - "epoch": 0.20360793256505275, - "grad_norm": 10.85607624053955, - "learning_rate": 1.826322493439079e-05, - "loss": 1.2249, - "step": 1250 - }, - { - "epoch": 0.20523679602557315, - "grad_norm": 10.35145378112793, - "learning_rate": 1.823381475602054e-05, - "loss": 1.3433, - "step": 1260 - }, - { - "epoch": 0.2068656594860936, - "grad_norm": 12.942462921142578, - "learning_rate": 1.8204181749631557e-05, - "loss": 1.3062, - "step": 1270 - }, - { - "epoch": 0.208494522946614, - "grad_norm": 8.48989200592041, - "learning_rate": 1.8174326717168547e-05, - "loss": 1.2474, - "step": 1280 - }, - { - "epoch": 0.21012338640713443, - "grad_norm": 7.585806846618652, - "learning_rate": 1.8144250466584794e-05, - "loss": 1.3294, - "step": 1290 - }, - { - "epoch": 0.21175224986765484, - "grad_norm": 9.783578872680664, - "learning_rate": 1.8113953811820322e-05, - "loss": 1.308, - "step": 1300 - }, - { - "epoch": 0.21338111332817528, - "grad_norm": 8.406821250915527, - "learning_rate": 1.8083437572779842e-05, - "loss": 1.2757, - "step": 1310 - }, - { - "epoch": 0.21500997678869568, - "grad_norm": 11.778732299804688, - "learning_rate": 1.8052702575310588e-05, - "loss": 1.2255, - "step": 1320 - }, - { - "epoch": 0.21663884024921612, - "grad_norm": 8.81125259399414, - "learning_rate": 1.802174965117994e-05, - "loss": 1.2558, - "step": 1330 - }, - { - "epoch": 0.21826770370973653, - "grad_norm": 8.534640312194824, - "learning_rate": 1.7990579638052944e-05, - "loss": 1.3359, - "step": 1340 - }, - { - "epoch": 0.21989656717025696, - "grad_norm": 7.685283184051514, - "learning_rate": 1.795919337946962e-05, - "loss": 1.2347, - "step": 1350 - }, - { - "epoch": 0.22152543063077737, - "grad_norm": 9.883221626281738, - "learning_rate": 1.7927591724822132e-05, - "loss": 1.2605, - "step": 1360 - }, - { - "epoch": 0.2231542940912978, - "grad_norm": 6.772229194641113, - "learning_rate": 1.7895775529331835e-05, - "loss": 1.2475, - "step": 1370 - }, - { - "epoch": 0.2247831575518182, - "grad_norm": 6.82761287689209, - "learning_rate": 1.7863745654026078e-05, - "loss": 1.2686, - "step": 1380 - }, - { - "epoch": 0.22641202101233865, - "grad_norm": 13.5228853225708, - "learning_rate": 1.7831502965714958e-05, - "loss": 1.2544, - "step": 1390 - }, - { - "epoch": 0.22804088447285906, - "grad_norm": 7.083252429962158, - "learning_rate": 1.779904833696781e-05, - "loss": 1.2435, - "step": 1400 - }, - { - "epoch": 0.2296697479333795, - "grad_norm": 9.345582962036133, - "learning_rate": 1.7766382646089635e-05, - "loss": 1.3229, - "step": 1410 - }, - { - "epoch": 0.2312986113938999, - "grad_norm": 10.011900901794434, - "learning_rate": 1.77335067770973e-05, - "loss": 1.3074, - "step": 1420 - }, - { - "epoch": 0.23292747485442034, - "grad_norm": 8.366225242614746, - "learning_rate": 1.770042161969564e-05, - "loss": 1.1657, - "step": 1430 - }, - { - "epoch": 0.23455633831494074, - "grad_norm": 13.709329605102539, - "learning_rate": 1.7667128069253362e-05, - "loss": 1.3257, - "step": 1440 - }, - { - "epoch": 0.23618520177546118, - "grad_norm": 9.527036666870117, - "learning_rate": 1.763362702677882e-05, - "loss": 1.1838, - "step": 1450 - }, - { - "epoch": 0.2378140652359816, - "grad_norm": 10.506624221801758, - "learning_rate": 1.759991939889562e-05, - "loss": 1.3324, - "step": 1460 - }, - { - "epoch": 0.23944292869650202, - "grad_norm": 9.029693603515625, - "learning_rate": 1.7566006097818123e-05, - "loss": 1.2102, - "step": 1470 - }, - { - "epoch": 0.24107179215702243, - "grad_norm": 9.59033203125, - "learning_rate": 1.7531888041326715e-05, - "loss": 1.3553, - "step": 1480 - }, - { - "epoch": 0.24270065561754287, - "grad_norm": 9.097341537475586, - "learning_rate": 1.7497566152742975e-05, - "loss": 1.3165, - "step": 1490 - }, - { - "epoch": 0.24432951907806327, - "grad_norm": 19.68447494506836, - "learning_rate": 1.7463041360904714e-05, - "loss": 1.2733, - "step": 1500 - }, - { - "epoch": 0.24432951907806327, - "eval_loss": 1.1864172220230103, - "eval_runtime": 101.8915, - "eval_samples_per_second": 25.37, - "eval_steps_per_second": 25.37, - "step": 1500 - }, - { - "epoch": 0.2459583825385837, - "grad_norm": 9.446989059448242, - "learning_rate": 1.742831460014082e-05, - "loss": 1.1618, - "step": 1510 - }, - { - "epoch": 0.24758724599910412, - "grad_norm": 7.566542148590088, - "learning_rate": 1.7393386810245968e-05, - "loss": 1.271, - "step": 1520 - }, - { - "epoch": 0.24921610945962455, - "grad_norm": 12.757163047790527, - "learning_rate": 1.7358258936455203e-05, - "loss": 1.2472, - "step": 1530 - }, - { - "epoch": 0.25084497292014496, - "grad_norm": 6.86335563659668, - "learning_rate": 1.7322931929418338e-05, - "loss": 1.3589, - "step": 1540 - }, - { - "epoch": 0.2524738363806654, - "grad_norm": 8.989468574523926, - "learning_rate": 1.7287406745174253e-05, - "loss": 1.246, - "step": 1550 - }, - { - "epoch": 0.25410269984118583, - "grad_norm": 9.14529037475586, - "learning_rate": 1.7251684345125e-05, - "loss": 1.1678, - "step": 1560 - }, - { - "epoch": 0.2557315633017062, - "grad_norm": 9.417140007019043, - "learning_rate": 1.7215765696009795e-05, - "loss": 1.2109, - "step": 1570 - }, - { - "epoch": 0.25736042676222665, - "grad_norm": 8.230705261230469, - "learning_rate": 1.7179651769878854e-05, - "loss": 1.2815, - "step": 1580 - }, - { - "epoch": 0.2589892902227471, - "grad_norm": 7.2801289558410645, - "learning_rate": 1.7143343544067094e-05, - "loss": 1.3492, - "step": 1590 - }, - { - "epoch": 0.2606181536832675, - "grad_norm": 6.567676544189453, - "learning_rate": 1.7106842001167664e-05, - "loss": 1.2145, - "step": 1600 - }, - { - "epoch": 0.2622470171437879, - "grad_norm": 7.0790910720825195, - "learning_rate": 1.7070148129005373e-05, - "loss": 1.2142, - "step": 1610 - }, - { - "epoch": 0.26387588060430833, - "grad_norm": 11.59786319732666, - "learning_rate": 1.7033262920609947e-05, - "loss": 1.2607, - "step": 1620 - }, - { - "epoch": 0.26550474406482877, - "grad_norm": 7.409655570983887, - "learning_rate": 1.699618737418917e-05, - "loss": 1.246, - "step": 1630 - }, - { - "epoch": 0.2671336075253492, - "grad_norm": 7.234554767608643, - "learning_rate": 1.6958922493101844e-05, - "loss": 1.2376, - "step": 1640 - }, - { - "epoch": 0.2687624709858696, - "grad_norm": 8.587789535522461, - "learning_rate": 1.6921469285830654e-05, - "loss": 1.312, - "step": 1650 - }, - { - "epoch": 0.27039133444639, - "grad_norm": 8.513998031616211, - "learning_rate": 1.688382876595487e-05, - "loss": 1.3469, - "step": 1660 - }, - { - "epoch": 0.27202019790691045, - "grad_norm": 7.039551258087158, - "learning_rate": 1.684600195212293e-05, - "loss": 1.3468, - "step": 1670 - }, - { - "epoch": 0.2736490613674309, - "grad_norm": 11.176895141601562, - "learning_rate": 1.6807989868024845e-05, - "loss": 1.419, - "step": 1680 - }, - { - "epoch": 0.27527792482795127, - "grad_norm": 8.219011306762695, - "learning_rate": 1.676979354236452e-05, - "loss": 1.3015, - "step": 1690 - }, - { - "epoch": 0.2769067882884717, - "grad_norm": 10.750848770141602, - "learning_rate": 1.673141400883191e-05, - "loss": 1.2978, - "step": 1700 - }, - { - "epoch": 0.27853565174899214, - "grad_norm": 11.351160049438477, - "learning_rate": 1.6692852306075033e-05, - "loss": 1.2648, - "step": 1710 - }, - { - "epoch": 0.2801645152095126, - "grad_norm": 7.423020362854004, - "learning_rate": 1.665410947767188e-05, - "loss": 1.2352, - "step": 1720 - }, - { - "epoch": 0.28179337867003296, - "grad_norm": 7.212891101837158, - "learning_rate": 1.6615186572102154e-05, - "loss": 1.2899, - "step": 1730 - }, - { - "epoch": 0.2834222421305534, - "grad_norm": 8.066819190979004, - "learning_rate": 1.6576084642718915e-05, - "loss": 1.2763, - "step": 1740 - }, - { - "epoch": 0.28505110559107383, - "grad_norm": 8.984209060668945, - "learning_rate": 1.653680474772006e-05, - "loss": 1.3167, - "step": 1750 - }, - { - "epoch": 0.28667996905159426, - "grad_norm": 17.02054786682129, - "learning_rate": 1.6497347950119687e-05, - "loss": 1.2149, - "step": 1760 - }, - { - "epoch": 0.2883088325121147, - "grad_norm": 10.05130386352539, - "learning_rate": 1.645771531771933e-05, - "loss": 1.1194, - "step": 1770 - }, - { - "epoch": 0.2899376959726351, - "grad_norm": 7.705258369445801, - "learning_rate": 1.6417907923079057e-05, - "loss": 1.2395, - "step": 1780 - }, - { - "epoch": 0.2915665594331555, - "grad_norm": 10.932085037231445, - "learning_rate": 1.6377926843488462e-05, - "loss": 1.3887, - "step": 1790 - }, - { - "epoch": 0.29319542289367595, - "grad_norm": 12.111505508422852, - "learning_rate": 1.633777316093748e-05, - "loss": 1.2902, - "step": 1800 - }, - { - "epoch": 0.2948242863541964, - "grad_norm": 7.594025135040283, - "learning_rate": 1.6297447962087133e-05, - "loss": 1.3156, - "step": 1810 - }, - { - "epoch": 0.29645314981471677, - "grad_norm": 6.600276470184326, - "learning_rate": 1.625695233824011e-05, - "loss": 1.2334, - "step": 1820 - }, - { - "epoch": 0.2980820132752372, - "grad_norm": 8.619099617004395, - "learning_rate": 1.621628738531123e-05, - "loss": 1.2654, - "step": 1830 - }, - { - "epoch": 0.29971087673575764, - "grad_norm": 8.581014633178711, - "learning_rate": 1.6175454203797786e-05, - "loss": 1.3179, - "step": 1840 - }, - { - "epoch": 0.30133974019627807, - "grad_norm": 11.676705360412598, - "learning_rate": 1.6134453898749778e-05, - "loss": 1.311, - "step": 1850 - }, - { - "epoch": 0.30296860365679845, - "grad_norm": 8.072620391845703, - "learning_rate": 1.6093287579739983e-05, - "loss": 1.1559, - "step": 1860 - }, - { - "epoch": 0.3045974671173189, - "grad_norm": 8.07971477508545, - "learning_rate": 1.605195636083395e-05, - "loss": 1.3077, - "step": 1870 - }, - { - "epoch": 0.3062263305778393, - "grad_norm": 8.255348205566406, - "learning_rate": 1.6010461360559823e-05, - "loss": 1.2889, - "step": 1880 - }, - { - "epoch": 0.30785519403835976, - "grad_norm": 9.381460189819336, - "learning_rate": 1.5968803701878107e-05, - "loss": 1.2049, - "step": 1890 - }, - { - "epoch": 0.30948405749888014, - "grad_norm": 10.263215065002441, - "learning_rate": 1.5926984512151243e-05, - "loss": 1.2165, - "step": 1900 - }, - { - "epoch": 0.3111129209594006, - "grad_norm": 7.69245719909668, - "learning_rate": 1.588500492311312e-05, - "loss": 1.2796, - "step": 1910 - }, - { - "epoch": 0.312741784419921, - "grad_norm": 6.975790023803711, - "learning_rate": 1.5842866070838444e-05, - "loss": 1.306, - "step": 1920 - }, - { - "epoch": 0.31437064788044145, - "grad_norm": 5.779184818267822, - "learning_rate": 1.5800569095711983e-05, - "loss": 1.233, - "step": 1930 - }, - { - "epoch": 0.3159995113409618, - "grad_norm": 8.1918306350708, - "learning_rate": 1.575811514239772e-05, - "loss": 1.2872, - "step": 1940 - }, - { - "epoch": 0.31762837480148226, - "grad_norm": 10.501468658447266, - "learning_rate": 1.5715505359807862e-05, - "loss": 1.1615, - "step": 1950 - }, - { - "epoch": 0.3192572382620027, - "grad_norm": 5.401251792907715, - "learning_rate": 1.567274090107176e-05, - "loss": 1.2808, - "step": 1960 - }, - { - "epoch": 0.32088610172252313, - "grad_norm": 7.17600154876709, - "learning_rate": 1.5629822923504692e-05, - "loss": 1.3381, - "step": 1970 - }, - { - "epoch": 0.3225149651830435, - "grad_norm": 7.486255645751953, - "learning_rate": 1.558675258857654e-05, - "loss": 1.2904, - "step": 1980 - }, - { - "epoch": 0.32414382864356395, - "grad_norm": 11.746787071228027, - "learning_rate": 1.5543531061880374e-05, - "loss": 1.2446, - "step": 1990 - }, - { - "epoch": 0.3257726921040844, - "grad_norm": 8.739921569824219, - "learning_rate": 1.55001595131009e-05, - "loss": 1.265, - "step": 2000 - }, - { - "epoch": 0.3257726921040844, - "eval_loss": 1.175255537033081, - "eval_runtime": 102.6201, - "eval_samples_per_second": 25.19, - "eval_steps_per_second": 25.19, - "step": 2000 - }, - { - "epoch": 0.3274015555646048, - "grad_norm": 7.848710536956787, - "learning_rate": 1.5456639115982795e-05, - "loss": 1.3146, - "step": 2010 - }, - { - "epoch": 0.3290304190251252, - "grad_norm": 11.283839225769043, - "learning_rate": 1.5412971048298964e-05, - "loss": 1.0946, - "step": 2020 - }, - { - "epoch": 0.33065928248564563, - "grad_norm": 11.305326461791992, - "learning_rate": 1.536915649181864e-05, - "loss": 1.2178, - "step": 2030 - }, - { - "epoch": 0.33228814594616607, - "grad_norm": 7.214547157287598, - "learning_rate": 1.5325196632275424e-05, - "loss": 1.2246, - "step": 2040 - }, - { - "epoch": 0.3339170094066865, - "grad_norm": 8.374602317810059, - "learning_rate": 1.528109265933519e-05, - "loss": 1.2537, - "step": 2050 - }, - { - "epoch": 0.3355458728672069, - "grad_norm": 7.756241321563721, - "learning_rate": 1.5236845766563881e-05, - "loss": 1.2351, - "step": 2060 - }, - { - "epoch": 0.3371747363277273, - "grad_norm": 13.376108169555664, - "learning_rate": 1.5192457151395226e-05, - "loss": 1.2366, - "step": 2070 - }, - { - "epoch": 0.33880359978824776, - "grad_norm": 8.468932151794434, - "learning_rate": 1.5147928015098309e-05, - "loss": 1.2931, - "step": 2080 - }, - { - "epoch": 0.3404324632487682, - "grad_norm": 7.511579513549805, - "learning_rate": 1.5103259562745084e-05, - "loss": 1.3374, - "step": 2090 - }, - { - "epoch": 0.34206132670928857, - "grad_norm": 8.611263275146484, - "learning_rate": 1.5058453003177756e-05, - "loss": 1.3561, - "step": 2100 - }, - { - "epoch": 0.343690190169809, - "grad_norm": 8.382079124450684, - "learning_rate": 1.5013509548976049e-05, - "loss": 1.34, - "step": 2110 - }, - { - "epoch": 0.34531905363032944, - "grad_norm": 6.7042365074157715, - "learning_rate": 1.4968430416424417e-05, - "loss": 1.2588, - "step": 2120 - }, - { - "epoch": 0.3469479170908499, - "grad_norm": 8.887292861938477, - "learning_rate": 1.4923216825479115e-05, - "loss": 1.3223, - "step": 2130 - }, - { - "epoch": 0.34857678055137026, - "grad_norm": 6.505458831787109, - "learning_rate": 1.4877869999735175e-05, - "loss": 1.3147, - "step": 2140 - }, - { - "epoch": 0.3502056440118907, - "grad_norm": 12.044365882873535, - "learning_rate": 1.4832391166393316e-05, - "loss": 1.2917, - "step": 2150 - }, - { - "epoch": 0.35183450747241113, - "grad_norm": 7.598618507385254, - "learning_rate": 1.4786781556226713e-05, - "loss": 1.1821, - "step": 2160 - }, - { - "epoch": 0.35346337093293156, - "grad_norm": 8.489738464355469, - "learning_rate": 1.4741042403547692e-05, - "loss": 1.2837, - "step": 2170 - }, - { - "epoch": 0.35509223439345194, - "grad_norm": 12.408584594726562, - "learning_rate": 1.4695174946174334e-05, - "loss": 1.2392, - "step": 2180 - }, - { - "epoch": 0.3567210978539724, - "grad_norm": 9.772265434265137, - "learning_rate": 1.4649180425396972e-05, - "loss": 1.3963, - "step": 2190 - }, - { - "epoch": 0.3583499613144928, - "grad_norm": 8.034818649291992, - "learning_rate": 1.4603060085944594e-05, - "loss": 1.2304, - "step": 2200 - }, - { - "epoch": 0.35997882477501325, - "grad_norm": 8.509257316589355, - "learning_rate": 1.455681517595117e-05, - "loss": 1.1302, - "step": 2210 - }, - { - "epoch": 0.36160768823553363, - "grad_norm": 7.198489189147949, - "learning_rate": 1.4510446946921857e-05, - "loss": 1.3122, - "step": 2220 - }, - { - "epoch": 0.36323655169605407, - "grad_norm": 13.541308403015137, - "learning_rate": 1.4463956653699148e-05, - "loss": 1.2654, - "step": 2230 - }, - { - "epoch": 0.3648654151565745, - "grad_norm": 9.754889488220215, - "learning_rate": 1.4417345554428898e-05, - "loss": 1.2936, - "step": 2240 - }, - { - "epoch": 0.36649427861709494, - "grad_norm": 11.386918067932129, - "learning_rate": 1.437061491052629e-05, - "loss": 1.2025, - "step": 2250 - }, - { - "epoch": 0.3681231420776153, - "grad_norm": 9.887473106384277, - "learning_rate": 1.4323765986641681e-05, - "loss": 1.3471, - "step": 2260 - }, - { - "epoch": 0.36975200553813575, - "grad_norm": 11.245658874511719, - "learning_rate": 1.4276800050626385e-05, - "loss": 1.3279, - "step": 2270 - }, - { - "epoch": 0.3713808689986562, - "grad_norm": 5.411240100860596, - "learning_rate": 1.4229718373498371e-05, - "loss": 1.2211, - "step": 2280 - }, - { - "epoch": 0.3730097324591766, - "grad_norm": 9.82564926147461, - "learning_rate": 1.4182522229407854e-05, - "loss": 1.0828, - "step": 2290 - }, - { - "epoch": 0.374638595919697, - "grad_norm": 10.895565032958984, - "learning_rate": 1.413521289560281e-05, - "loss": 1.2662, - "step": 2300 - }, - { - "epoch": 0.37626745938021744, - "grad_norm": 8.565170288085938, - "learning_rate": 1.4087791652394427e-05, - "loss": 1.1959, - "step": 2310 - }, - { - "epoch": 0.3778963228407379, - "grad_norm": 8.679253578186035, - "learning_rate": 1.404025978312244e-05, - "loss": 1.3137, - "step": 2320 - }, - { - "epoch": 0.3795251863012583, - "grad_norm": 7.74443244934082, - "learning_rate": 1.3992618574120415e-05, - "loss": 1.2227, - "step": 2330 - }, - { - "epoch": 0.38115404976177875, - "grad_norm": 6.578245639801025, - "learning_rate": 1.3944869314680922e-05, - "loss": 1.2857, - "step": 2340 - }, - { - "epoch": 0.3827829132222991, - "grad_norm": 7.3876237869262695, - "learning_rate": 1.3897013297020651e-05, - "loss": 1.3174, - "step": 2350 - }, - { - "epoch": 0.38441177668281956, - "grad_norm": 7.339137077331543, - "learning_rate": 1.3849051816245451e-05, - "loss": 1.281, - "step": 2360 - }, - { - "epoch": 0.38604064014334, - "grad_norm": 9.710099220275879, - "learning_rate": 1.3800986170315263e-05, - "loss": 1.3175, - "step": 2370 - }, - { - "epoch": 0.38766950360386043, - "grad_norm": 8.572495460510254, - "learning_rate": 1.3752817660009004e-05, - "loss": 1.3693, - "step": 2380 - }, - { - "epoch": 0.3892983670643808, - "grad_norm": 8.108867645263672, - "learning_rate": 1.3704547588889368e-05, - "loss": 1.3073, - "step": 2390 - }, - { - "epoch": 0.39092723052490125, - "grad_norm": 6.642714500427246, - "learning_rate": 1.3656177263267534e-05, - "loss": 1.2876, - "step": 2400 - }, - { - "epoch": 0.3925560939854217, - "grad_norm": 8.35530948638916, - "learning_rate": 1.3607707992167836e-05, - "loss": 1.3244, - "step": 2410 - }, - { - "epoch": 0.3941849574459421, - "grad_norm": 13.95084285736084, - "learning_rate": 1.3559141087292313e-05, - "loss": 1.2599, - "step": 2420 - }, - { - "epoch": 0.3958138209064625, - "grad_norm": 9.53144645690918, - "learning_rate": 1.3510477862985233e-05, - "loss": 1.1835, - "step": 2430 - }, - { - "epoch": 0.39744268436698293, - "grad_norm": 7.759304046630859, - "learning_rate": 1.3461719636197503e-05, - "loss": 1.2536, - "step": 2440 - }, - { - "epoch": 0.39907154782750337, - "grad_norm": 12.239033699035645, - "learning_rate": 1.3412867726451051e-05, - "loss": 1.2358, - "step": 2450 - }, - { - "epoch": 0.4007004112880238, - "grad_norm": 8.611287117004395, - "learning_rate": 1.3363923455803098e-05, - "loss": 1.273, - "step": 2460 - }, - { - "epoch": 0.4023292747485442, - "grad_norm": 9.023387908935547, - "learning_rate": 1.3314888148810381e-05, - "loss": 1.2195, - "step": 2470 - }, - { - "epoch": 0.4039581382090646, - "grad_norm": 10.45922565460205, - "learning_rate": 1.3265763132493325e-05, - "loss": 1.2142, - "step": 2480 - }, - { - "epoch": 0.40558700166958506, - "grad_norm": 9.23582649230957, - "learning_rate": 1.3216549736300108e-05, - "loss": 1.3208, - "step": 2490 - }, - { - "epoch": 0.4072158651301055, - "grad_norm": 12.40174674987793, - "learning_rate": 1.3167249292070701e-05, - "loss": 1.2436, - "step": 2500 - }, - { - "epoch": 0.4072158651301055, - "eval_loss": 1.1542352437973022, - "eval_runtime": 102.4621, - "eval_samples_per_second": 25.229, - "eval_steps_per_second": 25.229, - "step": 2500 - }, - { - "epoch": 0.40884472859062587, - "grad_norm": 11.083693504333496, - "learning_rate": 1.311786313400081e-05, - "loss": 1.1723, - "step": 2510 - }, - { - "epoch": 0.4104735920511463, - "grad_norm": 6.602702617645264, - "learning_rate": 1.3068392598605775e-05, - "loss": 1.3112, - "step": 2520 - }, - { - "epoch": 0.41210245551166674, - "grad_norm": 5.926374912261963, - "learning_rate": 1.3018839024684407e-05, - "loss": 1.1096, - "step": 2530 - }, - { - "epoch": 0.4137313189721872, - "grad_norm": 8.42911148071289, - "learning_rate": 1.296920375328275e-05, - "loss": 1.3634, - "step": 2540 - }, - { - "epoch": 0.41536018243270756, - "grad_norm": 7.443211555480957, - "learning_rate": 1.2919488127657788e-05, - "loss": 1.1775, - "step": 2550 - }, - { - "epoch": 0.416989045893228, - "grad_norm": 7.255467414855957, - "learning_rate": 1.28696934932411e-05, - "loss": 1.2215, - "step": 2560 - }, - { - "epoch": 0.41861790935374843, - "grad_norm": 7.436680316925049, - "learning_rate": 1.2819821197602434e-05, - "loss": 1.3608, - "step": 2570 - }, - { - "epoch": 0.42024677281426887, - "grad_norm": 9.471477508544922, - "learning_rate": 1.2769872590413262e-05, - "loss": 1.2015, - "step": 2580 - }, - { - "epoch": 0.42187563627478925, - "grad_norm": 10.229029655456543, - "learning_rate": 1.271984902341023e-05, - "loss": 1.1578, - "step": 2590 - }, - { - "epoch": 0.4235044997353097, - "grad_norm": 9.242801666259766, - "learning_rate": 1.2669751850358593e-05, - "loss": 1.2569, - "step": 2600 - }, - { - "epoch": 0.4251333631958301, - "grad_norm": 7.492494106292725, - "learning_rate": 1.2619582427015575e-05, - "loss": 1.1613, - "step": 2610 - }, - { - "epoch": 0.42676222665635055, - "grad_norm": 9.39388370513916, - "learning_rate": 1.256934211109367e-05, - "loss": 1.2344, - "step": 2620 - }, - { - "epoch": 0.42839109011687093, - "grad_norm": 17.988927841186523, - "learning_rate": 1.2519032262223913e-05, - "loss": 1.2001, - "step": 2630 - }, - { - "epoch": 0.43001995357739137, - "grad_norm": 8.884842872619629, - "learning_rate": 1.2468654241919077e-05, - "loss": 1.3394, - "step": 2640 - }, - { - "epoch": 0.4316488170379118, - "grad_norm": 5.588969707489014, - "learning_rate": 1.2418209413536822e-05, - "loss": 1.2306, - "step": 2650 - }, - { - "epoch": 0.43327768049843224, - "grad_norm": 9.964385032653809, - "learning_rate": 1.2367699142242808e-05, - "loss": 1.3004, - "step": 2660 - }, - { - "epoch": 0.4349065439589526, - "grad_norm": 9.903732299804688, - "learning_rate": 1.2317124794973757e-05, - "loss": 1.2649, - "step": 2670 - }, - { - "epoch": 0.43653540741947305, - "grad_norm": 8.422842979431152, - "learning_rate": 1.2266487740400432e-05, - "loss": 1.3842, - "step": 2680 - }, - { - "epoch": 0.4381642708799935, - "grad_norm": 6.042238235473633, - "learning_rate": 1.2215789348890627e-05, - "loss": 1.1659, - "step": 2690 - }, - { - "epoch": 0.4397931343405139, - "grad_norm": 10.695905685424805, - "learning_rate": 1.216503099247207e-05, - "loss": 1.1943, - "step": 2700 - }, - { - "epoch": 0.4414219978010343, - "grad_norm": 7.222139835357666, - "learning_rate": 1.2114214044795287e-05, - "loss": 1.1867, - "step": 2710 - }, - { - "epoch": 0.44305086126155474, - "grad_norm": 8.005696296691895, - "learning_rate": 1.206333988109644e-05, - "loss": 1.2714, - "step": 2720 - }, - { - "epoch": 0.4446797247220752, - "grad_norm": 10.262436866760254, - "learning_rate": 1.2012409878160093e-05, - "loss": 1.2536, - "step": 2730 - }, - { - "epoch": 0.4463085881825956, - "grad_norm": 5.630941867828369, - "learning_rate": 1.196142541428197e-05, - "loss": 1.2511, - "step": 2740 - }, - { - "epoch": 0.447937451643116, - "grad_norm": 5.890045642852783, - "learning_rate": 1.1910387869231646e-05, - "loss": 1.204, - "step": 2750 - }, - { - "epoch": 0.4495663151036364, - "grad_norm": 7.118770122528076, - "learning_rate": 1.1859298624215202e-05, - "loss": 1.2963, - "step": 2760 - }, - { - "epoch": 0.45119517856415686, - "grad_norm": 6.118231773376465, - "learning_rate": 1.180815906183786e-05, - "loss": 1.1031, - "step": 2770 - }, - { - "epoch": 0.4528240420246773, - "grad_norm": 9.652909278869629, - "learning_rate": 1.175697056606655e-05, - "loss": 1.2406, - "step": 2780 - }, - { - "epoch": 0.4544529054851977, - "grad_norm": 8.817192077636719, - "learning_rate": 1.170573452219247e-05, - "loss": 1.1656, - "step": 2790 - }, - { - "epoch": 0.4560817689457181, - "grad_norm": 7.891348838806152, - "learning_rate": 1.1654452316793592e-05, - "loss": 1.1508, - "step": 2800 - }, - { - "epoch": 0.45771063240623855, - "grad_norm": 7.836014270782471, - "learning_rate": 1.1603125337697129e-05, - "loss": 1.2084, - "step": 2810 - }, - { - "epoch": 0.459339495866759, - "grad_norm": 7.247401714324951, - "learning_rate": 1.1551754973941996e-05, - "loss": 1.2001, - "step": 2820 - }, - { - "epoch": 0.46096835932727936, - "grad_norm": 7.9783830642700195, - "learning_rate": 1.1500342615741193e-05, - "loss": 1.2263, - "step": 2830 - }, - { - "epoch": 0.4625972227877998, - "grad_norm": 8.071614265441895, - "learning_rate": 1.144888965444421e-05, - "loss": 1.1689, - "step": 2840 - }, - { - "epoch": 0.46422608624832024, - "grad_norm": 13.862833976745605, - "learning_rate": 1.1397397482499352e-05, - "loss": 1.2704, - "step": 2850 - }, - { - "epoch": 0.46585494970884067, - "grad_norm": 10.272599220275879, - "learning_rate": 1.1345867493416067e-05, - "loss": 1.3094, - "step": 2860 - }, - { - "epoch": 0.46748381316936105, - "grad_norm": 8.56863784790039, - "learning_rate": 1.1294301081727235e-05, - "loss": 1.2395, - "step": 2870 - }, - { - "epoch": 0.4691126766298815, - "grad_norm": 9.376766204833984, - "learning_rate": 1.1242699642951411e-05, - "loss": 1.2032, - "step": 2880 - }, - { - "epoch": 0.4707415400904019, - "grad_norm": 10.573633193969727, - "learning_rate": 1.1191064573555094e-05, - "loss": 1.2421, - "step": 2890 - }, - { - "epoch": 0.47237040355092236, - "grad_norm": 10.285879135131836, - "learning_rate": 1.1139397270914893e-05, - "loss": 1.3089, - "step": 2900 - }, - { - "epoch": 0.4739992670114428, - "grad_norm": 6.976883888244629, - "learning_rate": 1.1087699133279743e-05, - "loss": 1.1944, - "step": 2910 - }, - { - "epoch": 0.4756281304719632, - "grad_norm": 8.282888412475586, - "learning_rate": 1.1035971559733047e-05, - "loss": 1.2164, - "step": 2920 - }, - { - "epoch": 0.4772569939324836, - "grad_norm": 12.169346809387207, - "learning_rate": 1.0984215950154821e-05, - "loss": 1.2526, - "step": 2930 - }, - { - "epoch": 0.47888585739300404, - "grad_norm": 10.577465057373047, - "learning_rate": 1.0932433705183806e-05, - "loss": 1.252, - "step": 2940 - }, - { - "epoch": 0.4805147208535245, - "grad_norm": 10.231258392333984, - "learning_rate": 1.0880626226179566e-05, - "loss": 1.2123, - "step": 2950 - }, - { - "epoch": 0.48214358431404486, - "grad_norm": 9.532476425170898, - "learning_rate": 1.0828794915184556e-05, - "loss": 1.0514, - "step": 2960 - }, - { - "epoch": 0.4837724477745653, - "grad_norm": 7.888957500457764, - "learning_rate": 1.0776941174886204e-05, - "loss": 1.1282, - "step": 2970 - }, - { - "epoch": 0.48540131123508573, - "grad_norm": 13.633064270019531, - "learning_rate": 1.072506640857891e-05, - "loss": 1.2246, - "step": 2980 - }, - { - "epoch": 0.48703017469560617, - "grad_norm": 10.267495155334473, - "learning_rate": 1.06731720201261e-05, - "loss": 1.2691, - "step": 2990 - }, - { - "epoch": 0.48865903815612655, - "grad_norm": 11.644023895263672, - "learning_rate": 1.0621259413922234e-05, - "loss": 1.2935, - "step": 3000 - }, - { - "epoch": 0.48865903815612655, - "eval_loss": 1.1448081731796265, - "eval_runtime": 102.8493, - "eval_samples_per_second": 25.134, - "eval_steps_per_second": 25.134, - "step": 3000 - }, - { - "epoch": 0.490287901616647, - "grad_norm": 9.024466514587402, - "learning_rate": 1.056932999485477e-05, - "loss": 1.1484, - "step": 3010 - }, - { - "epoch": 0.4919167650771674, - "grad_norm": 5.3878655433654785, - "learning_rate": 1.0517385168266193e-05, - "loss": 1.1989, - "step": 3020 - }, - { - "epoch": 0.49354562853768785, - "grad_norm": 7.960078716278076, - "learning_rate": 1.0465426339915927e-05, - "loss": 1.28, - "step": 3030 - }, - { - "epoch": 0.49517449199820823, - "grad_norm": 9.044086456298828, - "learning_rate": 1.041345491594234e-05, - "loss": 1.1462, - "step": 3040 - }, - { - "epoch": 0.49680335545872867, - "grad_norm": 9.06221866607666, - "learning_rate": 1.0361472302824656e-05, - "loss": 1.2558, - "step": 3050 - }, - { - "epoch": 0.4984322189192491, - "grad_norm": 7.99545955657959, - "learning_rate": 1.0309479907344915e-05, - "loss": 1.3108, - "step": 3060 - }, - { - "epoch": 0.5000610823797695, - "grad_norm": 11.91790771484375, - "learning_rate": 1.0257479136549889e-05, - "loss": 1.3106, - "step": 3070 - }, - { - "epoch": 0.5016899458402899, - "grad_norm": 9.534984588623047, - "learning_rate": 1.0205471397713002e-05, - "loss": 1.2752, - "step": 3080 - }, - { - "epoch": 0.5033188093008104, - "grad_norm": 8.188611030578613, - "learning_rate": 1.0153458098296265e-05, - "loss": 1.2704, - "step": 3090 - }, - { - "epoch": 0.5049476727613308, - "grad_norm": 10.754870414733887, - "learning_rate": 1.0101440645912156e-05, - "loss": 1.2822, - "step": 3100 - }, - { - "epoch": 0.5065765362218512, - "grad_norm": 9.690641403198242, - "learning_rate": 1.0049420448285554e-05, - "loss": 1.1982, - "step": 3110 - }, - { - "epoch": 0.5082053996823717, - "grad_norm": 7.106996059417725, - "learning_rate": 9.997398913215629e-06, - "loss": 1.1991, - "step": 3120 - }, - { - "epoch": 0.509834263142892, - "grad_norm": 9.254938125610352, - "learning_rate": 9.945377448537744e-06, - "loss": 1.356, - "step": 3130 - }, - { - "epoch": 0.5114631266034124, - "grad_norm": 8.23971939086914, - "learning_rate": 9.893357462085355e-06, - "loss": 1.1756, - "step": 3140 - }, - { - "epoch": 0.5130919900639329, - "grad_norm": 7.241464614868164, - "learning_rate": 9.841340361651921e-06, - "loss": 1.1899, - "step": 3150 - }, - { - "epoch": 0.5147208535244533, - "grad_norm": 10.916207313537598, - "learning_rate": 9.78932755495279e-06, - "loss": 1.261, - "step": 3160 - }, - { - "epoch": 0.5163497169849738, - "grad_norm": 10.14999771118164, - "learning_rate": 9.737320449587113e-06, - "loss": 1.244, - "step": 3170 - }, - { - "epoch": 0.5179785804454942, - "grad_norm": 10.535028457641602, - "learning_rate": 9.68532045299975e-06, - "loss": 1.2962, - "step": 3180 - }, - { - "epoch": 0.5196074439060145, - "grad_norm": 12.43609619140625, - "learning_rate": 9.63332897244318e-06, - "loss": 1.2515, - "step": 3190 - }, - { - "epoch": 0.521236307366535, - "grad_norm": 8.834859848022461, - "learning_rate": 9.581347414939416e-06, - "loss": 1.2559, - "step": 3200 - }, - { - "epoch": 0.5228651708270554, - "grad_norm": 9.219295501708984, - "learning_rate": 9.529377187241921e-06, - "loss": 1.0892, - "step": 3210 - }, - { - "epoch": 0.5244940342875758, - "grad_norm": 6.687707424163818, - "learning_rate": 9.477419695797551e-06, - "loss": 1.2412, - "step": 3220 - }, - { - "epoch": 0.5261228977480963, - "grad_norm": 7.913536548614502, - "learning_rate": 9.425476346708489e-06, - "loss": 1.3012, - "step": 3230 - }, - { - "epoch": 0.5277517612086167, - "grad_norm": 10.74738597869873, - "learning_rate": 9.373548545694189e-06, - "loss": 1.2319, - "step": 3240 - }, - { - "epoch": 0.5293806246691372, - "grad_norm": 8.231693267822266, - "learning_rate": 9.321637698053327e-06, - "loss": 1.1856, - "step": 3250 - }, - { - "epoch": 0.5310094881296575, - "grad_norm": 10.926920890808105, - "learning_rate": 9.269745208625784e-06, - "loss": 1.3557, - "step": 3260 - }, - { - "epoch": 0.5326383515901779, - "grad_norm": 7.84801721572876, - "learning_rate": 9.217872481754619e-06, - "loss": 1.2767, - "step": 3270 - }, - { - "epoch": 0.5342672150506984, - "grad_norm": 9.671969413757324, - "learning_rate": 9.16602092124807e-06, - "loss": 1.2934, - "step": 3280 - }, - { - "epoch": 0.5358960785112188, - "grad_norm": 11.278433799743652, - "learning_rate": 9.11419193034155e-06, - "loss": 1.2475, - "step": 3290 - }, - { - "epoch": 0.5375249419717392, - "grad_norm": 8.938894271850586, - "learning_rate": 9.062386911659692e-06, - "loss": 1.2476, - "step": 3300 - }, - { - "epoch": 0.5391538054322597, - "grad_norm": 10.644549369812012, - "learning_rate": 9.010607267178372e-06, - "loss": 1.2407, - "step": 3310 - }, - { - "epoch": 0.54078266889278, - "grad_norm": 10.27429485321045, - "learning_rate": 8.958854398186774e-06, - "loss": 1.3306, - "step": 3320 - }, - { - "epoch": 0.5424115323533005, - "grad_norm": 5.996701717376709, - "learning_rate": 8.90712970524948e-06, - "loss": 1.162, - "step": 3330 - }, - { - "epoch": 0.5440403958138209, - "grad_norm": 11.066450119018555, - "learning_rate": 8.855434588168543e-06, - "loss": 1.16, - "step": 3340 - }, - { - "epoch": 0.5456692592743413, - "grad_norm": 8.821560859680176, - "learning_rate": 8.803770445945626e-06, - "loss": 1.2471, - "step": 3350 - }, - { - "epoch": 0.5472981227348618, - "grad_norm": 8.159625053405762, - "learning_rate": 8.752138676744128e-06, - "loss": 1.1295, - "step": 3360 - }, - { - "epoch": 0.5489269861953822, - "grad_norm": 10.010625839233398, - "learning_rate": 8.70054067785136e-06, - "loss": 1.1393, - "step": 3370 - }, - { - "epoch": 0.5505558496559025, - "grad_norm": 8.413679122924805, - "learning_rate": 8.648977845640713e-06, - "loss": 1.3249, - "step": 3380 - }, - { - "epoch": 0.552184713116423, - "grad_norm": 7.034830570220947, - "learning_rate": 8.597451575533884e-06, - "loss": 1.1537, - "step": 3390 - }, - { - "epoch": 0.5538135765769434, - "grad_norm": 7.142354488372803, - "learning_rate": 8.545963261963102e-06, - "loss": 1.3551, - "step": 3400 - }, - { - "epoch": 0.5554424400374639, - "grad_norm": 11.520668029785156, - "learning_rate": 8.494514298333401e-06, - "loss": 1.2437, - "step": 3410 - }, - { - "epoch": 0.5570713034979843, - "grad_norm": 9.558789253234863, - "learning_rate": 8.443106076984895e-06, - "loss": 1.3416, - "step": 3420 - }, - { - "epoch": 0.5587001669585047, - "grad_norm": 7.987767696380615, - "learning_rate": 8.39173998915512e-06, - "loss": 1.1504, - "step": 3430 - }, - { - "epoch": 0.5603290304190252, - "grad_norm": 8.003718376159668, - "learning_rate": 8.340417424941363e-06, - "loss": 1.1578, - "step": 3440 - }, - { - "epoch": 0.5619578938795455, - "grad_norm": 9.900299072265625, - "learning_rate": 8.289139773263057e-06, - "loss": 1.2571, - "step": 3450 - }, - { - "epoch": 0.5635867573400659, - "grad_norm": 7.391176700592041, - "learning_rate": 8.237908421824186e-06, - "loss": 1.3128, - "step": 3460 - }, - { - "epoch": 0.5652156208005864, - "grad_norm": 7.9699387550354, - "learning_rate": 8.186724757075725e-06, - "loss": 1.2942, - "step": 3470 - }, - { - "epoch": 0.5668444842611068, - "grad_norm": 13.502717018127441, - "learning_rate": 8.135590164178136e-06, - "loss": 1.157, - "step": 3480 - }, - { - "epoch": 0.5684733477216273, - "grad_norm": 10.46247673034668, - "learning_rate": 8.084506026963859e-06, - "loss": 1.1876, - "step": 3490 - }, - { - "epoch": 0.5701022111821477, - "grad_norm": 6.954629898071289, - "learning_rate": 8.033473727899889e-06, - "loss": 1.2595, - "step": 3500 - }, - { - "epoch": 0.5701022111821477, - "eval_loss": 1.13480544090271, - "eval_runtime": 102.3333, - "eval_samples_per_second": 25.261, - "eval_steps_per_second": 25.261, - "step": 3500 - }, - { - "epoch": 0.571731074642668, - "grad_norm": 9.53747272491455, - "learning_rate": 7.982494648050341e-06, - "loss": 1.3107, - "step": 3510 - }, - { - "epoch": 0.5733599381031885, - "grad_norm": 9.204121589660645, - "learning_rate": 7.93157016703908e-06, - "loss": 1.3048, - "step": 3520 - }, - { - "epoch": 0.5749888015637089, - "grad_norm": 8.356864929199219, - "learning_rate": 7.880701663012387e-06, - "loss": 1.1239, - "step": 3530 - }, - { - "epoch": 0.5766176650242294, - "grad_norm": 6.577971935272217, - "learning_rate": 7.829890512601672e-06, - "loss": 1.2206, - "step": 3540 - }, - { - "epoch": 0.5782465284847498, - "grad_norm": 7.473880767822266, - "learning_rate": 7.779138090886202e-06, - "loss": 1.1229, - "step": 3550 - }, - { - "epoch": 0.5798753919452702, - "grad_norm": 10.256245613098145, - "learning_rate": 7.728445771355897e-06, - "loss": 1.2466, - "step": 3560 - }, - { - "epoch": 0.5815042554057906, - "grad_norm": 3.6034979820251465, - "learning_rate": 7.677814925874159e-06, - "loss": 1.1838, - "step": 3570 - }, - { - "epoch": 0.583133118866311, - "grad_norm": 13.721611976623535, - "learning_rate": 7.627246924640744e-06, - "loss": 1.2497, - "step": 3580 - }, - { - "epoch": 0.5847619823268314, - "grad_norm": 12.126471519470215, - "learning_rate": 7.57674313615469e-06, - "loss": 1.2542, - "step": 3590 - }, - { - "epoch": 0.5863908457873519, - "grad_norm": 9.544647216796875, - "learning_rate": 7.5263049271772645e-06, - "loss": 1.2369, - "step": 3600 - }, - { - "epoch": 0.5880197092478723, - "grad_norm": 9.008516311645508, - "learning_rate": 7.475933662694993e-06, - "loss": 1.1323, - "step": 3610 - }, - { - "epoch": 0.5896485727083928, - "grad_norm": 14.236581802368164, - "learning_rate": 7.425630705882707e-06, - "loss": 1.1859, - "step": 3620 - }, - { - "epoch": 0.5912774361689132, - "grad_norm": 7.743088722229004, - "learning_rate": 7.375397418066665e-06, - "loss": 1.2496, - "step": 3630 - }, - { - "epoch": 0.5929062996294335, - "grad_norm": 10.839030265808105, - "learning_rate": 7.3252351586876955e-06, - "loss": 1.2483, - "step": 3640 - }, - { - "epoch": 0.594535163089954, - "grad_norm": 10.632163047790527, - "learning_rate": 7.275145285264424e-06, - "loss": 1.2619, - "step": 3650 - }, - { - "epoch": 0.5961640265504744, - "grad_norm": 8.51111125946045, - "learning_rate": 7.2251291533565245e-06, - "loss": 1.2082, - "step": 3660 - }, - { - "epoch": 0.5977928900109948, - "grad_norm": 8.234833717346191, - "learning_rate": 7.175188116528044e-06, - "loss": 1.2497, - "step": 3670 - }, - { - "epoch": 0.5994217534715153, - "grad_norm": 12.51526927947998, - "learning_rate": 7.125323526310752e-06, - "loss": 1.2674, - "step": 3680 - }, - { - "epoch": 0.6010506169320357, - "grad_norm": 8.174943923950195, - "learning_rate": 7.0755367321675915e-06, - "loss": 1.1269, - "step": 3690 - }, - { - "epoch": 0.6026794803925561, - "grad_norm": 8.4658842086792, - "learning_rate": 7.025829081456137e-06, - "loss": 1.2499, - "step": 3700 - }, - { - "epoch": 0.6043083438530765, - "grad_norm": 7.7957987785339355, - "learning_rate": 6.976201919392138e-06, - "loss": 1.2735, - "step": 3710 - }, - { - "epoch": 0.6059372073135969, - "grad_norm": 7.680945873260498, - "learning_rate": 6.926656589013127e-06, - "loss": 1.2044, - "step": 3720 - }, - { - "epoch": 0.6075660707741174, - "grad_norm": 8.13155460357666, - "learning_rate": 6.877194431142055e-06, - "loss": 1.2299, - "step": 3730 - }, - { - "epoch": 0.6091949342346378, - "grad_norm": 7.943605899810791, - "learning_rate": 6.827816784351011e-06, - "loss": 1.1769, - "step": 3740 - }, - { - "epoch": 0.6108237976951582, - "grad_norm": 8.881424903869629, - "learning_rate": 6.778524984924999e-06, - "loss": 1.1524, - "step": 3750 - }, - { - "epoch": 0.6124526611556786, - "grad_norm": 9.679656028747559, - "learning_rate": 6.729320366825785e-06, - "loss": 1.2131, - "step": 3760 - }, - { - "epoch": 0.614081524616199, - "grad_norm": 8.224056243896484, - "learning_rate": 6.68020426165577e-06, - "loss": 1.254, - "step": 3770 - }, - { - "epoch": 0.6157103880767195, - "grad_norm": 7.536296844482422, - "learning_rate": 6.631177998621982e-06, - "loss": 1.2487, - "step": 3780 - }, - { - "epoch": 0.6173392515372399, - "grad_norm": 8.117711067199707, - "learning_rate": 6.582242904500085e-06, - "loss": 1.2872, - "step": 3790 - }, - { - "epoch": 0.6189681149977603, - "grad_norm": 9.820115089416504, - "learning_rate": 6.53340030359848e-06, - "loss": 1.3597, - "step": 3800 - }, - { - "epoch": 0.6205969784582808, - "grad_norm": 8.39965534210205, - "learning_rate": 6.4846515177224735e-06, - "loss": 1.2167, - "step": 3810 - }, - { - "epoch": 0.6222258419188011, - "grad_norm": 14.166945457458496, - "learning_rate": 6.435997866138488e-06, - "loss": 1.347, - "step": 3820 - }, - { - "epoch": 0.6238547053793215, - "grad_norm": 7.368659496307373, - "learning_rate": 6.3874406655383755e-06, - "loss": 1.1975, - "step": 3830 - }, - { - "epoch": 0.625483568839842, - "grad_norm": 6.8791913986206055, - "learning_rate": 6.3389812300037774e-06, - "loss": 1.3515, - "step": 3840 - }, - { - "epoch": 0.6271124323003624, - "grad_norm": 8.488605499267578, - "learning_rate": 6.290620870970561e-06, - "loss": 1.2868, - "step": 3850 - }, - { - "epoch": 0.6287412957608829, - "grad_norm": 11.543606758117676, - "learning_rate": 6.242360897193331e-06, - "loss": 1.1796, - "step": 3860 - }, - { - "epoch": 0.6303701592214033, - "grad_norm": 7.279272556304932, - "learning_rate": 6.194202614710015e-06, - "loss": 1.1563, - "step": 3870 - }, - { - "epoch": 0.6319990226819237, - "grad_norm": 6.415475368499756, - "learning_rate": 6.146147326806509e-06, - "loss": 1.246, - "step": 3880 - }, - { - "epoch": 0.6336278861424441, - "grad_norm": 8.384225845336914, - "learning_rate": 6.098196333981421e-06, - "loss": 1.2252, - "step": 3890 - }, - { - "epoch": 0.6352567496029645, - "grad_norm": 7.170252323150635, - "learning_rate": 6.050350933910865e-06, - "loss": 1.0904, - "step": 3900 - }, - { - "epoch": 0.6368856130634849, - "grad_norm": 11.731266021728516, - "learning_rate": 6.002612421413341e-06, - "loss": 1.196, - "step": 3910 - }, - { - "epoch": 0.6385144765240054, - "grad_norm": 8.852503776550293, - "learning_rate": 5.954982088414701e-06, - "loss": 1.3241, - "step": 3920 - }, - { - "epoch": 0.6401433399845258, - "grad_norm": 8.035179138183594, - "learning_rate": 5.9074612239131915e-06, - "loss": 1.2826, - "step": 3930 - }, - { - "epoch": 0.6417722034450463, - "grad_norm": 8.706337928771973, - "learning_rate": 5.8600511139445536e-06, - "loss": 1.1557, - "step": 3940 - }, - { - "epoch": 0.6434010669055666, - "grad_norm": 14.4612398147583, - "learning_rate": 5.81275304154723e-06, - "loss": 1.1653, - "step": 3950 - }, - { - "epoch": 0.645029930366087, - "grad_norm": 7.372416973114014, - "learning_rate": 5.765568286727646e-06, - "loss": 1.1933, - "step": 3960 - }, - { - "epoch": 0.6466587938266075, - "grad_norm": 8.995341300964355, - "learning_rate": 5.718498126425556e-06, - "loss": 1.1356, - "step": 3970 - }, - { - "epoch": 0.6482876572871279, - "grad_norm": 12.190003395080566, - "learning_rate": 5.671543834479503e-06, - "loss": 1.3165, - "step": 3980 - }, - { - "epoch": 0.6499165207476483, - "grad_norm": 6.376560211181641, - "learning_rate": 5.624706681592329e-06, - "loss": 1.2403, - "step": 3990 - }, - { - "epoch": 0.6515453842081688, - "grad_norm": 8.215283393859863, - "learning_rate": 5.5779879352968e-06, - "loss": 1.2896, - "step": 4000 - }, - { - "epoch": 0.6515453842081688, - "eval_loss": 1.1294902563095093, - "eval_runtime": 102.4055, - "eval_samples_per_second": 25.243, - "eval_steps_per_second": 25.243, - "step": 4000 - }, - { - "epoch": 0.6531742476686891, - "grad_norm": 7.186392307281494, - "learning_rate": 5.531388859921303e-06, - "loss": 1.2025, - "step": 4010 - }, - { - "epoch": 0.6548031111292096, - "grad_norm": 11.444113731384277, - "learning_rate": 5.484910716555607e-06, - "loss": 1.3191, - "step": 4020 - }, - { - "epoch": 0.65643197458973, - "grad_norm": 4.889548301696777, - "learning_rate": 5.438554763016775e-06, - "loss": 1.1232, - "step": 4030 - }, - { - "epoch": 0.6580608380502504, - "grad_norm": 12.841841697692871, - "learning_rate": 5.392322253815079e-06, - "loss": 1.2834, - "step": 4040 - }, - { - "epoch": 0.6596897015107709, - "grad_norm": 7.367560386657715, - "learning_rate": 5.3462144401200945e-06, - "loss": 1.3042, - "step": 4050 - }, - { - "epoch": 0.6613185649712913, - "grad_norm": 7.350122451782227, - "learning_rate": 5.300232569726805e-06, - "loss": 1.2925, - "step": 4060 - }, - { - "epoch": 0.6629474284318118, - "grad_norm": 6.530559539794922, - "learning_rate": 5.254377887021842e-06, - "loss": 1.2089, - "step": 4070 - }, - { - "epoch": 0.6645762918923321, - "grad_norm": 8.514747619628906, - "learning_rate": 5.20865163294983e-06, - "loss": 1.1635, - "step": 4080 - }, - { - "epoch": 0.6662051553528525, - "grad_norm": 10.842570304870605, - "learning_rate": 5.163055044979783e-06, - "loss": 1.1896, - "step": 4090 - }, - { - "epoch": 0.667834018813373, - "grad_norm": 10.257299423217773, - "learning_rate": 5.1175893570716075e-06, - "loss": 1.2857, - "step": 4100 - }, - { - "epoch": 0.6694628822738934, - "grad_norm": 7.264848709106445, - "learning_rate": 5.072255799642737e-06, - "loss": 1.2011, - "step": 4110 - }, - { - "epoch": 0.6710917457344138, - "grad_norm": 7.874101161956787, - "learning_rate": 5.027055599534802e-06, - "loss": 1.2601, - "step": 4120 - }, - { - "epoch": 0.6727206091949343, - "grad_norm": 9.67297077178955, - "learning_rate": 4.981989979980457e-06, - "loss": 1.2038, - "step": 4130 - }, - { - "epoch": 0.6743494726554546, - "grad_norm": 8.36242389678955, - "learning_rate": 4.93706016057026e-06, - "loss": 1.2007, - "step": 4140 - }, - { - "epoch": 0.6759783361159751, - "grad_norm": 8.81531810760498, - "learning_rate": 4.8922673572196625e-06, - "loss": 1.2002, - "step": 4150 - }, - { - "epoch": 0.6776071995764955, - "grad_norm": 13.88326358795166, - "learning_rate": 4.847612782136127e-06, - "loss": 1.2879, - "step": 4160 - }, - { - "epoch": 0.6792360630370159, - "grad_norm": 8.99789810180664, - "learning_rate": 4.803097643786289e-06, - "loss": 1.0767, - "step": 4170 - }, - { - "epoch": 0.6808649264975364, - "grad_norm": 9.136383056640625, - "learning_rate": 4.758723146863285e-06, - "loss": 1.2943, - "step": 4180 - }, - { - "epoch": 0.6824937899580568, - "grad_norm": 11.534010887145996, - "learning_rate": 4.714490492254134e-06, - "loss": 1.2505, - "step": 4190 - }, - { - "epoch": 0.6841226534185771, - "grad_norm": 7.77902889251709, - "learning_rate": 4.670400877007229e-06, - "loss": 1.3522, - "step": 4200 - }, - { - "epoch": 0.6857515168790976, - "grad_norm": 9.129291534423828, - "learning_rate": 4.6264554942999685e-06, - "loss": 1.2585, - "step": 4210 - }, - { - "epoch": 0.687380380339618, - "grad_norm": 9.164624214172363, - "learning_rate": 4.582655533406445e-06, - "loss": 1.2429, - "step": 4220 - }, - { - "epoch": 0.6890092438001385, - "grad_norm": 10.26278018951416, - "learning_rate": 4.539002179665256e-06, - "loss": 1.3034, - "step": 4230 - }, - { - "epoch": 0.6906381072606589, - "grad_norm": 6.888890743255615, - "learning_rate": 4.495496614447455e-06, - "loss": 1.2, - "step": 4240 - }, - { - "epoch": 0.6922669707211793, - "grad_norm": 6.9961090087890625, - "learning_rate": 4.452140015124539e-06, - "loss": 1.2924, - "step": 4250 - }, - { - "epoch": 0.6938958341816998, - "grad_norm": 9.418280601501465, - "learning_rate": 4.4089335550366275e-06, - "loss": 1.2678, - "step": 4260 - }, - { - "epoch": 0.6955246976422201, - "grad_norm": 9.817927360534668, - "learning_rate": 4.365878403460687e-06, - "loss": 1.1866, - "step": 4270 - }, - { - "epoch": 0.6971535611027405, - "grad_norm": 7.495052337646484, - "learning_rate": 4.322975725578871e-06, - "loss": 1.2452, - "step": 4280 - }, - { - "epoch": 0.698782424563261, - "grad_norm": 7.096460342407227, - "learning_rate": 4.280226682447026e-06, - "loss": 1.2315, - "step": 4290 - }, - { - "epoch": 0.7004112880237814, - "grad_norm": 8.440934181213379, - "learning_rate": 4.23763243096325e-06, - "loss": 1.1404, - "step": 4300 - }, - { - "epoch": 0.7020401514843019, - "grad_norm": 10.936495780944824, - "learning_rate": 4.195194123836569e-06, - "loss": 1.303, - "step": 4310 - }, - { - "epoch": 0.7036690149448223, - "grad_norm": 10.077718734741211, - "learning_rate": 4.152912909555775e-06, - "loss": 1.2253, - "step": 4320 - }, - { - "epoch": 0.7052978784053426, - "grad_norm": 10.520235061645508, - "learning_rate": 4.110789932358312e-06, - "loss": 1.1638, - "step": 4330 - }, - { - "epoch": 0.7069267418658631, - "grad_norm": 8.664091110229492, - "learning_rate": 4.068826332199336e-06, - "loss": 1.2585, - "step": 4340 - }, - { - "epoch": 0.7085556053263835, - "grad_norm": 9.869935989379883, - "learning_rate": 4.027023244720853e-06, - "loss": 1.2998, - "step": 4350 - }, - { - "epoch": 0.7101844687869039, - "grad_norm": 7.815759658813477, - "learning_rate": 3.985381801220975e-06, - "loss": 1.1673, - "step": 4360 - }, - { - "epoch": 0.7118133322474244, - "grad_norm": 9.876256942749023, - "learning_rate": 3.943903128623336e-06, - "loss": 1.1876, - "step": 4370 - }, - { - "epoch": 0.7134421957079448, - "grad_norm": 10.636608123779297, - "learning_rate": 3.902588349446551e-06, - "loss": 1.2142, - "step": 4380 - }, - { - "epoch": 0.7150710591684653, - "grad_norm": 8.32884693145752, - "learning_rate": 3.86143858177388e-06, - "loss": 1.3103, - "step": 4390 - }, - { - "epoch": 0.7166999226289856, - "grad_norm": 13.757806777954102, - "learning_rate": 3.820454939222946e-06, - "loss": 1.1283, - "step": 4400 - }, - { - "epoch": 0.718328786089506, - "grad_norm": 19.816259384155273, - "learning_rate": 3.7796385309155948e-06, - "loss": 1.2626, - "step": 4410 - }, - { - "epoch": 0.7199576495500265, - "grad_norm": 8.810699462890625, - "learning_rate": 3.7389904614479e-06, - "loss": 1.3254, - "step": 4420 - }, - { - "epoch": 0.7215865130105469, - "grad_norm": 7.040134429931641, - "learning_rate": 3.698511830860243e-06, - "loss": 1.3012, - "step": 4430 - }, - { - "epoch": 0.7232153764710673, - "grad_norm": 8.462788581848145, - "learning_rate": 3.658203734607567e-06, - "loss": 1.1578, - "step": 4440 - }, - { - "epoch": 0.7248442399315878, - "grad_norm": 8.182144165039062, - "learning_rate": 3.6180672635297243e-06, - "loss": 1.3664, - "step": 4450 - }, - { - "epoch": 0.7264731033921081, - "grad_norm": 11.486858367919922, - "learning_rate": 3.578103503821939e-06, - "loss": 1.192, - "step": 4460 - }, - { - "epoch": 0.7281019668526286, - "grad_norm": 8.237690925598145, - "learning_rate": 3.53831353700544e-06, - "loss": 1.1809, - "step": 4470 - }, - { - "epoch": 0.729730830313149, - "grad_norm": 6.926926612854004, - "learning_rate": 3.4986984398981662e-06, - "loss": 1.1828, - "step": 4480 - }, - { - "epoch": 0.7313596937736694, - "grad_norm": 8.473318099975586, - "learning_rate": 3.4592592845856388e-06, - "loss": 1.3035, - "step": 4490 - }, - { - "epoch": 0.7329885572341899, - "grad_norm": 9.865799903869629, - "learning_rate": 3.4199971383919538e-06, - "loss": 1.2081, - "step": 4500 - }, - { - "epoch": 0.7329885572341899, - "eval_loss": 1.1236326694488525, - "eval_runtime": 102.8937, - "eval_samples_per_second": 25.123, - "eval_steps_per_second": 25.123, - "step": 4500 - }, - { - "epoch": 0.7346174206947103, - "grad_norm": 9.886106491088867, - "learning_rate": 3.380913063850877e-06, - "loss": 1.2866, - "step": 4510 - }, - { - "epoch": 0.7362462841552306, - "grad_norm": 7.137485504150391, - "learning_rate": 3.342008118677108e-06, - "loss": 1.0974, - "step": 4520 - }, - { - "epoch": 0.7378751476157511, - "grad_norm": 9.091876029968262, - "learning_rate": 3.303283355737653e-06, - "loss": 1.2417, - "step": 4530 - }, - { - "epoch": 0.7395040110762715, - "grad_norm": 7.459966659545898, - "learning_rate": 3.2647398230233175e-06, - "loss": 1.2105, - "step": 4540 - }, - { - "epoch": 0.741132874536792, - "grad_norm": 7.54026460647583, - "learning_rate": 3.2263785636203635e-06, - "loss": 1.1231, - "step": 4550 - }, - { - "epoch": 0.7427617379973124, - "grad_norm": 9.739063262939453, - "learning_rate": 3.188200615682265e-06, - "loss": 1.1882, - "step": 4560 - }, - { - "epoch": 0.7443906014578328, - "grad_norm": 11.615285873413086, - "learning_rate": 3.150207012401629e-06, - "loss": 1.1598, - "step": 4570 - }, - { - "epoch": 0.7460194649183532, - "grad_norm": 7.322878837585449, - "learning_rate": 3.1123987819822234e-06, - "loss": 1.198, - "step": 4580 - }, - { - "epoch": 0.7476483283788736, - "grad_norm": 7.064319133758545, - "learning_rate": 3.0747769476111454e-06, - "loss": 1.1921, - "step": 4590 - }, - { - "epoch": 0.749277191839394, - "grad_norm": 9.456534385681152, - "learning_rate": 3.037342527431152e-06, - "loss": 1.263, - "step": 4600 - }, - { - "epoch": 0.7509060552999145, - "grad_norm": 7.1396098136901855, - "learning_rate": 3.0000965345130904e-06, - "loss": 1.2136, - "step": 4610 - }, - { - "epoch": 0.7525349187604349, - "grad_norm": 7.965095520019531, - "learning_rate": 2.96303997682848e-06, - "loss": 1.2013, - "step": 4620 - }, - { - "epoch": 0.7541637822209554, - "grad_norm": 6.984764575958252, - "learning_rate": 2.9261738572222487e-06, - "loss": 1.2054, - "step": 4630 - }, - { - "epoch": 0.7557926456814757, - "grad_norm": 9.820785522460938, - "learning_rate": 2.889499173385576e-06, - "loss": 1.2689, - "step": 4640 - }, - { - "epoch": 0.7574215091419961, - "grad_norm": 12.76016616821289, - "learning_rate": 2.8530169178289068e-06, - "loss": 1.2673, - "step": 4650 - }, - { - "epoch": 0.7590503726025166, - "grad_norm": 6.978170871734619, - "learning_rate": 2.8167280778550897e-06, - "loss": 1.3873, - "step": 4660 - }, - { - "epoch": 0.760679236063037, - "grad_norm": 5.620461463928223, - "learning_rate": 2.7806336355326434e-06, - "loss": 1.1573, - "step": 4670 - }, - { - "epoch": 0.7623080995235575, - "grad_norm": 6.476634979248047, - "learning_rate": 2.744734567669203e-06, - "loss": 1.1334, - "step": 4680 - }, - { - "epoch": 0.7639369629840779, - "grad_norm": 6.739288330078125, - "learning_rate": 2.709031845785062e-06, - "loss": 1.2286, - "step": 4690 - }, - { - "epoch": 0.7655658264445983, - "grad_norm": 8.93143367767334, - "learning_rate": 2.673526436086894e-06, - "loss": 1.2131, - "step": 4700 - }, - { - "epoch": 0.7671946899051187, - "grad_norm": 17.24730110168457, - "learning_rate": 2.63821929944161e-06, - "loss": 1.2658, - "step": 4710 - }, - { - "epoch": 0.7688235533656391, - "grad_norm": 11.340949058532715, - "learning_rate": 2.6031113913503337e-06, - "loss": 1.2794, - "step": 4720 - }, - { - "epoch": 0.7704524168261595, - "grad_norm": 10.817314147949219, - "learning_rate": 2.5682036619225657e-06, - "loss": 1.3443, - "step": 4730 - }, - { - "epoch": 0.77208128028668, - "grad_norm": 17.411333084106445, - "learning_rate": 2.5334970558504613e-06, - "loss": 1.1532, - "step": 4740 - }, - { - "epoch": 0.7737101437472004, - "grad_norm": 9.042006492614746, - "learning_rate": 2.4989925123832583e-06, - "loss": 1.2526, - "step": 4750 - }, - { - "epoch": 0.7753390072077209, - "grad_norm": 9.995912551879883, - "learning_rate": 2.4646909653018724e-06, - "loss": 1.0986, - "step": 4760 - }, - { - "epoch": 0.7769678706682412, - "grad_norm": 11.739777565002441, - "learning_rate": 2.4305933428936137e-06, - "loss": 1.2693, - "step": 4770 - }, - { - "epoch": 0.7785967341287616, - "grad_norm": 11.440888404846191, - "learning_rate": 2.3967005679270736e-06, - "loss": 1.2691, - "step": 4780 - }, - { - "epoch": 0.7802255975892821, - "grad_norm": 9.173127174377441, - "learning_rate": 2.3630135576271563e-06, - "loss": 1.137, - "step": 4790 - }, - { - "epoch": 0.7818544610498025, - "grad_norm": 11.49284839630127, - "learning_rate": 2.329533223650233e-06, - "loss": 1.2192, - "step": 4800 - }, - { - "epoch": 0.7834833245103229, - "grad_norm": 6.268970012664795, - "learning_rate": 2.296260472059505e-06, - "loss": 1.1762, - "step": 4810 - }, - { - "epoch": 0.7851121879708434, - "grad_norm": 7.918376445770264, - "learning_rate": 2.2631962033004486e-06, - "loss": 1.2459, - "step": 4820 - }, - { - "epoch": 0.7867410514313637, - "grad_norm": 9.789480209350586, - "learning_rate": 2.230341312176476e-06, - "loss": 1.2437, - "step": 4830 - }, - { - "epoch": 0.7883699148918842, - "grad_norm": 6.269535064697266, - "learning_rate": 2.197696687824703e-06, - "loss": 1.2836, - "step": 4840 - }, - { - "epoch": 0.7899987783524046, - "grad_norm": 8.587770462036133, - "learning_rate": 2.165263213691885e-06, - "loss": 1.3312, - "step": 4850 - }, - { - "epoch": 0.791627641812925, - "grad_norm": 7.60506010055542, - "learning_rate": 2.133041767510523e-06, - "loss": 1.285, - "step": 4860 - }, - { - "epoch": 0.7932565052734455, - "grad_norm": 8.978716850280762, - "learning_rate": 2.1010332212750926e-06, - "loss": 1.2393, - "step": 4870 - }, - { - "epoch": 0.7948853687339659, - "grad_norm": 8.022439956665039, - "learning_rate": 2.0692384412184587e-06, - "loss": 1.1718, - "step": 4880 - }, - { - "epoch": 0.7965142321944862, - "grad_norm": 11.286605834960938, - "learning_rate": 2.0376582877884322e-06, - "loss": 1.1298, - "step": 4890 - }, - { - "epoch": 0.7981430956550067, - "grad_norm": 8.459285736083984, - "learning_rate": 2.0062936156244695e-06, - "loss": 1.3101, - "step": 4900 - }, - { - "epoch": 0.7997719591155271, - "grad_norm": 8.629647254943848, - "learning_rate": 1.9751452735345677e-06, - "loss": 1.3471, - "step": 4910 - }, - { - "epoch": 0.8014008225760476, - "grad_norm": 9.93689250946045, - "learning_rate": 1.9442141044722694e-06, - "loss": 1.2816, - "step": 4920 - }, - { - "epoch": 0.803029686036568, - "grad_norm": 13.70149040222168, - "learning_rate": 1.9135009455138643e-06, - "loss": 1.1864, - "step": 4930 - }, - { - "epoch": 0.8046585494970884, - "grad_norm": 11.720361709594727, - "learning_rate": 1.8830066278357395e-06, - "loss": 1.2885, - "step": 4940 - }, - { - "epoch": 0.8062874129576089, - "grad_norm": 9.665403366088867, - "learning_rate": 1.8527319766918694e-06, - "loss": 1.2245, - "step": 4950 - }, - { - "epoch": 0.8079162764181292, - "grad_norm": 8.559013366699219, - "learning_rate": 1.8226778113914989e-06, - "loss": 1.1385, - "step": 4960 - }, - { - "epoch": 0.8095451398786496, - "grad_norm": 9.008288383483887, - "learning_rate": 1.7928449452769636e-06, - "loss": 1.1462, - "step": 4970 - }, - { - "epoch": 0.8111740033391701, - "grad_norm": 7.702124118804932, - "learning_rate": 1.7632341857016733e-06, - "loss": 1.2371, - "step": 4980 - }, - { - "epoch": 0.8128028667996905, - "grad_norm": 7.910833835601807, - "learning_rate": 1.7338463340082734e-06, - "loss": 1.2431, - "step": 4990 - }, - { - "epoch": 0.814431730260211, - "grad_norm": 10.374570846557617, - "learning_rate": 1.7046821855069562e-06, - "loss": 1.2451, - "step": 5000 - }, - { - "epoch": 0.814431730260211, - "eval_loss": 1.1212321519851685, - "eval_runtime": 102.729, - "eval_samples_per_second": 25.163, - "eval_steps_per_second": 25.163, - "step": 5000 - }, - { - "epoch": 0.8160605937207314, - "grad_norm": 12.469367027282715, - "learning_rate": 1.6757425294539266e-06, - "loss": 1.3257, - "step": 5010 - }, - { - "epoch": 0.8176894571812517, - "grad_norm": 8.322937965393066, - "learning_rate": 1.647028149030061e-06, - "loss": 1.2494, - "step": 5020 - }, - { - "epoch": 0.8193183206417722, - "grad_norm": 10.362074851989746, - "learning_rate": 1.6185398213196935e-06, - "loss": 1.1441, - "step": 5030 - }, - { - "epoch": 0.8209471841022926, - "grad_norm": 10.657358169555664, - "learning_rate": 1.5902783172896042e-06, - "loss": 1.2038, - "step": 5040 - }, - { - "epoch": 0.822576047562813, - "grad_norm": 6.7474517822265625, - "learning_rate": 1.5622444017681438e-06, - "loss": 1.1308, - "step": 5050 - }, - { - "epoch": 0.8242049110233335, - "grad_norm": 10.932724952697754, - "learning_rate": 1.534438833424533e-06, - "loss": 1.1732, - "step": 5060 - }, - { - "epoch": 0.8258337744838539, - "grad_norm": 7.627405643463135, - "learning_rate": 1.5068623647483428e-06, - "loss": 1.1552, - "step": 5070 - }, - { - "epoch": 0.8274626379443744, - "grad_norm": 13.693827629089355, - "learning_rate": 1.479515742029115e-06, - "loss": 1.1808, - "step": 5080 - }, - { - "epoch": 0.8290915014048947, - "grad_norm": 7.652097702026367, - "learning_rate": 1.4523997053361805e-06, - "loss": 1.2478, - "step": 5090 - }, - { - "epoch": 0.8307203648654151, - "grad_norm": 6.031581401824951, - "learning_rate": 1.4255149884986253e-06, - "loss": 1.1577, - "step": 5100 - }, - { - "epoch": 0.8323492283259356, - "grad_norm": 7.7093729972839355, - "learning_rate": 1.3988623190854233e-06, - "loss": 1.1844, - "step": 5110 - }, - { - "epoch": 0.833978091786456, - "grad_norm": 9.999074935913086, - "learning_rate": 1.3724424183857599e-06, - "loss": 1.1921, - "step": 5120 - }, - { - "epoch": 0.8356069552469764, - "grad_norm": 6.947312355041504, - "learning_rate": 1.3462560013895031e-06, - "loss": 1.273, - "step": 5130 - }, - { - "epoch": 0.8372358187074969, - "grad_norm": 9.171476364135742, - "learning_rate": 1.320303776767855e-06, - "loss": 1.1454, - "step": 5140 - }, - { - "epoch": 0.8388646821680172, - "grad_norm": 7.364622592926025, - "learning_rate": 1.2945864468541792e-06, - "loss": 1.2112, - "step": 5150 - }, - { - "epoch": 0.8404935456285377, - "grad_norm": 6.328592777252197, - "learning_rate": 1.2691047076249852e-06, - "loss": 1.2982, - "step": 5160 - }, - { - "epoch": 0.8421224090890581, - "grad_norm": 11.03521728515625, - "learning_rate": 1.2438592486811007e-06, - "loss": 1.2062, - "step": 5170 - }, - { - "epoch": 0.8437512725495785, - "grad_norm": 7.168650150299072, - "learning_rate": 1.2188507532290094e-06, - "loss": 1.2764, - "step": 5180 - }, - { - "epoch": 0.845380136010099, - "grad_norm": 7.8710784912109375, - "learning_rate": 1.194079898062349e-06, - "loss": 1.1388, - "step": 5190 - }, - { - "epoch": 0.8470089994706194, - "grad_norm": 9.251112937927246, - "learning_rate": 1.1695473535436187e-06, - "loss": 1.1258, - "step": 5200 - }, - { - "epoch": 0.8486378629311399, - "grad_norm": 10.411331176757812, - "learning_rate": 1.145253783586011e-06, - "loss": 1.2281, - "step": 5210 - }, - { - "epoch": 0.8502667263916602, - "grad_norm": 13.52040958404541, - "learning_rate": 1.1211998456354656e-06, - "loss": 1.2075, - "step": 5220 - }, - { - "epoch": 0.8518955898521806, - "grad_norm": 8.210290908813477, - "learning_rate": 1.0973861906528692e-06, - "loss": 1.3148, - "step": 5230 - }, - { - "epoch": 0.8535244533127011, - "grad_norm": 7.864938259124756, - "learning_rate": 1.0738134630964326e-06, - "loss": 1.3147, - "step": 5240 - }, - { - "epoch": 0.8551533167732215, - "grad_norm": 8.113484382629395, - "learning_rate": 1.050482300904264e-06, - "loss": 1.2167, - "step": 5250 - }, - { - "epoch": 0.8567821802337419, - "grad_norm": 10.562095642089844, - "learning_rate": 1.0273933354770894e-06, - "loss": 1.3086, - "step": 5260 - }, - { - "epoch": 0.8584110436942624, - "grad_norm": 10.281449317932129, - "learning_rate": 1.004547191661178e-06, - "loss": 1.21, - "step": 5270 - }, - { - "epoch": 0.8600399071547827, - "grad_norm": 9.31655216217041, - "learning_rate": 9.819444877314299e-07, - "loss": 1.3514, - "step": 5280 - }, - { - "epoch": 0.8616687706153032, - "grad_norm": 8.388626098632812, - "learning_rate": 9.5958583537463e-07, - "loss": 1.2307, - "step": 5290 - }, - { - "epoch": 0.8632976340758236, - "grad_norm": 8.135517120361328, - "learning_rate": 9.374718396729188e-07, - "loss": 1.2003, - "step": 5300 - }, - { - "epoch": 0.864926497536344, - "grad_norm": 7.857133388519287, - "learning_rate": 9.156030990873932e-07, - "loss": 1.1086, - "step": 5310 - }, - { - "epoch": 0.8665553609968645, - "grad_norm": 10.863335609436035, - "learning_rate": 8.939802054419289e-07, - "loss": 1.2312, - "step": 5320 - }, - { - "epoch": 0.8681842244573849, - "grad_norm": 8.662337303161621, - "learning_rate": 8.726037439071555e-07, - "loss": 1.2288, - "step": 5330 - }, - { - "epoch": 0.8698130879179052, - "grad_norm": 8.542695045471191, - "learning_rate": 8.514742929846142e-07, - "loss": 1.1772, - "step": 5340 - }, - { - "epoch": 0.8714419513784257, - "grad_norm": 8.269294738769531, - "learning_rate": 8.305924244911178e-07, - "loss": 1.1143, - "step": 5350 - }, - { - "epoch": 0.8730708148389461, - "grad_norm": 7.3109354972839355, - "learning_rate": 8.099587035432654e-07, - "loss": 1.128, - "step": 5360 - }, - { - "epoch": 0.8746996782994666, - "grad_norm": 7.124019622802734, - "learning_rate": 7.895736885421468e-07, - "loss": 1.3185, - "step": 5370 - }, - { - "epoch": 0.876328541759987, - "grad_norm": 10.167950630187988, - "learning_rate": 7.694379311582401e-07, - "loss": 1.2222, - "step": 5380 - }, - { - "epoch": 0.8779574052205074, - "grad_norm": 9.3758544921875, - "learning_rate": 7.49551976316475e-07, - "loss": 1.23, - "step": 5390 - }, - { - "epoch": 0.8795862686810278, - "grad_norm": 10.248433113098145, - "learning_rate": 7.299163621814853e-07, - "loss": 1.2133, - "step": 5400 - }, - { - "epoch": 0.8812151321415482, - "grad_norm": 13.722402572631836, - "learning_rate": 7.105316201430512e-07, - "loss": 1.2697, - "step": 5410 - }, - { - "epoch": 0.8828439956020686, - "grad_norm": 8.881364822387695, - "learning_rate": 6.91398274801709e-07, - "loss": 1.18, - "step": 5420 - }, - { - "epoch": 0.8844728590625891, - "grad_norm": 6.859842300415039, - "learning_rate": 6.725168439545637e-07, - "loss": 1.1595, - "step": 5430 - }, - { - "epoch": 0.8861017225231095, - "grad_norm": 8.630757331848145, - "learning_rate": 6.53887838581273e-07, - "loss": 1.2345, - "step": 5440 - }, - { - "epoch": 0.88773058598363, - "grad_norm": 11.166703224182129, - "learning_rate": 6.355117628302121e-07, - "loss": 1.2953, - "step": 5450 - }, - { - "epoch": 0.8893594494441504, - "grad_norm": 7.578332901000977, - "learning_rate": 6.173891140048427e-07, - "loss": 1.1959, - "step": 5460 - }, - { - "epoch": 0.8909883129046707, - "grad_norm": 18.070478439331055, - "learning_rate": 5.995203825502393e-07, - "loss": 1.2072, - "step": 5470 - }, - { - "epoch": 0.8926171763651912, - "grad_norm": 9.904817581176758, - "learning_rate": 5.819060520398345e-07, - "loss": 1.2266, - "step": 5480 - }, - { - "epoch": 0.8942460398257116, - "grad_norm": 8.371089935302734, - "learning_rate": 5.645465991623167e-07, - "loss": 1.2003, - "step": 5490 - }, - { - "epoch": 0.895874903286232, - "grad_norm": 9.28614616394043, - "learning_rate": 5.474424937087353e-07, - "loss": 1.2134, - "step": 5500 - }, - { - "epoch": 0.895874903286232, - "eval_loss": 1.1204986572265625, - "eval_runtime": 102.9981, - "eval_samples_per_second": 25.098, - "eval_steps_per_second": 25.098, - "step": 5500 - }, - { - "epoch": 0.8975037667467525, - "grad_norm": 8.397842407226562, - "learning_rate": 5.305941985597929e-07, - "loss": 1.1799, - "step": 5510 - }, - { - "epoch": 0.8991326302072729, - "grad_norm": 5.709924697875977, - "learning_rate": 5.140021696733066e-07, - "loss": 1.233, - "step": 5520 - }, - { - "epoch": 0.9007614936677933, - "grad_norm": 8.849600791931152, - "learning_rate": 4.97666856071879e-07, - "loss": 1.3359, - "step": 5530 - }, - { - "epoch": 0.9023903571283137, - "grad_norm": 8.294559478759766, - "learning_rate": 4.815886998307439e-07, - "loss": 1.2746, - "step": 5540 - }, - { - "epoch": 0.9040192205888341, - "grad_norm": 8.917488098144531, - "learning_rate": 4.657681360657962e-07, - "loss": 1.1755, - "step": 5550 - }, - { - "epoch": 0.9056480840493546, - "grad_norm": 6.684938907623291, - "learning_rate": 4.502055929218241e-07, - "loss": 1.2818, - "step": 5560 - }, - { - "epoch": 0.907276947509875, - "grad_norm": 6.304566383361816, - "learning_rate": 4.34901491560924e-07, - "loss": 1.3162, - "step": 5570 - }, - { - "epoch": 0.9089058109703954, - "grad_norm": 16.449485778808594, - "learning_rate": 4.1985624615109134e-07, - "loss": 1.0536, - "step": 5580 - }, - { - "epoch": 0.9105346744309158, - "grad_norm": 9.711615562438965, - "learning_rate": 4.0507026385502747e-07, - "loss": 1.1989, - "step": 5590 - }, - { - "epoch": 0.9121635378914362, - "grad_norm": 9.28388786315918, - "learning_rate": 3.9054394481910507e-07, - "loss": 1.3322, - "step": 5600 - }, - { - "epoch": 0.9137924013519567, - "grad_norm": 7.323754787445068, - "learning_rate": 3.7627768216255244e-07, - "loss": 1.1938, - "step": 5610 - }, - { - "epoch": 0.9154212648124771, - "grad_norm": 7.1576385498046875, - "learning_rate": 3.6227186196680976e-07, - "loss": 1.2738, - "step": 5620 - }, - { - "epoch": 0.9170501282729975, - "grad_norm": 8.460196495056152, - "learning_rate": 3.485268632650751e-07, - "loss": 1.2285, - "step": 5630 - }, - { - "epoch": 0.918678991733518, - "grad_norm": 8.773188591003418, - "learning_rate": 3.350430580320574e-07, - "loss": 1.2892, - "step": 5640 - }, - { - "epoch": 0.9203078551940383, - "grad_norm": 7.650882720947266, - "learning_rate": 3.218208111738996e-07, - "loss": 1.2367, - "step": 5650 - }, - { - "epoch": 0.9219367186545587, - "grad_norm": 12.023070335388184, - "learning_rate": 3.088604805183126e-07, - "loss": 1.2383, - "step": 5660 - }, - { - "epoch": 0.9235655821150792, - "grad_norm": 10.934995651245117, - "learning_rate": 2.9616241680488713e-07, - "loss": 1.2227, - "step": 5670 - }, - { - "epoch": 0.9251944455755996, - "grad_norm": 8.503402709960938, - "learning_rate": 2.837269636755946e-07, - "loss": 1.3065, - "step": 5680 - }, - { - "epoch": 0.9268233090361201, - "grad_norm": 8.271093368530273, - "learning_rate": 2.7155445766550605e-07, - "loss": 1.0851, - "step": 5690 - }, - { - "epoch": 0.9284521724966405, - "grad_norm": 8.876206398010254, - "learning_rate": 2.5964522819366125e-07, - "loss": 1.2316, - "step": 5700 - }, - { - "epoch": 0.9300810359571609, - "grad_norm": 11.97021484375, - "learning_rate": 2.479995975541749e-07, - "loss": 1.2201, - "step": 5710 - }, - { - "epoch": 0.9317098994176813, - "grad_norm": 9.127492904663086, - "learning_rate": 2.3661788090750038e-07, - "loss": 1.2073, - "step": 5720 - }, - { - "epoch": 0.9333387628782017, - "grad_norm": 7.851465702056885, - "learning_rate": 2.255003862719074e-07, - "loss": 1.2395, - "step": 5730 - }, - { - "epoch": 0.9349676263387221, - "grad_norm": 11.95673942565918, - "learning_rate": 2.1464741451514447e-07, - "loss": 1.2248, - "step": 5740 - }, - { - "epoch": 0.9365964897992426, - "grad_norm": 7.73248291015625, - "learning_rate": 2.0405925934629423e-07, - "loss": 1.3249, - "step": 5750 - }, - { - "epoch": 0.938225353259763, - "grad_norm": 10.03739070892334, - "learning_rate": 1.9373620730783082e-07, - "loss": 1.1815, - "step": 5760 - }, - { - "epoch": 0.9398542167202835, - "grad_norm": 8.619077682495117, - "learning_rate": 1.836785377678596e-07, - "loss": 1.244, - "step": 5770 - }, - { - "epoch": 0.9414830801808038, - "grad_norm": 8.094244956970215, - "learning_rate": 1.738865229125597e-07, - "loss": 1.2881, - "step": 5780 - }, - { - "epoch": 0.9431119436413242, - "grad_norm": 9.411540031433105, - "learning_rate": 1.6436042773881666e-07, - "loss": 1.2496, - "step": 5790 - }, - { - "epoch": 0.9447408071018447, - "grad_norm": 9.164313316345215, - "learning_rate": 1.5510051004705263e-07, - "loss": 1.2901, - "step": 5800 - }, - { - "epoch": 0.9463696705623651, - "grad_norm": 10.367000579833984, - "learning_rate": 1.4610702043424628e-07, - "loss": 1.1452, - "step": 5810 - }, - { - "epoch": 0.9479985340228856, - "grad_norm": 8.201516151428223, - "learning_rate": 1.373802022871551e-07, - "loss": 1.2266, - "step": 5820 - }, - { - "epoch": 0.949627397483406, - "grad_norm": 8.908825874328613, - "learning_rate": 1.2892029177572817e-07, - "loss": 1.2799, - "step": 5830 - }, - { - "epoch": 0.9512562609439263, - "grad_norm": 7.841489315032959, - "learning_rate": 1.2072751784671043e-07, - "loss": 1.2983, - "step": 5840 - }, - { - "epoch": 0.9528851244044468, - "grad_norm": 9.294130325317383, - "learning_rate": 1.1280210221745192e-07, - "loss": 1.2792, - "step": 5850 - }, - { - "epoch": 0.9545139878649672, - "grad_norm": 7.930861473083496, - "learning_rate": 1.0514425936990369e-07, - "loss": 1.1413, - "step": 5860 - }, - { - "epoch": 0.9561428513254876, - "grad_norm": 9.744686126708984, - "learning_rate": 9.775419654481588e-08, - "loss": 1.1266, - "step": 5870 - }, - { - "epoch": 0.9577717147860081, - "grad_norm": 7.7475905418396, - "learning_rate": 9.063211373613102e-08, - "loss": 1.4206, - "step": 5880 - }, - { - "epoch": 0.9594005782465285, - "grad_norm": 5.828322887420654, - "learning_rate": 8.3778203685565e-08, - "loss": 1.2971, - "step": 5890 - }, - { - "epoch": 0.961029441707049, - "grad_norm": 10.001614570617676, - "learning_rate": 7.71926518773991e-08, - "loss": 1.2998, - "step": 5900 - }, - { - "epoch": 0.9626583051675693, - "grad_norm": 9.513165473937988, - "learning_rate": 7.087563653345286e-08, - "loss": 1.1007, - "step": 5910 - }, - { - "epoch": 0.9642871686280897, - "grad_norm": 9.425087928771973, - "learning_rate": 6.482732860826679e-08, - "loss": 1.245, - "step": 5920 - }, - { - "epoch": 0.9659160320886102, - "grad_norm": 8.666535377502441, - "learning_rate": 5.90478917844739e-08, - "loss": 1.2323, - "step": 5930 - }, - { - "epoch": 0.9675448955491306, - "grad_norm": 8.544173240661621, - "learning_rate": 5.3537482468366544e-08, - "loss": 1.2152, - "step": 5940 - }, - { - "epoch": 0.969173759009651, - "grad_norm": 9.873296737670898, - "learning_rate": 4.829624978567204e-08, - "loss": 1.216, - "step": 5950 - }, - { - "epoch": 0.9708026224701715, - "grad_norm": 6.528816223144531, - "learning_rate": 4.332433557750704e-08, - "loss": 1.36, - "step": 5960 - }, - { - "epoch": 0.9724314859306918, - "grad_norm": 13.494250297546387, - "learning_rate": 3.862187439654608e-08, - "loss": 1.1844, - "step": 5970 - }, - { - "epoch": 0.9740603493912123, - "grad_norm": 10.320327758789062, - "learning_rate": 3.41889935033779e-08, - "loss": 1.3135, - "step": 5980 - }, - { - "epoch": 0.9756892128517327, - "grad_norm": 8.843477249145508, - "learning_rate": 3.002581286305817e-08, - "loss": 1.2222, - "step": 5990 - }, - { - "epoch": 0.9773180763122531, - "grad_norm": 7.343149662017822, - "learning_rate": 2.6132445141872074e-08, - "loss": 1.2437, - "step": 6000 - }, - { - "epoch": 0.9773180763122531, - "eval_loss": 1.1204884052276611, - "eval_runtime": 102.4877, - "eval_samples_per_second": 25.223, - "eval_steps_per_second": 25.223, - "step": 6000 - }, - { - "epoch": 0.9789469397727736, - "grad_norm": 10.39957332611084, - "learning_rate": 2.250899570427345e-08, - "loss": 1.174, - "step": 6010 - }, - { - "epoch": 0.980575803233294, - "grad_norm": 7.477624416351318, - "learning_rate": 1.91555626100437e-08, - "loss": 1.3035, - "step": 6020 - }, - { - "epoch": 0.9822046666938143, - "grad_norm": 7.673905849456787, - "learning_rate": 1.6072236611629487e-08, - "loss": 1.2888, - "step": 6030 - }, - { - "epoch": 0.9838335301543348, - "grad_norm": 8.837902069091797, - "learning_rate": 1.325910115169471e-08, - "loss": 1.3579, - "step": 6040 - }, - { - "epoch": 0.9854623936148552, - "grad_norm": 9.265447616577148, - "learning_rate": 1.0716232360856726e-08, - "loss": 1.2222, - "step": 6050 - }, - { - "epoch": 0.9870912570753757, - "grad_norm": 11.45450210571289, - "learning_rate": 8.44369905562692e-09, - "loss": 1.2786, - "step": 6060 - }, - { - "epoch": 0.9887201205358961, - "grad_norm": 9.006537437438965, - "learning_rate": 6.441562736551054e-09, - "loss": 1.2923, - "step": 6070 - }, - { - "epoch": 0.9903489839964165, - "grad_norm": 7.819212913513184, - "learning_rate": 4.709877586540623e-09, - "loss": 1.1388, - "step": 6080 - }, - { - "epoch": 0.991977847456937, - "grad_norm": 10.596216201782227, - "learning_rate": 3.2486904694128963e-09, - "loss": 1.1399, - "step": 6090 - }, - { - "epoch": 0.9936067109174573, - "grad_norm": 9.980060577392578, - "learning_rate": 2.0580409286152792e-09, - "loss": 1.1898, - "step": 6100 - }, - { - "epoch": 0.9952355743779777, - "grad_norm": 7.423088073730469, - "learning_rate": 1.1379611861594974e-09, - "loss": 1.234, - "step": 6110 - }, - { - "epoch": 0.9968644378384982, - "grad_norm": 8.370770454406738, - "learning_rate": 4.884761417489614e-10, - "loss": 1.2935, - "step": 6120 - }, - { - "epoch": 0.9984933012990186, - "grad_norm": 8.747970581054688, - "learning_rate": 1.0960337210597083e-10, - "loss": 1.2299, - "step": 6130 - } - ], - "logging_steps": 10, - "max_steps": 6139, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.1514993505940275e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}