{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 169, "global_step": 511, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019569471624266144, "grad_norm": 4.606130123138428, "learning_rate": 3.125e-07, "loss": 0.6812, "step": 1 }, { "epoch": 0.003913894324853229, "grad_norm": 5.627719402313232, "learning_rate": 6.25e-07, "loss": 0.7188, "step": 2 }, { "epoch": 0.005870841487279843, "grad_norm": 5.225893974304199, "learning_rate": 9.375000000000001e-07, "loss": 0.6832, "step": 3 }, { "epoch": 0.007827788649706457, "grad_norm": 4.055615425109863, "learning_rate": 1.25e-06, "loss": 0.7478, "step": 4 }, { "epoch": 0.009784735812133072, "grad_norm": 3.32236385345459, "learning_rate": 1.5625e-06, "loss": 0.6512, "step": 5 }, { "epoch": 0.011741682974559686, "grad_norm": 2.5439915657043457, "learning_rate": 1.8750000000000003e-06, "loss": 0.72, "step": 6 }, { "epoch": 0.0136986301369863, "grad_norm": 1.9466145038604736, "learning_rate": 2.1875000000000002e-06, "loss": 0.6302, "step": 7 }, { "epoch": 0.015655577299412915, "grad_norm": 1.833212971687317, "learning_rate": 2.5e-06, "loss": 0.6859, "step": 8 }, { "epoch": 0.01761252446183953, "grad_norm": 1.4203251600265503, "learning_rate": 2.8125e-06, "loss": 0.5943, "step": 9 }, { "epoch": 0.019569471624266144, "grad_norm": 1.7164653539657593, "learning_rate": 3.125e-06, "loss": 0.5744, "step": 10 }, { "epoch": 0.021526418786692758, "grad_norm": 1.4249149560928345, "learning_rate": 3.4375e-06, "loss": 0.5896, "step": 11 }, { "epoch": 0.023483365949119372, "grad_norm": 1.2433736324310303, "learning_rate": 3.7500000000000005e-06, "loss": 0.5873, "step": 12 }, { "epoch": 0.025440313111545987, "grad_norm": 1.0826597213745117, "learning_rate": 4.0625000000000005e-06, "loss": 0.5792, "step": 13 }, { "epoch": 0.0273972602739726, "grad_norm": 1.0738195180892944, "learning_rate": 4.3750000000000005e-06, "loss": 0.6032, "step": 14 }, { "epoch": 0.029354207436399216, "grad_norm": 1.1434872150421143, "learning_rate": 4.6875000000000004e-06, "loss": 0.5698, "step": 15 }, { "epoch": 0.03131115459882583, "grad_norm": 1.4672112464904785, "learning_rate": 5e-06, "loss": 0.5471, "step": 16 }, { "epoch": 0.033268101761252444, "grad_norm": 0.9875673651695251, "learning_rate": 4.999949650182267e-06, "loss": 0.5393, "step": 17 }, { "epoch": 0.03522504892367906, "grad_norm": 1.0974621772766113, "learning_rate": 4.999798602757149e-06, "loss": 0.5349, "step": 18 }, { "epoch": 0.03718199608610567, "grad_norm": 1.2209999561309814, "learning_rate": 4.999546863808815e-06, "loss": 0.6743, "step": 19 }, { "epoch": 0.03913894324853229, "grad_norm": 0.8842924237251282, "learning_rate": 4.999194443477273e-06, "loss": 0.5919, "step": 20 }, { "epoch": 0.0410958904109589, "grad_norm": 1.0825450420379639, "learning_rate": 4.998741355957963e-06, "loss": 0.6438, "step": 21 }, { "epoch": 0.043052837573385516, "grad_norm": 1.0688315629959106, "learning_rate": 4.998187619501185e-06, "loss": 0.5637, "step": 22 }, { "epoch": 0.04500978473581213, "grad_norm": 0.8487011790275574, "learning_rate": 4.99753325641136e-06, "loss": 0.5082, "step": 23 }, { "epoch": 0.046966731898238745, "grad_norm": 0.9255719780921936, "learning_rate": 4.9967782930461405e-06, "loss": 0.5081, "step": 24 }, { "epoch": 0.04892367906066536, "grad_norm": 0.9492978453636169, "learning_rate": 4.9959227598153395e-06, "loss": 0.6473, "step": 25 }, { "epoch": 0.050880626223091974, "grad_norm": 1.5718590021133423, "learning_rate": 4.994966691179712e-06, "loss": 0.5219, "step": 26 }, { "epoch": 0.05283757338551859, "grad_norm": 0.9533342123031616, "learning_rate": 4.993910125649561e-06, "loss": 0.5279, "step": 27 }, { "epoch": 0.0547945205479452, "grad_norm": 1.8043086528778076, "learning_rate": 4.992753105783194e-06, "loss": 0.5277, "step": 28 }, { "epoch": 0.05675146771037182, "grad_norm": 1.317238450050354, "learning_rate": 4.991495678185202e-06, "loss": 0.4567, "step": 29 }, { "epoch": 0.05870841487279843, "grad_norm": 1.1877973079681396, "learning_rate": 4.990137893504585e-06, "loss": 0.5536, "step": 30 }, { "epoch": 0.060665362035225046, "grad_norm": 1.052051305770874, "learning_rate": 4.988679806432712e-06, "loss": 0.4946, "step": 31 }, { "epoch": 0.06262230919765166, "grad_norm": 7.080264091491699, "learning_rate": 4.987121475701118e-06, "loss": 0.5056, "step": 32 }, { "epoch": 0.06457925636007827, "grad_norm": 3.640033483505249, "learning_rate": 4.985462964079137e-06, "loss": 0.5162, "step": 33 }, { "epoch": 0.06653620352250489, "grad_norm": 2.17399263381958, "learning_rate": 4.983704338371375e-06, "loss": 0.5314, "step": 34 }, { "epoch": 0.0684931506849315, "grad_norm": 0.9113507270812988, "learning_rate": 4.981845669415022e-06, "loss": 0.5416, "step": 35 }, { "epoch": 0.07045009784735812, "grad_norm": 0.865261971950531, "learning_rate": 4.9798870320769884e-06, "loss": 0.5266, "step": 36 }, { "epoch": 0.07240704500978473, "grad_norm": 1.3988151550292969, "learning_rate": 4.977828505250903e-06, "loss": 0.4983, "step": 37 }, { "epoch": 0.07436399217221135, "grad_norm": 1.0698161125183105, "learning_rate": 4.975670171853926e-06, "loss": 0.4723, "step": 38 }, { "epoch": 0.07632093933463796, "grad_norm": 1.2741320133209229, "learning_rate": 4.9734121188234115e-06, "loss": 0.4996, "step": 39 }, { "epoch": 0.07827788649706457, "grad_norm": 2.0048317909240723, "learning_rate": 4.971054437113406e-06, "loss": 0.6535, "step": 40 }, { "epoch": 0.08023483365949119, "grad_norm": 1.2805678844451904, "learning_rate": 4.968597221690986e-06, "loss": 0.5198, "step": 41 }, { "epoch": 0.0821917808219178, "grad_norm": 0.9233219027519226, "learning_rate": 4.96604057153243e-06, "loss": 0.5724, "step": 42 }, { "epoch": 0.08414872798434442, "grad_norm": 0.9261006712913513, "learning_rate": 4.963384589619233e-06, "loss": 0.4601, "step": 43 }, { "epoch": 0.08610567514677103, "grad_norm": 1.3594372272491455, "learning_rate": 4.960629382933959e-06, "loss": 0.5616, "step": 44 }, { "epoch": 0.08806262230919765, "grad_norm": 2.4310686588287354, "learning_rate": 4.957775062455933e-06, "loss": 0.5442, "step": 45 }, { "epoch": 0.09001956947162426, "grad_norm": 1.030832290649414, "learning_rate": 4.9548217431567665e-06, "loss": 0.5964, "step": 46 }, { "epoch": 0.09197651663405088, "grad_norm": 0.831721305847168, "learning_rate": 4.951769543995731e-06, "loss": 0.44, "step": 47 }, { "epoch": 0.09393346379647749, "grad_norm": 0.9876791834831238, "learning_rate": 4.948618587914963e-06, "loss": 0.5404, "step": 48 }, { "epoch": 0.0958904109589041, "grad_norm": 0.9953415393829346, "learning_rate": 4.9453690018345144e-06, "loss": 0.5668, "step": 49 }, { "epoch": 0.09784735812133072, "grad_norm": 0.8553183078765869, "learning_rate": 4.9420209166472386e-06, "loss": 0.5414, "step": 50 }, { "epoch": 0.09980430528375733, "grad_norm": 0.7962396144866943, "learning_rate": 4.938574467213519e-06, "loss": 0.495, "step": 51 }, { "epoch": 0.10176125244618395, "grad_norm": 0.7835857272148132, "learning_rate": 4.935029792355834e-06, "loss": 0.5037, "step": 52 }, { "epoch": 0.10371819960861056, "grad_norm": 0.8453947901725769, "learning_rate": 4.931387034853173e-06, "loss": 0.5011, "step": 53 }, { "epoch": 0.10567514677103718, "grad_norm": 1.8459208011627197, "learning_rate": 4.927646341435276e-06, "loss": 0.5554, "step": 54 }, { "epoch": 0.10763209393346379, "grad_norm": 0.9212117195129395, "learning_rate": 4.9238078627767285e-06, "loss": 0.5886, "step": 55 }, { "epoch": 0.1095890410958904, "grad_norm": 0.7834203243255615, "learning_rate": 4.919871753490892e-06, "loss": 0.4602, "step": 56 }, { "epoch": 0.11154598825831702, "grad_norm": 0.9025184512138367, "learning_rate": 4.9158381721236715e-06, "loss": 0.4544, "step": 57 }, { "epoch": 0.11350293542074363, "grad_norm": 1.1300384998321533, "learning_rate": 4.91170728114714e-06, "loss": 0.5704, "step": 58 }, { "epoch": 0.11545988258317025, "grad_norm": 0.7926605343818665, "learning_rate": 4.907479246952981e-06, "loss": 0.5112, "step": 59 }, { "epoch": 0.11741682974559686, "grad_norm": 0.7744232416152954, "learning_rate": 4.903154239845798e-06, "loss": 0.4894, "step": 60 }, { "epoch": 0.11937377690802348, "grad_norm": 1.6636885404586792, "learning_rate": 4.8987324340362445e-06, "loss": 0.5311, "step": 61 }, { "epoch": 0.12133072407045009, "grad_norm": 1.0098280906677246, "learning_rate": 4.894214007634014e-06, "loss": 0.4907, "step": 62 }, { "epoch": 0.1232876712328767, "grad_norm": 1.0168606042861938, "learning_rate": 4.889599142640663e-06, "loss": 0.5128, "step": 63 }, { "epoch": 0.12524461839530332, "grad_norm": 0.8393405079841614, "learning_rate": 4.884888024942282e-06, "loss": 0.4989, "step": 64 }, { "epoch": 0.12720156555772993, "grad_norm": 1.2758891582489014, "learning_rate": 4.880080844302004e-06, "loss": 0.5329, "step": 65 }, { "epoch": 0.12915851272015655, "grad_norm": 0.8657482862472534, "learning_rate": 4.875177794352364e-06, "loss": 0.5058, "step": 66 }, { "epoch": 0.13111545988258316, "grad_norm": 0.9110330939292908, "learning_rate": 4.870179072587499e-06, "loss": 0.5137, "step": 67 }, { "epoch": 0.13307240704500978, "grad_norm": 0.8738705515861511, "learning_rate": 4.865084880355193e-06, "loss": 0.5423, "step": 68 }, { "epoch": 0.1350293542074364, "grad_norm": 0.8127829432487488, "learning_rate": 4.859895422848767e-06, "loss": 0.5402, "step": 69 }, { "epoch": 0.136986301369863, "grad_norm": 0.768864631652832, "learning_rate": 4.854610909098813e-06, "loss": 0.5301, "step": 70 }, { "epoch": 0.13894324853228962, "grad_norm": 1.2464350461959839, "learning_rate": 4.849231551964771e-06, "loss": 0.5124, "step": 71 }, { "epoch": 0.14090019569471623, "grad_norm": 0.9351313710212708, "learning_rate": 4.843757568126366e-06, "loss": 0.5152, "step": 72 }, { "epoch": 0.14285714285714285, "grad_norm": 0.842991054058075, "learning_rate": 4.838189178074867e-06, "loss": 0.5254, "step": 73 }, { "epoch": 0.14481409001956946, "grad_norm": 0.7789003252983093, "learning_rate": 4.832526606104213e-06, "loss": 0.5528, "step": 74 }, { "epoch": 0.14677103718199608, "grad_norm": 0.8701135516166687, "learning_rate": 4.826770080301978e-06, "loss": 0.5243, "step": 75 }, { "epoch": 0.1487279843444227, "grad_norm": 0.8384250998497009, "learning_rate": 4.8209198325401815e-06, "loss": 0.4648, "step": 76 }, { "epoch": 0.1506849315068493, "grad_norm": 1.0472533702850342, "learning_rate": 4.814976098465951e-06, "loss": 0.5342, "step": 77 }, { "epoch": 0.15264187866927592, "grad_norm": 0.9264402389526367, "learning_rate": 4.808939117492028e-06, "loss": 0.5267, "step": 78 }, { "epoch": 0.15459882583170254, "grad_norm": 0.8155198097229004, "learning_rate": 4.802809132787125e-06, "loss": 0.5363, "step": 79 }, { "epoch": 0.15655577299412915, "grad_norm": 0.8857468366622925, "learning_rate": 4.796586391266135e-06, "loss": 0.5021, "step": 80 }, { "epoch": 0.15851272015655576, "grad_norm": 1.0320619344711304, "learning_rate": 4.790271143580174e-06, "loss": 0.4892, "step": 81 }, { "epoch": 0.16046966731898238, "grad_norm": 0.9655166268348694, "learning_rate": 4.783863644106502e-06, "loss": 0.5493, "step": 82 }, { "epoch": 0.162426614481409, "grad_norm": 1.3644921779632568, "learning_rate": 4.777364150938263e-06, "loss": 0.4835, "step": 83 }, { "epoch": 0.1643835616438356, "grad_norm": 1.291692852973938, "learning_rate": 4.770772925874093e-06, "loss": 0.5755, "step": 84 }, { "epoch": 0.16634050880626222, "grad_norm": 1.0446902513504028, "learning_rate": 4.764090234407578e-06, "loss": 0.5659, "step": 85 }, { "epoch": 0.16829745596868884, "grad_norm": 0.9225801825523376, "learning_rate": 4.757316345716554e-06, "loss": 0.4067, "step": 86 }, { "epoch": 0.17025440313111545, "grad_norm": 0.8291013240814209, "learning_rate": 4.75045153265227e-06, "loss": 0.4946, "step": 87 }, { "epoch": 0.17221135029354206, "grad_norm": 1.1656488180160522, "learning_rate": 4.743496071728396e-06, "loss": 0.4933, "step": 88 }, { "epoch": 0.17416829745596868, "grad_norm": 0.9090279936790466, "learning_rate": 4.736450243109885e-06, "loss": 0.5085, "step": 89 }, { "epoch": 0.1761252446183953, "grad_norm": 1.2236806154251099, "learning_rate": 4.729314330601684e-06, "loss": 0.5147, "step": 90 }, { "epoch": 0.1780821917808219, "grad_norm": 0.9335976839065552, "learning_rate": 4.7220886216373095e-06, "loss": 0.4589, "step": 91 }, { "epoch": 0.18003913894324852, "grad_norm": 0.759772002696991, "learning_rate": 4.714773407267264e-06, "loss": 0.5398, "step": 92 }, { "epoch": 0.18199608610567514, "grad_norm": 0.9582347869873047, "learning_rate": 4.707368982147318e-06, "loss": 0.5696, "step": 93 }, { "epoch": 0.18395303326810175, "grad_norm": 0.9130314588546753, "learning_rate": 4.699875644526633e-06, "loss": 0.4803, "step": 94 }, { "epoch": 0.18590998043052837, "grad_norm": 0.9103049635887146, "learning_rate": 4.692293696235758e-06, "loss": 0.4833, "step": 95 }, { "epoch": 0.18786692759295498, "grad_norm": 0.7975893616676331, "learning_rate": 4.684623442674463e-06, "loss": 0.5263, "step": 96 }, { "epoch": 0.1898238747553816, "grad_norm": 0.761643648147583, "learning_rate": 4.676865192799443e-06, "loss": 0.4519, "step": 97 }, { "epoch": 0.1917808219178082, "grad_norm": 0.7510681748390198, "learning_rate": 4.669019259111873e-06, "loss": 0.4871, "step": 98 }, { "epoch": 0.19373776908023482, "grad_norm": 1.1785235404968262, "learning_rate": 4.661085957644817e-06, "loss": 0.4644, "step": 99 }, { "epoch": 0.19569471624266144, "grad_norm": 1.2464004755020142, "learning_rate": 4.653065607950502e-06, "loss": 0.4791, "step": 100 }, { "epoch": 0.19765166340508805, "grad_norm": 2.580218553543091, "learning_rate": 4.644958533087443e-06, "loss": 0.4146, "step": 101 }, { "epoch": 0.19960861056751467, "grad_norm": 0.9442769289016724, "learning_rate": 4.636765059607434e-06, "loss": 0.494, "step": 102 }, { "epoch": 0.20156555772994128, "grad_norm": 0.7965562343597412, "learning_rate": 4.628485517542393e-06, "loss": 0.4496, "step": 103 }, { "epoch": 0.2035225048923679, "grad_norm": 1.2338522672653198, "learning_rate": 4.620120240391065e-06, "loss": 0.4592, "step": 104 }, { "epoch": 0.2054794520547945, "grad_norm": 0.8661827445030212, "learning_rate": 4.611669565105597e-06, "loss": 0.4883, "step": 105 }, { "epoch": 0.20743639921722112, "grad_norm": 1.014655351638794, "learning_rate": 4.603133832077953e-06, "loss": 0.5101, "step": 106 }, { "epoch": 0.20939334637964774, "grad_norm": 0.9033066630363464, "learning_rate": 4.5945133851262185e-06, "loss": 0.4515, "step": 107 }, { "epoch": 0.21135029354207435, "grad_norm": 0.91737961769104, "learning_rate": 4.585808571480739e-06, "loss": 0.4886, "step": 108 }, { "epoch": 0.21330724070450097, "grad_norm": 0.9076818823814392, "learning_rate": 4.577019741770137e-06, "loss": 0.5572, "step": 109 }, { "epoch": 0.21526418786692758, "grad_norm": 0.9256044626235962, "learning_rate": 4.5681472500071935e-06, "loss": 0.5089, "step": 110 }, { "epoch": 0.2172211350293542, "grad_norm": 0.8705273270606995, "learning_rate": 4.559191453574582e-06, "loss": 0.5199, "step": 111 }, { "epoch": 0.2191780821917808, "grad_norm": 0.8358094096183777, "learning_rate": 4.550152713210478e-06, "loss": 0.5091, "step": 112 }, { "epoch": 0.22113502935420742, "grad_norm": 1.0409964323043823, "learning_rate": 4.541031392994025e-06, "loss": 0.4997, "step": 113 }, { "epoch": 0.22309197651663404, "grad_norm": 0.8039932250976562, "learning_rate": 4.53182786033067e-06, "loss": 0.537, "step": 114 }, { "epoch": 0.22504892367906065, "grad_norm": 0.9191640615463257, "learning_rate": 4.522542485937369e-06, "loss": 0.5799, "step": 115 }, { "epoch": 0.22700587084148727, "grad_norm": 0.8132153153419495, "learning_rate": 4.513175643827647e-06, "loss": 0.5217, "step": 116 }, { "epoch": 0.22896281800391388, "grad_norm": 0.7776696085929871, "learning_rate": 4.503727711296539e-06, "loss": 0.4729, "step": 117 }, { "epoch": 0.2309197651663405, "grad_norm": 0.8824874758720398, "learning_rate": 4.494199068905389e-06, "loss": 0.4977, "step": 118 }, { "epoch": 0.2328767123287671, "grad_norm": 0.9938674569129944, "learning_rate": 4.484590100466524e-06, "loss": 0.5067, "step": 119 }, { "epoch": 0.23483365949119372, "grad_norm": 1.018510103225708, "learning_rate": 4.474901193027791e-06, "loss": 0.5855, "step": 120 }, { "epoch": 0.23679060665362034, "grad_norm": 1.0530946254730225, "learning_rate": 4.4651327368569695e-06, "loss": 0.4835, "step": 121 }, { "epoch": 0.23874755381604695, "grad_norm": 0.7325494289398193, "learning_rate": 4.455285125426049e-06, "loss": 0.5043, "step": 122 }, { "epoch": 0.24070450097847357, "grad_norm": 1.2264351844787598, "learning_rate": 4.445358755395382e-06, "loss": 0.4991, "step": 123 }, { "epoch": 0.24266144814090018, "grad_norm": 0.7878324389457703, "learning_rate": 4.435354026597707e-06, "loss": 0.4943, "step": 124 }, { "epoch": 0.2446183953033268, "grad_norm": 1.0379810333251953, "learning_rate": 4.425271342022039e-06, "loss": 0.5664, "step": 125 }, { "epoch": 0.2465753424657534, "grad_norm": 1.2007404565811157, "learning_rate": 4.415111107797445e-06, "loss": 0.4495, "step": 126 }, { "epoch": 0.24853228962818003, "grad_norm": 1.4260215759277344, "learning_rate": 4.404873733176678e-06, "loss": 0.4848, "step": 127 }, { "epoch": 0.25048923679060664, "grad_norm": 0.7717714309692383, "learning_rate": 4.3945596305196925e-06, "loss": 0.4975, "step": 128 }, { "epoch": 0.25244618395303325, "grad_norm": 1.0631009340286255, "learning_rate": 4.384169215277042e-06, "loss": 0.538, "step": 129 }, { "epoch": 0.25440313111545987, "grad_norm": 0.9604893326759338, "learning_rate": 4.373702905973136e-06, "loss": 0.554, "step": 130 }, { "epoch": 0.2563600782778865, "grad_norm": 0.8638473749160767, "learning_rate": 4.363161124189387e-06, "loss": 0.4839, "step": 131 }, { "epoch": 0.2583170254403131, "grad_norm": 0.8187501430511475, "learning_rate": 4.352544294547229e-06, "loss": 0.5105, "step": 132 }, { "epoch": 0.2602739726027397, "grad_norm": 1.4357470273971558, "learning_rate": 4.341852844691012e-06, "loss": 0.4532, "step": 133 }, { "epoch": 0.2622309197651663, "grad_norm": 0.8292232155799866, "learning_rate": 4.331087205270778e-06, "loss": 0.451, "step": 134 }, { "epoch": 0.26418786692759294, "grad_norm": 0.8243665099143982, "learning_rate": 4.320247809924911e-06, "loss": 0.4857, "step": 135 }, { "epoch": 0.26614481409001955, "grad_norm": 0.9147266745567322, "learning_rate": 4.309335095262675e-06, "loss": 0.4778, "step": 136 }, { "epoch": 0.26810176125244617, "grad_norm": 0.8612287044525146, "learning_rate": 4.2983495008466285e-06, "loss": 0.4627, "step": 137 }, { "epoch": 0.2700587084148728, "grad_norm": 0.8230846524238586, "learning_rate": 4.287291469174909e-06, "loss": 0.4627, "step": 138 }, { "epoch": 0.2720156555772994, "grad_norm": 0.8767359852790833, "learning_rate": 4.276161445663423e-06, "loss": 0.5119, "step": 139 }, { "epoch": 0.273972602739726, "grad_norm": 0.8119643926620483, "learning_rate": 4.264959878627891e-06, "loss": 0.4495, "step": 140 }, { "epoch": 0.2759295499021526, "grad_norm": 0.7973845601081848, "learning_rate": 4.253687219265803e-06, "loss": 0.5228, "step": 141 }, { "epoch": 0.27788649706457924, "grad_norm": 0.892238199710846, "learning_rate": 4.242343921638235e-06, "loss": 0.5154, "step": 142 }, { "epoch": 0.27984344422700586, "grad_norm": 1.3092166185379028, "learning_rate": 4.230930442651558e-06, "loss": 0.5085, "step": 143 }, { "epoch": 0.28180039138943247, "grad_norm": 1.2284399271011353, "learning_rate": 4.219447242039043e-06, "loss": 0.4366, "step": 144 }, { "epoch": 0.2837573385518591, "grad_norm": 1.0883151292800903, "learning_rate": 4.207894782342337e-06, "loss": 0.5958, "step": 145 }, { "epoch": 0.2857142857142857, "grad_norm": 1.1132919788360596, "learning_rate": 4.196273528892831e-06, "loss": 0.4348, "step": 146 }, { "epoch": 0.2876712328767123, "grad_norm": 1.2576059103012085, "learning_rate": 4.18458394979292e-06, "loss": 0.5247, "step": 147 }, { "epoch": 0.2896281800391389, "grad_norm": 0.8995031714439392, "learning_rate": 4.172826515897146e-06, "loss": 0.5082, "step": 148 }, { "epoch": 0.29158512720156554, "grad_norm": 0.7533922791481018, "learning_rate": 4.161001700793231e-06, "loss": 0.4644, "step": 149 }, { "epoch": 0.29354207436399216, "grad_norm": 0.9206835031509399, "learning_rate": 4.149109980783004e-06, "loss": 0.494, "step": 150 }, { "epoch": 0.29549902152641877, "grad_norm": 1.208590030670166, "learning_rate": 4.137151834863213e-06, "loss": 0.5545, "step": 151 }, { "epoch": 0.2974559686888454, "grad_norm": 0.7689659595489502, "learning_rate": 4.125127744706232e-06, "loss": 0.4845, "step": 152 }, { "epoch": 0.299412915851272, "grad_norm": 1.0235570669174194, "learning_rate": 4.113038194640658e-06, "loss": 0.4778, "step": 153 }, { "epoch": 0.3013698630136986, "grad_norm": 1.1112617254257202, "learning_rate": 4.100883671631806e-06, "loss": 0.5206, "step": 154 }, { "epoch": 0.30332681017612523, "grad_norm": 1.073519229888916, "learning_rate": 4.088664665262091e-06, "loss": 0.4944, "step": 155 }, { "epoch": 0.30528375733855184, "grad_norm": 0.8319236040115356, "learning_rate": 4.076381667711306e-06, "loss": 0.4741, "step": 156 }, { "epoch": 0.30724070450097846, "grad_norm": 1.2600641250610352, "learning_rate": 4.064035173736804e-06, "loss": 0.5311, "step": 157 }, { "epoch": 0.30919765166340507, "grad_norm": 0.8686632513999939, "learning_rate": 4.05162568065356e-06, "loss": 0.5436, "step": 158 }, { "epoch": 0.3111545988258317, "grad_norm": 0.7053869366645813, "learning_rate": 4.039153688314146e-06, "loss": 0.4846, "step": 159 }, { "epoch": 0.3131115459882583, "grad_norm": 0.8360055685043335, "learning_rate": 4.0266196990885955e-06, "loss": 0.5041, "step": 160 }, { "epoch": 0.3150684931506849, "grad_norm": 0.8842881321907043, "learning_rate": 4.014024217844167e-06, "loss": 0.4708, "step": 161 }, { "epoch": 0.31702544031311153, "grad_norm": 1.0392301082611084, "learning_rate": 4.001367751925008e-06, "loss": 0.5315, "step": 162 }, { "epoch": 0.31898238747553814, "grad_norm": 0.8801809549331665, "learning_rate": 3.98865081113172e-06, "loss": 0.4438, "step": 163 }, { "epoch": 0.32093933463796476, "grad_norm": 1.395719289779663, "learning_rate": 3.9758739077008256e-06, "loss": 0.4929, "step": 164 }, { "epoch": 0.32289628180039137, "grad_norm": 0.8075605034828186, "learning_rate": 3.96303755628413e-06, "loss": 0.4364, "step": 165 }, { "epoch": 0.324853228962818, "grad_norm": 0.9566773772239685, "learning_rate": 3.950142273927996e-06, "loss": 0.4001, "step": 166 }, { "epoch": 0.3268101761252446, "grad_norm": 2.270550012588501, "learning_rate": 3.937188580052518e-06, "loss": 0.4683, "step": 167 }, { "epoch": 0.3287671232876712, "grad_norm": 0.8937717080116272, "learning_rate": 3.924176996430597e-06, "loss": 0.479, "step": 168 }, { "epoch": 0.33072407045009783, "grad_norm": 0.9620490074157715, "learning_rate": 3.911108047166924e-06, "loss": 0.4669, "step": 169 }, { "epoch": 0.33072407045009783, "eval_accuracy": 0.825246566025413, "eval_accuracy_first_token": 0.9521367521367521, "eval_accuracy_first_token_all": 0.9670737362807235, "eval_accuracy_first_token_all_total": 6469, "eval_accuracy_first_token_calculate": 0.7954545454545454, "eval_accuracy_first_token_calculate_total": 44, "eval_accuracy_first_token_execute": 1.0, "eval_accuracy_first_token_execute_total": 202, "eval_accuracy_first_token_get": 0.9649122807017544, "eval_accuracy_first_token_get_total": 456, "eval_accuracy_first_token_python": 0.8777777777777778, "eval_accuracy_first_token_python_total": 990, "eval_loss": 0.5176534056663513, "eval_perplexity": 1.206566771452624, "eval_runtime": 524.2306, "eval_samples_per_second": 1.269, "eval_steps_per_second": 0.16, "eval_total_number_first_token": 9360, "step": 169 }, { "epoch": 0.33268101761252444, "grad_norm": 0.8628460168838501, "learning_rate": 3.897982258676867e-06, "loss": 0.4727, "step": 170 }, { "epoch": 0.33463796477495106, "grad_norm": 0.8537535071372986, "learning_rate": 3.8848001596652765e-06, "loss": 0.4746, "step": 171 }, { "epoch": 0.33659491193737767, "grad_norm": 0.9613227248191833, "learning_rate": 3.8715622811051754e-06, "loss": 0.5148, "step": 172 }, { "epoch": 0.3385518590998043, "grad_norm": 0.8833454251289368, "learning_rate": 3.858269156216383e-06, "loss": 0.5125, "step": 173 }, { "epoch": 0.3405088062622309, "grad_norm": 0.9823891520500183, "learning_rate": 3.844921320444031e-06, "loss": 0.5127, "step": 174 }, { "epoch": 0.3424657534246575, "grad_norm": 1.0789107084274292, "learning_rate": 3.8315193114369995e-06, "loss": 0.4935, "step": 175 }, { "epoch": 0.34442270058708413, "grad_norm": 0.8753149509429932, "learning_rate": 3.8180636690262565e-06, "loss": 0.4543, "step": 176 }, { "epoch": 0.34637964774951074, "grad_norm": 1.7468674182891846, "learning_rate": 3.804554935203115e-06, "loss": 0.4955, "step": 177 }, { "epoch": 0.34833659491193736, "grad_norm": 0.9011304974555969, "learning_rate": 3.7909936540974052e-06, "loss": 0.5992, "step": 178 }, { "epoch": 0.350293542074364, "grad_norm": 0.9541127681732178, "learning_rate": 3.777380371955552e-06, "loss": 0.5322, "step": 179 }, { "epoch": 0.3522504892367906, "grad_norm": 1.3841750621795654, "learning_rate": 3.7637156371185744e-06, "loss": 0.4661, "step": 180 }, { "epoch": 0.3542074363992172, "grad_norm": 1.0240124464035034, "learning_rate": 3.7500000000000005e-06, "loss": 0.5231, "step": 181 }, { "epoch": 0.3561643835616438, "grad_norm": 1.444016933441162, "learning_rate": 3.7362340130636926e-06, "loss": 0.5203, "step": 182 }, { "epoch": 0.35812133072407043, "grad_norm": 0.7845962047576904, "learning_rate": 3.7224182308015977e-06, "loss": 0.4929, "step": 183 }, { "epoch": 0.36007827788649704, "grad_norm": 1.0257796049118042, "learning_rate": 3.7085532097114098e-06, "loss": 0.4597, "step": 184 }, { "epoch": 0.36203522504892366, "grad_norm": 0.9083458185195923, "learning_rate": 3.6946395082741582e-06, "loss": 0.5254, "step": 185 }, { "epoch": 0.3639921722113503, "grad_norm": 0.9128417372703552, "learning_rate": 3.6806776869317074e-06, "loss": 0.4428, "step": 186 }, { "epoch": 0.3659491193737769, "grad_norm": 1.1980143785476685, "learning_rate": 3.6666683080641846e-06, "loss": 0.5374, "step": 187 }, { "epoch": 0.3679060665362035, "grad_norm": 0.8467942476272583, "learning_rate": 3.6526119359673283e-06, "loss": 0.4963, "step": 188 }, { "epoch": 0.3698630136986301, "grad_norm": 0.8798732757568359, "learning_rate": 3.6385091368297582e-06, "loss": 0.5208, "step": 189 }, { "epoch": 0.37181996086105673, "grad_norm": 0.8612852692604065, "learning_rate": 3.624360478710165e-06, "loss": 0.3989, "step": 190 }, { "epoch": 0.37377690802348335, "grad_norm": 0.7529587149620056, "learning_rate": 3.6101665315144357e-06, "loss": 0.5015, "step": 191 }, { "epoch": 0.37573385518590996, "grad_norm": 0.8704853653907776, "learning_rate": 3.595927866972694e-06, "loss": 0.4318, "step": 192 }, { "epoch": 0.3776908023483366, "grad_norm": 1.1298363208770752, "learning_rate": 3.581645058616271e-06, "loss": 0.5047, "step": 193 }, { "epoch": 0.3796477495107632, "grad_norm": 1.2964321374893188, "learning_rate": 3.5673186817546047e-06, "loss": 0.4764, "step": 194 }, { "epoch": 0.3816046966731898, "grad_norm": 2.080096960067749, "learning_rate": 3.552949313452067e-06, "loss": 0.4808, "step": 195 }, { "epoch": 0.3835616438356164, "grad_norm": 0.8993785977363586, "learning_rate": 3.5385375325047167e-06, "loss": 0.5577, "step": 196 }, { "epoch": 0.38551859099804303, "grad_norm": 0.8617794513702393, "learning_rate": 3.5240839194169885e-06, "loss": 0.5042, "step": 197 }, { "epoch": 0.38747553816046965, "grad_norm": 0.9634183645248413, "learning_rate": 3.5095890563783124e-06, "loss": 0.466, "step": 198 }, { "epoch": 0.38943248532289626, "grad_norm": 0.9015300273895264, "learning_rate": 3.4950535272396564e-06, "loss": 0.3887, "step": 199 }, { "epoch": 0.3913894324853229, "grad_norm": 0.8658633828163147, "learning_rate": 3.480477917490014e-06, "loss": 0.4665, "step": 200 }, { "epoch": 0.3933463796477495, "grad_norm": 0.7967968583106995, "learning_rate": 3.4658628142328215e-06, "loss": 0.515, "step": 201 }, { "epoch": 0.3953033268101761, "grad_norm": 0.7495056986808777, "learning_rate": 3.4512088061623077e-06, "loss": 0.4345, "step": 202 }, { "epoch": 0.3972602739726027, "grad_norm": 0.9585980772972107, "learning_rate": 3.436516483539781e-06, "loss": 0.4084, "step": 203 }, { "epoch": 0.39921722113502933, "grad_norm": 0.9240750670433044, "learning_rate": 3.4217864381698523e-06, "loss": 0.4451, "step": 204 }, { "epoch": 0.40117416829745595, "grad_norm": 1.2117798328399658, "learning_rate": 3.4070192633766025e-06, "loss": 0.5152, "step": 205 }, { "epoch": 0.40313111545988256, "grad_norm": 0.868486225605011, "learning_rate": 3.39221555397968e-06, "loss": 0.5456, "step": 206 }, { "epoch": 0.4050880626223092, "grad_norm": 0.7969531416893005, "learning_rate": 3.37737590627034e-06, "loss": 0.4295, "step": 207 }, { "epoch": 0.4070450097847358, "grad_norm": 0.9103299975395203, "learning_rate": 3.362500917987427e-06, "loss": 0.4485, "step": 208 }, { "epoch": 0.4090019569471624, "grad_norm": 1.0487585067749023, "learning_rate": 3.3475911882933014e-06, "loss": 0.4807, "step": 209 }, { "epoch": 0.410958904109589, "grad_norm": 0.9155584573745728, "learning_rate": 3.332647317749702e-06, "loss": 0.4617, "step": 210 }, { "epoch": 0.41291585127201563, "grad_norm": 0.9164103865623474, "learning_rate": 3.3176699082935546e-06, "loss": 0.5041, "step": 211 }, { "epoch": 0.41487279843444225, "grad_norm": 0.7580545544624329, "learning_rate": 3.3026595632127274e-06, "loss": 0.465, "step": 212 }, { "epoch": 0.41682974559686886, "grad_norm": 1.0577958822250366, "learning_rate": 3.2876168871217322e-06, "loss": 0.4055, "step": 213 }, { "epoch": 0.4187866927592955, "grad_norm": 1.2304415702819824, "learning_rate": 3.272542485937369e-06, "loss": 0.3852, "step": 214 }, { "epoch": 0.4207436399217221, "grad_norm": 0.905158281326294, "learning_rate": 3.2574369668543187e-06, "loss": 0.4861, "step": 215 }, { "epoch": 0.4227005870841487, "grad_norm": 0.9109801054000854, "learning_rate": 3.2423009383206876e-06, "loss": 0.4247, "step": 216 }, { "epoch": 0.4246575342465753, "grad_norm": 0.8025485277175903, "learning_rate": 3.227135010013498e-06, "loss": 0.5319, "step": 217 }, { "epoch": 0.42661448140900193, "grad_norm": 0.883714497089386, "learning_rate": 3.211939792814131e-06, "loss": 0.5287, "step": 218 }, { "epoch": 0.42857142857142855, "grad_norm": 0.9827890396118164, "learning_rate": 3.19671589878372e-06, "loss": 0.4799, "step": 219 }, { "epoch": 0.43052837573385516, "grad_norm": 0.8296178579330444, "learning_rate": 3.1814639411384953e-06, "loss": 0.4725, "step": 220 }, { "epoch": 0.4324853228962818, "grad_norm": 0.8092741370201111, "learning_rate": 3.1661845342250874e-06, "loss": 0.5054, "step": 221 }, { "epoch": 0.4344422700587084, "grad_norm": 1.160125732421875, "learning_rate": 3.1508782934957804e-06, "loss": 0.6022, "step": 222 }, { "epoch": 0.436399217221135, "grad_norm": 0.871837854385376, "learning_rate": 3.1355458354837183e-06, "loss": 0.4545, "step": 223 }, { "epoch": 0.4383561643835616, "grad_norm": 0.8639246225357056, "learning_rate": 3.1201877777780724e-06, "loss": 0.449, "step": 224 }, { "epoch": 0.44031311154598823, "grad_norm": 0.9144279956817627, "learning_rate": 3.1048047389991693e-06, "loss": 0.4308, "step": 225 }, { "epoch": 0.44227005870841485, "grad_norm": 1.0165725946426392, "learning_rate": 3.089397338773569e-06, "loss": 0.4997, "step": 226 }, { "epoch": 0.44422700587084146, "grad_norm": 0.7787861824035645, "learning_rate": 3.0739661977091027e-06, "loss": 0.4408, "step": 227 }, { "epoch": 0.4461839530332681, "grad_norm": 0.8962077498435974, "learning_rate": 3.0585119373698858e-06, "loss": 0.4879, "step": 228 }, { "epoch": 0.4481409001956947, "grad_norm": 0.8481760621070862, "learning_rate": 3.04303518025127e-06, "loss": 0.4525, "step": 229 }, { "epoch": 0.4500978473581213, "grad_norm": 0.9689728021621704, "learning_rate": 3.0275365497547747e-06, "loss": 0.5199, "step": 230 }, { "epoch": 0.4520547945205479, "grad_norm": 1.0657813549041748, "learning_rate": 3.012016670162977e-06, "loss": 0.4834, "step": 231 }, { "epoch": 0.45401174168297453, "grad_norm": 1.0324097871780396, "learning_rate": 2.9964761666143638e-06, "loss": 0.5407, "step": 232 }, { "epoch": 0.45596868884540115, "grad_norm": 0.8452147245407104, "learning_rate": 2.980915665078153e-06, "loss": 0.5108, "step": 233 }, { "epoch": 0.45792563600782776, "grad_norm": 1.1484103202819824, "learning_rate": 2.9653357923290753e-06, "loss": 0.4082, "step": 234 }, { "epoch": 0.4598825831702544, "grad_norm": 0.859313428401947, "learning_rate": 2.949737175922135e-06, "loss": 0.4752, "step": 235 }, { "epoch": 0.461839530332681, "grad_norm": 0.87496417760849, "learning_rate": 2.9341204441673267e-06, "loss": 0.4624, "step": 236 }, { "epoch": 0.4637964774951076, "grad_norm": 0.9420116543769836, "learning_rate": 2.9184862261043272e-06, "loss": 0.4557, "step": 237 }, { "epoch": 0.4657534246575342, "grad_norm": 1.4860702753067017, "learning_rate": 2.902835151477161e-06, "loss": 0.4617, "step": 238 }, { "epoch": 0.46771037181996084, "grad_norm": 0.8771023750305176, "learning_rate": 2.887167850708831e-06, "loss": 0.5299, "step": 239 }, { "epoch": 0.46966731898238745, "grad_norm": 0.8673617839813232, "learning_rate": 2.8714849548759293e-06, "loss": 0.5504, "step": 240 }, { "epoch": 0.47162426614481406, "grad_norm": 0.8307452201843262, "learning_rate": 2.8557870956832135e-06, "loss": 0.4735, "step": 241 }, { "epoch": 0.4735812133072407, "grad_norm": 0.9233512282371521, "learning_rate": 2.840074905438161e-06, "loss": 0.3701, "step": 242 }, { "epoch": 0.4755381604696673, "grad_norm": 1.0768812894821167, "learning_rate": 2.8243490170255046e-06, "loss": 0.4983, "step": 243 }, { "epoch": 0.4774951076320939, "grad_norm": 0.9305315017700195, "learning_rate": 2.808610063881737e-06, "loss": 0.4137, "step": 244 }, { "epoch": 0.4794520547945205, "grad_norm": 1.1971187591552734, "learning_rate": 2.792858679969596e-06, "loss": 0.452, "step": 245 }, { "epoch": 0.48140900195694714, "grad_norm": 1.314292073249817, "learning_rate": 2.7770954997525277e-06, "loss": 0.526, "step": 246 }, { "epoch": 0.48336594911937375, "grad_norm": 1.2386282682418823, "learning_rate": 2.761321158169134e-06, "loss": 0.5002, "step": 247 }, { "epoch": 0.48532289628180036, "grad_norm": 0.9772767424583435, "learning_rate": 2.745536290607593e-06, "loss": 0.5091, "step": 248 }, { "epoch": 0.487279843444227, "grad_norm": 1.0364662408828735, "learning_rate": 2.729741532880069e-06, "loss": 0.4752, "step": 249 }, { "epoch": 0.4892367906066536, "grad_norm": 0.8030025362968445, "learning_rate": 2.7139375211971e-06, "loss": 0.462, "step": 250 }, { "epoch": 0.4911937377690802, "grad_norm": 1.3889553546905518, "learning_rate": 2.6981248921419713e-06, "loss": 0.4102, "step": 251 }, { "epoch": 0.4931506849315068, "grad_norm": 0.9577500224113464, "learning_rate": 2.682304282645077e-06, "loss": 0.5008, "step": 252 }, { "epoch": 0.49510763209393344, "grad_norm": 1.3206193447113037, "learning_rate": 2.66647632995826e-06, "loss": 0.4624, "step": 253 }, { "epoch": 0.49706457925636005, "grad_norm": 0.8159929513931274, "learning_rate": 2.6506416716291466e-06, "loss": 0.4561, "step": 254 }, { "epoch": 0.49902152641878667, "grad_norm": 0.854573130607605, "learning_rate": 2.634800945475465e-06, "loss": 0.5503, "step": 255 }, { "epoch": 0.5009784735812133, "grad_norm": 9.345633506774902, "learning_rate": 2.6189547895593565e-06, "loss": 0.5216, "step": 256 }, { "epoch": 0.50293542074364, "grad_norm": 0.8881295323371887, "learning_rate": 2.6031038421616684e-06, "loss": 0.4713, "step": 257 }, { "epoch": 0.5048923679060665, "grad_norm": 1.7568496465682983, "learning_rate": 2.587248741756253e-06, "loss": 0.5096, "step": 258 }, { "epoch": 0.5068493150684932, "grad_norm": 0.8306764960289001, "learning_rate": 2.5713901269842405e-06, "loss": 0.4504, "step": 259 }, { "epoch": 0.5088062622309197, "grad_norm": 0.9716941118240356, "learning_rate": 2.555528636628324e-06, "loss": 0.4668, "step": 260 }, { "epoch": 0.5107632093933464, "grad_norm": 0.8290694355964661, "learning_rate": 2.53966490958702e-06, "loss": 0.4288, "step": 261 }, { "epoch": 0.512720156555773, "grad_norm": 0.9514800310134888, "learning_rate": 2.5237995848489422e-06, "loss": 0.5157, "step": 262 }, { "epoch": 0.5146771037181996, "grad_norm": 1.515278935432434, "learning_rate": 2.507933301467056e-06, "loss": 0.4863, "step": 263 }, { "epoch": 0.5166340508806262, "grad_norm": 0.9582359790802002, "learning_rate": 2.4920666985329446e-06, "loss": 0.4694, "step": 264 }, { "epoch": 0.5185909980430529, "grad_norm": 0.8128112554550171, "learning_rate": 2.4762004151510586e-06, "loss": 0.4244, "step": 265 }, { "epoch": 0.5205479452054794, "grad_norm": 1.151044487953186, "learning_rate": 2.4603350904129802e-06, "loss": 0.4555, "step": 266 }, { "epoch": 0.5225048923679061, "grad_norm": 0.8072860240936279, "learning_rate": 2.4444713633716764e-06, "loss": 0.4173, "step": 267 }, { "epoch": 0.5244618395303327, "grad_norm": 1.8496747016906738, "learning_rate": 2.42860987301576e-06, "loss": 0.4206, "step": 268 }, { "epoch": 0.5264187866927593, "grad_norm": 1.096216082572937, "learning_rate": 2.4127512582437486e-06, "loss": 0.4501, "step": 269 }, { "epoch": 0.5283757338551859, "grad_norm": 0.9519087076187134, "learning_rate": 2.3968961578383324e-06, "loss": 0.4848, "step": 270 }, { "epoch": 0.5303326810176126, "grad_norm": 0.9204405546188354, "learning_rate": 2.3810452104406444e-06, "loss": 0.4526, "step": 271 }, { "epoch": 0.5322896281800391, "grad_norm": 0.8748743534088135, "learning_rate": 2.3651990545245357e-06, "loss": 0.4547, "step": 272 }, { "epoch": 0.5342465753424658, "grad_norm": 1.6212592124938965, "learning_rate": 2.3493583283708542e-06, "loss": 0.4937, "step": 273 }, { "epoch": 0.5362035225048923, "grad_norm": 0.9793727993965149, "learning_rate": 2.3335236700417404e-06, "loss": 0.4456, "step": 274 }, { "epoch": 0.538160469667319, "grad_norm": 1.7149184942245483, "learning_rate": 2.3176957173549236e-06, "loss": 0.4737, "step": 275 }, { "epoch": 0.5401174168297456, "grad_norm": 0.9447053074836731, "learning_rate": 2.3018751078580287e-06, "loss": 0.4496, "step": 276 }, { "epoch": 0.5420743639921722, "grad_norm": 0.9250771999359131, "learning_rate": 2.2860624788029013e-06, "loss": 0.4674, "step": 277 }, { "epoch": 0.5440313111545988, "grad_norm": 0.8631194233894348, "learning_rate": 2.2702584671199317e-06, "loss": 0.48, "step": 278 }, { "epoch": 0.5459882583170255, "grad_norm": 0.8921899199485779, "learning_rate": 2.2544637093924072e-06, "loss": 0.4009, "step": 279 }, { "epoch": 0.547945205479452, "grad_norm": 0.811696469783783, "learning_rate": 2.238678841830867e-06, "loss": 0.4714, "step": 280 }, { "epoch": 0.5499021526418787, "grad_norm": 0.7900722026824951, "learning_rate": 2.2229045002474727e-06, "loss": 0.3956, "step": 281 }, { "epoch": 0.5518590998043053, "grad_norm": 0.8538399934768677, "learning_rate": 2.2071413200304046e-06, "loss": 0.3488, "step": 282 }, { "epoch": 0.5538160469667319, "grad_norm": 0.9310709238052368, "learning_rate": 2.1913899361182634e-06, "loss": 0.3915, "step": 283 }, { "epoch": 0.5557729941291585, "grad_norm": 0.8419170379638672, "learning_rate": 2.1756509829744958e-06, "loss": 0.4716, "step": 284 }, { "epoch": 0.5577299412915852, "grad_norm": 1.2008228302001953, "learning_rate": 2.1599250945618404e-06, "loss": 0.4493, "step": 285 }, { "epoch": 0.5596868884540117, "grad_norm": 0.8127449750900269, "learning_rate": 2.1442129043167877e-06, "loss": 0.4223, "step": 286 }, { "epoch": 0.5616438356164384, "grad_norm": 1.0872187614440918, "learning_rate": 2.128515045124071e-06, "loss": 0.4814, "step": 287 }, { "epoch": 0.5636007827788649, "grad_norm": 0.9573928713798523, "learning_rate": 2.1128321492911697e-06, "loss": 0.4606, "step": 288 }, { "epoch": 0.5655577299412916, "grad_norm": 0.7555910348892212, "learning_rate": 2.0971648485228404e-06, "loss": 0.446, "step": 289 }, { "epoch": 0.5675146771037182, "grad_norm": 0.9281080961227417, "learning_rate": 2.0815137738956736e-06, "loss": 0.4224, "step": 290 }, { "epoch": 0.5694716242661448, "grad_norm": 1.0036050081253052, "learning_rate": 2.0658795558326745e-06, "loss": 0.4476, "step": 291 }, { "epoch": 0.5714285714285714, "grad_norm": 0.8064005970954895, "learning_rate": 2.0502628240778655e-06, "loss": 0.4518, "step": 292 }, { "epoch": 0.5733855185909981, "grad_norm": 0.8491390347480774, "learning_rate": 2.034664207670925e-06, "loss": 0.4947, "step": 293 }, { "epoch": 0.5753424657534246, "grad_norm": 1.433266043663025, "learning_rate": 2.019084334921849e-06, "loss": 0.4929, "step": 294 }, { "epoch": 0.5772994129158513, "grad_norm": 0.8420299291610718, "learning_rate": 2.003523833385637e-06, "loss": 0.4533, "step": 295 }, { "epoch": 0.5792563600782779, "grad_norm": 2.3586318492889404, "learning_rate": 1.987983329837024e-06, "loss": 0.4257, "step": 296 }, { "epoch": 0.5812133072407045, "grad_norm": 0.85833340883255, "learning_rate": 1.972463450245226e-06, "loss": 0.4875, "step": 297 }, { "epoch": 0.5831702544031311, "grad_norm": 0.7927659749984741, "learning_rate": 1.956964819748731e-06, "loss": 0.415, "step": 298 }, { "epoch": 0.5851272015655578, "grad_norm": 0.8850895762443542, "learning_rate": 1.9414880626301147e-06, "loss": 0.409, "step": 299 }, { "epoch": 0.5870841487279843, "grad_norm": 1.509407877922058, "learning_rate": 1.9260338022908972e-06, "loss": 0.5041, "step": 300 }, { "epoch": 0.589041095890411, "grad_norm": 1.5269814729690552, "learning_rate": 1.9106026612264316e-06, "loss": 0.4222, "step": 301 }, { "epoch": 0.5909980430528375, "grad_norm": 1.386004090309143, "learning_rate": 1.895195261000831e-06, "loss": 0.5278, "step": 302 }, { "epoch": 0.5929549902152642, "grad_norm": 1.278283953666687, "learning_rate": 1.8798122222219288e-06, "loss": 0.4823, "step": 303 }, { "epoch": 0.5949119373776908, "grad_norm": 0.8502036333084106, "learning_rate": 1.8644541645162834e-06, "loss": 0.4682, "step": 304 }, { "epoch": 0.5968688845401174, "grad_norm": 0.8835340142250061, "learning_rate": 1.84912170650422e-06, "loss": 0.3474, "step": 305 }, { "epoch": 0.598825831702544, "grad_norm": 0.8175051212310791, "learning_rate": 1.833815465774913e-06, "loss": 0.4262, "step": 306 }, { "epoch": 0.6007827788649707, "grad_norm": 1.031742811203003, "learning_rate": 1.818536058861506e-06, "loss": 0.4432, "step": 307 }, { "epoch": 0.6027397260273972, "grad_norm": 0.9526416659355164, "learning_rate": 1.803284101216281e-06, "loss": 0.3981, "step": 308 }, { "epoch": 0.6046966731898239, "grad_norm": 0.9259310364723206, "learning_rate": 1.7880602071858694e-06, "loss": 0.4249, "step": 309 }, { "epoch": 0.6066536203522505, "grad_norm": 1.0978782176971436, "learning_rate": 1.7728649899865024e-06, "loss": 0.4955, "step": 310 }, { "epoch": 0.6086105675146771, "grad_norm": 0.8304716944694519, "learning_rate": 1.7576990616793139e-06, "loss": 0.4666, "step": 311 }, { "epoch": 0.6105675146771037, "grad_norm": 2.960554838180542, "learning_rate": 1.7425630331456821e-06, "loss": 0.412, "step": 312 }, { "epoch": 0.6125244618395304, "grad_norm": 0.8440503478050232, "learning_rate": 1.7274575140626318e-06, "loss": 0.4814, "step": 313 }, { "epoch": 0.6144814090019569, "grad_norm": 0.8478916883468628, "learning_rate": 1.7123831128782686e-06, "loss": 0.4708, "step": 314 }, { "epoch": 0.6164383561643836, "grad_norm": 0.8599239587783813, "learning_rate": 1.697340436787273e-06, "loss": 0.428, "step": 315 }, { "epoch": 0.6183953033268101, "grad_norm": 1.151842474937439, "learning_rate": 1.6823300917064462e-06, "loss": 0.3433, "step": 316 }, { "epoch": 0.6203522504892368, "grad_norm": 0.8544068336486816, "learning_rate": 1.6673526822502982e-06, "loss": 0.4431, "step": 317 }, { "epoch": 0.6223091976516634, "grad_norm": 1.1891425848007202, "learning_rate": 1.6524088117066984e-06, "loss": 0.4334, "step": 318 }, { "epoch": 0.62426614481409, "grad_norm": 1.1379172801971436, "learning_rate": 1.637499082012574e-06, "loss": 0.5514, "step": 319 }, { "epoch": 0.6262230919765166, "grad_norm": 1.0814030170440674, "learning_rate": 1.6226240937296617e-06, "loss": 0.4772, "step": 320 }, { "epoch": 0.6281800391389433, "grad_norm": 0.9527184963226318, "learning_rate": 1.6077844460203207e-06, "loss": 0.4292, "step": 321 }, { "epoch": 0.6301369863013698, "grad_norm": 0.9083294868469238, "learning_rate": 1.5929807366233979e-06, "loss": 0.501, "step": 322 }, { "epoch": 0.6320939334637965, "grad_norm": 1.4445644617080688, "learning_rate": 1.5782135618301486e-06, "loss": 0.4924, "step": 323 }, { "epoch": 0.6340508806262231, "grad_norm": 1.361970067024231, "learning_rate": 1.56348351646022e-06, "loss": 0.4487, "step": 324 }, { "epoch": 0.6360078277886497, "grad_norm": 0.8321120142936707, "learning_rate": 1.5487911938376925e-06, "loss": 0.4566, "step": 325 }, { "epoch": 0.6379647749510763, "grad_norm": 1.1182819604873657, "learning_rate": 1.5341371857671782e-06, "loss": 0.4253, "step": 326 }, { "epoch": 0.639921722113503, "grad_norm": 1.133865475654602, "learning_rate": 1.5195220825099863e-06, "loss": 0.4212, "step": 327 }, { "epoch": 0.6418786692759295, "grad_norm": 0.9962513446807861, "learning_rate": 1.5049464727603453e-06, "loss": 0.4702, "step": 328 }, { "epoch": 0.6438356164383562, "grad_norm": 1.1037347316741943, "learning_rate": 1.4904109436216885e-06, "loss": 0.5035, "step": 329 }, { "epoch": 0.6457925636007827, "grad_norm": 1.0168620347976685, "learning_rate": 1.475916080583012e-06, "loss": 0.4762, "step": 330 }, { "epoch": 0.6477495107632094, "grad_norm": 1.1695796251296997, "learning_rate": 1.4614624674952843e-06, "loss": 0.4313, "step": 331 }, { "epoch": 0.649706457925636, "grad_norm": 1.4158042669296265, "learning_rate": 1.4470506865479337e-06, "loss": 0.4798, "step": 332 }, { "epoch": 0.6516634050880626, "grad_norm": 0.9545938968658447, "learning_rate": 1.4326813182453959e-06, "loss": 0.4126, "step": 333 }, { "epoch": 0.6536203522504892, "grad_norm": 1.0253559350967407, "learning_rate": 1.4183549413837288e-06, "loss": 0.4633, "step": 334 }, { "epoch": 0.6555772994129159, "grad_norm": 0.9522657990455627, "learning_rate": 1.4040721330273063e-06, "loss": 0.4716, "step": 335 }, { "epoch": 0.6575342465753424, "grad_norm": 0.9612335562705994, "learning_rate": 1.3898334684855647e-06, "loss": 0.4699, "step": 336 }, { "epoch": 0.6594911937377691, "grad_norm": 0.9365352392196655, "learning_rate": 1.375639521289836e-06, "loss": 0.4775, "step": 337 }, { "epoch": 0.6614481409001957, "grad_norm": 0.9329326152801514, "learning_rate": 1.3614908631702435e-06, "loss": 0.4236, "step": 338 }, { "epoch": 0.6614481409001957, "eval_accuracy": 0.8283926414598568, "eval_accuracy_first_token": 0.9538461538461539, "eval_accuracy_first_token_all": 0.9723295718039883, "eval_accuracy_first_token_all_total": 6469, "eval_accuracy_first_token_calculate": 0.9090909090909091, "eval_accuracy_first_token_calculate_total": 44, "eval_accuracy_first_token_execute": 1.0, "eval_accuracy_first_token_execute_total": 202, "eval_accuracy_first_token_get": 0.9517543859649122, "eval_accuracy_first_token_get_total": 456, "eval_accuracy_first_token_python": 0.8838383838383839, "eval_accuracy_first_token_python_total": 990, "eval_loss": 0.5066910982131958, "eval_perplexity": 1.2021342846813718, "eval_runtime": 525.6643, "eval_samples_per_second": 1.265, "eval_steps_per_second": 0.16, "eval_total_number_first_token": 9360, "step": 338 }, { "epoch": 0.6634050880626223, "grad_norm": 0.9786916375160217, "learning_rate": 1.3473880640326725e-06, "loss": 0.4361, "step": 339 }, { "epoch": 0.6653620352250489, "grad_norm": 0.9186453819274902, "learning_rate": 1.3333316919358159e-06, "loss": 0.4658, "step": 340 }, { "epoch": 0.6673189823874756, "grad_norm": 1.02847421169281, "learning_rate": 1.3193223130682937e-06, "loss": 0.4517, "step": 341 }, { "epoch": 0.6692759295499021, "grad_norm": 0.970281720161438, "learning_rate": 1.3053604917258428e-06, "loss": 0.4617, "step": 342 }, { "epoch": 0.6712328767123288, "grad_norm": 0.7810537219047546, "learning_rate": 1.2914467902885902e-06, "loss": 0.4246, "step": 343 }, { "epoch": 0.6731898238747553, "grad_norm": 0.9498399496078491, "learning_rate": 1.2775817691984032e-06, "loss": 0.4706, "step": 344 }, { "epoch": 0.675146771037182, "grad_norm": 0.9036231637001038, "learning_rate": 1.2637659869363085e-06, "loss": 0.4826, "step": 345 }, { "epoch": 0.6771037181996086, "grad_norm": 0.8129305243492126, "learning_rate": 1.2500000000000007e-06, "loss": 0.4163, "step": 346 }, { "epoch": 0.6790606653620352, "grad_norm": 1.4746164083480835, "learning_rate": 1.2362843628814267e-06, "loss": 0.3961, "step": 347 }, { "epoch": 0.6810176125244618, "grad_norm": 11.255887031555176, "learning_rate": 1.222619628044449e-06, "loss": 0.4761, "step": 348 }, { "epoch": 0.6829745596868885, "grad_norm": 0.9121260643005371, "learning_rate": 1.2090063459025956e-06, "loss": 0.4277, "step": 349 }, { "epoch": 0.684931506849315, "grad_norm": 0.9116764068603516, "learning_rate": 1.1954450647968856e-06, "loss": 0.4696, "step": 350 }, { "epoch": 0.6868884540117417, "grad_norm": 1.206604242324829, "learning_rate": 1.181936330973744e-06, "loss": 0.4205, "step": 351 }, { "epoch": 0.6888454011741683, "grad_norm": 0.8744117617607117, "learning_rate": 1.1684806885630003e-06, "loss": 0.5077, "step": 352 }, { "epoch": 0.6908023483365949, "grad_norm": 2.155042886734009, "learning_rate": 1.155078679555969e-06, "loss": 0.4193, "step": 353 }, { "epoch": 0.6927592954990215, "grad_norm": 0.9258475303649902, "learning_rate": 1.1417308437836181e-06, "loss": 0.3645, "step": 354 }, { "epoch": 0.6947162426614482, "grad_norm": 0.7997338771820068, "learning_rate": 1.1284377188948258e-06, "loss": 0.4044, "step": 355 }, { "epoch": 0.6966731898238747, "grad_norm": 0.8342923521995544, "learning_rate": 1.1151998403347245e-06, "loss": 0.4132, "step": 356 }, { "epoch": 0.6986301369863014, "grad_norm": 1.0009496212005615, "learning_rate": 1.1020177413231334e-06, "loss": 0.4046, "step": 357 }, { "epoch": 0.700587084148728, "grad_norm": 1.0892616510391235, "learning_rate": 1.0888919528330778e-06, "loss": 0.4878, "step": 358 }, { "epoch": 0.7025440313111546, "grad_norm": 0.829866886138916, "learning_rate": 1.0758230035694031e-06, "loss": 0.4876, "step": 359 }, { "epoch": 0.7045009784735812, "grad_norm": 0.9134871363639832, "learning_rate": 1.062811419947482e-06, "loss": 0.5027, "step": 360 }, { "epoch": 0.7064579256360078, "grad_norm": 1.1233887672424316, "learning_rate": 1.049857726072005e-06, "loss": 0.3487, "step": 361 }, { "epoch": 0.7084148727984344, "grad_norm": 0.8092291355133057, "learning_rate": 1.036962443715872e-06, "loss": 0.5009, "step": 362 }, { "epoch": 0.7103718199608611, "grad_norm": 1.730331301689148, "learning_rate": 1.0241260922991761e-06, "loss": 0.386, "step": 363 }, { "epoch": 0.7123287671232876, "grad_norm": 0.9802207946777344, "learning_rate": 1.0113491888682802e-06, "loss": 0.4209, "step": 364 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0146572589874268, "learning_rate": 9.986322480749926e-07, "loss": 0.6119, "step": 365 }, { "epoch": 0.7162426614481409, "grad_norm": 0.930644154548645, "learning_rate": 9.85975782155834e-07, "loss": 0.4453, "step": 366 }, { "epoch": 0.7181996086105675, "grad_norm": 1.2394402027130127, "learning_rate": 9.733803009114045e-07, "loss": 0.4364, "step": 367 }, { "epoch": 0.7201565557729941, "grad_norm": 0.8096799850463867, "learning_rate": 9.608463116858544e-07, "loss": 0.3672, "step": 368 }, { "epoch": 0.7221135029354208, "grad_norm": 0.9330917596817017, "learning_rate": 9.483743193464409e-07, "loss": 0.4665, "step": 369 }, { "epoch": 0.7240704500978473, "grad_norm": 1.0829280614852905, "learning_rate": 9.359648262631962e-07, "loss": 0.4924, "step": 370 }, { "epoch": 0.726027397260274, "grad_norm": 1.0950247049331665, "learning_rate": 9.236183322886946e-07, "loss": 0.4907, "step": 371 }, { "epoch": 0.7279843444227005, "grad_norm": 0.8494971394538879, "learning_rate": 9.113353347379097e-07, "loss": 0.4286, "step": 372 }, { "epoch": 0.7299412915851272, "grad_norm": 0.9138147830963135, "learning_rate": 8.991163283681945e-07, "loss": 0.4396, "step": 373 }, { "epoch": 0.7318982387475538, "grad_norm": 1.6995892524719238, "learning_rate": 8.869618053593429e-07, "loss": 0.3989, "step": 374 }, { "epoch": 0.7338551859099804, "grad_norm": 0.9424477815628052, "learning_rate": 8.748722552937688e-07, "loss": 0.4371, "step": 375 }, { "epoch": 0.735812133072407, "grad_norm": 1.2042125463485718, "learning_rate": 8.628481651367876e-07, "loss": 0.4337, "step": 376 }, { "epoch": 0.7377690802348337, "grad_norm": 0.9822342395782471, "learning_rate": 8.508900192169964e-07, "loss": 0.4329, "step": 377 }, { "epoch": 0.7397260273972602, "grad_norm": 1.0332896709442139, "learning_rate": 8.389982992067688e-07, "loss": 0.4286, "step": 378 }, { "epoch": 0.7416829745596869, "grad_norm": 0.8743665218353271, "learning_rate": 8.271734841028553e-07, "loss": 0.487, "step": 379 }, { "epoch": 0.7436399217221135, "grad_norm": 0.9147298336029053, "learning_rate": 8.154160502070804e-07, "loss": 0.453, "step": 380 }, { "epoch": 0.7455968688845401, "grad_norm": 1.113299012184143, "learning_rate": 8.037264711071699e-07, "loss": 0.4432, "step": 381 }, { "epoch": 0.7475538160469667, "grad_norm": 0.934984564781189, "learning_rate": 7.921052176576643e-07, "loss": 0.5102, "step": 382 }, { "epoch": 0.7495107632093934, "grad_norm": 0.8149503469467163, "learning_rate": 7.805527579609575e-07, "loss": 0.4834, "step": 383 }, { "epoch": 0.7514677103718199, "grad_norm": 1.2893983125686646, "learning_rate": 7.690695573484433e-07, "loss": 0.3211, "step": 384 }, { "epoch": 0.7534246575342466, "grad_norm": 1.0519015789031982, "learning_rate": 7.576560783617667e-07, "loss": 0.4613, "step": 385 }, { "epoch": 0.7553816046966731, "grad_norm": 0.8619464039802551, "learning_rate": 7.463127807341966e-07, "loss": 0.4728, "step": 386 }, { "epoch": 0.7573385518590998, "grad_norm": 0.890130341053009, "learning_rate": 7.35040121372109e-07, "loss": 0.4721, "step": 387 }, { "epoch": 0.7592954990215264, "grad_norm": 1.1289362907409668, "learning_rate": 7.238385543365783e-07, "loss": 0.4206, "step": 388 }, { "epoch": 0.761252446183953, "grad_norm": 0.8591368198394775, "learning_rate": 7.127085308250914e-07, "loss": 0.415, "step": 389 }, { "epoch": 0.7632093933463796, "grad_norm": 0.9674418568611145, "learning_rate": 7.016504991533727e-07, "loss": 0.5114, "step": 390 }, { "epoch": 0.7651663405088063, "grad_norm": 1.0890218019485474, "learning_rate": 6.906649047373246e-07, "loss": 0.3641, "step": 391 }, { "epoch": 0.7671232876712328, "grad_norm": 0.9494483470916748, "learning_rate": 6.797521900750897e-07, "loss": 0.4682, "step": 392 }, { "epoch": 0.7690802348336595, "grad_norm": 0.9544976949691772, "learning_rate": 6.689127947292232e-07, "loss": 0.4227, "step": 393 }, { "epoch": 0.7710371819960861, "grad_norm": 2.679705858230591, "learning_rate": 6.581471553089874e-07, "loss": 0.4482, "step": 394 }, { "epoch": 0.7729941291585127, "grad_norm": 0.8427915573120117, "learning_rate": 6.474557054527709e-07, "loss": 0.4048, "step": 395 }, { "epoch": 0.7749510763209393, "grad_norm": 0.8168734312057495, "learning_rate": 6.368388758106134e-07, "loss": 0.377, "step": 396 }, { "epoch": 0.776908023483366, "grad_norm": 1.0561057329177856, "learning_rate": 6.262970940268653e-07, "loss": 0.4315, "step": 397 }, { "epoch": 0.7788649706457925, "grad_norm": 0.8930473923683167, "learning_rate": 6.158307847229594e-07, "loss": 0.5171, "step": 398 }, { "epoch": 0.7808219178082192, "grad_norm": 1.0137521028518677, "learning_rate": 6.05440369480308e-07, "loss": 0.4549, "step": 399 }, { "epoch": 0.7827788649706457, "grad_norm": 0.9667198061943054, "learning_rate": 5.951262668233232e-07, "loss": 0.4213, "step": 400 }, { "epoch": 0.7847358121330724, "grad_norm": 0.7895818948745728, "learning_rate": 5.848888922025553e-07, "loss": 0.427, "step": 401 }, { "epoch": 0.786692759295499, "grad_norm": 1.007455825805664, "learning_rate": 5.747286579779607e-07, "loss": 0.4125, "step": 402 }, { "epoch": 0.7886497064579256, "grad_norm": 1.8778549432754517, "learning_rate": 5.646459734022938e-07, "loss": 0.4568, "step": 403 }, { "epoch": 0.7906066536203522, "grad_norm": 0.976000964641571, "learning_rate": 5.546412446046187e-07, "loss": 0.5, "step": 404 }, { "epoch": 0.7925636007827789, "grad_norm": 0.9260036945343018, "learning_rate": 5.447148745739522e-07, "loss": 0.4729, "step": 405 }, { "epoch": 0.7945205479452054, "grad_norm": 0.851002037525177, "learning_rate": 5.348672631430319e-07, "loss": 0.4294, "step": 406 }, { "epoch": 0.7964774951076321, "grad_norm": 0.976465106010437, "learning_rate": 5.250988069722096e-07, "loss": 0.4655, "step": 407 }, { "epoch": 0.7984344422700587, "grad_norm": 0.9321781396865845, "learning_rate": 5.154098995334769e-07, "loss": 0.3931, "step": 408 }, { "epoch": 0.8003913894324853, "grad_norm": 0.8924025297164917, "learning_rate": 5.058009310946119e-07, "loss": 0.4222, "step": 409 }, { "epoch": 0.8023483365949119, "grad_norm": 0.8116724491119385, "learning_rate": 4.962722887034616e-07, "loss": 0.325, "step": 410 }, { "epoch": 0.8043052837573386, "grad_norm": 0.9633209705352783, "learning_rate": 4.868243561723535e-07, "loss": 0.3769, "step": 411 }, { "epoch": 0.8062622309197651, "grad_norm": 0.902252733707428, "learning_rate": 4.774575140626317e-07, "loss": 0.3959, "step": 412 }, { "epoch": 0.8082191780821918, "grad_norm": 0.8941038250923157, "learning_rate": 4.681721396693303e-07, "loss": 0.4998, "step": 413 }, { "epoch": 0.8101761252446184, "grad_norm": 1.213836669921875, "learning_rate": 4.589686070059762e-07, "loss": 0.5012, "step": 414 }, { "epoch": 0.812133072407045, "grad_norm": 1.0174344778060913, "learning_rate": 4.4984728678952234e-07, "loss": 0.468, "step": 415 }, { "epoch": 0.8140900195694716, "grad_norm": 1.8333814144134521, "learning_rate": 4.4080854642541833e-07, "loss": 0.4941, "step": 416 }, { "epoch": 0.8160469667318982, "grad_norm": 1.6971678733825684, "learning_rate": 4.318527499928074e-07, "loss": 0.3649, "step": 417 }, { "epoch": 0.8180039138943248, "grad_norm": 0.8866695165634155, "learning_rate": 4.229802582298634e-07, "loss": 0.4657, "step": 418 }, { "epoch": 0.8199608610567515, "grad_norm": 1.3764787912368774, "learning_rate": 4.141914285192619e-07, "loss": 0.3836, "step": 419 }, { "epoch": 0.821917808219178, "grad_norm": 0.9406548142433167, "learning_rate": 4.0548661487378184e-07, "loss": 0.497, "step": 420 }, { "epoch": 0.8238747553816047, "grad_norm": 0.8251882195472717, "learning_rate": 3.9686616792204677e-07, "loss": 0.4032, "step": 421 }, { "epoch": 0.8258317025440313, "grad_norm": 0.8226965069770813, "learning_rate": 3.8833043489440477e-07, "loss": 0.4526, "step": 422 }, { "epoch": 0.8277886497064579, "grad_norm": 0.9033458232879639, "learning_rate": 3.798797596089351e-07, "loss": 0.4149, "step": 423 }, { "epoch": 0.8297455968688845, "grad_norm": 0.9945986866950989, "learning_rate": 3.715144824576078e-07, "loss": 0.5138, "step": 424 }, { "epoch": 0.8317025440313112, "grad_norm": 1.1671781539916992, "learning_rate": 3.632349403925664e-07, "loss": 0.4718, "step": 425 }, { "epoch": 0.8336594911937377, "grad_norm": 1.2945449352264404, "learning_rate": 3.5504146691255736e-07, "loss": 0.4514, "step": 426 }, { "epoch": 0.8356164383561644, "grad_norm": 1.3590197563171387, "learning_rate": 3.469343920494986e-07, "loss": 0.4147, "step": 427 }, { "epoch": 0.837573385518591, "grad_norm": 0.8810437917709351, "learning_rate": 3.389140423551834e-07, "loss": 0.4462, "step": 428 }, { "epoch": 0.8395303326810176, "grad_norm": 0.9122494459152222, "learning_rate": 3.3098074088812686e-07, "loss": 0.4766, "step": 429 }, { "epoch": 0.8414872798434442, "grad_norm": 0.8525986075401306, "learning_rate": 3.2313480720055747e-07, "loss": 0.3684, "step": 430 }, { "epoch": 0.8434442270058709, "grad_norm": 1.0988531112670898, "learning_rate": 3.153765573255377e-07, "loss": 0.4956, "step": 431 }, { "epoch": 0.8454011741682974, "grad_norm": 0.7911211848258972, "learning_rate": 3.0770630376424276e-07, "loss": 0.4842, "step": 432 }, { "epoch": 0.8473581213307241, "grad_norm": 1.0055835247039795, "learning_rate": 3.0012435547336737e-07, "loss": 0.3518, "step": 433 }, { "epoch": 0.8493150684931506, "grad_norm": 1.304575800895691, "learning_rate": 2.9263101785268253e-07, "loss": 0.3509, "step": 434 }, { "epoch": 0.8512720156555773, "grad_norm": 0.9222425818443298, "learning_rate": 2.8522659273273606e-07, "loss": 0.3888, "step": 435 }, { "epoch": 0.8532289628180039, "grad_norm": 0.9765827059745789, "learning_rate": 2.779113783626916e-07, "loss": 0.4616, "step": 436 }, { "epoch": 0.8551859099804305, "grad_norm": 0.972284734249115, "learning_rate": 2.7068566939831646e-07, "loss": 0.3573, "step": 437 }, { "epoch": 0.8571428571428571, "grad_norm": 0.9025648832321167, "learning_rate": 2.6354975689011576e-07, "loss": 0.4246, "step": 438 }, { "epoch": 0.8590998043052838, "grad_norm": 0.8234553933143616, "learning_rate": 2.5650392827160446e-07, "loss": 0.3739, "step": 439 }, { "epoch": 0.8610567514677103, "grad_norm": 1.1872916221618652, "learning_rate": 2.4954846734773054e-07, "loss": 0.377, "step": 440 }, { "epoch": 0.863013698630137, "grad_norm": 0.9565138816833496, "learning_rate": 2.4268365428344737e-07, "loss": 0.5044, "step": 441 }, { "epoch": 0.8649706457925636, "grad_norm": 1.1466796398162842, "learning_rate": 2.3590976559242278e-07, "loss": 0.3848, "step": 442 }, { "epoch": 0.8669275929549902, "grad_norm": 0.9302741289138794, "learning_rate": 2.29227074125907e-07, "loss": 0.5157, "step": 443 }, { "epoch": 0.8688845401174168, "grad_norm": 0.9383424520492554, "learning_rate": 2.2263584906173723e-07, "loss": 0.4421, "step": 444 }, { "epoch": 0.8708414872798435, "grad_norm": 1.1834505796432495, "learning_rate": 2.1613635589349756e-07, "loss": 0.4172, "step": 445 }, { "epoch": 0.87279843444227, "grad_norm": 0.9577175378799438, "learning_rate": 2.0972885641982605e-07, "loss": 0.4004, "step": 446 }, { "epoch": 0.8747553816046967, "grad_norm": 0.8691757321357727, "learning_rate": 2.0341360873386673e-07, "loss": 0.4321, "step": 447 }, { "epoch": 0.8767123287671232, "grad_norm": 1.0094484090805054, "learning_rate": 1.97190867212875e-07, "loss": 0.428, "step": 448 }, { "epoch": 0.8786692759295499, "grad_norm": 0.8963342308998108, "learning_rate": 1.9106088250797266e-07, "loss": 0.4358, "step": 449 }, { "epoch": 0.8806262230919765, "grad_norm": 1.7301355600357056, "learning_rate": 1.8502390153404936e-07, "loss": 0.4104, "step": 450 }, { "epoch": 0.8825831702544031, "grad_norm": 0.8558318614959717, "learning_rate": 1.790801674598186e-07, "loss": 0.4592, "step": 451 }, { "epoch": 0.8845401174168297, "grad_norm": 0.8883755207061768, "learning_rate": 1.732299196980225e-07, "loss": 0.416, "step": 452 }, { "epoch": 0.8864970645792564, "grad_norm": 1.679168701171875, "learning_rate": 1.6747339389578732e-07, "loss": 0.4899, "step": 453 }, { "epoch": 0.8884540117416829, "grad_norm": 0.8892528414726257, "learning_rate": 1.6181082192513352e-07, "loss": 0.4228, "step": 454 }, { "epoch": 0.8904109589041096, "grad_norm": 1.5113455057144165, "learning_rate": 1.5624243187363442e-07, "loss": 0.4832, "step": 455 }, { "epoch": 0.8923679060665362, "grad_norm": 1.2870134115219116, "learning_rate": 1.507684480352292e-07, "loss": 0.4141, "step": 456 }, { "epoch": 0.8943248532289628, "grad_norm": 1.6229395866394043, "learning_rate": 1.4538909090118846e-07, "loss": 0.4619, "step": 457 }, { "epoch": 0.8962818003913894, "grad_norm": 0.8794851899147034, "learning_rate": 1.4010457715123355e-07, "loss": 0.3665, "step": 458 }, { "epoch": 0.898238747553816, "grad_norm": 0.8392042517662048, "learning_rate": 1.3491511964480703e-07, "loss": 0.4389, "step": 459 }, { "epoch": 0.9001956947162426, "grad_norm": 1.3040436506271362, "learning_rate": 1.2982092741250145e-07, "loss": 0.3347, "step": 460 }, { "epoch": 0.9021526418786693, "grad_norm": 2.594942331314087, "learning_rate": 1.2482220564763669e-07, "loss": 0.3493, "step": 461 }, { "epoch": 0.9041095890410958, "grad_norm": 1.2146382331848145, "learning_rate": 1.1991915569799645e-07, "loss": 0.487, "step": 462 }, { "epoch": 0.9060665362035225, "grad_norm": 0.9857767224311829, "learning_rate": 1.1511197505771843e-07, "loss": 0.3678, "step": 463 }, { "epoch": 0.9080234833659491, "grad_norm": 0.9433605670928955, "learning_rate": 1.1040085735933681e-07, "loss": 0.4477, "step": 464 }, { "epoch": 0.9099804305283757, "grad_norm": 1.0072382688522339, "learning_rate": 1.0578599236598708e-07, "loss": 0.4258, "step": 465 }, { "epoch": 0.9119373776908023, "grad_norm": 0.977323591709137, "learning_rate": 1.0126756596375687e-07, "loss": 0.4071, "step": 466 }, { "epoch": 0.913894324853229, "grad_norm": 0.924149751663208, "learning_rate": 9.684576015420277e-08, "loss": 0.477, "step": 467 }, { "epoch": 0.9158512720156555, "grad_norm": 0.8529196381568909, "learning_rate": 9.252075304701929e-08, "loss": 0.4513, "step": 468 }, { "epoch": 0.9178082191780822, "grad_norm": 1.0572128295898438, "learning_rate": 8.829271885286095e-08, "loss": 0.4472, "step": 469 }, { "epoch": 0.9197651663405088, "grad_norm": 3.0187559127807617, "learning_rate": 8.416182787632871e-08, "loss": 0.3696, "step": 470 }, { "epoch": 0.9217221135029354, "grad_norm": 1.2419676780700684, "learning_rate": 8.012824650910938e-08, "loss": 0.3411, "step": 471 }, { "epoch": 0.923679060665362, "grad_norm": 0.8936371803283691, "learning_rate": 7.619213722327184e-08, "loss": 0.4494, "step": 472 }, { "epoch": 0.9256360078277887, "grad_norm": 1.0433343648910522, "learning_rate": 7.235365856472443e-08, "loss": 0.4545, "step": 473 }, { "epoch": 0.9275929549902152, "grad_norm": 0.9922037720680237, "learning_rate": 6.86129651468273e-08, "loss": 0.4118, "step": 474 }, { "epoch": 0.9295499021526419, "grad_norm": 0.8298634886741638, "learning_rate": 6.497020764416633e-08, "loss": 0.4768, "step": 475 }, { "epoch": 0.9315068493150684, "grad_norm": 0.8023221492767334, "learning_rate": 6.142553278648239e-08, "loss": 0.4451, "step": 476 }, { "epoch": 0.9334637964774951, "grad_norm": 0.828525960445404, "learning_rate": 5.7979083352762146e-08, "loss": 0.3043, "step": 477 }, { "epoch": 0.9354207436399217, "grad_norm": 1.59126615524292, "learning_rate": 5.463099816548578e-08, "loss": 0.3771, "step": 478 }, { "epoch": 0.9373776908023483, "grad_norm": 1.2710837125778198, "learning_rate": 5.1381412085036994e-08, "loss": 0.4743, "step": 479 }, { "epoch": 0.9393346379647749, "grad_norm": 0.953567624092102, "learning_rate": 4.823045600426901e-08, "loss": 0.4077, "step": 480 }, { "epoch": 0.9412915851272016, "grad_norm": 0.9778720736503601, "learning_rate": 4.5178256843233235e-08, "loss": 0.4112, "step": 481 }, { "epoch": 0.9432485322896281, "grad_norm": 0.8094834685325623, "learning_rate": 4.2224937544067254e-08, "loss": 0.4878, "step": 482 }, { "epoch": 0.9452054794520548, "grad_norm": 0.8327929377555847, "learning_rate": 3.9370617066040726e-08, "loss": 0.3676, "step": 483 }, { "epoch": 0.9471624266144814, "grad_norm": 0.8924036622047424, "learning_rate": 3.661541038076755e-08, "loss": 0.3628, "step": 484 }, { "epoch": 0.949119373776908, "grad_norm": 1.062476634979248, "learning_rate": 3.395942846757067e-08, "loss": 0.3709, "step": 485 }, { "epoch": 0.9510763209393346, "grad_norm": 0.9672690033912659, "learning_rate": 3.1402778309014284e-08, "loss": 0.4846, "step": 486 }, { "epoch": 0.9530332681017613, "grad_norm": 0.9301928281784058, "learning_rate": 2.8945562886593948e-08, "loss": 0.4465, "step": 487 }, { "epoch": 0.9549902152641878, "grad_norm": 1.6346007585525513, "learning_rate": 2.6587881176588782e-08, "loss": 0.3958, "step": 488 }, { "epoch": 0.9569471624266145, "grad_norm": 0.9479952454566956, "learning_rate": 2.4329828146074096e-08, "loss": 0.3922, "step": 489 }, { "epoch": 0.958904109589041, "grad_norm": 1.1471753120422363, "learning_rate": 2.2171494749097243e-08, "loss": 0.462, "step": 490 }, { "epoch": 0.9608610567514677, "grad_norm": 0.9728820323944092, "learning_rate": 2.011296792301165e-08, "loss": 0.4206, "step": 491 }, { "epoch": 0.9628180039138943, "grad_norm": 0.8930822014808655, "learning_rate": 1.8154330584978785e-08, "loss": 0.4664, "step": 492 }, { "epoch": 0.9647749510763209, "grad_norm": 1.0260281562805176, "learning_rate": 1.629566162862445e-08, "loss": 0.4395, "step": 493 }, { "epoch": 0.9667318982387475, "grad_norm": 1.2178572416305542, "learning_rate": 1.453703592086353e-08, "loss": 0.4311, "step": 494 }, { "epoch": 0.9686888454011742, "grad_norm": 0.8803574442863464, "learning_rate": 1.28785242988827e-08, "loss": 0.4175, "step": 495 }, { "epoch": 0.9706457925636007, "grad_norm": 0.9738378524780273, "learning_rate": 1.132019356728853e-08, "loss": 0.4419, "step": 496 }, { "epoch": 0.9726027397260274, "grad_norm": 0.8282538056373596, "learning_rate": 9.862106495415469e-09, "loss": 0.4128, "step": 497 }, { "epoch": 0.974559686888454, "grad_norm": 1.130934715270996, "learning_rate": 8.504321814798433e-09, "loss": 0.3772, "step": 498 }, { "epoch": 0.9765166340508806, "grad_norm": 2.3474204540252686, "learning_rate": 7.246894216806355e-09, "loss": 0.4271, "step": 499 }, { "epoch": 0.9784735812133072, "grad_norm": 0.9170702695846558, "learning_rate": 6.089874350439507e-09, "loss": 0.4163, "step": 500 }, { "epoch": 0.9804305283757339, "grad_norm": 1.3329914808273315, "learning_rate": 5.033308820289185e-09, "loss": 0.4318, "step": 501 }, { "epoch": 0.9823874755381604, "grad_norm": 0.9551968574523926, "learning_rate": 4.07724018466088e-09, "loss": 0.3538, "step": 502 }, { "epoch": 0.9843444227005871, "grad_norm": 0.9140384197235107, "learning_rate": 3.2217069538600932e-09, "loss": 0.4503, "step": 503 }, { "epoch": 0.9863013698630136, "grad_norm": 1.072695016860962, "learning_rate": 2.4667435886402414e-09, "loss": 0.4374, "step": 504 }, { "epoch": 0.9882583170254403, "grad_norm": 0.8060042262077332, "learning_rate": 1.8123804988159909e-09, "loss": 0.4142, "step": 505 }, { "epoch": 0.9902152641878669, "grad_norm": 1.2433676719665527, "learning_rate": 1.2586440420372936e-09, "loss": 0.4401, "step": 506 }, { "epoch": 0.9921722113502935, "grad_norm": 1.1050037145614624, "learning_rate": 8.0555652272718e-10, "loss": 0.4379, "step": 507 }, { "epoch": 0.9921722113502935, "eval_accuracy": 0.8291501000753674, "eval_accuracy_first_token": 0.9575854700854701, "eval_accuracy_first_token_all": 0.9726387385994744, "eval_accuracy_first_token_all_total": 6469, "eval_accuracy_first_token_calculate": 0.9090909090909091, "eval_accuracy_first_token_calculate_total": 44, "eval_accuracy_first_token_execute": 1.0, "eval_accuracy_first_token_execute_total": 202, "eval_accuracy_first_token_get": 0.956140350877193, "eval_accuracy_first_token_get_total": 456, "eval_accuracy_first_token_python": 0.8909090909090909, "eval_accuracy_first_token_python_total": 990, "eval_loss": 0.5047600269317627, "eval_perplexity": 1.201347285698878, "eval_runtime": 525.3078, "eval_samples_per_second": 1.266, "eval_steps_per_second": 0.16, "eval_total_number_first_token": 9360, "step": 507 }, { "epoch": 0.9941291585127201, "grad_norm": 0.9681710004806519, "learning_rate": 4.5313619118553256e-10, "loss": 0.4287, "step": 508 }, { "epoch": 0.9960861056751468, "grad_norm": 0.8318100571632385, "learning_rate": 2.0139724285161976e-10, "loss": 0.4405, "step": 509 }, { "epoch": 0.9980430528375733, "grad_norm": 0.8928787708282471, "learning_rate": 5.0349817733719165e-11, "loss": 0.3779, "step": 510 }, { "epoch": 1.0, "grad_norm": 2.4657742977142334, "learning_rate": 0.0, "loss": 0.4483, "step": 511 }, { "epoch": 1.0, "step": 511, "total_flos": 529508264312832.0, "train_loss": 0.47377988486140676, "train_runtime": 61575.0259, "train_samples_per_second": 0.133, "train_steps_per_second": 0.008 } ], "logging_steps": 1.0, "max_steps": 511, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 529508264312832.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }