|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 169, |
|
"global_step": 511, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019569471624266144, |
|
"grad_norm": 4.606130123138428, |
|
"learning_rate": 3.125e-07, |
|
"loss": 0.6812, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003913894324853229, |
|
"grad_norm": 5.627719402313232, |
|
"learning_rate": 6.25e-07, |
|
"loss": 0.7188, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005870841487279843, |
|
"grad_norm": 5.225893974304199, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 0.6832, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007827788649706457, |
|
"grad_norm": 4.055615425109863, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.7478, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009784735812133072, |
|
"grad_norm": 3.32236385345459, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 0.6512, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011741682974559686, |
|
"grad_norm": 2.5439915657043457, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 0.72, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0136986301369863, |
|
"grad_norm": 1.9466145038604736, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 0.6302, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015655577299412915, |
|
"grad_norm": 1.833212971687317, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6859, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01761252446183953, |
|
"grad_norm": 1.4203251600265503, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.5943, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.019569471624266144, |
|
"grad_norm": 1.7164653539657593, |
|
"learning_rate": 3.125e-06, |
|
"loss": 0.5744, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021526418786692758, |
|
"grad_norm": 1.4249149560928345, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 0.5896, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.023483365949119372, |
|
"grad_norm": 1.2433736324310303, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.5873, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.025440313111545987, |
|
"grad_norm": 1.0826597213745117, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 0.5792, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0273972602739726, |
|
"grad_norm": 1.0738195180892944, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 0.6032, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.029354207436399216, |
|
"grad_norm": 1.1434872150421143, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 0.5698, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03131115459882583, |
|
"grad_norm": 1.4672112464904785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5471, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.033268101761252444, |
|
"grad_norm": 0.9875673651695251, |
|
"learning_rate": 4.999949650182267e-06, |
|
"loss": 0.5393, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03522504892367906, |
|
"grad_norm": 1.0974621772766113, |
|
"learning_rate": 4.999798602757149e-06, |
|
"loss": 0.5349, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03718199608610567, |
|
"grad_norm": 1.2209999561309814, |
|
"learning_rate": 4.999546863808815e-06, |
|
"loss": 0.6743, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03913894324853229, |
|
"grad_norm": 0.8842924237251282, |
|
"learning_rate": 4.999194443477273e-06, |
|
"loss": 0.5919, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0410958904109589, |
|
"grad_norm": 1.0825450420379639, |
|
"learning_rate": 4.998741355957963e-06, |
|
"loss": 0.6438, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.043052837573385516, |
|
"grad_norm": 1.0688315629959106, |
|
"learning_rate": 4.998187619501185e-06, |
|
"loss": 0.5637, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04500978473581213, |
|
"grad_norm": 0.8487011790275574, |
|
"learning_rate": 4.99753325641136e-06, |
|
"loss": 0.5082, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.046966731898238745, |
|
"grad_norm": 0.9255719780921936, |
|
"learning_rate": 4.9967782930461405e-06, |
|
"loss": 0.5081, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04892367906066536, |
|
"grad_norm": 0.9492978453636169, |
|
"learning_rate": 4.9959227598153395e-06, |
|
"loss": 0.6473, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.050880626223091974, |
|
"grad_norm": 1.5718590021133423, |
|
"learning_rate": 4.994966691179712e-06, |
|
"loss": 0.5219, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05283757338551859, |
|
"grad_norm": 0.9533342123031616, |
|
"learning_rate": 4.993910125649561e-06, |
|
"loss": 0.5279, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0547945205479452, |
|
"grad_norm": 1.8043086528778076, |
|
"learning_rate": 4.992753105783194e-06, |
|
"loss": 0.5277, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05675146771037182, |
|
"grad_norm": 1.317238450050354, |
|
"learning_rate": 4.991495678185202e-06, |
|
"loss": 0.4567, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05870841487279843, |
|
"grad_norm": 1.1877973079681396, |
|
"learning_rate": 4.990137893504585e-06, |
|
"loss": 0.5536, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.060665362035225046, |
|
"grad_norm": 1.052051305770874, |
|
"learning_rate": 4.988679806432712e-06, |
|
"loss": 0.4946, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06262230919765166, |
|
"grad_norm": 7.080264091491699, |
|
"learning_rate": 4.987121475701118e-06, |
|
"loss": 0.5056, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06457925636007827, |
|
"grad_norm": 3.640033483505249, |
|
"learning_rate": 4.985462964079137e-06, |
|
"loss": 0.5162, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06653620352250489, |
|
"grad_norm": 2.17399263381958, |
|
"learning_rate": 4.983704338371375e-06, |
|
"loss": 0.5314, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0684931506849315, |
|
"grad_norm": 0.9113507270812988, |
|
"learning_rate": 4.981845669415022e-06, |
|
"loss": 0.5416, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07045009784735812, |
|
"grad_norm": 0.865261971950531, |
|
"learning_rate": 4.9798870320769884e-06, |
|
"loss": 0.5266, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07240704500978473, |
|
"grad_norm": 1.3988151550292969, |
|
"learning_rate": 4.977828505250903e-06, |
|
"loss": 0.4983, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07436399217221135, |
|
"grad_norm": 1.0698161125183105, |
|
"learning_rate": 4.975670171853926e-06, |
|
"loss": 0.4723, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07632093933463796, |
|
"grad_norm": 1.2741320133209229, |
|
"learning_rate": 4.9734121188234115e-06, |
|
"loss": 0.4996, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07827788649706457, |
|
"grad_norm": 2.0048317909240723, |
|
"learning_rate": 4.971054437113406e-06, |
|
"loss": 0.6535, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08023483365949119, |
|
"grad_norm": 1.2805678844451904, |
|
"learning_rate": 4.968597221690986e-06, |
|
"loss": 0.5198, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0821917808219178, |
|
"grad_norm": 0.9233219027519226, |
|
"learning_rate": 4.96604057153243e-06, |
|
"loss": 0.5724, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08414872798434442, |
|
"grad_norm": 0.9261006712913513, |
|
"learning_rate": 4.963384589619233e-06, |
|
"loss": 0.4601, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08610567514677103, |
|
"grad_norm": 1.3594372272491455, |
|
"learning_rate": 4.960629382933959e-06, |
|
"loss": 0.5616, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08806262230919765, |
|
"grad_norm": 2.4310686588287354, |
|
"learning_rate": 4.957775062455933e-06, |
|
"loss": 0.5442, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09001956947162426, |
|
"grad_norm": 1.030832290649414, |
|
"learning_rate": 4.9548217431567665e-06, |
|
"loss": 0.5964, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09197651663405088, |
|
"grad_norm": 0.831721305847168, |
|
"learning_rate": 4.951769543995731e-06, |
|
"loss": 0.44, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09393346379647749, |
|
"grad_norm": 0.9876791834831238, |
|
"learning_rate": 4.948618587914963e-06, |
|
"loss": 0.5404, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0958904109589041, |
|
"grad_norm": 0.9953415393829346, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.5668, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09784735812133072, |
|
"grad_norm": 0.8553183078765869, |
|
"learning_rate": 4.9420209166472386e-06, |
|
"loss": 0.5414, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09980430528375733, |
|
"grad_norm": 0.7962396144866943, |
|
"learning_rate": 4.938574467213519e-06, |
|
"loss": 0.495, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10176125244618395, |
|
"grad_norm": 0.7835857272148132, |
|
"learning_rate": 4.935029792355834e-06, |
|
"loss": 0.5037, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10371819960861056, |
|
"grad_norm": 0.8453947901725769, |
|
"learning_rate": 4.931387034853173e-06, |
|
"loss": 0.5011, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10567514677103718, |
|
"grad_norm": 1.8459208011627197, |
|
"learning_rate": 4.927646341435276e-06, |
|
"loss": 0.5554, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10763209393346379, |
|
"grad_norm": 0.9212117195129395, |
|
"learning_rate": 4.9238078627767285e-06, |
|
"loss": 0.5886, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1095890410958904, |
|
"grad_norm": 0.7834203243255615, |
|
"learning_rate": 4.919871753490892e-06, |
|
"loss": 0.4602, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11154598825831702, |
|
"grad_norm": 0.9025184512138367, |
|
"learning_rate": 4.9158381721236715e-06, |
|
"loss": 0.4544, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11350293542074363, |
|
"grad_norm": 1.1300384998321533, |
|
"learning_rate": 4.91170728114714e-06, |
|
"loss": 0.5704, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11545988258317025, |
|
"grad_norm": 0.7926605343818665, |
|
"learning_rate": 4.907479246952981e-06, |
|
"loss": 0.5112, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11741682974559686, |
|
"grad_norm": 0.7744232416152954, |
|
"learning_rate": 4.903154239845798e-06, |
|
"loss": 0.4894, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11937377690802348, |
|
"grad_norm": 1.6636885404586792, |
|
"learning_rate": 4.8987324340362445e-06, |
|
"loss": 0.5311, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12133072407045009, |
|
"grad_norm": 1.0098280906677246, |
|
"learning_rate": 4.894214007634014e-06, |
|
"loss": 0.4907, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1232876712328767, |
|
"grad_norm": 1.0168606042861938, |
|
"learning_rate": 4.889599142640663e-06, |
|
"loss": 0.5128, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12524461839530332, |
|
"grad_norm": 0.8393405079841614, |
|
"learning_rate": 4.884888024942282e-06, |
|
"loss": 0.4989, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12720156555772993, |
|
"grad_norm": 1.2758891582489014, |
|
"learning_rate": 4.880080844302004e-06, |
|
"loss": 0.5329, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.12915851272015655, |
|
"grad_norm": 0.8657482862472534, |
|
"learning_rate": 4.875177794352364e-06, |
|
"loss": 0.5058, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13111545988258316, |
|
"grad_norm": 0.9110330939292908, |
|
"learning_rate": 4.870179072587499e-06, |
|
"loss": 0.5137, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.13307240704500978, |
|
"grad_norm": 0.8738705515861511, |
|
"learning_rate": 4.865084880355193e-06, |
|
"loss": 0.5423, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1350293542074364, |
|
"grad_norm": 0.8127829432487488, |
|
"learning_rate": 4.859895422848767e-06, |
|
"loss": 0.5402, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 0.768864631652832, |
|
"learning_rate": 4.854610909098813e-06, |
|
"loss": 0.5301, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13894324853228962, |
|
"grad_norm": 1.2464350461959839, |
|
"learning_rate": 4.849231551964771e-06, |
|
"loss": 0.5124, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.14090019569471623, |
|
"grad_norm": 0.9351313710212708, |
|
"learning_rate": 4.843757568126366e-06, |
|
"loss": 0.5152, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.842991054058075, |
|
"learning_rate": 4.838189178074867e-06, |
|
"loss": 0.5254, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14481409001956946, |
|
"grad_norm": 0.7789003252983093, |
|
"learning_rate": 4.832526606104213e-06, |
|
"loss": 0.5528, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14677103718199608, |
|
"grad_norm": 0.8701135516166687, |
|
"learning_rate": 4.826770080301978e-06, |
|
"loss": 0.5243, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1487279843444227, |
|
"grad_norm": 0.8384250998497009, |
|
"learning_rate": 4.8209198325401815e-06, |
|
"loss": 0.4648, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1506849315068493, |
|
"grad_norm": 1.0472533702850342, |
|
"learning_rate": 4.814976098465951e-06, |
|
"loss": 0.5342, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15264187866927592, |
|
"grad_norm": 0.9264402389526367, |
|
"learning_rate": 4.808939117492028e-06, |
|
"loss": 0.5267, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15459882583170254, |
|
"grad_norm": 0.8155198097229004, |
|
"learning_rate": 4.802809132787125e-06, |
|
"loss": 0.5363, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15655577299412915, |
|
"grad_norm": 0.8857468366622925, |
|
"learning_rate": 4.796586391266135e-06, |
|
"loss": 0.5021, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15851272015655576, |
|
"grad_norm": 1.0320619344711304, |
|
"learning_rate": 4.790271143580174e-06, |
|
"loss": 0.4892, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16046966731898238, |
|
"grad_norm": 0.9655166268348694, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.5493, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.162426614481409, |
|
"grad_norm": 1.3644921779632568, |
|
"learning_rate": 4.777364150938263e-06, |
|
"loss": 0.4835, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1643835616438356, |
|
"grad_norm": 1.291692852973938, |
|
"learning_rate": 4.770772925874093e-06, |
|
"loss": 0.5755, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.16634050880626222, |
|
"grad_norm": 1.0446902513504028, |
|
"learning_rate": 4.764090234407578e-06, |
|
"loss": 0.5659, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.16829745596868884, |
|
"grad_norm": 0.9225801825523376, |
|
"learning_rate": 4.757316345716554e-06, |
|
"loss": 0.4067, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17025440313111545, |
|
"grad_norm": 0.8291013240814209, |
|
"learning_rate": 4.75045153265227e-06, |
|
"loss": 0.4946, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17221135029354206, |
|
"grad_norm": 1.1656488180160522, |
|
"learning_rate": 4.743496071728396e-06, |
|
"loss": 0.4933, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17416829745596868, |
|
"grad_norm": 0.9090279936790466, |
|
"learning_rate": 4.736450243109885e-06, |
|
"loss": 0.5085, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1761252446183953, |
|
"grad_norm": 1.2236806154251099, |
|
"learning_rate": 4.729314330601684e-06, |
|
"loss": 0.5147, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1780821917808219, |
|
"grad_norm": 0.9335976839065552, |
|
"learning_rate": 4.7220886216373095e-06, |
|
"loss": 0.4589, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18003913894324852, |
|
"grad_norm": 0.759772002696991, |
|
"learning_rate": 4.714773407267264e-06, |
|
"loss": 0.5398, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18199608610567514, |
|
"grad_norm": 0.9582347869873047, |
|
"learning_rate": 4.707368982147318e-06, |
|
"loss": 0.5696, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18395303326810175, |
|
"grad_norm": 0.9130314588546753, |
|
"learning_rate": 4.699875644526633e-06, |
|
"loss": 0.4803, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.18590998043052837, |
|
"grad_norm": 0.9103049635887146, |
|
"learning_rate": 4.692293696235758e-06, |
|
"loss": 0.4833, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18786692759295498, |
|
"grad_norm": 0.7975893616676331, |
|
"learning_rate": 4.684623442674463e-06, |
|
"loss": 0.5263, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1898238747553816, |
|
"grad_norm": 0.761643648147583, |
|
"learning_rate": 4.676865192799443e-06, |
|
"loss": 0.4519, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1917808219178082, |
|
"grad_norm": 0.7510681748390198, |
|
"learning_rate": 4.669019259111873e-06, |
|
"loss": 0.4871, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.19373776908023482, |
|
"grad_norm": 1.1785235404968262, |
|
"learning_rate": 4.661085957644817e-06, |
|
"loss": 0.4644, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.19569471624266144, |
|
"grad_norm": 1.2464004755020142, |
|
"learning_rate": 4.653065607950502e-06, |
|
"loss": 0.4791, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19765166340508805, |
|
"grad_norm": 2.580218553543091, |
|
"learning_rate": 4.644958533087443e-06, |
|
"loss": 0.4146, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19960861056751467, |
|
"grad_norm": 0.9442769289016724, |
|
"learning_rate": 4.636765059607434e-06, |
|
"loss": 0.494, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.20156555772994128, |
|
"grad_norm": 0.7965562343597412, |
|
"learning_rate": 4.628485517542393e-06, |
|
"loss": 0.4496, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2035225048923679, |
|
"grad_norm": 1.2338522672653198, |
|
"learning_rate": 4.620120240391065e-06, |
|
"loss": 0.4592, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2054794520547945, |
|
"grad_norm": 0.8661827445030212, |
|
"learning_rate": 4.611669565105597e-06, |
|
"loss": 0.4883, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.20743639921722112, |
|
"grad_norm": 1.014655351638794, |
|
"learning_rate": 4.603133832077953e-06, |
|
"loss": 0.5101, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.20939334637964774, |
|
"grad_norm": 0.9033066630363464, |
|
"learning_rate": 4.5945133851262185e-06, |
|
"loss": 0.4515, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.21135029354207435, |
|
"grad_norm": 0.91737961769104, |
|
"learning_rate": 4.585808571480739e-06, |
|
"loss": 0.4886, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.21330724070450097, |
|
"grad_norm": 0.9076818823814392, |
|
"learning_rate": 4.577019741770137e-06, |
|
"loss": 0.5572, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.21526418786692758, |
|
"grad_norm": 0.9256044626235962, |
|
"learning_rate": 4.5681472500071935e-06, |
|
"loss": 0.5089, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2172211350293542, |
|
"grad_norm": 0.8705273270606995, |
|
"learning_rate": 4.559191453574582e-06, |
|
"loss": 0.5199, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2191780821917808, |
|
"grad_norm": 0.8358094096183777, |
|
"learning_rate": 4.550152713210478e-06, |
|
"loss": 0.5091, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.22113502935420742, |
|
"grad_norm": 1.0409964323043823, |
|
"learning_rate": 4.541031392994025e-06, |
|
"loss": 0.4997, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.22309197651663404, |
|
"grad_norm": 0.8039932250976562, |
|
"learning_rate": 4.53182786033067e-06, |
|
"loss": 0.537, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.22504892367906065, |
|
"grad_norm": 0.9191640615463257, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.5799, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.22700587084148727, |
|
"grad_norm": 0.8132153153419495, |
|
"learning_rate": 4.513175643827647e-06, |
|
"loss": 0.5217, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.22896281800391388, |
|
"grad_norm": 0.7776696085929871, |
|
"learning_rate": 4.503727711296539e-06, |
|
"loss": 0.4729, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2309197651663405, |
|
"grad_norm": 0.8824874758720398, |
|
"learning_rate": 4.494199068905389e-06, |
|
"loss": 0.4977, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2328767123287671, |
|
"grad_norm": 0.9938674569129944, |
|
"learning_rate": 4.484590100466524e-06, |
|
"loss": 0.5067, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.23483365949119372, |
|
"grad_norm": 1.018510103225708, |
|
"learning_rate": 4.474901193027791e-06, |
|
"loss": 0.5855, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23679060665362034, |
|
"grad_norm": 1.0530946254730225, |
|
"learning_rate": 4.4651327368569695e-06, |
|
"loss": 0.4835, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.23874755381604695, |
|
"grad_norm": 0.7325494289398193, |
|
"learning_rate": 4.455285125426049e-06, |
|
"loss": 0.5043, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.24070450097847357, |
|
"grad_norm": 1.2264351844787598, |
|
"learning_rate": 4.445358755395382e-06, |
|
"loss": 0.4991, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.24266144814090018, |
|
"grad_norm": 0.7878324389457703, |
|
"learning_rate": 4.435354026597707e-06, |
|
"loss": 0.4943, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.2446183953033268, |
|
"grad_norm": 1.0379810333251953, |
|
"learning_rate": 4.425271342022039e-06, |
|
"loss": 0.5664, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2465753424657534, |
|
"grad_norm": 1.2007404565811157, |
|
"learning_rate": 4.415111107797445e-06, |
|
"loss": 0.4495, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.24853228962818003, |
|
"grad_norm": 1.4260215759277344, |
|
"learning_rate": 4.404873733176678e-06, |
|
"loss": 0.4848, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.25048923679060664, |
|
"grad_norm": 0.7717714309692383, |
|
"learning_rate": 4.3945596305196925e-06, |
|
"loss": 0.4975, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.25244618395303325, |
|
"grad_norm": 1.0631009340286255, |
|
"learning_rate": 4.384169215277042e-06, |
|
"loss": 0.538, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.25440313111545987, |
|
"grad_norm": 0.9604893326759338, |
|
"learning_rate": 4.373702905973136e-06, |
|
"loss": 0.554, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2563600782778865, |
|
"grad_norm": 0.8638473749160767, |
|
"learning_rate": 4.363161124189387e-06, |
|
"loss": 0.4839, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2583170254403131, |
|
"grad_norm": 0.8187501430511475, |
|
"learning_rate": 4.352544294547229e-06, |
|
"loss": 0.5105, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2602739726027397, |
|
"grad_norm": 1.4357470273971558, |
|
"learning_rate": 4.341852844691012e-06, |
|
"loss": 0.4532, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2622309197651663, |
|
"grad_norm": 0.8292232155799866, |
|
"learning_rate": 4.331087205270778e-06, |
|
"loss": 0.451, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.26418786692759294, |
|
"grad_norm": 0.8243665099143982, |
|
"learning_rate": 4.320247809924911e-06, |
|
"loss": 0.4857, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.26614481409001955, |
|
"grad_norm": 0.9147266745567322, |
|
"learning_rate": 4.309335095262675e-06, |
|
"loss": 0.4778, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.26810176125244617, |
|
"grad_norm": 0.8612287044525146, |
|
"learning_rate": 4.2983495008466285e-06, |
|
"loss": 0.4627, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2700587084148728, |
|
"grad_norm": 0.8230846524238586, |
|
"learning_rate": 4.287291469174909e-06, |
|
"loss": 0.4627, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2720156555772994, |
|
"grad_norm": 0.8767359852790833, |
|
"learning_rate": 4.276161445663423e-06, |
|
"loss": 0.5119, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.8119643926620483, |
|
"learning_rate": 4.264959878627891e-06, |
|
"loss": 0.4495, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2759295499021526, |
|
"grad_norm": 0.7973845601081848, |
|
"learning_rate": 4.253687219265803e-06, |
|
"loss": 0.5228, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.27788649706457924, |
|
"grad_norm": 0.892238199710846, |
|
"learning_rate": 4.242343921638235e-06, |
|
"loss": 0.5154, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.27984344422700586, |
|
"grad_norm": 1.3092166185379028, |
|
"learning_rate": 4.230930442651558e-06, |
|
"loss": 0.5085, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.28180039138943247, |
|
"grad_norm": 1.2284399271011353, |
|
"learning_rate": 4.219447242039043e-06, |
|
"loss": 0.4366, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2837573385518591, |
|
"grad_norm": 1.0883151292800903, |
|
"learning_rate": 4.207894782342337e-06, |
|
"loss": 0.5958, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.1132919788360596, |
|
"learning_rate": 4.196273528892831e-06, |
|
"loss": 0.4348, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2876712328767123, |
|
"grad_norm": 1.2576059103012085, |
|
"learning_rate": 4.18458394979292e-06, |
|
"loss": 0.5247, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2896281800391389, |
|
"grad_norm": 0.8995031714439392, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.5082, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.29158512720156554, |
|
"grad_norm": 0.7533922791481018, |
|
"learning_rate": 4.161001700793231e-06, |
|
"loss": 0.4644, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.29354207436399216, |
|
"grad_norm": 0.9206835031509399, |
|
"learning_rate": 4.149109980783004e-06, |
|
"loss": 0.494, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.29549902152641877, |
|
"grad_norm": 1.208590030670166, |
|
"learning_rate": 4.137151834863213e-06, |
|
"loss": 0.5545, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2974559686888454, |
|
"grad_norm": 0.7689659595489502, |
|
"learning_rate": 4.125127744706232e-06, |
|
"loss": 0.4845, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.299412915851272, |
|
"grad_norm": 1.0235570669174194, |
|
"learning_rate": 4.113038194640658e-06, |
|
"loss": 0.4778, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3013698630136986, |
|
"grad_norm": 1.1112617254257202, |
|
"learning_rate": 4.100883671631806e-06, |
|
"loss": 0.5206, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.30332681017612523, |
|
"grad_norm": 1.073519229888916, |
|
"learning_rate": 4.088664665262091e-06, |
|
"loss": 0.4944, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.30528375733855184, |
|
"grad_norm": 0.8319236040115356, |
|
"learning_rate": 4.076381667711306e-06, |
|
"loss": 0.4741, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.30724070450097846, |
|
"grad_norm": 1.2600641250610352, |
|
"learning_rate": 4.064035173736804e-06, |
|
"loss": 0.5311, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.30919765166340507, |
|
"grad_norm": 0.8686632513999939, |
|
"learning_rate": 4.05162568065356e-06, |
|
"loss": 0.5436, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3111545988258317, |
|
"grad_norm": 0.7053869366645813, |
|
"learning_rate": 4.039153688314146e-06, |
|
"loss": 0.4846, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3131115459882583, |
|
"grad_norm": 0.8360055685043335, |
|
"learning_rate": 4.0266196990885955e-06, |
|
"loss": 0.5041, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3150684931506849, |
|
"grad_norm": 0.8842881321907043, |
|
"learning_rate": 4.014024217844167e-06, |
|
"loss": 0.4708, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.31702544031311153, |
|
"grad_norm": 1.0392301082611084, |
|
"learning_rate": 4.001367751925008e-06, |
|
"loss": 0.5315, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.31898238747553814, |
|
"grad_norm": 0.8801809549331665, |
|
"learning_rate": 3.98865081113172e-06, |
|
"loss": 0.4438, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.32093933463796476, |
|
"grad_norm": 1.395719289779663, |
|
"learning_rate": 3.9758739077008256e-06, |
|
"loss": 0.4929, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.32289628180039137, |
|
"grad_norm": 0.8075605034828186, |
|
"learning_rate": 3.96303755628413e-06, |
|
"loss": 0.4364, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.324853228962818, |
|
"grad_norm": 0.9566773772239685, |
|
"learning_rate": 3.950142273927996e-06, |
|
"loss": 0.4001, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3268101761252446, |
|
"grad_norm": 2.270550012588501, |
|
"learning_rate": 3.937188580052518e-06, |
|
"loss": 0.4683, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3287671232876712, |
|
"grad_norm": 0.8937717080116272, |
|
"learning_rate": 3.924176996430597e-06, |
|
"loss": 0.479, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.33072407045009783, |
|
"grad_norm": 0.9620490074157715, |
|
"learning_rate": 3.911108047166924e-06, |
|
"loss": 0.4669, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.33072407045009783, |
|
"eval_accuracy": 0.825246566025413, |
|
"eval_accuracy_first_token": 0.9521367521367521, |
|
"eval_accuracy_first_token_all": 0.9670737362807235, |
|
"eval_accuracy_first_token_all_total": 6469, |
|
"eval_accuracy_first_token_calculate": 0.7954545454545454, |
|
"eval_accuracy_first_token_calculate_total": 44, |
|
"eval_accuracy_first_token_execute": 1.0, |
|
"eval_accuracy_first_token_execute_total": 202, |
|
"eval_accuracy_first_token_get": 0.9649122807017544, |
|
"eval_accuracy_first_token_get_total": 456, |
|
"eval_accuracy_first_token_python": 0.8777777777777778, |
|
"eval_accuracy_first_token_python_total": 990, |
|
"eval_loss": 0.5176534056663513, |
|
"eval_perplexity": 1.206566771452624, |
|
"eval_runtime": 524.2306, |
|
"eval_samples_per_second": 1.269, |
|
"eval_steps_per_second": 0.16, |
|
"eval_total_number_first_token": 9360, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.33268101761252444, |
|
"grad_norm": 0.8628460168838501, |
|
"learning_rate": 3.897982258676867e-06, |
|
"loss": 0.4727, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33463796477495106, |
|
"grad_norm": 0.8537535071372986, |
|
"learning_rate": 3.8848001596652765e-06, |
|
"loss": 0.4746, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.33659491193737767, |
|
"grad_norm": 0.9613227248191833, |
|
"learning_rate": 3.8715622811051754e-06, |
|
"loss": 0.5148, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3385518590998043, |
|
"grad_norm": 0.8833454251289368, |
|
"learning_rate": 3.858269156216383e-06, |
|
"loss": 0.5125, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3405088062622309, |
|
"grad_norm": 0.9823891520500183, |
|
"learning_rate": 3.844921320444031e-06, |
|
"loss": 0.5127, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3424657534246575, |
|
"grad_norm": 1.0789107084274292, |
|
"learning_rate": 3.8315193114369995e-06, |
|
"loss": 0.4935, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.34442270058708413, |
|
"grad_norm": 0.8753149509429932, |
|
"learning_rate": 3.8180636690262565e-06, |
|
"loss": 0.4543, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.34637964774951074, |
|
"grad_norm": 1.7468674182891846, |
|
"learning_rate": 3.804554935203115e-06, |
|
"loss": 0.4955, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.34833659491193736, |
|
"grad_norm": 0.9011304974555969, |
|
"learning_rate": 3.7909936540974052e-06, |
|
"loss": 0.5992, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.350293542074364, |
|
"grad_norm": 0.9541127681732178, |
|
"learning_rate": 3.777380371955552e-06, |
|
"loss": 0.5322, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3522504892367906, |
|
"grad_norm": 1.3841750621795654, |
|
"learning_rate": 3.7637156371185744e-06, |
|
"loss": 0.4661, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3542074363992172, |
|
"grad_norm": 1.0240124464035034, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.5231, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.3561643835616438, |
|
"grad_norm": 1.444016933441162, |
|
"learning_rate": 3.7362340130636926e-06, |
|
"loss": 0.5203, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.35812133072407043, |
|
"grad_norm": 0.7845962047576904, |
|
"learning_rate": 3.7224182308015977e-06, |
|
"loss": 0.4929, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.36007827788649704, |
|
"grad_norm": 1.0257796049118042, |
|
"learning_rate": 3.7085532097114098e-06, |
|
"loss": 0.4597, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.36203522504892366, |
|
"grad_norm": 0.9083458185195923, |
|
"learning_rate": 3.6946395082741582e-06, |
|
"loss": 0.5254, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3639921722113503, |
|
"grad_norm": 0.9128417372703552, |
|
"learning_rate": 3.6806776869317074e-06, |
|
"loss": 0.4428, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3659491193737769, |
|
"grad_norm": 1.1980143785476685, |
|
"learning_rate": 3.6666683080641846e-06, |
|
"loss": 0.5374, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3679060665362035, |
|
"grad_norm": 0.8467942476272583, |
|
"learning_rate": 3.6526119359673283e-06, |
|
"loss": 0.4963, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3698630136986301, |
|
"grad_norm": 0.8798732757568359, |
|
"learning_rate": 3.6385091368297582e-06, |
|
"loss": 0.5208, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.37181996086105673, |
|
"grad_norm": 0.8612852692604065, |
|
"learning_rate": 3.624360478710165e-06, |
|
"loss": 0.3989, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37377690802348335, |
|
"grad_norm": 0.7529587149620056, |
|
"learning_rate": 3.6101665315144357e-06, |
|
"loss": 0.5015, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.37573385518590996, |
|
"grad_norm": 0.8704853653907776, |
|
"learning_rate": 3.595927866972694e-06, |
|
"loss": 0.4318, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3776908023483366, |
|
"grad_norm": 1.1298363208770752, |
|
"learning_rate": 3.581645058616271e-06, |
|
"loss": 0.5047, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3796477495107632, |
|
"grad_norm": 1.2964321374893188, |
|
"learning_rate": 3.5673186817546047e-06, |
|
"loss": 0.4764, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3816046966731898, |
|
"grad_norm": 2.080096960067749, |
|
"learning_rate": 3.552949313452067e-06, |
|
"loss": 0.4808, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3835616438356164, |
|
"grad_norm": 0.8993785977363586, |
|
"learning_rate": 3.5385375325047167e-06, |
|
"loss": 0.5577, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.38551859099804303, |
|
"grad_norm": 0.8617794513702393, |
|
"learning_rate": 3.5240839194169885e-06, |
|
"loss": 0.5042, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.38747553816046965, |
|
"grad_norm": 0.9634183645248413, |
|
"learning_rate": 3.5095890563783124e-06, |
|
"loss": 0.466, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.38943248532289626, |
|
"grad_norm": 0.9015300273895264, |
|
"learning_rate": 3.4950535272396564e-06, |
|
"loss": 0.3887, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3913894324853229, |
|
"grad_norm": 0.8658633828163147, |
|
"learning_rate": 3.480477917490014e-06, |
|
"loss": 0.4665, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3933463796477495, |
|
"grad_norm": 0.7967968583106995, |
|
"learning_rate": 3.4658628142328215e-06, |
|
"loss": 0.515, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3953033268101761, |
|
"grad_norm": 0.7495056986808777, |
|
"learning_rate": 3.4512088061623077e-06, |
|
"loss": 0.4345, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3972602739726027, |
|
"grad_norm": 0.9585980772972107, |
|
"learning_rate": 3.436516483539781e-06, |
|
"loss": 0.4084, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.39921722113502933, |
|
"grad_norm": 0.9240750670433044, |
|
"learning_rate": 3.4217864381698523e-06, |
|
"loss": 0.4451, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.40117416829745595, |
|
"grad_norm": 1.2117798328399658, |
|
"learning_rate": 3.4070192633766025e-06, |
|
"loss": 0.5152, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.40313111545988256, |
|
"grad_norm": 0.868486225605011, |
|
"learning_rate": 3.39221555397968e-06, |
|
"loss": 0.5456, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4050880626223092, |
|
"grad_norm": 0.7969531416893005, |
|
"learning_rate": 3.37737590627034e-06, |
|
"loss": 0.4295, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4070450097847358, |
|
"grad_norm": 0.9103299975395203, |
|
"learning_rate": 3.362500917987427e-06, |
|
"loss": 0.4485, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4090019569471624, |
|
"grad_norm": 1.0487585067749023, |
|
"learning_rate": 3.3475911882933014e-06, |
|
"loss": 0.4807, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 0.9155584573745728, |
|
"learning_rate": 3.332647317749702e-06, |
|
"loss": 0.4617, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.41291585127201563, |
|
"grad_norm": 0.9164103865623474, |
|
"learning_rate": 3.3176699082935546e-06, |
|
"loss": 0.5041, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.41487279843444225, |
|
"grad_norm": 0.7580545544624329, |
|
"learning_rate": 3.3026595632127274e-06, |
|
"loss": 0.465, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.41682974559686886, |
|
"grad_norm": 1.0577958822250366, |
|
"learning_rate": 3.2876168871217322e-06, |
|
"loss": 0.4055, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.4187866927592955, |
|
"grad_norm": 1.2304415702819824, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.3852, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4207436399217221, |
|
"grad_norm": 0.905158281326294, |
|
"learning_rate": 3.2574369668543187e-06, |
|
"loss": 0.4861, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4227005870841487, |
|
"grad_norm": 0.9109801054000854, |
|
"learning_rate": 3.2423009383206876e-06, |
|
"loss": 0.4247, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4246575342465753, |
|
"grad_norm": 0.8025485277175903, |
|
"learning_rate": 3.227135010013498e-06, |
|
"loss": 0.5319, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.42661448140900193, |
|
"grad_norm": 0.883714497089386, |
|
"learning_rate": 3.211939792814131e-06, |
|
"loss": 0.5287, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.9827890396118164, |
|
"learning_rate": 3.19671589878372e-06, |
|
"loss": 0.4799, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.43052837573385516, |
|
"grad_norm": 0.8296178579330444, |
|
"learning_rate": 3.1814639411384953e-06, |
|
"loss": 0.4725, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4324853228962818, |
|
"grad_norm": 0.8092741370201111, |
|
"learning_rate": 3.1661845342250874e-06, |
|
"loss": 0.5054, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4344422700587084, |
|
"grad_norm": 1.160125732421875, |
|
"learning_rate": 3.1508782934957804e-06, |
|
"loss": 0.6022, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.436399217221135, |
|
"grad_norm": 0.871837854385376, |
|
"learning_rate": 3.1355458354837183e-06, |
|
"loss": 0.4545, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4383561643835616, |
|
"grad_norm": 0.8639246225357056, |
|
"learning_rate": 3.1201877777780724e-06, |
|
"loss": 0.449, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.44031311154598823, |
|
"grad_norm": 0.9144279956817627, |
|
"learning_rate": 3.1048047389991693e-06, |
|
"loss": 0.4308, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44227005870841485, |
|
"grad_norm": 1.0165725946426392, |
|
"learning_rate": 3.089397338773569e-06, |
|
"loss": 0.4997, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.44422700587084146, |
|
"grad_norm": 0.7787861824035645, |
|
"learning_rate": 3.0739661977091027e-06, |
|
"loss": 0.4408, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4461839530332681, |
|
"grad_norm": 0.8962077498435974, |
|
"learning_rate": 3.0585119373698858e-06, |
|
"loss": 0.4879, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4481409001956947, |
|
"grad_norm": 0.8481760621070862, |
|
"learning_rate": 3.04303518025127e-06, |
|
"loss": 0.4525, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4500978473581213, |
|
"grad_norm": 0.9689728021621704, |
|
"learning_rate": 3.0275365497547747e-06, |
|
"loss": 0.5199, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4520547945205479, |
|
"grad_norm": 1.0657813549041748, |
|
"learning_rate": 3.012016670162977e-06, |
|
"loss": 0.4834, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.45401174168297453, |
|
"grad_norm": 1.0324097871780396, |
|
"learning_rate": 2.9964761666143638e-06, |
|
"loss": 0.5407, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.45596868884540115, |
|
"grad_norm": 0.8452147245407104, |
|
"learning_rate": 2.980915665078153e-06, |
|
"loss": 0.5108, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.45792563600782776, |
|
"grad_norm": 1.1484103202819824, |
|
"learning_rate": 2.9653357923290753e-06, |
|
"loss": 0.4082, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4598825831702544, |
|
"grad_norm": 0.859313428401947, |
|
"learning_rate": 2.949737175922135e-06, |
|
"loss": 0.4752, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.461839530332681, |
|
"grad_norm": 0.87496417760849, |
|
"learning_rate": 2.9341204441673267e-06, |
|
"loss": 0.4624, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4637964774951076, |
|
"grad_norm": 0.9420116543769836, |
|
"learning_rate": 2.9184862261043272e-06, |
|
"loss": 0.4557, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4657534246575342, |
|
"grad_norm": 1.4860702753067017, |
|
"learning_rate": 2.902835151477161e-06, |
|
"loss": 0.4617, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.46771037181996084, |
|
"grad_norm": 0.8771023750305176, |
|
"learning_rate": 2.887167850708831e-06, |
|
"loss": 0.5299, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.46966731898238745, |
|
"grad_norm": 0.8673617839813232, |
|
"learning_rate": 2.8714849548759293e-06, |
|
"loss": 0.5504, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.47162426614481406, |
|
"grad_norm": 0.8307452201843262, |
|
"learning_rate": 2.8557870956832135e-06, |
|
"loss": 0.4735, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4735812133072407, |
|
"grad_norm": 0.9233512282371521, |
|
"learning_rate": 2.840074905438161e-06, |
|
"loss": 0.3701, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.4755381604696673, |
|
"grad_norm": 1.0768812894821167, |
|
"learning_rate": 2.8243490170255046e-06, |
|
"loss": 0.4983, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4774951076320939, |
|
"grad_norm": 0.9305315017700195, |
|
"learning_rate": 2.808610063881737e-06, |
|
"loss": 0.4137, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.4794520547945205, |
|
"grad_norm": 1.1971187591552734, |
|
"learning_rate": 2.792858679969596e-06, |
|
"loss": 0.452, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.48140900195694714, |
|
"grad_norm": 1.314292073249817, |
|
"learning_rate": 2.7770954997525277e-06, |
|
"loss": 0.526, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.48336594911937375, |
|
"grad_norm": 1.2386282682418823, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.5002, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.48532289628180036, |
|
"grad_norm": 0.9772767424583435, |
|
"learning_rate": 2.745536290607593e-06, |
|
"loss": 0.5091, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.487279843444227, |
|
"grad_norm": 1.0364662408828735, |
|
"learning_rate": 2.729741532880069e-06, |
|
"loss": 0.4752, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4892367906066536, |
|
"grad_norm": 0.8030025362968445, |
|
"learning_rate": 2.7139375211971e-06, |
|
"loss": 0.462, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4911937377690802, |
|
"grad_norm": 1.3889553546905518, |
|
"learning_rate": 2.6981248921419713e-06, |
|
"loss": 0.4102, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.4931506849315068, |
|
"grad_norm": 0.9577500224113464, |
|
"learning_rate": 2.682304282645077e-06, |
|
"loss": 0.5008, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.49510763209393344, |
|
"grad_norm": 1.3206193447113037, |
|
"learning_rate": 2.66647632995826e-06, |
|
"loss": 0.4624, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.49706457925636005, |
|
"grad_norm": 0.8159929513931274, |
|
"learning_rate": 2.6506416716291466e-06, |
|
"loss": 0.4561, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.49902152641878667, |
|
"grad_norm": 0.854573130607605, |
|
"learning_rate": 2.634800945475465e-06, |
|
"loss": 0.5503, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5009784735812133, |
|
"grad_norm": 9.345633506774902, |
|
"learning_rate": 2.6189547895593565e-06, |
|
"loss": 0.5216, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.50293542074364, |
|
"grad_norm": 0.8881295323371887, |
|
"learning_rate": 2.6031038421616684e-06, |
|
"loss": 0.4713, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5048923679060665, |
|
"grad_norm": 1.7568496465682983, |
|
"learning_rate": 2.587248741756253e-06, |
|
"loss": 0.5096, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5068493150684932, |
|
"grad_norm": 0.8306764960289001, |
|
"learning_rate": 2.5713901269842405e-06, |
|
"loss": 0.4504, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5088062622309197, |
|
"grad_norm": 0.9716941118240356, |
|
"learning_rate": 2.555528636628324e-06, |
|
"loss": 0.4668, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5107632093933464, |
|
"grad_norm": 0.8290694355964661, |
|
"learning_rate": 2.53966490958702e-06, |
|
"loss": 0.4288, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.512720156555773, |
|
"grad_norm": 0.9514800310134888, |
|
"learning_rate": 2.5237995848489422e-06, |
|
"loss": 0.5157, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5146771037181996, |
|
"grad_norm": 1.515278935432434, |
|
"learning_rate": 2.507933301467056e-06, |
|
"loss": 0.4863, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5166340508806262, |
|
"grad_norm": 0.9582359790802002, |
|
"learning_rate": 2.4920666985329446e-06, |
|
"loss": 0.4694, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5185909980430529, |
|
"grad_norm": 0.8128112554550171, |
|
"learning_rate": 2.4762004151510586e-06, |
|
"loss": 0.4244, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5205479452054794, |
|
"grad_norm": 1.151044487953186, |
|
"learning_rate": 2.4603350904129802e-06, |
|
"loss": 0.4555, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.5225048923679061, |
|
"grad_norm": 0.8072860240936279, |
|
"learning_rate": 2.4444713633716764e-06, |
|
"loss": 0.4173, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5244618395303327, |
|
"grad_norm": 1.8496747016906738, |
|
"learning_rate": 2.42860987301576e-06, |
|
"loss": 0.4206, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5264187866927593, |
|
"grad_norm": 1.096216082572937, |
|
"learning_rate": 2.4127512582437486e-06, |
|
"loss": 0.4501, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5283757338551859, |
|
"grad_norm": 0.9519087076187134, |
|
"learning_rate": 2.3968961578383324e-06, |
|
"loss": 0.4848, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5303326810176126, |
|
"grad_norm": 0.9204405546188354, |
|
"learning_rate": 2.3810452104406444e-06, |
|
"loss": 0.4526, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5322896281800391, |
|
"grad_norm": 0.8748743534088135, |
|
"learning_rate": 2.3651990545245357e-06, |
|
"loss": 0.4547, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5342465753424658, |
|
"grad_norm": 1.6212592124938965, |
|
"learning_rate": 2.3493583283708542e-06, |
|
"loss": 0.4937, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5362035225048923, |
|
"grad_norm": 0.9793727993965149, |
|
"learning_rate": 2.3335236700417404e-06, |
|
"loss": 0.4456, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.538160469667319, |
|
"grad_norm": 1.7149184942245483, |
|
"learning_rate": 2.3176957173549236e-06, |
|
"loss": 0.4737, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5401174168297456, |
|
"grad_norm": 0.9447053074836731, |
|
"learning_rate": 2.3018751078580287e-06, |
|
"loss": 0.4496, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5420743639921722, |
|
"grad_norm": 0.9250771999359131, |
|
"learning_rate": 2.2860624788029013e-06, |
|
"loss": 0.4674, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5440313111545988, |
|
"grad_norm": 0.8631194233894348, |
|
"learning_rate": 2.2702584671199317e-06, |
|
"loss": 0.48, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5459882583170255, |
|
"grad_norm": 0.8921899199485779, |
|
"learning_rate": 2.2544637093924072e-06, |
|
"loss": 0.4009, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.811696469783783, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.4714, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5499021526418787, |
|
"grad_norm": 0.7900722026824951, |
|
"learning_rate": 2.2229045002474727e-06, |
|
"loss": 0.3956, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5518590998043053, |
|
"grad_norm": 0.8538399934768677, |
|
"learning_rate": 2.2071413200304046e-06, |
|
"loss": 0.3488, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5538160469667319, |
|
"grad_norm": 0.9310709238052368, |
|
"learning_rate": 2.1913899361182634e-06, |
|
"loss": 0.3915, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5557729941291585, |
|
"grad_norm": 0.8419170379638672, |
|
"learning_rate": 2.1756509829744958e-06, |
|
"loss": 0.4716, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5577299412915852, |
|
"grad_norm": 1.2008228302001953, |
|
"learning_rate": 2.1599250945618404e-06, |
|
"loss": 0.4493, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5596868884540117, |
|
"grad_norm": 0.8127449750900269, |
|
"learning_rate": 2.1442129043167877e-06, |
|
"loss": 0.4223, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5616438356164384, |
|
"grad_norm": 1.0872187614440918, |
|
"learning_rate": 2.128515045124071e-06, |
|
"loss": 0.4814, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5636007827788649, |
|
"grad_norm": 0.9573928713798523, |
|
"learning_rate": 2.1128321492911697e-06, |
|
"loss": 0.4606, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5655577299412916, |
|
"grad_norm": 0.7555910348892212, |
|
"learning_rate": 2.0971648485228404e-06, |
|
"loss": 0.446, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5675146771037182, |
|
"grad_norm": 0.9281080961227417, |
|
"learning_rate": 2.0815137738956736e-06, |
|
"loss": 0.4224, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5694716242661448, |
|
"grad_norm": 1.0036050081253052, |
|
"learning_rate": 2.0658795558326745e-06, |
|
"loss": 0.4476, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.8064005970954895, |
|
"learning_rate": 2.0502628240778655e-06, |
|
"loss": 0.4518, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5733855185909981, |
|
"grad_norm": 0.8491390347480774, |
|
"learning_rate": 2.034664207670925e-06, |
|
"loss": 0.4947, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5753424657534246, |
|
"grad_norm": 1.433266043663025, |
|
"learning_rate": 2.019084334921849e-06, |
|
"loss": 0.4929, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5772994129158513, |
|
"grad_norm": 0.8420299291610718, |
|
"learning_rate": 2.003523833385637e-06, |
|
"loss": 0.4533, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5792563600782779, |
|
"grad_norm": 2.3586318492889404, |
|
"learning_rate": 1.987983329837024e-06, |
|
"loss": 0.4257, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5812133072407045, |
|
"grad_norm": 0.85833340883255, |
|
"learning_rate": 1.972463450245226e-06, |
|
"loss": 0.4875, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5831702544031311, |
|
"grad_norm": 0.7927659749984741, |
|
"learning_rate": 1.956964819748731e-06, |
|
"loss": 0.415, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5851272015655578, |
|
"grad_norm": 0.8850895762443542, |
|
"learning_rate": 1.9414880626301147e-06, |
|
"loss": 0.409, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5870841487279843, |
|
"grad_norm": 1.509407877922058, |
|
"learning_rate": 1.9260338022908972e-06, |
|
"loss": 0.5041, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.589041095890411, |
|
"grad_norm": 1.5269814729690552, |
|
"learning_rate": 1.9106026612264316e-06, |
|
"loss": 0.4222, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5909980430528375, |
|
"grad_norm": 1.386004090309143, |
|
"learning_rate": 1.895195261000831e-06, |
|
"loss": 0.5278, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5929549902152642, |
|
"grad_norm": 1.278283953666687, |
|
"learning_rate": 1.8798122222219288e-06, |
|
"loss": 0.4823, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5949119373776908, |
|
"grad_norm": 0.8502036333084106, |
|
"learning_rate": 1.8644541645162834e-06, |
|
"loss": 0.4682, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5968688845401174, |
|
"grad_norm": 0.8835340142250061, |
|
"learning_rate": 1.84912170650422e-06, |
|
"loss": 0.3474, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.598825831702544, |
|
"grad_norm": 0.8175051212310791, |
|
"learning_rate": 1.833815465774913e-06, |
|
"loss": 0.4262, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6007827788649707, |
|
"grad_norm": 1.031742811203003, |
|
"learning_rate": 1.818536058861506e-06, |
|
"loss": 0.4432, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6027397260273972, |
|
"grad_norm": 0.9526416659355164, |
|
"learning_rate": 1.803284101216281e-06, |
|
"loss": 0.3981, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6046966731898239, |
|
"grad_norm": 0.9259310364723206, |
|
"learning_rate": 1.7880602071858694e-06, |
|
"loss": 0.4249, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.6066536203522505, |
|
"grad_norm": 1.0978782176971436, |
|
"learning_rate": 1.7728649899865024e-06, |
|
"loss": 0.4955, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6086105675146771, |
|
"grad_norm": 0.8304716944694519, |
|
"learning_rate": 1.7576990616793139e-06, |
|
"loss": 0.4666, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.6105675146771037, |
|
"grad_norm": 2.960554838180542, |
|
"learning_rate": 1.7425630331456821e-06, |
|
"loss": 0.412, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.6125244618395304, |
|
"grad_norm": 0.8440503478050232, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.4814, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.6144814090019569, |
|
"grad_norm": 0.8478916883468628, |
|
"learning_rate": 1.7123831128782686e-06, |
|
"loss": 0.4708, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.6164383561643836, |
|
"grad_norm": 0.8599239587783813, |
|
"learning_rate": 1.697340436787273e-06, |
|
"loss": 0.428, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6183953033268101, |
|
"grad_norm": 1.151842474937439, |
|
"learning_rate": 1.6823300917064462e-06, |
|
"loss": 0.3433, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6203522504892368, |
|
"grad_norm": 0.8544068336486816, |
|
"learning_rate": 1.6673526822502982e-06, |
|
"loss": 0.4431, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.6223091976516634, |
|
"grad_norm": 1.1891425848007202, |
|
"learning_rate": 1.6524088117066984e-06, |
|
"loss": 0.4334, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.62426614481409, |
|
"grad_norm": 1.1379172801971436, |
|
"learning_rate": 1.637499082012574e-06, |
|
"loss": 0.5514, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.6262230919765166, |
|
"grad_norm": 1.0814030170440674, |
|
"learning_rate": 1.6226240937296617e-06, |
|
"loss": 0.4772, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6281800391389433, |
|
"grad_norm": 0.9527184963226318, |
|
"learning_rate": 1.6077844460203207e-06, |
|
"loss": 0.4292, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.6301369863013698, |
|
"grad_norm": 0.9083294868469238, |
|
"learning_rate": 1.5929807366233979e-06, |
|
"loss": 0.501, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6320939334637965, |
|
"grad_norm": 1.4445644617080688, |
|
"learning_rate": 1.5782135618301486e-06, |
|
"loss": 0.4924, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.6340508806262231, |
|
"grad_norm": 1.361970067024231, |
|
"learning_rate": 1.56348351646022e-06, |
|
"loss": 0.4487, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6360078277886497, |
|
"grad_norm": 0.8321120142936707, |
|
"learning_rate": 1.5487911938376925e-06, |
|
"loss": 0.4566, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6379647749510763, |
|
"grad_norm": 1.1182819604873657, |
|
"learning_rate": 1.5341371857671782e-06, |
|
"loss": 0.4253, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.639921722113503, |
|
"grad_norm": 1.133865475654602, |
|
"learning_rate": 1.5195220825099863e-06, |
|
"loss": 0.4212, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6418786692759295, |
|
"grad_norm": 0.9962513446807861, |
|
"learning_rate": 1.5049464727603453e-06, |
|
"loss": 0.4702, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6438356164383562, |
|
"grad_norm": 1.1037347316741943, |
|
"learning_rate": 1.4904109436216885e-06, |
|
"loss": 0.5035, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6457925636007827, |
|
"grad_norm": 1.0168620347976685, |
|
"learning_rate": 1.475916080583012e-06, |
|
"loss": 0.4762, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6477495107632094, |
|
"grad_norm": 1.1695796251296997, |
|
"learning_rate": 1.4614624674952843e-06, |
|
"loss": 0.4313, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.649706457925636, |
|
"grad_norm": 1.4158042669296265, |
|
"learning_rate": 1.4470506865479337e-06, |
|
"loss": 0.4798, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6516634050880626, |
|
"grad_norm": 0.9545938968658447, |
|
"learning_rate": 1.4326813182453959e-06, |
|
"loss": 0.4126, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6536203522504892, |
|
"grad_norm": 1.0253559350967407, |
|
"learning_rate": 1.4183549413837288e-06, |
|
"loss": 0.4633, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6555772994129159, |
|
"grad_norm": 0.9522657990455627, |
|
"learning_rate": 1.4040721330273063e-06, |
|
"loss": 0.4716, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6575342465753424, |
|
"grad_norm": 0.9612335562705994, |
|
"learning_rate": 1.3898334684855647e-06, |
|
"loss": 0.4699, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6594911937377691, |
|
"grad_norm": 0.9365352392196655, |
|
"learning_rate": 1.375639521289836e-06, |
|
"loss": 0.4775, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6614481409001957, |
|
"grad_norm": 0.9329326152801514, |
|
"learning_rate": 1.3614908631702435e-06, |
|
"loss": 0.4236, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6614481409001957, |
|
"eval_accuracy": 0.8283926414598568, |
|
"eval_accuracy_first_token": 0.9538461538461539, |
|
"eval_accuracy_first_token_all": 0.9723295718039883, |
|
"eval_accuracy_first_token_all_total": 6469, |
|
"eval_accuracy_first_token_calculate": 0.9090909090909091, |
|
"eval_accuracy_first_token_calculate_total": 44, |
|
"eval_accuracy_first_token_execute": 1.0, |
|
"eval_accuracy_first_token_execute_total": 202, |
|
"eval_accuracy_first_token_get": 0.9517543859649122, |
|
"eval_accuracy_first_token_get_total": 456, |
|
"eval_accuracy_first_token_python": 0.8838383838383839, |
|
"eval_accuracy_first_token_python_total": 990, |
|
"eval_loss": 0.5066910982131958, |
|
"eval_perplexity": 1.2021342846813718, |
|
"eval_runtime": 525.6643, |
|
"eval_samples_per_second": 1.265, |
|
"eval_steps_per_second": 0.16, |
|
"eval_total_number_first_token": 9360, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6634050880626223, |
|
"grad_norm": 0.9786916375160217, |
|
"learning_rate": 1.3473880640326725e-06, |
|
"loss": 0.4361, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6653620352250489, |
|
"grad_norm": 0.9186453819274902, |
|
"learning_rate": 1.3333316919358159e-06, |
|
"loss": 0.4658, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6673189823874756, |
|
"grad_norm": 1.02847421169281, |
|
"learning_rate": 1.3193223130682937e-06, |
|
"loss": 0.4517, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6692759295499021, |
|
"grad_norm": 0.970281720161438, |
|
"learning_rate": 1.3053604917258428e-06, |
|
"loss": 0.4617, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6712328767123288, |
|
"grad_norm": 0.7810537219047546, |
|
"learning_rate": 1.2914467902885902e-06, |
|
"loss": 0.4246, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6731898238747553, |
|
"grad_norm": 0.9498399496078491, |
|
"learning_rate": 1.2775817691984032e-06, |
|
"loss": 0.4706, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.675146771037182, |
|
"grad_norm": 0.9036231637001038, |
|
"learning_rate": 1.2637659869363085e-06, |
|
"loss": 0.4826, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6771037181996086, |
|
"grad_norm": 0.8129305243492126, |
|
"learning_rate": 1.2500000000000007e-06, |
|
"loss": 0.4163, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6790606653620352, |
|
"grad_norm": 1.4746164083480835, |
|
"learning_rate": 1.2362843628814267e-06, |
|
"loss": 0.3961, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6810176125244618, |
|
"grad_norm": 11.255887031555176, |
|
"learning_rate": 1.222619628044449e-06, |
|
"loss": 0.4761, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6829745596868885, |
|
"grad_norm": 0.9121260643005371, |
|
"learning_rate": 1.2090063459025956e-06, |
|
"loss": 0.4277, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 0.9116764068603516, |
|
"learning_rate": 1.1954450647968856e-06, |
|
"loss": 0.4696, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6868884540117417, |
|
"grad_norm": 1.206604242324829, |
|
"learning_rate": 1.181936330973744e-06, |
|
"loss": 0.4205, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6888454011741683, |
|
"grad_norm": 0.8744117617607117, |
|
"learning_rate": 1.1684806885630003e-06, |
|
"loss": 0.5077, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6908023483365949, |
|
"grad_norm": 2.155042886734009, |
|
"learning_rate": 1.155078679555969e-06, |
|
"loss": 0.4193, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6927592954990215, |
|
"grad_norm": 0.9258475303649902, |
|
"learning_rate": 1.1417308437836181e-06, |
|
"loss": 0.3645, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6947162426614482, |
|
"grad_norm": 0.7997338771820068, |
|
"learning_rate": 1.1284377188948258e-06, |
|
"loss": 0.4044, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6966731898238747, |
|
"grad_norm": 0.8342923521995544, |
|
"learning_rate": 1.1151998403347245e-06, |
|
"loss": 0.4132, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6986301369863014, |
|
"grad_norm": 1.0009496212005615, |
|
"learning_rate": 1.1020177413231334e-06, |
|
"loss": 0.4046, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.700587084148728, |
|
"grad_norm": 1.0892616510391235, |
|
"learning_rate": 1.0888919528330778e-06, |
|
"loss": 0.4878, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.7025440313111546, |
|
"grad_norm": 0.829866886138916, |
|
"learning_rate": 1.0758230035694031e-06, |
|
"loss": 0.4876, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.7045009784735812, |
|
"grad_norm": 0.9134871363639832, |
|
"learning_rate": 1.062811419947482e-06, |
|
"loss": 0.5027, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7064579256360078, |
|
"grad_norm": 1.1233887672424316, |
|
"learning_rate": 1.049857726072005e-06, |
|
"loss": 0.3487, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.7084148727984344, |
|
"grad_norm": 0.8092291355133057, |
|
"learning_rate": 1.036962443715872e-06, |
|
"loss": 0.5009, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.7103718199608611, |
|
"grad_norm": 1.730331301689148, |
|
"learning_rate": 1.0241260922991761e-06, |
|
"loss": 0.386, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.7123287671232876, |
|
"grad_norm": 0.9802207946777344, |
|
"learning_rate": 1.0113491888682802e-06, |
|
"loss": 0.4209, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.0146572589874268, |
|
"learning_rate": 9.986322480749926e-07, |
|
"loss": 0.6119, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7162426614481409, |
|
"grad_norm": 0.930644154548645, |
|
"learning_rate": 9.85975782155834e-07, |
|
"loss": 0.4453, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.7181996086105675, |
|
"grad_norm": 1.2394402027130127, |
|
"learning_rate": 9.733803009114045e-07, |
|
"loss": 0.4364, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.7201565557729941, |
|
"grad_norm": 0.8096799850463867, |
|
"learning_rate": 9.608463116858544e-07, |
|
"loss": 0.3672, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7221135029354208, |
|
"grad_norm": 0.9330917596817017, |
|
"learning_rate": 9.483743193464409e-07, |
|
"loss": 0.4665, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.7240704500978473, |
|
"grad_norm": 1.0829280614852905, |
|
"learning_rate": 9.359648262631962e-07, |
|
"loss": 0.4924, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.726027397260274, |
|
"grad_norm": 1.0950247049331665, |
|
"learning_rate": 9.236183322886946e-07, |
|
"loss": 0.4907, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.7279843444227005, |
|
"grad_norm": 0.8494971394538879, |
|
"learning_rate": 9.113353347379097e-07, |
|
"loss": 0.4286, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7299412915851272, |
|
"grad_norm": 0.9138147830963135, |
|
"learning_rate": 8.991163283681945e-07, |
|
"loss": 0.4396, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.7318982387475538, |
|
"grad_norm": 1.6995892524719238, |
|
"learning_rate": 8.869618053593429e-07, |
|
"loss": 0.3989, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7338551859099804, |
|
"grad_norm": 0.9424477815628052, |
|
"learning_rate": 8.748722552937688e-07, |
|
"loss": 0.4371, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.735812133072407, |
|
"grad_norm": 1.2042125463485718, |
|
"learning_rate": 8.628481651367876e-07, |
|
"loss": 0.4337, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7377690802348337, |
|
"grad_norm": 0.9822342395782471, |
|
"learning_rate": 8.508900192169964e-07, |
|
"loss": 0.4329, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.7397260273972602, |
|
"grad_norm": 1.0332896709442139, |
|
"learning_rate": 8.389982992067688e-07, |
|
"loss": 0.4286, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7416829745596869, |
|
"grad_norm": 0.8743665218353271, |
|
"learning_rate": 8.271734841028553e-07, |
|
"loss": 0.487, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7436399217221135, |
|
"grad_norm": 0.9147298336029053, |
|
"learning_rate": 8.154160502070804e-07, |
|
"loss": 0.453, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7455968688845401, |
|
"grad_norm": 1.113299012184143, |
|
"learning_rate": 8.037264711071699e-07, |
|
"loss": 0.4432, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7475538160469667, |
|
"grad_norm": 0.934984564781189, |
|
"learning_rate": 7.921052176576643e-07, |
|
"loss": 0.5102, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7495107632093934, |
|
"grad_norm": 0.8149503469467163, |
|
"learning_rate": 7.805527579609575e-07, |
|
"loss": 0.4834, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7514677103718199, |
|
"grad_norm": 1.2893983125686646, |
|
"learning_rate": 7.690695573484433e-07, |
|
"loss": 0.3211, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7534246575342466, |
|
"grad_norm": 1.0519015789031982, |
|
"learning_rate": 7.576560783617667e-07, |
|
"loss": 0.4613, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7553816046966731, |
|
"grad_norm": 0.8619464039802551, |
|
"learning_rate": 7.463127807341966e-07, |
|
"loss": 0.4728, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7573385518590998, |
|
"grad_norm": 0.890130341053009, |
|
"learning_rate": 7.35040121372109e-07, |
|
"loss": 0.4721, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7592954990215264, |
|
"grad_norm": 1.1289362907409668, |
|
"learning_rate": 7.238385543365783e-07, |
|
"loss": 0.4206, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.761252446183953, |
|
"grad_norm": 0.8591368198394775, |
|
"learning_rate": 7.127085308250914e-07, |
|
"loss": 0.415, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7632093933463796, |
|
"grad_norm": 0.9674418568611145, |
|
"learning_rate": 7.016504991533727e-07, |
|
"loss": 0.5114, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7651663405088063, |
|
"grad_norm": 1.0890218019485474, |
|
"learning_rate": 6.906649047373246e-07, |
|
"loss": 0.3641, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7671232876712328, |
|
"grad_norm": 0.9494483470916748, |
|
"learning_rate": 6.797521900750897e-07, |
|
"loss": 0.4682, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7690802348336595, |
|
"grad_norm": 0.9544976949691772, |
|
"learning_rate": 6.689127947292232e-07, |
|
"loss": 0.4227, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7710371819960861, |
|
"grad_norm": 2.679705858230591, |
|
"learning_rate": 6.581471553089874e-07, |
|
"loss": 0.4482, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7729941291585127, |
|
"grad_norm": 0.8427915573120117, |
|
"learning_rate": 6.474557054527709e-07, |
|
"loss": 0.4048, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7749510763209393, |
|
"grad_norm": 0.8168734312057495, |
|
"learning_rate": 6.368388758106134e-07, |
|
"loss": 0.377, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.776908023483366, |
|
"grad_norm": 1.0561057329177856, |
|
"learning_rate": 6.262970940268653e-07, |
|
"loss": 0.4315, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7788649706457925, |
|
"grad_norm": 0.8930473923683167, |
|
"learning_rate": 6.158307847229594e-07, |
|
"loss": 0.5171, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7808219178082192, |
|
"grad_norm": 1.0137521028518677, |
|
"learning_rate": 6.05440369480308e-07, |
|
"loss": 0.4549, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7827788649706457, |
|
"grad_norm": 0.9667198061943054, |
|
"learning_rate": 5.951262668233232e-07, |
|
"loss": 0.4213, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7847358121330724, |
|
"grad_norm": 0.7895818948745728, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.427, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.786692759295499, |
|
"grad_norm": 1.007455825805664, |
|
"learning_rate": 5.747286579779607e-07, |
|
"loss": 0.4125, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7886497064579256, |
|
"grad_norm": 1.8778549432754517, |
|
"learning_rate": 5.646459734022938e-07, |
|
"loss": 0.4568, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7906066536203522, |
|
"grad_norm": 0.976000964641571, |
|
"learning_rate": 5.546412446046187e-07, |
|
"loss": 0.5, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7925636007827789, |
|
"grad_norm": 0.9260036945343018, |
|
"learning_rate": 5.447148745739522e-07, |
|
"loss": 0.4729, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7945205479452054, |
|
"grad_norm": 0.851002037525177, |
|
"learning_rate": 5.348672631430319e-07, |
|
"loss": 0.4294, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7964774951076321, |
|
"grad_norm": 0.976465106010437, |
|
"learning_rate": 5.250988069722096e-07, |
|
"loss": 0.4655, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.7984344422700587, |
|
"grad_norm": 0.9321781396865845, |
|
"learning_rate": 5.154098995334769e-07, |
|
"loss": 0.3931, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.8003913894324853, |
|
"grad_norm": 0.8924025297164917, |
|
"learning_rate": 5.058009310946119e-07, |
|
"loss": 0.4222, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.8023483365949119, |
|
"grad_norm": 0.8116724491119385, |
|
"learning_rate": 4.962722887034616e-07, |
|
"loss": 0.325, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8043052837573386, |
|
"grad_norm": 0.9633209705352783, |
|
"learning_rate": 4.868243561723535e-07, |
|
"loss": 0.3769, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.8062622309197651, |
|
"grad_norm": 0.902252733707428, |
|
"learning_rate": 4.774575140626317e-07, |
|
"loss": 0.3959, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.8082191780821918, |
|
"grad_norm": 0.8941038250923157, |
|
"learning_rate": 4.681721396693303e-07, |
|
"loss": 0.4998, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.8101761252446184, |
|
"grad_norm": 1.213836669921875, |
|
"learning_rate": 4.589686070059762e-07, |
|
"loss": 0.5012, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.812133072407045, |
|
"grad_norm": 1.0174344778060913, |
|
"learning_rate": 4.4984728678952234e-07, |
|
"loss": 0.468, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8140900195694716, |
|
"grad_norm": 1.8333814144134521, |
|
"learning_rate": 4.4080854642541833e-07, |
|
"loss": 0.4941, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.8160469667318982, |
|
"grad_norm": 1.6971678733825684, |
|
"learning_rate": 4.318527499928074e-07, |
|
"loss": 0.3649, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.8180039138943248, |
|
"grad_norm": 0.8866695165634155, |
|
"learning_rate": 4.229802582298634e-07, |
|
"loss": 0.4657, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.8199608610567515, |
|
"grad_norm": 1.3764787912368774, |
|
"learning_rate": 4.141914285192619e-07, |
|
"loss": 0.3836, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.9406548142433167, |
|
"learning_rate": 4.0548661487378184e-07, |
|
"loss": 0.497, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8238747553816047, |
|
"grad_norm": 0.8251882195472717, |
|
"learning_rate": 3.9686616792204677e-07, |
|
"loss": 0.4032, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.8258317025440313, |
|
"grad_norm": 0.8226965069770813, |
|
"learning_rate": 3.8833043489440477e-07, |
|
"loss": 0.4526, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.8277886497064579, |
|
"grad_norm": 0.9033458232879639, |
|
"learning_rate": 3.798797596089351e-07, |
|
"loss": 0.4149, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.8297455968688845, |
|
"grad_norm": 0.9945986866950989, |
|
"learning_rate": 3.715144824576078e-07, |
|
"loss": 0.5138, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8317025440313112, |
|
"grad_norm": 1.1671781539916992, |
|
"learning_rate": 3.632349403925664e-07, |
|
"loss": 0.4718, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8336594911937377, |
|
"grad_norm": 1.2945449352264404, |
|
"learning_rate": 3.5504146691255736e-07, |
|
"loss": 0.4514, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.8356164383561644, |
|
"grad_norm": 1.3590197563171387, |
|
"learning_rate": 3.469343920494986e-07, |
|
"loss": 0.4147, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.837573385518591, |
|
"grad_norm": 0.8810437917709351, |
|
"learning_rate": 3.389140423551834e-07, |
|
"loss": 0.4462, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8395303326810176, |
|
"grad_norm": 0.9122494459152222, |
|
"learning_rate": 3.3098074088812686e-07, |
|
"loss": 0.4766, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.8414872798434442, |
|
"grad_norm": 0.8525986075401306, |
|
"learning_rate": 3.2313480720055747e-07, |
|
"loss": 0.3684, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8434442270058709, |
|
"grad_norm": 1.0988531112670898, |
|
"learning_rate": 3.153765573255377e-07, |
|
"loss": 0.4956, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8454011741682974, |
|
"grad_norm": 0.7911211848258972, |
|
"learning_rate": 3.0770630376424276e-07, |
|
"loss": 0.4842, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8473581213307241, |
|
"grad_norm": 1.0055835247039795, |
|
"learning_rate": 3.0012435547336737e-07, |
|
"loss": 0.3518, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.8493150684931506, |
|
"grad_norm": 1.304575800895691, |
|
"learning_rate": 2.9263101785268253e-07, |
|
"loss": 0.3509, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8512720156555773, |
|
"grad_norm": 0.9222425818443298, |
|
"learning_rate": 2.8522659273273606e-07, |
|
"loss": 0.3888, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8532289628180039, |
|
"grad_norm": 0.9765827059745789, |
|
"learning_rate": 2.779113783626916e-07, |
|
"loss": 0.4616, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8551859099804305, |
|
"grad_norm": 0.972284734249115, |
|
"learning_rate": 2.7068566939831646e-07, |
|
"loss": 0.3573, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.9025648832321167, |
|
"learning_rate": 2.6354975689011576e-07, |
|
"loss": 0.4246, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8590998043052838, |
|
"grad_norm": 0.8234553933143616, |
|
"learning_rate": 2.5650392827160446e-07, |
|
"loss": 0.3739, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8610567514677103, |
|
"grad_norm": 1.1872916221618652, |
|
"learning_rate": 2.4954846734773054e-07, |
|
"loss": 0.377, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.863013698630137, |
|
"grad_norm": 0.9565138816833496, |
|
"learning_rate": 2.4268365428344737e-07, |
|
"loss": 0.5044, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8649706457925636, |
|
"grad_norm": 1.1466796398162842, |
|
"learning_rate": 2.3590976559242278e-07, |
|
"loss": 0.3848, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8669275929549902, |
|
"grad_norm": 0.9302741289138794, |
|
"learning_rate": 2.29227074125907e-07, |
|
"loss": 0.5157, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8688845401174168, |
|
"grad_norm": 0.9383424520492554, |
|
"learning_rate": 2.2263584906173723e-07, |
|
"loss": 0.4421, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8708414872798435, |
|
"grad_norm": 1.1834505796432495, |
|
"learning_rate": 2.1613635589349756e-07, |
|
"loss": 0.4172, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.87279843444227, |
|
"grad_norm": 0.9577175378799438, |
|
"learning_rate": 2.0972885641982605e-07, |
|
"loss": 0.4004, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8747553816046967, |
|
"grad_norm": 0.8691757321357727, |
|
"learning_rate": 2.0341360873386673e-07, |
|
"loss": 0.4321, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8767123287671232, |
|
"grad_norm": 1.0094484090805054, |
|
"learning_rate": 1.97190867212875e-07, |
|
"loss": 0.428, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8786692759295499, |
|
"grad_norm": 0.8963342308998108, |
|
"learning_rate": 1.9106088250797266e-07, |
|
"loss": 0.4358, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8806262230919765, |
|
"grad_norm": 1.7301355600357056, |
|
"learning_rate": 1.8502390153404936e-07, |
|
"loss": 0.4104, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8825831702544031, |
|
"grad_norm": 0.8558318614959717, |
|
"learning_rate": 1.790801674598186e-07, |
|
"loss": 0.4592, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8845401174168297, |
|
"grad_norm": 0.8883755207061768, |
|
"learning_rate": 1.732299196980225e-07, |
|
"loss": 0.416, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8864970645792564, |
|
"grad_norm": 1.679168701171875, |
|
"learning_rate": 1.6747339389578732e-07, |
|
"loss": 0.4899, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.8884540117416829, |
|
"grad_norm": 0.8892528414726257, |
|
"learning_rate": 1.6181082192513352e-07, |
|
"loss": 0.4228, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8904109589041096, |
|
"grad_norm": 1.5113455057144165, |
|
"learning_rate": 1.5624243187363442e-07, |
|
"loss": 0.4832, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8923679060665362, |
|
"grad_norm": 1.2870134115219116, |
|
"learning_rate": 1.507684480352292e-07, |
|
"loss": 0.4141, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8943248532289628, |
|
"grad_norm": 1.6229395866394043, |
|
"learning_rate": 1.4538909090118846e-07, |
|
"loss": 0.4619, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.8962818003913894, |
|
"grad_norm": 0.8794851899147034, |
|
"learning_rate": 1.4010457715123355e-07, |
|
"loss": 0.3665, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.898238747553816, |
|
"grad_norm": 0.8392042517662048, |
|
"learning_rate": 1.3491511964480703e-07, |
|
"loss": 0.4389, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.9001956947162426, |
|
"grad_norm": 1.3040436506271362, |
|
"learning_rate": 1.2982092741250145e-07, |
|
"loss": 0.3347, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9021526418786693, |
|
"grad_norm": 2.594942331314087, |
|
"learning_rate": 1.2482220564763669e-07, |
|
"loss": 0.3493, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.9041095890410958, |
|
"grad_norm": 1.2146382331848145, |
|
"learning_rate": 1.1991915569799645e-07, |
|
"loss": 0.487, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.9060665362035225, |
|
"grad_norm": 0.9857767224311829, |
|
"learning_rate": 1.1511197505771843e-07, |
|
"loss": 0.3678, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.9080234833659491, |
|
"grad_norm": 0.9433605670928955, |
|
"learning_rate": 1.1040085735933681e-07, |
|
"loss": 0.4477, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.9099804305283757, |
|
"grad_norm": 1.0072382688522339, |
|
"learning_rate": 1.0578599236598708e-07, |
|
"loss": 0.4258, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9119373776908023, |
|
"grad_norm": 0.977323591709137, |
|
"learning_rate": 1.0126756596375687e-07, |
|
"loss": 0.4071, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.913894324853229, |
|
"grad_norm": 0.924149751663208, |
|
"learning_rate": 9.684576015420277e-08, |
|
"loss": 0.477, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.9158512720156555, |
|
"grad_norm": 0.8529196381568909, |
|
"learning_rate": 9.252075304701929e-08, |
|
"loss": 0.4513, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.9178082191780822, |
|
"grad_norm": 1.0572128295898438, |
|
"learning_rate": 8.829271885286095e-08, |
|
"loss": 0.4472, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.9197651663405088, |
|
"grad_norm": 3.0187559127807617, |
|
"learning_rate": 8.416182787632871e-08, |
|
"loss": 0.3696, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9217221135029354, |
|
"grad_norm": 1.2419676780700684, |
|
"learning_rate": 8.012824650910938e-08, |
|
"loss": 0.3411, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.923679060665362, |
|
"grad_norm": 0.8936371803283691, |
|
"learning_rate": 7.619213722327184e-08, |
|
"loss": 0.4494, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9256360078277887, |
|
"grad_norm": 1.0433343648910522, |
|
"learning_rate": 7.235365856472443e-08, |
|
"loss": 0.4545, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.9275929549902152, |
|
"grad_norm": 0.9922037720680237, |
|
"learning_rate": 6.86129651468273e-08, |
|
"loss": 0.4118, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.9295499021526419, |
|
"grad_norm": 0.8298634886741638, |
|
"learning_rate": 6.497020764416633e-08, |
|
"loss": 0.4768, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9315068493150684, |
|
"grad_norm": 0.8023221492767334, |
|
"learning_rate": 6.142553278648239e-08, |
|
"loss": 0.4451, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9334637964774951, |
|
"grad_norm": 0.828525960445404, |
|
"learning_rate": 5.7979083352762146e-08, |
|
"loss": 0.3043, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.9354207436399217, |
|
"grad_norm": 1.59126615524292, |
|
"learning_rate": 5.463099816548578e-08, |
|
"loss": 0.3771, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.9373776908023483, |
|
"grad_norm": 1.2710837125778198, |
|
"learning_rate": 5.1381412085036994e-08, |
|
"loss": 0.4743, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.9393346379647749, |
|
"grad_norm": 0.953567624092102, |
|
"learning_rate": 4.823045600426901e-08, |
|
"loss": 0.4077, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9412915851272016, |
|
"grad_norm": 0.9778720736503601, |
|
"learning_rate": 4.5178256843233235e-08, |
|
"loss": 0.4112, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.9432485322896281, |
|
"grad_norm": 0.8094834685325623, |
|
"learning_rate": 4.2224937544067254e-08, |
|
"loss": 0.4878, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.9452054794520548, |
|
"grad_norm": 0.8327929377555847, |
|
"learning_rate": 3.9370617066040726e-08, |
|
"loss": 0.3676, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.9471624266144814, |
|
"grad_norm": 0.8924036622047424, |
|
"learning_rate": 3.661541038076755e-08, |
|
"loss": 0.3628, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.949119373776908, |
|
"grad_norm": 1.062476634979248, |
|
"learning_rate": 3.395942846757067e-08, |
|
"loss": 0.3709, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9510763209393346, |
|
"grad_norm": 0.9672690033912659, |
|
"learning_rate": 3.1402778309014284e-08, |
|
"loss": 0.4846, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.9530332681017613, |
|
"grad_norm": 0.9301928281784058, |
|
"learning_rate": 2.8945562886593948e-08, |
|
"loss": 0.4465, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.9549902152641878, |
|
"grad_norm": 1.6346007585525513, |
|
"learning_rate": 2.6587881176588782e-08, |
|
"loss": 0.3958, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9569471624266145, |
|
"grad_norm": 0.9479952454566956, |
|
"learning_rate": 2.4329828146074096e-08, |
|
"loss": 0.3922, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 1.1471753120422363, |
|
"learning_rate": 2.2171494749097243e-08, |
|
"loss": 0.462, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9608610567514677, |
|
"grad_norm": 0.9728820323944092, |
|
"learning_rate": 2.011296792301165e-08, |
|
"loss": 0.4206, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9628180039138943, |
|
"grad_norm": 0.8930822014808655, |
|
"learning_rate": 1.8154330584978785e-08, |
|
"loss": 0.4664, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9647749510763209, |
|
"grad_norm": 1.0260281562805176, |
|
"learning_rate": 1.629566162862445e-08, |
|
"loss": 0.4395, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9667318982387475, |
|
"grad_norm": 1.2178572416305542, |
|
"learning_rate": 1.453703592086353e-08, |
|
"loss": 0.4311, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9686888454011742, |
|
"grad_norm": 0.8803574442863464, |
|
"learning_rate": 1.28785242988827e-08, |
|
"loss": 0.4175, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9706457925636007, |
|
"grad_norm": 0.9738378524780273, |
|
"learning_rate": 1.132019356728853e-08, |
|
"loss": 0.4419, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9726027397260274, |
|
"grad_norm": 0.8282538056373596, |
|
"learning_rate": 9.862106495415469e-09, |
|
"loss": 0.4128, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.974559686888454, |
|
"grad_norm": 1.130934715270996, |
|
"learning_rate": 8.504321814798433e-09, |
|
"loss": 0.3772, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9765166340508806, |
|
"grad_norm": 2.3474204540252686, |
|
"learning_rate": 7.246894216806355e-09, |
|
"loss": 0.4271, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.9784735812133072, |
|
"grad_norm": 0.9170702695846558, |
|
"learning_rate": 6.089874350439507e-09, |
|
"loss": 0.4163, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9804305283757339, |
|
"grad_norm": 1.3329914808273315, |
|
"learning_rate": 5.033308820289185e-09, |
|
"loss": 0.4318, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.9823874755381604, |
|
"grad_norm": 0.9551968574523926, |
|
"learning_rate": 4.07724018466088e-09, |
|
"loss": 0.3538, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.9843444227005871, |
|
"grad_norm": 0.9140384197235107, |
|
"learning_rate": 3.2217069538600932e-09, |
|
"loss": 0.4503, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.9863013698630136, |
|
"grad_norm": 1.072695016860962, |
|
"learning_rate": 2.4667435886402414e-09, |
|
"loss": 0.4374, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9882583170254403, |
|
"grad_norm": 0.8060042262077332, |
|
"learning_rate": 1.8123804988159909e-09, |
|
"loss": 0.4142, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9902152641878669, |
|
"grad_norm": 1.2433676719665527, |
|
"learning_rate": 1.2586440420372936e-09, |
|
"loss": 0.4401, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9921722113502935, |
|
"grad_norm": 1.1050037145614624, |
|
"learning_rate": 8.0555652272718e-10, |
|
"loss": 0.4379, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9921722113502935, |
|
"eval_accuracy": 0.8291501000753674, |
|
"eval_accuracy_first_token": 0.9575854700854701, |
|
"eval_accuracy_first_token_all": 0.9726387385994744, |
|
"eval_accuracy_first_token_all_total": 6469, |
|
"eval_accuracy_first_token_calculate": 0.9090909090909091, |
|
"eval_accuracy_first_token_calculate_total": 44, |
|
"eval_accuracy_first_token_execute": 1.0, |
|
"eval_accuracy_first_token_execute_total": 202, |
|
"eval_accuracy_first_token_get": 0.956140350877193, |
|
"eval_accuracy_first_token_get_total": 456, |
|
"eval_accuracy_first_token_python": 0.8909090909090909, |
|
"eval_accuracy_first_token_python_total": 990, |
|
"eval_loss": 0.5047600269317627, |
|
"eval_perplexity": 1.201347285698878, |
|
"eval_runtime": 525.3078, |
|
"eval_samples_per_second": 1.266, |
|
"eval_steps_per_second": 0.16, |
|
"eval_total_number_first_token": 9360, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9941291585127201, |
|
"grad_norm": 0.9681710004806519, |
|
"learning_rate": 4.5313619118553256e-10, |
|
"loss": 0.4287, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9960861056751468, |
|
"grad_norm": 0.8318100571632385, |
|
"learning_rate": 2.0139724285161976e-10, |
|
"loss": 0.4405, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9980430528375733, |
|
"grad_norm": 0.8928787708282471, |
|
"learning_rate": 5.0349817733719165e-11, |
|
"loss": 0.3779, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.4657742977142334, |
|
"learning_rate": 0.0, |
|
"loss": 0.4483, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 511, |
|
"total_flos": 529508264312832.0, |
|
"train_loss": 0.47377988486140676, |
|
"train_runtime": 61575.0259, |
|
"train_samples_per_second": 0.133, |
|
"train_steps_per_second": 0.008 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 511, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 529508264312832.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|