{ "best_metric": 0.03968534991145134, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 0.27586206896551724, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001379310344827586, "grad_norm": 3.2597131729125977, "learning_rate": 6e-06, "loss": 1.8762, "step": 1 }, { "epoch": 0.001379310344827586, "eval_loss": 1.2149479389190674, "eval_runtime": 26.091, "eval_samples_per_second": 46.798, "eval_steps_per_second": 11.728, "step": 1 }, { "epoch": 0.002758620689655172, "grad_norm": 4.694916725158691, "learning_rate": 1.2e-05, "loss": 1.338, "step": 2 }, { "epoch": 0.004137931034482759, "grad_norm": 6.192763328552246, "learning_rate": 1.8e-05, "loss": 1.6252, "step": 3 }, { "epoch": 0.005517241379310344, "grad_norm": 7.979697227478027, "learning_rate": 2.4e-05, "loss": 2.1471, "step": 4 }, { "epoch": 0.006896551724137931, "grad_norm": 9.736289978027344, "learning_rate": 3e-05, "loss": 2.8252, "step": 5 }, { "epoch": 0.008275862068965517, "grad_norm": 5.926011085510254, "learning_rate": 3.6e-05, "loss": 1.4922, "step": 6 }, { "epoch": 0.009655172413793104, "grad_norm": 5.782570838928223, "learning_rate": 4.2e-05, "loss": 1.2979, "step": 7 }, { "epoch": 0.011034482758620689, "grad_norm": 4.730049133300781, "learning_rate": 4.8e-05, "loss": 1.41, "step": 8 }, { "epoch": 0.012413793103448275, "grad_norm": 4.301328659057617, "learning_rate": 5.4000000000000005e-05, "loss": 1.1268, "step": 9 }, { "epoch": 0.013793103448275862, "grad_norm": 4.3723530769348145, "learning_rate": 6e-05, "loss": 1.1332, "step": 10 }, { "epoch": 0.015172413793103448, "grad_norm": 4.571831703186035, "learning_rate": 5.999589914977407e-05, "loss": 1.171, "step": 11 }, { "epoch": 0.016551724137931035, "grad_norm": 4.981822967529297, "learning_rate": 5.998359772022778e-05, "loss": 1.1291, "step": 12 }, { "epoch": 0.01793103448275862, "grad_norm": 4.491147041320801, "learning_rate": 5.996309907444915e-05, "loss": 1.2609, "step": 13 }, { "epoch": 0.019310344827586208, "grad_norm": 5.013556957244873, "learning_rate": 5.9934408816563236e-05, "loss": 1.094, "step": 14 }, { "epoch": 0.020689655172413793, "grad_norm": 4.86841344833374, "learning_rate": 5.98975347902001e-05, "loss": 1.092, "step": 15 }, { "epoch": 0.022068965517241378, "grad_norm": 6.520169734954834, "learning_rate": 5.9852487076350345e-05, "loss": 1.033, "step": 16 }, { "epoch": 0.023448275862068966, "grad_norm": 6.673559188842773, "learning_rate": 5.979927799060915e-05, "loss": 1.0463, "step": 17 }, { "epoch": 0.02482758620689655, "grad_norm": 6.042698383331299, "learning_rate": 5.9737922079809257e-05, "loss": 0.9002, "step": 18 }, { "epoch": 0.02620689655172414, "grad_norm": 7.816788673400879, "learning_rate": 5.9668436118044054e-05, "loss": 0.8878, "step": 19 }, { "epoch": 0.027586206896551724, "grad_norm": 5.338583946228027, "learning_rate": 5.959083910208167e-05, "loss": 0.7796, "step": 20 }, { "epoch": 0.028965517241379312, "grad_norm": 5.218435287475586, "learning_rate": 5.9505152246171474e-05, "loss": 0.6179, "step": 21 }, { "epoch": 0.030344827586206897, "grad_norm": 5.052286148071289, "learning_rate": 5.941139897624428e-05, "loss": 0.5596, "step": 22 }, { "epoch": 0.031724137931034485, "grad_norm": 7.385002613067627, "learning_rate": 5.9309604923507984e-05, "loss": 0.6379, "step": 23 }, { "epoch": 0.03310344827586207, "grad_norm": 6.220843315124512, "learning_rate": 5.9199797917440176e-05, "loss": 0.5378, "step": 24 }, { "epoch": 0.034482758620689655, "grad_norm": 4.937102317810059, "learning_rate": 5.908200797817991e-05, "loss": 0.3842, "step": 25 }, { "epoch": 0.03586206896551724, "grad_norm": 6.7553911209106445, "learning_rate": 5.895626730832046e-05, "loss": 0.7545, "step": 26 }, { "epoch": 0.037241379310344824, "grad_norm": 5.3821234703063965, "learning_rate": 5.882261028410545e-05, "loss": 0.4282, "step": 27 }, { "epoch": 0.038620689655172416, "grad_norm": 9.57083511352539, "learning_rate": 5.8681073446030734e-05, "loss": 0.357, "step": 28 }, { "epoch": 0.04, "grad_norm": 6.343085289001465, "learning_rate": 5.853169548885461e-05, "loss": 0.3994, "step": 29 }, { "epoch": 0.041379310344827586, "grad_norm": 8.39541244506836, "learning_rate": 5.8374517251019035e-05, "loss": 0.5362, "step": 30 }, { "epoch": 0.04275862068965517, "grad_norm": 6.480747222900391, "learning_rate": 5.820958170348484e-05, "loss": 0.3538, "step": 31 }, { "epoch": 0.044137931034482755, "grad_norm": 7.175205707550049, "learning_rate": 5.8036933937983825e-05, "loss": 0.5323, "step": 32 }, { "epoch": 0.04551724137931035, "grad_norm": 14.051421165466309, "learning_rate": 5.7856621154691217e-05, "loss": 0.8048, "step": 33 }, { "epoch": 0.04689655172413793, "grad_norm": 6.995046138763428, "learning_rate": 5.766869264932154e-05, "loss": 0.3382, "step": 34 }, { "epoch": 0.04827586206896552, "grad_norm": 7.1437668800354, "learning_rate": 5.747319979965172e-05, "loss": 0.388, "step": 35 }, { "epoch": 0.0496551724137931, "grad_norm": 6.067580223083496, "learning_rate": 5.727019605147488e-05, "loss": 0.43, "step": 36 }, { "epoch": 0.05103448275862069, "grad_norm": 6.2847700119018555, "learning_rate": 5.7059736903988775e-05, "loss": 0.3548, "step": 37 }, { "epoch": 0.05241379310344828, "grad_norm": 6.924132347106934, "learning_rate": 5.684187989462291e-05, "loss": 0.3532, "step": 38 }, { "epoch": 0.05379310344827586, "grad_norm": 6.123856544494629, "learning_rate": 5.661668458330836e-05, "loss": 0.2675, "step": 39 }, { "epoch": 0.05517241379310345, "grad_norm": 4.196285247802734, "learning_rate": 5.638421253619467e-05, "loss": 0.0934, "step": 40 }, { "epoch": 0.05655172413793103, "grad_norm": 6.830177307128906, "learning_rate": 5.614452730881832e-05, "loss": 0.2334, "step": 41 }, { "epoch": 0.057931034482758624, "grad_norm": 3.183481216430664, "learning_rate": 5.589769442872722e-05, "loss": 0.1075, "step": 42 }, { "epoch": 0.05931034482758621, "grad_norm": 3.9168667793273926, "learning_rate": 5.5643781377566175e-05, "loss": 0.1061, "step": 43 }, { "epoch": 0.060689655172413794, "grad_norm": 2.812781810760498, "learning_rate": 5.538285757262806e-05, "loss": 0.0804, "step": 44 }, { "epoch": 0.06206896551724138, "grad_norm": 10.25263786315918, "learning_rate": 5.5114994347875856e-05, "loss": 0.3843, "step": 45 }, { "epoch": 0.06344827586206897, "grad_norm": 7.238382339477539, "learning_rate": 5.48402649344406e-05, "loss": 0.3073, "step": 46 }, { "epoch": 0.06482758620689655, "grad_norm": 6.004594802856445, "learning_rate": 5.455874444060078e-05, "loss": 0.2398, "step": 47 }, { "epoch": 0.06620689655172414, "grad_norm": 9.107511520385742, "learning_rate": 5.427050983124843e-05, "loss": 0.2184, "step": 48 }, { "epoch": 0.06758620689655172, "grad_norm": 6.167825222015381, "learning_rate": 5.397563990684774e-05, "loss": 0.2221, "step": 49 }, { "epoch": 0.06896551724137931, "grad_norm": 17.819231033325195, "learning_rate": 5.367421528189181e-05, "loss": 0.4287, "step": 50 }, { "epoch": 0.06896551724137931, "eval_loss": 0.18357078731060028, "eval_runtime": 26.6864, "eval_samples_per_second": 45.754, "eval_steps_per_second": 11.467, "step": 50 }, { "epoch": 0.0703448275862069, "grad_norm": 29.97521209716797, "learning_rate": 5.336631836286338e-05, "loss": 3.3529, "step": 51 }, { "epoch": 0.07172413793103448, "grad_norm": 26.448806762695312, "learning_rate": 5.3052033325705774e-05, "loss": 1.9551, "step": 52 }, { "epoch": 0.07310344827586207, "grad_norm": 21.095224380493164, "learning_rate": 5.2731446092810044e-05, "loss": 1.2991, "step": 53 }, { "epoch": 0.07448275862068965, "grad_norm": 9.618408203125, "learning_rate": 5.240464430952462e-05, "loss": 1.0447, "step": 54 }, { "epoch": 0.07586206896551724, "grad_norm": 19.800874710083008, "learning_rate": 5.207171732019395e-05, "loss": 2.5338, "step": 55 }, { "epoch": 0.07724137931034483, "grad_norm": 5.092163562774658, "learning_rate": 5.1732756143732675e-05, "loss": 0.5511, "step": 56 }, { "epoch": 0.07862068965517241, "grad_norm": 4.90592622756958, "learning_rate": 5.1387853448741916e-05, "loss": 0.4434, "step": 57 }, { "epoch": 0.08, "grad_norm": 4.161886215209961, "learning_rate": 5.103710352817465e-05, "loss": 0.4564, "step": 58 }, { "epoch": 0.08137931034482758, "grad_norm": 3.1255943775177, "learning_rate": 5.068060227355698e-05, "loss": 0.2998, "step": 59 }, { "epoch": 0.08275862068965517, "grad_norm": 3.8610427379608154, "learning_rate": 5.0318447148772234e-05, "loss": 0.4897, "step": 60 }, { "epoch": 0.08413793103448276, "grad_norm": 2.8609635829925537, "learning_rate": 4.995073716341545e-05, "loss": 0.3771, "step": 61 }, { "epoch": 0.08551724137931034, "grad_norm": 2.479064702987671, "learning_rate": 4.957757284572506e-05, "loss": 0.2395, "step": 62 }, { "epoch": 0.08689655172413793, "grad_norm": 2.5700461864471436, "learning_rate": 4.91990562150995e-05, "loss": 0.2465, "step": 63 }, { "epoch": 0.08827586206896551, "grad_norm": 4.032325267791748, "learning_rate": 4.881529075420611e-05, "loss": 0.3796, "step": 64 }, { "epoch": 0.0896551724137931, "grad_norm": 3.90286922454834, "learning_rate": 4.8426381380690036e-05, "loss": 0.4224, "step": 65 }, { "epoch": 0.0910344827586207, "grad_norm": 3.585496187210083, "learning_rate": 4.8032434418490753e-05, "loss": 0.3608, "step": 66 }, { "epoch": 0.09241379310344827, "grad_norm": 2.7982876300811768, "learning_rate": 4.7633557568774194e-05, "loss": 0.2261, "step": 67 }, { "epoch": 0.09379310344827586, "grad_norm": 3.4788730144500732, "learning_rate": 4.722985988048831e-05, "loss": 0.2269, "step": 68 }, { "epoch": 0.09517241379310344, "grad_norm": 3.0776095390319824, "learning_rate": 4.6821451720550184e-05, "loss": 0.3505, "step": 69 }, { "epoch": 0.09655172413793103, "grad_norm": 1.9606295824050903, "learning_rate": 4.640844474367282e-05, "loss": 0.165, "step": 70 }, { "epoch": 0.09793103448275862, "grad_norm": 3.669142961502075, "learning_rate": 4.5990951861839815e-05, "loss": 0.2614, "step": 71 }, { "epoch": 0.0993103448275862, "grad_norm": 2.891281843185425, "learning_rate": 4.5569087213436455e-05, "loss": 0.2596, "step": 72 }, { "epoch": 0.1006896551724138, "grad_norm": 1.9978502988815308, "learning_rate": 4.514296613204532e-05, "loss": 0.1158, "step": 73 }, { "epoch": 0.10206896551724139, "grad_norm": 4.080682754516602, "learning_rate": 4.471270511491525e-05, "loss": 0.2223, "step": 74 }, { "epoch": 0.10344827586206896, "grad_norm": 2.488248586654663, "learning_rate": 4.427842179111221e-05, "loss": 0.0983, "step": 75 }, { "epoch": 0.10482758620689656, "grad_norm": 4.52821683883667, "learning_rate": 4.3840234889360634e-05, "loss": 0.1799, "step": 76 }, { "epoch": 0.10620689655172413, "grad_norm": 3.5614824295043945, "learning_rate": 4.33982642055842e-05, "loss": 0.19, "step": 77 }, { "epoch": 0.10758620689655173, "grad_norm": 3.7923061847686768, "learning_rate": 4.2952630570154785e-05, "loss": 0.1115, "step": 78 }, { "epoch": 0.10896551724137932, "grad_norm": 3.957282066345215, "learning_rate": 4.250345581485871e-05, "loss": 0.2632, "step": 79 }, { "epoch": 0.1103448275862069, "grad_norm": 3.155132532119751, "learning_rate": 4.205086273958909e-05, "loss": 0.1195, "step": 80 }, { "epoch": 0.11172413793103449, "grad_norm": 2.5200023651123047, "learning_rate": 4.1594975078773565e-05, "loss": 0.1794, "step": 81 }, { "epoch": 0.11310344827586206, "grad_norm": 1.611479640007019, "learning_rate": 4.113591746754662e-05, "loss": 0.0998, "step": 82 }, { "epoch": 0.11448275862068966, "grad_norm": 3.2705655097961426, "learning_rate": 4.06738154076755e-05, "loss": 0.2304, "step": 83 }, { "epoch": 0.11586206896551725, "grad_norm": 5.466758728027344, "learning_rate": 4.020879523324929e-05, "loss": 0.1846, "step": 84 }, { "epoch": 0.11724137931034483, "grad_norm": 3.167026996612549, "learning_rate": 3.974098407614051e-05, "loss": 0.1012, "step": 85 }, { "epoch": 0.11862068965517242, "grad_norm": 2.0375046730041504, "learning_rate": 3.927050983124842e-05, "loss": 0.1, "step": 86 }, { "epoch": 0.12, "grad_norm": 4.988479137420654, "learning_rate": 3.8797501121533946e-05, "loss": 0.2156, "step": 87 }, { "epoch": 0.12137931034482759, "grad_norm": 5.211511611938477, "learning_rate": 3.832208726285534e-05, "loss": 0.1878, "step": 88 }, { "epoch": 0.12275862068965518, "grad_norm": 2.6948442459106445, "learning_rate": 3.784439822861459e-05, "loss": 0.1897, "step": 89 }, { "epoch": 0.12413793103448276, "grad_norm": 2.251394033432007, "learning_rate": 3.7364564614223976e-05, "loss": 0.0533, "step": 90 }, { "epoch": 0.12551724137931033, "grad_norm": 7.236598491668701, "learning_rate": 3.688271760140255e-05, "loss": 0.2186, "step": 91 }, { "epoch": 0.12689655172413794, "grad_norm": 0.982333779335022, "learning_rate": 3.6398988922312406e-05, "loss": 0.032, "step": 92 }, { "epoch": 0.12827586206896552, "grad_norm": 2.438767433166504, "learning_rate": 3.591351082354441e-05, "loss": 0.042, "step": 93 }, { "epoch": 0.1296551724137931, "grad_norm": 4.39130163192749, "learning_rate": 3.54264160299633e-05, "loss": 0.0791, "step": 94 }, { "epoch": 0.1310344827586207, "grad_norm": 8.375447273254395, "learning_rate": 3.493783770842202e-05, "loss": 0.1072, "step": 95 }, { "epoch": 0.13241379310344828, "grad_norm": 0.44400498270988464, "learning_rate": 3.444790943135526e-05, "loss": 0.0148, "step": 96 }, { "epoch": 0.13379310344827586, "grad_norm": 3.429780960083008, "learning_rate": 3.3956765140262074e-05, "loss": 0.096, "step": 97 }, { "epoch": 0.13517241379310344, "grad_norm": 0.8855041265487671, "learning_rate": 3.346453910908759e-05, "loss": 0.0253, "step": 98 }, { "epoch": 0.13655172413793104, "grad_norm": 3.90129017829895, "learning_rate": 3.297136590751389e-05, "loss": 0.1212, "step": 99 }, { "epoch": 0.13793103448275862, "grad_norm": 2.5597951412200928, "learning_rate": 3.247738036416998e-05, "loss": 0.0491, "step": 100 }, { "epoch": 0.13793103448275862, "eval_loss": 0.056027818471193314, "eval_runtime": 26.2388, "eval_samples_per_second": 46.534, "eval_steps_per_second": 11.662, "step": 100 }, { "epoch": 0.1393103448275862, "grad_norm": 3.206486225128174, "learning_rate": 3.1982717529770985e-05, "loss": 2.3094, "step": 101 }, { "epoch": 0.1406896551724138, "grad_norm": 1.8436448574066162, "learning_rate": 3.148751264019667e-05, "loss": 0.4255, "step": 102 }, { "epoch": 0.14206896551724138, "grad_norm": 1.9917385578155518, "learning_rate": 3.099190107951924e-05, "loss": 0.3296, "step": 103 }, { "epoch": 0.14344827586206896, "grad_norm": 1.9825700521469116, "learning_rate": 3.049601834299076e-05, "loss": 0.3324, "step": 104 }, { "epoch": 0.14482758620689656, "grad_norm": 12.273948669433594, "learning_rate": 3e-05, "loss": 1.675, "step": 105 }, { "epoch": 0.14620689655172414, "grad_norm": 1.9092867374420166, "learning_rate": 2.9503981657009246e-05, "loss": 0.2856, "step": 106 }, { "epoch": 0.14758620689655172, "grad_norm": 1.991979956626892, "learning_rate": 2.9008098920480752e-05, "loss": 0.2462, "step": 107 }, { "epoch": 0.1489655172413793, "grad_norm": 2.3256442546844482, "learning_rate": 2.851248735980333e-05, "loss": 0.3043, "step": 108 }, { "epoch": 0.1503448275862069, "grad_norm": 1.5754172801971436, "learning_rate": 2.801728247022902e-05, "loss": 0.1755, "step": 109 }, { "epoch": 0.15172413793103448, "grad_norm": 2.5417425632476807, "learning_rate": 2.7522619635830034e-05, "loss": 0.178, "step": 110 }, { "epoch": 0.15310344827586206, "grad_norm": 2.442079782485962, "learning_rate": 2.702863409248612e-05, "loss": 0.2021, "step": 111 }, { "epoch": 0.15448275862068966, "grad_norm": 3.0164618492126465, "learning_rate": 2.6535460890912416e-05, "loss": 0.3017, "step": 112 }, { "epoch": 0.15586206896551724, "grad_norm": 1.9608697891235352, "learning_rate": 2.604323485973793e-05, "loss": 0.1605, "step": 113 }, { "epoch": 0.15724137931034482, "grad_norm": 2.4695162773132324, "learning_rate": 2.555209056864474e-05, "loss": 0.2018, "step": 114 }, { "epoch": 0.15862068965517243, "grad_norm": 2.8998382091522217, "learning_rate": 2.5062162291577978e-05, "loss": 0.1556, "step": 115 }, { "epoch": 0.16, "grad_norm": 2.075309991836548, "learning_rate": 2.4573583970036712e-05, "loss": 0.1801, "step": 116 }, { "epoch": 0.16137931034482758, "grad_norm": 1.1833781003952026, "learning_rate": 2.4086489176455595e-05, "loss": 0.0601, "step": 117 }, { "epoch": 0.16275862068965516, "grad_norm": 3.300199270248413, "learning_rate": 2.36010110776876e-05, "loss": 0.2278, "step": 118 }, { "epoch": 0.16413793103448276, "grad_norm": 2.723283290863037, "learning_rate": 2.3117282398597456e-05, "loss": 0.2208, "step": 119 }, { "epoch": 0.16551724137931034, "grad_norm": 1.7539467811584473, "learning_rate": 2.263543538577603e-05, "loss": 0.0827, "step": 120 }, { "epoch": 0.16689655172413792, "grad_norm": 1.8702696561813354, "learning_rate": 2.215560177138541e-05, "loss": 0.1367, "step": 121 }, { "epoch": 0.16827586206896553, "grad_norm": 2.4578003883361816, "learning_rate": 2.167791273714467e-05, "loss": 0.0948, "step": 122 }, { "epoch": 0.1696551724137931, "grad_norm": 9.225005149841309, "learning_rate": 2.1202498878466062e-05, "loss": 0.1515, "step": 123 }, { "epoch": 0.17103448275862068, "grad_norm": 3.836768865585327, "learning_rate": 2.072949016875158e-05, "loss": 0.137, "step": 124 }, { "epoch": 0.1724137931034483, "grad_norm": 6.218253135681152, "learning_rate": 2.0259015923859498e-05, "loss": 0.1587, "step": 125 }, { "epoch": 0.17379310344827587, "grad_norm": 3.252035140991211, "learning_rate": 1.979120476675071e-05, "loss": 0.0892, "step": 126 }, { "epoch": 0.17517241379310344, "grad_norm": 2.1638646125793457, "learning_rate": 1.9326184592324503e-05, "loss": 0.1058, "step": 127 }, { "epoch": 0.17655172413793102, "grad_norm": 3.323601007461548, "learning_rate": 1.8864082532453373e-05, "loss": 0.1896, "step": 128 }, { "epoch": 0.17793103448275863, "grad_norm": 2.099111318588257, "learning_rate": 1.840502492122644e-05, "loss": 0.1162, "step": 129 }, { "epoch": 0.1793103448275862, "grad_norm": 1.051591157913208, "learning_rate": 1.7949137260410924e-05, "loss": 0.0621, "step": 130 }, { "epoch": 0.18068965517241378, "grad_norm": 2.248985767364502, "learning_rate": 1.7496544185141295e-05, "loss": 0.081, "step": 131 }, { "epoch": 0.1820689655172414, "grad_norm": 1.8897676467895508, "learning_rate": 1.7047369429845216e-05, "loss": 0.0603, "step": 132 }, { "epoch": 0.18344827586206897, "grad_norm": 5.641902923583984, "learning_rate": 1.6601735794415806e-05, "loss": 0.219, "step": 133 }, { "epoch": 0.18482758620689654, "grad_norm": 4.376874923706055, "learning_rate": 1.615976511063937e-05, "loss": 0.1193, "step": 134 }, { "epoch": 0.18620689655172415, "grad_norm": 1.680614709854126, "learning_rate": 1.5721578208887793e-05, "loss": 0.0728, "step": 135 }, { "epoch": 0.18758620689655173, "grad_norm": 2.368492603302002, "learning_rate": 1.5287294885084766e-05, "loss": 0.0776, "step": 136 }, { "epoch": 0.1889655172413793, "grad_norm": 2.124453067779541, "learning_rate": 1.4857033867954697e-05, "loss": 0.0478, "step": 137 }, { "epoch": 0.19034482758620688, "grad_norm": 2.201846122741699, "learning_rate": 1.4430912786563554e-05, "loss": 0.077, "step": 138 }, { "epoch": 0.1917241379310345, "grad_norm": 4.287417888641357, "learning_rate": 1.4009048138160195e-05, "loss": 0.1849, "step": 139 }, { "epoch": 0.19310344827586207, "grad_norm": 7.006134510040283, "learning_rate": 1.3591555256327199e-05, "loss": 0.2051, "step": 140 }, { "epoch": 0.19448275862068964, "grad_norm": 2.679762363433838, "learning_rate": 1.3178548279449822e-05, "loss": 0.038, "step": 141 }, { "epoch": 0.19586206896551725, "grad_norm": 2.5773937702178955, "learning_rate": 1.2770140119511693e-05, "loss": 0.0315, "step": 142 }, { "epoch": 0.19724137931034483, "grad_norm": 5.307798385620117, "learning_rate": 1.2366442431225809e-05, "loss": 0.1446, "step": 143 }, { "epoch": 0.1986206896551724, "grad_norm": 1.9738516807556152, "learning_rate": 1.1967565581509248e-05, "loss": 0.0449, "step": 144 }, { "epoch": 0.2, "grad_norm": 4.396138668060303, "learning_rate": 1.1573618619309965e-05, "loss": 0.0507, "step": 145 }, { "epoch": 0.2013793103448276, "grad_norm": 2.1322338581085205, "learning_rate": 1.1184709245793889e-05, "loss": 0.0352, "step": 146 }, { "epoch": 0.20275862068965517, "grad_norm": 0.7680739164352417, "learning_rate": 1.0800943784900502e-05, "loss": 0.0123, "step": 147 }, { "epoch": 0.20413793103448277, "grad_norm": 4.772315502166748, "learning_rate": 1.042242715427494e-05, "loss": 0.1208, "step": 148 }, { "epoch": 0.20551724137931035, "grad_norm": 0.9251326322555542, "learning_rate": 1.004926283658455e-05, "loss": 0.0166, "step": 149 }, { "epoch": 0.20689655172413793, "grad_norm": 13.54332447052002, "learning_rate": 9.681552851227774e-06, "loss": 0.1272, "step": 150 }, { "epoch": 0.20689655172413793, "eval_loss": 0.03968534991145134, "eval_runtime": 26.3996, "eval_samples_per_second": 46.251, "eval_steps_per_second": 11.591, "step": 150 }, { "epoch": 0.2082758620689655, "grad_norm": 2.429658889770508, "learning_rate": 9.319397726443026e-06, "loss": 1.8887, "step": 151 }, { "epoch": 0.2096551724137931, "grad_norm": 1.5520846843719482, "learning_rate": 8.962896471825342e-06, "loss": 0.307, "step": 152 }, { "epoch": 0.2110344827586207, "grad_norm": 1.7691882848739624, "learning_rate": 8.61214655125809e-06, "loss": 0.28, "step": 153 }, { "epoch": 0.21241379310344827, "grad_norm": 2.1819674968719482, "learning_rate": 8.267243856267331e-06, "loss": 0.2835, "step": 154 }, { "epoch": 0.21379310344827587, "grad_norm": 7.068450450897217, "learning_rate": 7.928282679806052e-06, "loss": 1.096, "step": 155 }, { "epoch": 0.21517241379310345, "grad_norm": 1.7071144580841064, "learning_rate": 7.595355690475393e-06, "loss": 0.1968, "step": 156 }, { "epoch": 0.21655172413793103, "grad_norm": 2.26689076423645, "learning_rate": 7.268553907189964e-06, "loss": 0.1573, "step": 157 }, { "epoch": 0.21793103448275863, "grad_norm": 2.0882985591888428, "learning_rate": 6.947966674294236e-06, "loss": 0.2273, "step": 158 }, { "epoch": 0.2193103448275862, "grad_norm": 1.6875180006027222, "learning_rate": 6.6336816371366305e-06, "loss": 0.1827, "step": 159 }, { "epoch": 0.2206896551724138, "grad_norm": 2.291029214859009, "learning_rate": 6.325784718108196e-06, "loss": 0.1672, "step": 160 }, { "epoch": 0.22206896551724137, "grad_norm": 1.801910400390625, "learning_rate": 6.0243600931522595e-06, "loss": 0.146, "step": 161 }, { "epoch": 0.22344827586206897, "grad_norm": 2.1168177127838135, "learning_rate": 5.72949016875158e-06, "loss": 0.218, "step": 162 }, { "epoch": 0.22482758620689655, "grad_norm": 2.684204578399658, "learning_rate": 5.44125555939923e-06, "loss": 0.1932, "step": 163 }, { "epoch": 0.22620689655172413, "grad_norm": 2.362295150756836, "learning_rate": 5.159735065559399e-06, "loss": 0.1881, "step": 164 }, { "epoch": 0.22758620689655173, "grad_norm": 1.549054741859436, "learning_rate": 4.885005652124144e-06, "loss": 0.1069, "step": 165 }, { "epoch": 0.2289655172413793, "grad_norm": 3.44657301902771, "learning_rate": 4.617142427371934e-06, "loss": 0.1535, "step": 166 }, { "epoch": 0.2303448275862069, "grad_norm": 1.7551734447479248, "learning_rate": 4.3562186224338265e-06, "loss": 0.1201, "step": 167 }, { "epoch": 0.2317241379310345, "grad_norm": 1.292171835899353, "learning_rate": 4.102305571272783e-06, "loss": 0.0825, "step": 168 }, { "epoch": 0.23310344827586207, "grad_norm": 2.1873793601989746, "learning_rate": 3.855472691181678e-06, "loss": 0.1023, "step": 169 }, { "epoch": 0.23448275862068965, "grad_norm": 1.6460893154144287, "learning_rate": 3.615787463805331e-06, "loss": 0.1173, "step": 170 }, { "epoch": 0.23586206896551723, "grad_norm": 2.9875564575195312, "learning_rate": 3.383315416691646e-06, "loss": 0.2615, "step": 171 }, { "epoch": 0.23724137931034484, "grad_norm": 2.0621020793914795, "learning_rate": 3.158120105377096e-06, "loss": 0.0811, "step": 172 }, { "epoch": 0.2386206896551724, "grad_norm": 3.524592876434326, "learning_rate": 2.940263096011233e-06, "loss": 0.1396, "step": 173 }, { "epoch": 0.24, "grad_norm": 2.933391571044922, "learning_rate": 2.729803948525125e-06, "loss": 0.1669, "step": 174 }, { "epoch": 0.2413793103448276, "grad_norm": 1.03538179397583, "learning_rate": 2.526800200348275e-06, "loss": 0.0893, "step": 175 }, { "epoch": 0.24275862068965517, "grad_norm": 8.298017501831055, "learning_rate": 2.3313073506784575e-06, "loss": 0.1639, "step": 176 }, { "epoch": 0.24413793103448275, "grad_norm": 3.220946788787842, "learning_rate": 2.143378845308791e-06, "loss": 0.1279, "step": 177 }, { "epoch": 0.24551724137931036, "grad_norm": 2.422441244125366, "learning_rate": 1.9630660620161777e-06, "loss": 0.1243, "step": 178 }, { "epoch": 0.24689655172413794, "grad_norm": 3.2675623893737793, "learning_rate": 1.790418296515165e-06, "loss": 0.1033, "step": 179 }, { "epoch": 0.2482758620689655, "grad_norm": 1.4966555833816528, "learning_rate": 1.625482748980961e-06, "loss": 0.0431, "step": 180 }, { "epoch": 0.2496551724137931, "grad_norm": 2.3309779167175293, "learning_rate": 1.4683045111453942e-06, "loss": 0.0809, "step": 181 }, { "epoch": 0.25103448275862067, "grad_norm": 2.5142300128936768, "learning_rate": 1.3189265539692707e-06, "loss": 0.1171, "step": 182 }, { "epoch": 0.2524137931034483, "grad_norm": 1.699469804763794, "learning_rate": 1.1773897158945557e-06, "loss": 0.0726, "step": 183 }, { "epoch": 0.2537931034482759, "grad_norm": 1.9541224241256714, "learning_rate": 1.0437326916795432e-06, "loss": 0.0663, "step": 184 }, { "epoch": 0.25517241379310346, "grad_norm": 1.9770375490188599, "learning_rate": 9.179920218200888e-07, "loss": 0.053, "step": 185 }, { "epoch": 0.25655172413793104, "grad_norm": 4.276590347290039, "learning_rate": 8.002020825598277e-07, "loss": 0.2432, "step": 186 }, { "epoch": 0.2579310344827586, "grad_norm": 0.9335222840309143, "learning_rate": 6.90395076492022e-07, "loss": 0.0455, "step": 187 }, { "epoch": 0.2593103448275862, "grad_norm": 1.479485273361206, "learning_rate": 5.886010237557194e-07, "loss": 0.052, "step": 188 }, { "epoch": 0.26068965517241377, "grad_norm": 4.499995708465576, "learning_rate": 4.94847753828529e-07, "loss": 0.1523, "step": 189 }, { "epoch": 0.2620689655172414, "grad_norm": 1.5931576490402222, "learning_rate": 4.091608979183303e-07, "loss": 0.049, "step": 190 }, { "epoch": 0.263448275862069, "grad_norm": 3.1008856296539307, "learning_rate": 3.315638819559452e-07, "loss": 0.0821, "step": 191 }, { "epoch": 0.26482758620689656, "grad_norm": 0.6905267834663391, "learning_rate": 2.6207792019074414e-07, "loss": 0.0176, "step": 192 }, { "epoch": 0.26620689655172414, "grad_norm": 1.2080904245376587, "learning_rate": 2.0072200939085573e-07, "loss": 0.0839, "step": 193 }, { "epoch": 0.2675862068965517, "grad_norm": 3.8367063999176025, "learning_rate": 1.475129236496575e-07, "loss": 0.0677, "step": 194 }, { "epoch": 0.2689655172413793, "grad_norm": 6.552917003631592, "learning_rate": 1.0246520979990459e-07, "loss": 0.2188, "step": 195 }, { "epoch": 0.27034482758620687, "grad_norm": 6.491734981536865, "learning_rate": 6.559118343676396e-08, "loss": 0.1625, "step": 196 }, { "epoch": 0.2717241379310345, "grad_norm": 1.543799877166748, "learning_rate": 3.690092555085789e-08, "loss": 0.0266, "step": 197 }, { "epoch": 0.2731034482758621, "grad_norm": 4.993340015411377, "learning_rate": 1.640227977221853e-08, "loss": 0.1796, "step": 198 }, { "epoch": 0.27448275862068966, "grad_norm": 11.29518985748291, "learning_rate": 4.1008502259298755e-09, "loss": 0.1721, "step": 199 }, { "epoch": 0.27586206896551724, "grad_norm": 0.9339697957038879, "learning_rate": 0.0, "loss": 0.0189, "step": 200 }, { "epoch": 0.27586206896551724, "eval_loss": 0.04178377613425255, "eval_runtime": 26.3479, "eval_samples_per_second": 46.342, "eval_steps_per_second": 11.614, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.472576998473728e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }