|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 4450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02247191011235955, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 4.49438202247191e-06, |
|
"loss": 2.2098, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.98876404494382e-06, |
|
"loss": 2.1052, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06741573033707865, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.348314606741573e-05, |
|
"loss": 2.1605, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.797752808988764e-05, |
|
"loss": 1.8331, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 2.2471910112359552e-05, |
|
"loss": 1.8186, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.696629213483146e-05, |
|
"loss": 1.5533, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15730337078651685, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.1460674157303374e-05, |
|
"loss": 1.5419, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 3.595505617977528e-05, |
|
"loss": 1.3374, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20224719101123595, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 4.044943820224719e-05, |
|
"loss": 1.3418, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.4943820224719104e-05, |
|
"loss": 1.3921, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24719101123595505, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 4.943820224719101e-05, |
|
"loss": 1.1401, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 5.393258426966292e-05, |
|
"loss": 1.2139, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29213483146067415, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 5.8426966292134835e-05, |
|
"loss": 1.171, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.292134831460675e-05, |
|
"loss": 1.1424, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.741573033707866e-05, |
|
"loss": 1.2171, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.191011235955056e-05, |
|
"loss": 1.1575, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.38202247191011235, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 7.640449438202247e-05, |
|
"loss": 1.2041, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 1.0, |
|
"learning_rate": 8.089887640449438e-05, |
|
"loss": 1.106, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.42696629213483145, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 8.53932584269663e-05, |
|
"loss": 1.0728, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 8.988764044943821e-05, |
|
"loss": 0.9622, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.47191011235955055, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.438202247191012e-05, |
|
"loss": 1.0835, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 9.887640449438202e-05, |
|
"loss": 1.0557, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5168539325842697, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00010337078651685395, |
|
"loss": 1.1037, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00010786516853932584, |
|
"loss": 1.058, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00011235955056179777, |
|
"loss": 1.1904, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"eval_loss": 1.0617437362670898, |
|
"eval_runtime": 206.3616, |
|
"eval_samples_per_second": 1.856, |
|
"eval_steps_per_second": 1.856, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00011685393258426967, |
|
"loss": 1.1192, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6067415730337079, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00012134831460674158, |
|
"loss": 1.0356, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.0001258426966292135, |
|
"loss": 1.0108, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.651685393258427, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0001303370786516854, |
|
"loss": 0.8835, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00013483146067415732, |
|
"loss": 0.9925, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6966292134831461, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00013932584269662923, |
|
"loss": 0.9548, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00014382022471910112, |
|
"loss": 1.113, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7415730337078652, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00014831460674157306, |
|
"loss": 0.9504, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00015280898876404494, |
|
"loss": 1.1266, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00015730337078651685, |
|
"loss": 1.0707, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00016179775280898877, |
|
"loss": 0.9222, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8314606741573034, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016629213483146068, |
|
"loss": 1.1053, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001707865168539326, |
|
"loss": 1.25, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8764044943820225, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001752808988764045, |
|
"loss": 0.933, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017977528089887642, |
|
"loss": 0.885, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9213483146067416, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00018426966292134833, |
|
"loss": 0.9786, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00018876404494382024, |
|
"loss": 0.9101, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9662921348314607, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00019325842696629215, |
|
"loss": 0.9005, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00019775280898876404, |
|
"loss": 0.9485, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0112359550561798, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00019999610626011892, |
|
"loss": 1.0071, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0337078651685394, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00019996495816039186, |
|
"loss": 0.9138, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0561797752808988, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019990267166335664, |
|
"loss": 0.7752, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0786516853932584, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019980926617082901, |
|
"loss": 0.9331, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.101123595505618, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00019968477077797781, |
|
"loss": 1.0037, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019952922426426207, |
|
"loss": 0.765, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"eval_loss": 0.944207489490509, |
|
"eval_runtime": 206.3524, |
|
"eval_samples_per_second": 1.856, |
|
"eval_steps_per_second": 1.856, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.146067415730337, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019934267508135164, |
|
"loss": 0.861, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.1685393258426966, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019912518133803465, |
|
"loss": 0.8251, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1910112359550562, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019887681078211707, |
|
"loss": 0.9779, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2134831460674158, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00019859764077931978, |
|
"loss": 0.8112, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.2359550561797752, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019828775828917964, |
|
"loss": 0.9084, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2584269662921348, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00019794725983796218, |
|
"loss": 0.8429, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2808988764044944, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00019757625148859441, |
|
"loss": 0.8029, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.303370786516854, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019717484880762685, |
|
"loss": 0.9478, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3258426966292136, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00019674317682923532, |
|
"loss": 0.6985, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.348314606741573, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00019628137001627383, |
|
"loss": 0.8653, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3707865168539326, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00019578957221839014, |
|
"loss": 0.891, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.3932584269662922, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00019526793662721768, |
|
"loss": 0.861, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4157303370786516, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019471662572865736, |
|
"loss": 0.7591, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4382022471910112, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00019413581125226438, |
|
"loss": 0.7109, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.4606741573033708, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019352567411775565, |
|
"loss": 0.8947, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4831460674157304, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00019288640437865445, |
|
"loss": 0.8514, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.50561797752809, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0001922182011630902, |
|
"loss": 0.7379, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5280898876404494, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00019152127261177126, |
|
"loss": 0.6778, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.550561797752809, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00019079583581315076, |
|
"loss": 0.6591, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.5730337078651684, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0001900421167358048, |
|
"loss": 0.8635, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.595505617977528, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00018926035015804488, |
|
"loss": 0.924, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6179775280898876, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00018845077959478613, |
|
"loss": 0.8554, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.6404494382022472, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00018761365722169403, |
|
"loss": 0.9471, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.6629213483146068, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00018674924379663338, |
|
"loss": 0.9187, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018585780857844418, |
|
"loss": 0.782, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"eval_loss": 0.8689672350883484, |
|
"eval_runtime": 206.6921, |
|
"eval_samples_per_second": 1.853, |
|
"eval_steps_per_second": 1.853, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7078651685393258, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018493962924306912, |
|
"loss": 0.8983, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.7303370786516854, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001839949917970596, |
|
"loss": 0.5218, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.7528089887640448, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00018302419048848667, |
|
"loss": 0.6711, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.7752808988764044, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001820275277152846, |
|
"loss": 0.7932, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.797752808988764, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00018100531393105623, |
|
"loss": 0.7181, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8202247191011236, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00017995786754836863, |
|
"loss": 0.8525, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.8426966292134832, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017888551483956987, |
|
"loss": 0.6968, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.8651685393258428, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00017778858983515743, |
|
"loss": 0.902, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.8876404494382022, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00017666743421972987, |
|
"loss": 0.954, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.9101123595505618, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001755223972255546, |
|
"loss": 0.791, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.9325842696629212, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017435383552378428, |
|
"loss": 0.77, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.9550561797752808, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0001731621131133564, |
|
"loss": 0.6294, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.9775280898876404, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017194760120760986, |
|
"loss": 0.7982, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.7643, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.0224719101123596, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001694517291395307, |
|
"loss": 0.5279, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.044943820224719, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00016817114642419067, |
|
"loss": 0.6667, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.067415730337079, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00016686932886534781, |
|
"loss": 0.6427, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.0898876404494384, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00016554668197022295, |
|
"loss": 0.633, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.1123595505617976, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00016420361773423204, |
|
"loss": 0.5623, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.134831460674157, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00016284055451265246, |
|
"loss": 0.6311, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.157303370786517, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00016145791689030795, |
|
"loss": 0.7469, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.1797752808988764, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001600561355493137, |
|
"loss": 0.7196, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.202247191011236, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.0001586356471349215, |
|
"loss": 0.6328, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.2247191011235956, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00015719689411950808, |
|
"loss": 0.6349, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00015574032466474775, |
|
"loss": 0.5591, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"eval_loss": 0.8647096753120422, |
|
"eval_runtime": 206.6878, |
|
"eval_samples_per_second": 1.853, |
|
"eval_steps_per_second": 1.853, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2696629213483144, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00015426639248201313, |
|
"loss": 0.5206, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.292134831460674, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001527755566910474, |
|
"loss": 0.7186, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.3146067415730336, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00015126828167695146, |
|
"loss": 0.6533, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.337078651685393, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.0001497450369455312, |
|
"loss": 0.6324, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00014820629697704965, |
|
"loss": 0.5276, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.3820224719101124, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00014665254107842964, |
|
"loss": 0.612, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.404494382022472, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00014508425323395317, |
|
"loss": 0.614, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.4269662921348316, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0001435019219545034, |
|
"loss": 0.4988, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.449438202247191, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00014190604012539684, |
|
"loss": 0.6777, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.4719101123595504, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00014029710485285324, |
|
"loss": 0.662, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.49438202247191, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00013867561730915016, |
|
"loss": 0.6087, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.5168539325842696, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001370420825765114, |
|
"loss": 0.56, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.539325842696629, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00013539700948977717, |
|
"loss": 0.572, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.561797752808989, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.00013374091047790585, |
|
"loss": 0.7334, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.5842696629213484, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00013207430140435556, |
|
"loss": 0.5377, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.606741573033708, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00013039770140639654, |
|
"loss": 0.6306, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.629213483146067, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00012871163273340307, |
|
"loss": 0.582, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.6516853932584272, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00012701662058417688, |
|
"loss": 0.6326, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.6741573033707864, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00012531319294335086, |
|
"loss": 0.6907, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.696629213483146, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00012360188041692582, |
|
"loss": 0.656, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00012188321606699016, |
|
"loss": 0.5817, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.741573033707865, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00012015773524567479, |
|
"loss": 0.5046, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.764044943820225, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00011842597542839462, |
|
"loss": 0.6293, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.7865168539325844, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00011668847604642861, |
|
"loss": 0.6067, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00011494577831889067, |
|
"loss": 0.5669, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"eval_loss": 0.8295581340789795, |
|
"eval_runtime": 206.704, |
|
"eval_samples_per_second": 1.853, |
|
"eval_steps_per_second": 1.853, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.831460674157303, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00011319842508414365, |
|
"loss": 0.5429, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.853932584269663, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00011144696063070883, |
|
"loss": 0.5481, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.8764044943820224, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00010969193052772396, |
|
"loss": 0.5308, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.898876404494382, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00010793388145500198, |
|
"loss": 0.4527, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.9213483146067416, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.00010617336103274424, |
|
"loss": 0.5333, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.943820224719101, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00010441091765096047, |
|
"loss": 0.5886, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.966292134831461, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001026471002986491, |
|
"loss": 0.626, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.98876404494382, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00010088245839279082, |
|
"loss": 0.6703, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.0112359550561796, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.911754160720923e-05, |
|
"loss": 0.4819, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.033707865168539, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 9.735289970135095e-05, |
|
"loss": 0.4379, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.056179775280899, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 9.558908234903954e-05, |
|
"loss": 0.3811, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.0786516853932584, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.382663896725578e-05, |
|
"loss": 0.3855, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.101123595505618, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 9.206611854499805e-05, |
|
"loss": 0.4749, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.1235955056179776, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.030806947227607e-05, |
|
"loss": 0.501, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.146067415730337, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 8.855303936929117e-05, |
|
"loss": 0.4239, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.168539325842697, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 8.680157491585636e-05, |
|
"loss": 0.5388, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.191011235955056, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 8.505422168110934e-05, |
|
"loss": 0.3715, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.2134831460674156, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8.331152395357141e-05, |
|
"loss": 0.4274, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.235955056179775, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 8.157402457160539e-05, |
|
"loss": 0.4368, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.258426966292135, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 7.984226475432522e-05, |
|
"loss": 0.4026, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.2808988764044944, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 7.811678393300987e-05, |
|
"loss": 0.3971, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.303370786516854, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.63981195830742e-05, |
|
"loss": 0.395, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.3258426966292136, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.468680705664914e-05, |
|
"loss": 0.4165, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.348314606741573, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 7.298337941582314e-05, |
|
"loss": 0.4071, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.128836726659696e-05, |
|
"loss": 0.4205, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"eval_loss": 0.8820343613624573, |
|
"eval_runtime": 206.8369, |
|
"eval_samples_per_second": 1.852, |
|
"eval_steps_per_second": 1.852, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.393258426966292, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 6.960229859360353e-05, |
|
"loss": 0.3759, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 3.4157303370786516, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 6.792569859564445e-05, |
|
"loss": 0.4457, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.438202247191011, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 6.625908952209418e-05, |
|
"loss": 0.4088, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.460674157303371, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 6.460299051022285e-05, |
|
"loss": 0.4221, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.4831460674157304, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 6.295791742348865e-05, |
|
"loss": 0.4304, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.50561797752809, |
|
"grad_norm": 1.0, |
|
"learning_rate": 6.132438269084985e-05, |
|
"loss": 0.3612, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.5280898876404496, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 5.970289514714677e-05, |
|
"loss": 0.4692, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.550561797752809, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 5.8093959874603176e-05, |
|
"loss": 0.4579, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.5730337078651684, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.649807804549663e-05, |
|
"loss": 0.3754, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.595505617977528, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 5.491574676604682e-05, |
|
"loss": 0.3685, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.6179775280898876, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.334745892157035e-05, |
|
"loss": 0.3809, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.640449438202247, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 5.179370302295037e-05, |
|
"loss": 0.4809, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.662921348314607, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.02549630544688e-05, |
|
"loss": 0.3798, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.6853932584269664, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.8731718323048516e-05, |
|
"loss": 0.4153, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.7078651685393256, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.722444330895256e-05, |
|
"loss": 0.4612, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.7303370786516856, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.573360751798689e-05, |
|
"loss": 0.469, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.752808988764045, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 4.425967533525229e-05, |
|
"loss": 0.4523, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.7752808988764044, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.2803105880491925e-05, |
|
"loss": 0.4214, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.797752808988764, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.136435286507849e-05, |
|
"loss": 0.4981, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.8202247191011236, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.994386445068632e-05, |
|
"loss": 0.4029, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.842696629213483, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.854208310969204e-05, |
|
"loss": 0.3747, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.865168539325843, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.715944548734755e-05, |
|
"loss": 0.4113, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.8876404494382024, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.5796382265767937e-05, |
|
"loss": 0.3896, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.9101123595505616, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.445331802977709e-05, |
|
"loss": 0.4709, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.932584269662921, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.313067113465222e-05, |
|
"loss": 0.3812, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.932584269662921, |
|
"eval_loss": 0.8859002590179443, |
|
"eval_runtime": 206.8365, |
|
"eval_samples_per_second": 1.852, |
|
"eval_steps_per_second": 1.852, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.955056179775281, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.182885357580934e-05, |
|
"loss": 0.3906, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.9775280898876404, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.054827086046931e-05, |
|
"loss": 0.3987, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.3499, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 4.022471910112359, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 2.8052398792390154e-05, |
|
"loss": 0.3292, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 4.044943820224719, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.6837886886643614e-05, |
|
"loss": 0.3343, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.067415730337078, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 2.5646164476215716e-05, |
|
"loss": 0.3236, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 4.089887640449438, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.447760277444543e-05, |
|
"loss": 0.2892, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 4.112359550561798, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.3332565780270165e-05, |
|
"loss": 0.2801, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 4.134831460674158, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.2211410164842606e-05, |
|
"loss": 0.3082, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 4.157303370786517, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.1114485160430132e-05, |
|
"loss": 0.3128, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.179775280898877, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.0042132451631378e-05, |
|
"loss": 0.3846, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 4.202247191011236, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.899468606894379e-05, |
|
"loss": 0.2718, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 4.224719101123595, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.7972472284715415e-05, |
|
"loss": 0.302, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 4.247191011235955, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.6975809511513353e-05, |
|
"loss": 0.3785, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 4.269662921348314, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.600500820294041e-05, |
|
"loss": 0.3845, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.292134831460674, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.5060370756930919e-05, |
|
"loss": 0.327, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 4.314606741573034, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.414219142155585e-05, |
|
"loss": 0.3589, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 4.337078651685394, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.3250756203366632e-05, |
|
"loss": 0.4057, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 4.359550561797753, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.2386342778305993e-05, |
|
"loss": 0.3862, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 4.382022471910112, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.1549220405213878e-05, |
|
"loss": 0.3319, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.404494382022472, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.0739649841955136e-05, |
|
"loss": 0.2832, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 4.426966292134831, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 9.957883264195223e-06, |
|
"loss": 0.2732, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 4.449438202247191, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 9.20416418684924e-06, |
|
"loss": 0.2587, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 4.47191011235955, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 8.478727388228735e-06, |
|
"loss": 0.3469, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 4.49438202247191, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.781798836909826e-06, |
|
"loss": 0.3323, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.49438202247191, |
|
"eval_loss": 0.9360187649726868, |
|
"eval_runtime": 206.9644, |
|
"eval_samples_per_second": 1.851, |
|
"eval_steps_per_second": 1.851, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.51685393258427, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 7.11359562134557e-06, |
|
"loss": 0.3441, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 4.539325842696629, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 6.4743258822443695e-06, |
|
"loss": 0.3196, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 4.561797752808989, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.8641887477356215e-06, |
|
"loss": 0.3226, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 4.584269662921348, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.283374271342645e-06, |
|
"loss": 0.2859, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 4.606741573033708, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.732063372782336e-06, |
|
"loss": 0.3164, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 4.629213483146067, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.210427781609861e-06, |
|
"loss": 0.3275, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 4.651685393258427, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.718629983726185e-06, |
|
"loss": 0.367, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 4.674157303370786, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.256823170764689e-06, |
|
"loss": 0.3445, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 4.696629213483146, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.8251511923731655e-06, |
|
"loss": 0.4628, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 4.719101123595506, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.423748511405577e-06, |
|
"loss": 0.3252, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.741573033707866, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.052740162037814e-06, |
|
"loss": 0.3783, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 4.764044943820225, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.7122417108203726e-06, |
|
"loss": 0.294, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 4.786516853932584, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.4023592206802382e-06, |
|
"loss": 0.3194, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 4.808988764044944, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1.1231892178829472e-06, |
|
"loss": 0.3145, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 4.831460674157303, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 8.74818661965382e-07, |
|
"loss": 0.3372, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.853932584269663, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.573249186483721e-07, |
|
"loss": 0.2791, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 4.876404494382022, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.707757357379383e-07, |
|
"loss": 0.2428, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 4.898876404494382, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.152292220222064e-07, |
|
"loss": 0.3225, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 4.921348314606742, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.9073382917097483e-07, |
|
"loss": 0.3164, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 4.943820224719101, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.732833664334307e-08, |
|
"loss": 0.3571, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.966292134831461, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.5041839608151996e-08, |
|
"loss": 0.3002, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 4.98876404494382, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.893739881088987e-09, |
|
"loss": 0.366, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 4450, |
|
"total_flos": 1.024663401529344e+17, |
|
"train_loss": 0.6764815047617708, |
|
"train_runtime": 9686.8536, |
|
"train_samples_per_second": 0.459, |
|
"train_steps_per_second": 0.459 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 4450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.024663401529344e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|