|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9999172801720573,
|
|
"eval_steps": 2300,
|
|
"global_step": 7555,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0,
|
|
"grad_norm": 9.696115493774414,
|
|
"learning_rate": 4.347826086956522e-08,
|
|
"loss": 1.6976,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0,
|
|
"grad_norm": 9.331488609313965,
|
|
"learning_rate": 8.695652173913044e-08,
|
|
"loss": 1.711,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.0,
|
|
"grad_norm": 9.760047912597656,
|
|
"learning_rate": 1.3043478260869566e-07,
|
|
"loss": 1.7084,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 8.302080154418945,
|
|
"learning_rate": 1.7391304347826088e-07,
|
|
"loss": 1.6865,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 7.753058433532715,
|
|
"learning_rate": 2.173913043478261e-07,
|
|
"loss": 1.6633,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 7.2695817947387695,
|
|
"learning_rate": 2.608695652173913e-07,
|
|
"loss": 1.654,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 6.353436470031738,
|
|
"learning_rate": 3.0434782608695656e-07,
|
|
"loss": 1.6029,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 5.489682674407959,
|
|
"learning_rate": 3.4782608695652175e-07,
|
|
"loss": 1.5619,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 5.3502631187438965,
|
|
"learning_rate": 3.91304347826087e-07,
|
|
"loss": 1.522,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 5.116358280181885,
|
|
"learning_rate": 4.347826086956522e-07,
|
|
"loss": 1.4462,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.01,
|
|
"grad_norm": 4.4354634284973145,
|
|
"learning_rate": 4.782608695652174e-07,
|
|
"loss": 1.4002,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 5.6585693359375,
|
|
"learning_rate": 5.217391304347826e-07,
|
|
"loss": 1.3227,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 4.757506847381592,
|
|
"learning_rate": 5.652173913043478e-07,
|
|
"loss": 1.2458,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 4.914801120758057,
|
|
"learning_rate": 6.086956521739131e-07,
|
|
"loss": 1.2131,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 9.453673362731934,
|
|
"learning_rate": 6.521739130434783e-07,
|
|
"loss": 1.15,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 6.355442047119141,
|
|
"learning_rate": 6.956521739130435e-07,
|
|
"loss": 1.0977,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 6.3270087242126465,
|
|
"learning_rate": 7.391304347826088e-07,
|
|
"loss": 1.0666,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 6.119747638702393,
|
|
"learning_rate": 7.82608695652174e-07,
|
|
"loss": 1.044,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 8.558585166931152,
|
|
"learning_rate": 8.260869565217392e-07,
|
|
"loss": 1.0215,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 5.8033766746521,
|
|
"learning_rate": 8.695652173913044e-07,
|
|
"loss": 0.996,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 6.511552810668945,
|
|
"learning_rate": 9.130434782608697e-07,
|
|
"loss": 0.9866,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 7.597463607788086,
|
|
"learning_rate": 9.565217391304349e-07,
|
|
"loss": 0.9724,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 6.662442207336426,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.9711,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 5.849515914916992,
|
|
"learning_rate": 1.0434782608695653e-06,
|
|
"loss": 0.9607,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 6.360912322998047,
|
|
"learning_rate": 1.0869565217391306e-06,
|
|
"loss": 0.946,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.03,
|
|
"grad_norm": 4.921197891235352,
|
|
"learning_rate": 1.1304347826086956e-06,
|
|
"loss": 0.9355,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.507955074310303,
|
|
"learning_rate": 1.173913043478261e-06,
|
|
"loss": 0.9248,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.760683059692383,
|
|
"learning_rate": 1.2173913043478262e-06,
|
|
"loss": 0.9315,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 5.035203456878662,
|
|
"learning_rate": 1.2608695652173913e-06,
|
|
"loss": 0.9209,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.834679126739502,
|
|
"learning_rate": 1.3043478260869566e-06,
|
|
"loss": 0.9104,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 3.9745352268218994,
|
|
"learning_rate": 1.347826086956522e-06,
|
|
"loss": 0.9025,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 3.602553606033325,
|
|
"learning_rate": 1.391304347826087e-06,
|
|
"loss": 0.8954,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 3.429502487182617,
|
|
"learning_rate": 1.4347826086956523e-06,
|
|
"loss": 0.889,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.117110252380371,
|
|
"learning_rate": 1.4782608695652176e-06,
|
|
"loss": 0.8732,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 3.424821615219116,
|
|
"learning_rate": 1.521739130434783e-06,
|
|
"loss": 0.8819,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 4.870906829833984,
|
|
"learning_rate": 1.565217391304348e-06,
|
|
"loss": 0.8829,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 6.471639156341553,
|
|
"learning_rate": 1.608695652173913e-06,
|
|
"loss": 0.8632,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 4.399796962738037,
|
|
"learning_rate": 1.6521739130434784e-06,
|
|
"loss": 0.8543,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 3.4351227283477783,
|
|
"learning_rate": 1.6956521739130435e-06,
|
|
"loss": 0.847,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 3.1236326694488525,
|
|
"learning_rate": 1.7391304347826088e-06,
|
|
"loss": 0.8445,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.05,
|
|
"grad_norm": 3.5134594440460205,
|
|
"learning_rate": 1.782608695652174e-06,
|
|
"loss": 0.8377,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.1803650856018066,
|
|
"learning_rate": 1.8260869565217394e-06,
|
|
"loss": 0.835,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 5.259217739105225,
|
|
"learning_rate": 1.8695652173913044e-06,
|
|
"loss": 0.8368,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 4.378271579742432,
|
|
"learning_rate": 1.9130434782608697e-06,
|
|
"loss": 0.8377,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.6259055137634277,
|
|
"learning_rate": 1.956521739130435e-06,
|
|
"loss": 0.8383,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 4.089799404144287,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.8309,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.041405200958252,
|
|
"learning_rate": 2.0434782608695656e-06,
|
|
"loss": 0.8178,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.9156289100646973,
|
|
"learning_rate": 2.0869565217391305e-06,
|
|
"loss": 0.8091,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.1488077640533447,
|
|
"learning_rate": 2.130434782608696e-06,
|
|
"loss": 0.8287,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 3.1481711864471436,
|
|
"learning_rate": 2.173913043478261e-06,
|
|
"loss": 0.8109,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 3.6406595706939697,
|
|
"learning_rate": 2.2173913043478264e-06,
|
|
"loss": 0.8071,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 3.674004554748535,
|
|
"learning_rate": 2.2608695652173913e-06,
|
|
"loss": 0.8033,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 2.566720485687256,
|
|
"learning_rate": 2.3043478260869566e-06,
|
|
"loss": 0.8138,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 3.0154435634613037,
|
|
"learning_rate": 2.347826086956522e-06,
|
|
"loss": 0.8147,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 3.2751858234405518,
|
|
"learning_rate": 2.391304347826087e-06,
|
|
"loss": 0.7845,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.07,
|
|
"grad_norm": 5.122045993804932,
|
|
"learning_rate": 2.4347826086956525e-06,
|
|
"loss": 0.7984,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 4.417994499206543,
|
|
"learning_rate": 2.4782608695652178e-06,
|
|
"loss": 0.7981,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.893526315689087,
|
|
"learning_rate": 2.5217391304347826e-06,
|
|
"loss": 0.799,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 3.6591286659240723,
|
|
"learning_rate": 2.5652173913043484e-06,
|
|
"loss": 0.7965,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.970930337905884,
|
|
"learning_rate": 2.6086956521739132e-06,
|
|
"loss": 0.8057,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.9321494102478027,
|
|
"learning_rate": 2.6521739130434785e-06,
|
|
"loss": 0.7862,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 3.5950493812561035,
|
|
"learning_rate": 2.695652173913044e-06,
|
|
"loss": 0.7844,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.530301332473755,
|
|
"learning_rate": 2.7391304347826087e-06,
|
|
"loss": 0.7863,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.930530548095703,
|
|
"learning_rate": 2.782608695652174e-06,
|
|
"loss": 0.7914,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.9592342376708984,
|
|
"learning_rate": 2.8260869565217393e-06,
|
|
"loss": 0.7738,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.425995349884033,
|
|
"learning_rate": 2.8695652173913046e-06,
|
|
"loss": 0.7941,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.796645402908325,
|
|
"learning_rate": 2.9130434782608695e-06,
|
|
"loss": 0.7969,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.917015314102173,
|
|
"learning_rate": 2.956521739130435e-06,
|
|
"loss": 0.7803,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.644934892654419,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.7925,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 3.2409515380859375,
|
|
"learning_rate": 3.043478260869566e-06,
|
|
"loss": 0.7863,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.09,
|
|
"grad_norm": 2.5315630435943604,
|
|
"learning_rate": 3.0869565217391307e-06,
|
|
"loss": 0.7798,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.755002498626709,
|
|
"learning_rate": 3.130434782608696e-06,
|
|
"loss": 0.7736,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.5441417694091797,
|
|
"learning_rate": 3.1739130434782613e-06,
|
|
"loss": 0.7653,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.50203537940979,
|
|
"learning_rate": 3.217391304347826e-06,
|
|
"loss": 0.7751,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.8558547496795654,
|
|
"learning_rate": 3.2608695652173914e-06,
|
|
"loss": 0.7832,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.7601280212402344,
|
|
"learning_rate": 3.3043478260869567e-06,
|
|
"loss": 0.7865,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 4.165918827056885,
|
|
"learning_rate": 3.347826086956522e-06,
|
|
"loss": 0.7804,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 3.20556378364563,
|
|
"learning_rate": 3.391304347826087e-06,
|
|
"loss": 0.7787,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.259490489959717,
|
|
"learning_rate": 3.4347826086956526e-06,
|
|
"loss": 0.7776,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.482300281524658,
|
|
"learning_rate": 3.4782608695652175e-06,
|
|
"loss": 0.7909,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.6870365142822266,
|
|
"learning_rate": 3.5217391304347832e-06,
|
|
"loss": 0.7747,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.509525775909424,
|
|
"learning_rate": 3.565217391304348e-06,
|
|
"loss": 0.7675,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.8357813358306885,
|
|
"learning_rate": 3.6086956521739134e-06,
|
|
"loss": 0.7525,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 3.0093586444854736,
|
|
"learning_rate": 3.6521739130434787e-06,
|
|
"loss": 0.7871,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.5198683738708496,
|
|
"learning_rate": 3.6956521739130436e-06,
|
|
"loss": 0.7602,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.11,
|
|
"grad_norm": 2.4067280292510986,
|
|
"learning_rate": 3.739130434782609e-06,
|
|
"loss": 0.7709,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.968722343444824,
|
|
"learning_rate": 3.782608695652174e-06,
|
|
"loss": 0.7638,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.5267333984375,
|
|
"learning_rate": 3.8260869565217395e-06,
|
|
"loss": 0.7618,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.519435405731201,
|
|
"learning_rate": 3.869565217391304e-06,
|
|
"loss": 0.7862,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.373142957687378,
|
|
"learning_rate": 3.91304347826087e-06,
|
|
"loss": 0.7785,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.939995288848877,
|
|
"learning_rate": 3.956521739130435e-06,
|
|
"loss": 0.7752,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.4909372329711914,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.779,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.6996891498565674,
|
|
"learning_rate": 4.0434782608695655e-06,
|
|
"loss": 0.7735,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.628506660461426,
|
|
"learning_rate": 4.086956521739131e-06,
|
|
"loss": 0.7551,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 2.350477457046509,
|
|
"learning_rate": 4.130434782608696e-06,
|
|
"loss": 0.7685,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 2.2088937759399414,
|
|
"learning_rate": 4.173913043478261e-06,
|
|
"loss": 0.7714,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 3.055957555770874,
|
|
"learning_rate": 4.217391304347827e-06,
|
|
"loss": 0.7633,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 2.987377882003784,
|
|
"learning_rate": 4.260869565217392e-06,
|
|
"loss": 0.7639,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 2.8698835372924805,
|
|
"learning_rate": 4.304347826086957e-06,
|
|
"loss": 0.7748,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 2.728653907775879,
|
|
"learning_rate": 4.347826086956522e-06,
|
|
"loss": 0.7531,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 3.0617196559906006,
|
|
"learning_rate": 4.391304347826087e-06,
|
|
"loss": 0.7637,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.13,
|
|
"grad_norm": 9.645702362060547,
|
|
"learning_rate": 4.434782608695653e-06,
|
|
"loss": 0.7734,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 3.174217462539673,
|
|
"learning_rate": 4.478260869565218e-06,
|
|
"loss": 0.7637,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.565565586090088,
|
|
"learning_rate": 4.5217391304347826e-06,
|
|
"loss": 0.7616,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.3000173568725586,
|
|
"learning_rate": 4.565217391304348e-06,
|
|
"loss": 0.7605,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.2204582691192627,
|
|
"learning_rate": 4.608695652173913e-06,
|
|
"loss": 0.7613,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.5566813945770264,
|
|
"learning_rate": 4.652173913043478e-06,
|
|
"loss": 0.745,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.7924296855926514,
|
|
"learning_rate": 4.695652173913044e-06,
|
|
"loss": 0.7643,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.78627610206604,
|
|
"learning_rate": 4.739130434782609e-06,
|
|
"loss": 0.765,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.9410696029663086,
|
|
"learning_rate": 4.782608695652174e-06,
|
|
"loss": 0.7534,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.6935067176818848,
|
|
"learning_rate": 4.826086956521739e-06,
|
|
"loss": 0.7501,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.8043696880340576,
|
|
"learning_rate": 4.869565217391305e-06,
|
|
"loss": 0.7576,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.8394672870635986,
|
|
"learning_rate": 4.91304347826087e-06,
|
|
"loss": 0.7536,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.5857579708099365,
|
|
"learning_rate": 4.9565217391304355e-06,
|
|
"loss": 0.7545,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.707064151763916,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.7686,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.7348179817199707,
|
|
"learning_rate": 5.043478260869565e-06,
|
|
"loss": 0.7594,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.15,
|
|
"grad_norm": 2.8637123107910156,
|
|
"learning_rate": 5.08695652173913e-06,
|
|
"loss": 0.7719,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.605658769607544,
|
|
"learning_rate": 5.130434782608697e-06,
|
|
"loss": 0.7609,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.538159132003784,
|
|
"learning_rate": 5.173913043478262e-06,
|
|
"loss": 0.7447,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.6438486576080322,
|
|
"learning_rate": 5.2173913043478265e-06,
|
|
"loss": 0.7474,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.5470008850097656,
|
|
"learning_rate": 5.260869565217391e-06,
|
|
"loss": 0.755,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.9301180839538574,
|
|
"learning_rate": 5.304347826086957e-06,
|
|
"loss": 0.7507,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.509558916091919,
|
|
"learning_rate": 5.347826086956523e-06,
|
|
"loss": 0.7568,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.386697292327881,
|
|
"learning_rate": 5.391304347826088e-06,
|
|
"loss": 0.7615,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.8356692790985107,
|
|
"learning_rate": 5.4347826086956525e-06,
|
|
"loss": 0.7498,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.812669038772583,
|
|
"learning_rate": 5.478260869565217e-06,
|
|
"loss": 0.7467,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.591529369354248,
|
|
"learning_rate": 5.521739130434783e-06,
|
|
"loss": 0.7537,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.7662971019744873,
|
|
"learning_rate": 5.565217391304348e-06,
|
|
"loss": 0.7508,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.4197189807891846,
|
|
"learning_rate": 5.608695652173914e-06,
|
|
"loss": 0.7415,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.972205877304077,
|
|
"learning_rate": 5.652173913043479e-06,
|
|
"loss": 0.7587,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 3.1979939937591553,
|
|
"learning_rate": 5.695652173913044e-06,
|
|
"loss": 0.7369,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.17,
|
|
"grad_norm": 2.4516711235046387,
|
|
"learning_rate": 5.739130434782609e-06,
|
|
"loss": 0.7438,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.620466470718384,
|
|
"learning_rate": 5.782608695652174e-06,
|
|
"loss": 0.7465,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.7041525840759277,
|
|
"learning_rate": 5.826086956521739e-06,
|
|
"loss": 0.7522,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.6662778854370117,
|
|
"learning_rate": 5.8695652173913055e-06,
|
|
"loss": 0.7567,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 5.283855438232422,
|
|
"learning_rate": 5.91304347826087e-06,
|
|
"loss": 0.7307,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.551743745803833,
|
|
"learning_rate": 5.956521739130435e-06,
|
|
"loss": 0.7466,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 3.212951898574829,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.752,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.417921304702759,
|
|
"learning_rate": 6.043478260869565e-06,
|
|
"loss": 0.7531,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.751988649368286,
|
|
"learning_rate": 6.086956521739132e-06,
|
|
"loss": 0.7393,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.8188741207122803,
|
|
"learning_rate": 6.1304347826086965e-06,
|
|
"loss": 0.748,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.727118730545044,
|
|
"learning_rate": 6.173913043478261e-06,
|
|
"loss": 0.7404,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.9528021812438965,
|
|
"learning_rate": 6.217391304347826e-06,
|
|
"loss": 0.7476,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 3.184258460998535,
|
|
"learning_rate": 6.260869565217392e-06,
|
|
"loss": 0.7439,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 3.2081708908081055,
|
|
"learning_rate": 6.304347826086958e-06,
|
|
"loss": 0.7328,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.437472343444824,
|
|
"learning_rate": 6.3478260869565225e-06,
|
|
"loss": 0.7447,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.19,
|
|
"grad_norm": 2.4201667308807373,
|
|
"learning_rate": 6.391304347826087e-06,
|
|
"loss": 0.7333,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 3.096134901046753,
|
|
"learning_rate": 6.434782608695652e-06,
|
|
"loss": 0.7592,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 2.744535446166992,
|
|
"learning_rate": 6.478260869565218e-06,
|
|
"loss": 0.7469,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 2.768773317337036,
|
|
"learning_rate": 6.521739130434783e-06,
|
|
"loss": 0.7434,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 3.7912373542785645,
|
|
"learning_rate": 6.565217391304349e-06,
|
|
"loss": 0.7597,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 3.1697614192962646,
|
|
"learning_rate": 6.6086956521739135e-06,
|
|
"loss": 0.7484,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 3.172487735748291,
|
|
"learning_rate": 6.652173913043479e-06,
|
|
"loss": 0.7327,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 2.5283539295196533,
|
|
"learning_rate": 6.695652173913044e-06,
|
|
"loss": 0.743,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 3.1751644611358643,
|
|
"learning_rate": 6.739130434782609e-06,
|
|
"loss": 0.723,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.524111747741699,
|
|
"learning_rate": 6.782608695652174e-06,
|
|
"loss": 0.7248,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 5.5174455642700195,
|
|
"learning_rate": 6.8260869565217395e-06,
|
|
"loss": 0.7399,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.582502841949463,
|
|
"learning_rate": 6.869565217391305e-06,
|
|
"loss": 0.7428,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.751222848892212,
|
|
"learning_rate": 6.91304347826087e-06,
|
|
"loss": 0.7353,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.983644485473633,
|
|
"learning_rate": 6.956521739130435e-06,
|
|
"loss": 0.753,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.416503667831421,
|
|
"learning_rate": 7e-06,
|
|
"loss": 0.7323,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.21,
|
|
"grad_norm": 2.5844953060150146,
|
|
"learning_rate": 7.0434782608695665e-06,
|
|
"loss": 0.7514,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.449826717376709,
|
|
"learning_rate": 7.086956521739131e-06,
|
|
"loss": 0.7514,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.574061393737793,
|
|
"learning_rate": 7.130434782608696e-06,
|
|
"loss": 0.7418,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.707425355911255,
|
|
"learning_rate": 7.173913043478261e-06,
|
|
"loss": 0.7402,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.7220213413238525,
|
|
"learning_rate": 7.217391304347827e-06,
|
|
"loss": 0.7501,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.730178117752075,
|
|
"learning_rate": 7.2608695652173925e-06,
|
|
"loss": 0.7351,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.536191940307617,
|
|
"learning_rate": 7.304347826086957e-06,
|
|
"loss": 0.7362,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.93157958984375,
|
|
"learning_rate": 7.347826086956522e-06,
|
|
"loss": 0.7369,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.8212029933929443,
|
|
"learning_rate": 7.391304347826087e-06,
|
|
"loss": 0.7325,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 3.0014121532440186,
|
|
"learning_rate": 7.434782608695653e-06,
|
|
"loss": 0.7383,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 3.5077619552612305,
|
|
"learning_rate": 7.478260869565218e-06,
|
|
"loss": 0.7317,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 2.7584381103515625,
|
|
"learning_rate": 7.5217391304347835e-06,
|
|
"loss": 0.7214,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 3.4156510829925537,
|
|
"learning_rate": 7.565217391304348e-06,
|
|
"loss": 0.7367,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 3.4717941284179688,
|
|
"learning_rate": 7.608695652173914e-06,
|
|
"loss": 0.7234,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 2.6128644943237305,
|
|
"learning_rate": 7.652173913043479e-06,
|
|
"loss": 0.7486,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.23,
|
|
"grad_norm": 2.3647897243499756,
|
|
"learning_rate": 7.695652173913044e-06,
|
|
"loss": 0.7186,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 2.6185524463653564,
|
|
"learning_rate": 7.739130434782609e-06,
|
|
"loss": 0.7405,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 3.2258949279785156,
|
|
"learning_rate": 7.782608695652174e-06,
|
|
"loss": 0.7399,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 3.954819679260254,
|
|
"learning_rate": 7.82608695652174e-06,
|
|
"loss": 0.7277,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 3.0589473247528076,
|
|
"learning_rate": 7.869565217391305e-06,
|
|
"loss": 0.7282,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 2.6480607986450195,
|
|
"learning_rate": 7.91304347826087e-06,
|
|
"loss": 0.7262,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 2.735381603240967,
|
|
"learning_rate": 7.956521739130435e-06,
|
|
"loss": 0.7216,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 5.60382080078125,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.7275,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 2.710845947265625,
|
|
"learning_rate": 8.043478260869566e-06,
|
|
"loss": 0.739,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 2.4441914558410645,
|
|
"learning_rate": 8.086956521739131e-06,
|
|
"loss": 0.7145,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 2.7932469844818115,
|
|
"learning_rate": 8.130434782608696e-06,
|
|
"loss": 0.7226,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 2.782019853591919,
|
|
"learning_rate": 8.173913043478263e-06,
|
|
"loss": 0.7139,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 3.049837350845337,
|
|
"learning_rate": 8.217391304347827e-06,
|
|
"loss": 0.7393,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 2.894196033477783,
|
|
"learning_rate": 8.260869565217392e-06,
|
|
"loss": 0.7163,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 2.4531071186065674,
|
|
"learning_rate": 8.304347826086957e-06,
|
|
"loss": 0.7204,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.25,
|
|
"grad_norm": 3.320891857147217,
|
|
"learning_rate": 8.347826086956522e-06,
|
|
"loss": 0.7248,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 3.3719470500946045,
|
|
"learning_rate": 8.391304347826089e-06,
|
|
"loss": 0.7315,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 2.7417898178100586,
|
|
"learning_rate": 8.434782608695653e-06,
|
|
"loss": 0.7256,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 2.6355440616607666,
|
|
"learning_rate": 8.478260869565218e-06,
|
|
"loss": 0.7305,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 2.6812551021575928,
|
|
"learning_rate": 8.521739130434783e-06,
|
|
"loss": 0.7157,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 3.1449575424194336,
|
|
"learning_rate": 8.56521739130435e-06,
|
|
"loss": 0.7186,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 4.587336540222168,
|
|
"learning_rate": 8.608695652173915e-06,
|
|
"loss": 0.7241,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 3.474202871322632,
|
|
"learning_rate": 8.65217391304348e-06,
|
|
"loss": 0.7318,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 4.36326789855957,
|
|
"learning_rate": 8.695652173913044e-06,
|
|
"loss": 0.7336,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.8643243312835693,
|
|
"learning_rate": 8.73913043478261e-06,
|
|
"loss": 0.7282,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.8812708854675293,
|
|
"learning_rate": 8.782608695652174e-06,
|
|
"loss": 0.7158,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.9906275272369385,
|
|
"learning_rate": 8.82608695652174e-06,
|
|
"loss": 0.7163,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.6248161792755127,
|
|
"learning_rate": 8.869565217391306e-06,
|
|
"loss": 0.736,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.564918041229248,
|
|
"learning_rate": 8.91304347826087e-06,
|
|
"loss": 0.733,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.4388699531555176,
|
|
"learning_rate": 8.956521739130435e-06,
|
|
"loss": 0.7192,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.27,
|
|
"grad_norm": 2.6738839149475098,
|
|
"learning_rate": 9e-06,
|
|
"loss": 0.7107,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 4.980138778686523,
|
|
"learning_rate": 9.043478260869565e-06,
|
|
"loss": 0.716,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.9591591358184814,
|
|
"learning_rate": 9.086956521739132e-06,
|
|
"loss": 0.7197,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.6318516731262207,
|
|
"learning_rate": 9.130434782608697e-06,
|
|
"loss": 0.7179,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.6253883838653564,
|
|
"learning_rate": 9.173913043478261e-06,
|
|
"loss": 0.7184,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.5605149269104004,
|
|
"learning_rate": 9.217391304347826e-06,
|
|
"loss": 0.7305,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 4.129536151885986,
|
|
"learning_rate": 9.260869565217391e-06,
|
|
"loss": 0.7088,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.9836220741271973,
|
|
"learning_rate": 9.304347826086956e-06,
|
|
"loss": 0.7239,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.778383731842041,
|
|
"learning_rate": 9.347826086956523e-06,
|
|
"loss": 0.7067,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 2.8585681915283203,
|
|
"learning_rate": 9.391304347826087e-06,
|
|
"loss": 0.7085,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 2.595531940460205,
|
|
"learning_rate": 9.434782608695652e-06,
|
|
"loss": 0.7101,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 3.7232606410980225,
|
|
"learning_rate": 9.478260869565217e-06,
|
|
"loss": 0.7333,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 2.381574869155884,
|
|
"learning_rate": 9.521739130434784e-06,
|
|
"loss": 0.7236,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 3.042024612426758,
|
|
"learning_rate": 9.565217391304349e-06,
|
|
"loss": 0.7261,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 2.2856943607330322,
|
|
"learning_rate": 9.608695652173914e-06,
|
|
"loss": 0.7021,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.29,
|
|
"grad_norm": 3.454638719558716,
|
|
"learning_rate": 9.652173913043478e-06,
|
|
"loss": 0.711,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.605741500854492,
|
|
"learning_rate": 9.695652173913043e-06,
|
|
"loss": 0.7078,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 3.4367051124572754,
|
|
"learning_rate": 9.73913043478261e-06,
|
|
"loss": 0.7208,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.4078874588012695,
|
|
"learning_rate": 9.782608695652175e-06,
|
|
"loss": 0.7148,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.6727590560913086,
|
|
"learning_rate": 9.82608695652174e-06,
|
|
"loss": 0.7276,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 3.0248544216156006,
|
|
"learning_rate": 9.869565217391304e-06,
|
|
"loss": 0.7133,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.7144577503204346,
|
|
"learning_rate": 9.913043478260871e-06,
|
|
"loss": 0.7101,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.7497832775115967,
|
|
"learning_rate": 9.956521739130436e-06,
|
|
"loss": 0.692,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.990417957305908,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.6961,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"eval_loss": 0.7328751683235168,
|
|
"eval_runtime": 199.3608,
|
|
"eval_samples_per_second": 55.176,
|
|
"eval_steps_per_second": 6.897,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.8692731857299805,
|
|
"learning_rate": 9.999994241637783e-06,
|
|
"loss": 0.715,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.5786540508270264,
|
|
"learning_rate": 9.999976966564394e-06,
|
|
"loss": 0.7163,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 3.327714443206787,
|
|
"learning_rate": 9.999948174819623e-06,
|
|
"loss": 0.7135,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.9145894050598145,
|
|
"learning_rate": 9.999907866469787e-06,
|
|
"loss": 0.7054,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.5164294242858887,
|
|
"learning_rate": 9.999856041607732e-06,
|
|
"loss": 0.7149,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 3.212944984436035,
|
|
"learning_rate": 9.999792700352826e-06,
|
|
"loss": 0.7022,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.477055072784424,
|
|
"learning_rate": 9.99971784285097e-06,
|
|
"loss": 0.7057,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.31,
|
|
"grad_norm": 2.563532590866089,
|
|
"learning_rate": 9.99963146927458e-06,
|
|
"loss": 0.7117,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 3.0468506813049316,
|
|
"learning_rate": 9.999533579822611e-06,
|
|
"loss": 0.7152,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.904538154602051,
|
|
"learning_rate": 9.99942417472053e-06,
|
|
"loss": 0.72,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.625180244445801,
|
|
"learning_rate": 9.999303254220342e-06,
|
|
"loss": 0.7097,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.9058775901794434,
|
|
"learning_rate": 9.999170818600562e-06,
|
|
"loss": 0.7254,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 3.0165164470672607,
|
|
"learning_rate": 9.999026868166238e-06,
|
|
"loss": 0.7132,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 4.671907424926758,
|
|
"learning_rate": 9.998871403248936e-06,
|
|
"loss": 0.7191,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.6668975353240967,
|
|
"learning_rate": 9.998704424206747e-06,
|
|
"loss": 0.7066,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 3.66217041015625,
|
|
"learning_rate": 9.998525931424279e-06,
|
|
"loss": 0.6917,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 2.8258302211761475,
|
|
"learning_rate": 9.998335925312666e-06,
|
|
"loss": 0.6889,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 2.8596441745758057,
|
|
"learning_rate": 9.998134406309555e-06,
|
|
"loss": 0.6997,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 2.9443325996398926,
|
|
"learning_rate": 9.997921374879112e-06,
|
|
"loss": 0.7082,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 4.481228828430176,
|
|
"learning_rate": 9.997696831512027e-06,
|
|
"loss": 0.7007,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 3.7334978580474854,
|
|
"learning_rate": 9.997460776725497e-06,
|
|
"loss": 0.708,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 2.843071937561035,
|
|
"learning_rate": 9.997213211063236e-06,
|
|
"loss": 0.7201,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.33,
|
|
"grad_norm": 3.9357028007507324,
|
|
"learning_rate": 9.99695413509548e-06,
|
|
"loss": 0.7152,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 2.709559440612793,
|
|
"learning_rate": 9.996683549418964e-06,
|
|
"loss": 0.7071,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 3.1510403156280518,
|
|
"learning_rate": 9.996401454656941e-06,
|
|
"loss": 0.6963,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 2.642859697341919,
|
|
"learning_rate": 9.996107851459175e-06,
|
|
"loss": 0.7107,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 3.1064438819885254,
|
|
"learning_rate": 9.995802740501933e-06,
|
|
"loss": 0.7045,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 2.6383016109466553,
|
|
"learning_rate": 9.995486122487992e-06,
|
|
"loss": 0.6912,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 3.1221187114715576,
|
|
"learning_rate": 9.995157998146633e-06,
|
|
"loss": 0.7,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 2.3788633346557617,
|
|
"learning_rate": 9.994818368233639e-06,
|
|
"loss": 0.7152,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.5052289962768555,
|
|
"learning_rate": 9.994467233531294e-06,
|
|
"loss": 0.7041,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.9792628288269043,
|
|
"learning_rate": 9.994104594848383e-06,
|
|
"loss": 0.707,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.6907389163970947,
|
|
"learning_rate": 9.993730453020187e-06,
|
|
"loss": 0.6965,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.660902738571167,
|
|
"learning_rate": 9.993344808908486e-06,
|
|
"loss": 0.6978,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.5266551971435547,
|
|
"learning_rate": 9.992947663401548e-06,
|
|
"loss": 0.6938,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.86554217338562,
|
|
"learning_rate": 9.99253901741414e-06,
|
|
"loss": 0.7002,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 3.3354568481445312,
|
|
"learning_rate": 9.992118871887513e-06,
|
|
"loss": 0.7191,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.35,
|
|
"grad_norm": 2.7831523418426514,
|
|
"learning_rate": 9.991687227789407e-06,
|
|
"loss": 0.7031,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.882188558578491,
|
|
"learning_rate": 9.991244086114046e-06,
|
|
"loss": 0.6944,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.857550859451294,
|
|
"learning_rate": 9.990789447882136e-06,
|
|
"loss": 0.694,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.44887638092041,
|
|
"learning_rate": 9.990323314140872e-06,
|
|
"loss": 0.7152,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 4.176514148712158,
|
|
"learning_rate": 9.989845685963917e-06,
|
|
"loss": 0.7048,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.6085798740386963,
|
|
"learning_rate": 9.989356564451415e-06,
|
|
"loss": 0.6918,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.8457624912261963,
|
|
"learning_rate": 9.988855950729979e-06,
|
|
"loss": 0.6992,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.9820759296417236,
|
|
"learning_rate": 9.988343845952697e-06,
|
|
"loss": 0.708,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 3.1150028705596924,
|
|
"learning_rate": 9.987820251299121e-06,
|
|
"loss": 0.6925,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 2.9244368076324463,
|
|
"learning_rate": 9.987285167975274e-06,
|
|
"loss": 0.6865,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 2.4057462215423584,
|
|
"learning_rate": 9.986738597213633e-06,
|
|
"loss": 0.7015,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 3.322909116744995,
|
|
"learning_rate": 9.986180540273143e-06,
|
|
"loss": 0.6832,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 3.0608110427856445,
|
|
"learning_rate": 9.985610998439198e-06,
|
|
"loss": 0.6943,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 4.037482261657715,
|
|
"learning_rate": 9.98502997302365e-06,
|
|
"loss": 0.6892,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 3.36313796043396,
|
|
"learning_rate": 9.984437465364802e-06,
|
|
"loss": 0.6965,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.37,
|
|
"grad_norm": 2.5192835330963135,
|
|
"learning_rate": 9.983833476827404e-06,
|
|
"loss": 0.7066,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 2.7486279010772705,
|
|
"learning_rate": 9.983218008802648e-06,
|
|
"loss": 0.699,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 2.654358148574829,
|
|
"learning_rate": 9.982591062708172e-06,
|
|
"loss": 0.6979,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 3.2748641967773438,
|
|
"learning_rate": 9.981952639988046e-06,
|
|
"loss": 0.6991,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 2.578864812850952,
|
|
"learning_rate": 9.98130274211278e-06,
|
|
"loss": 0.7049,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 2.6727378368377686,
|
|
"learning_rate": 9.98064137057931e-06,
|
|
"loss": 0.7018,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 3.929105043411255,
|
|
"learning_rate": 9.979968526911006e-06,
|
|
"loss": 0.7024,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 3.152024030685425,
|
|
"learning_rate": 9.979284212657658e-06,
|
|
"loss": 0.6998,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.9024596214294434,
|
|
"learning_rate": 9.978588429395475e-06,
|
|
"loss": 0.6984,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.836294651031494,
|
|
"learning_rate": 9.97788117872709e-06,
|
|
"loss": 0.6908,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.5680007934570312,
|
|
"learning_rate": 9.977162462281544e-06,
|
|
"loss": 0.6976,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.9260566234588623,
|
|
"learning_rate": 9.976432281714289e-06,
|
|
"loss": 0.7054,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.2062673568725586,
|
|
"learning_rate": 9.97569063870718e-06,
|
|
"loss": 0.6856,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 4.240058422088623,
|
|
"learning_rate": 9.97493753496848e-06,
|
|
"loss": 0.7103,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 2.477383852005005,
|
|
"learning_rate": 9.974172972232845e-06,
|
|
"loss": 0.6985,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.39,
|
|
"grad_norm": 3.088667392730713,
|
|
"learning_rate": 9.973396952261327e-06,
|
|
"loss": 0.6934,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 3.2433605194091797,
|
|
"learning_rate": 9.972609476841368e-06,
|
|
"loss": 0.6985,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.7958269119262695,
|
|
"learning_rate": 9.971810547786794e-06,
|
|
"loss": 0.6962,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.777493953704834,
|
|
"learning_rate": 9.971000166937815e-06,
|
|
"loss": 0.6986,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 3.0154001712799072,
|
|
"learning_rate": 9.970178336161018e-06,
|
|
"loss": 0.6812,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.762068033218384,
|
|
"learning_rate": 9.969345057349365e-06,
|
|
"loss": 0.6936,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.7011806964874268,
|
|
"learning_rate": 9.96850033242218e-06,
|
|
"loss": 0.6913,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.9354515075683594,
|
|
"learning_rate": 9.967644163325157e-06,
|
|
"loss": 0.6717,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.9377188682556152,
|
|
"learning_rate": 9.96677655203035e-06,
|
|
"loss": 0.703,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 3.62003231048584,
|
|
"learning_rate": 9.965897500536167e-06,
|
|
"loss": 0.6982,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 3.100145101547241,
|
|
"learning_rate": 9.965007010867366e-06,
|
|
"loss": 0.6869,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 3.1306726932525635,
|
|
"learning_rate": 9.964105085075053e-06,
|
|
"loss": 0.6998,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 2.943037986755371,
|
|
"learning_rate": 9.963191725236672e-06,
|
|
"loss": 0.6983,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 2.641789197921753,
|
|
"learning_rate": 9.962266933456008e-06,
|
|
"loss": 0.7036,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 2.6184604167938232,
|
|
"learning_rate": 9.961330711863175e-06,
|
|
"loss": 0.6847,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.41,
|
|
"grad_norm": 3.769002914428711,
|
|
"learning_rate": 9.960383062614614e-06,
|
|
"loss": 0.6908,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.812992811203003,
|
|
"learning_rate": 9.959423987893086e-06,
|
|
"loss": 0.694,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.8881213665008545,
|
|
"learning_rate": 9.958453489907673e-06,
|
|
"loss": 0.6891,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.4800655841827393,
|
|
"learning_rate": 9.957471570893767e-06,
|
|
"loss": 0.6945,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.597376585006714,
|
|
"learning_rate": 9.956478233113066e-06,
|
|
"loss": 0.6879,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.7456142902374268,
|
|
"learning_rate": 9.955473478853567e-06,
|
|
"loss": 0.6835,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 3.177309513092041,
|
|
"learning_rate": 9.954457310429569e-06,
|
|
"loss": 0.6912,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.0441253185272217,
|
|
"learning_rate": 9.953429730181653e-06,
|
|
"loss": 0.6797,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.5114927291870117,
|
|
"learning_rate": 9.952390740476698e-06,
|
|
"loss": 0.6952,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 2.6748149394989014,
|
|
"learning_rate": 9.951340343707852e-06,
|
|
"loss": 0.6844,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 3.3017261028289795,
|
|
"learning_rate": 9.95027854229454e-06,
|
|
"loss": 0.6827,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 2.5286686420440674,
|
|
"learning_rate": 9.94920533868246e-06,
|
|
"loss": 0.6895,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 2.7270824909210205,
|
|
"learning_rate": 9.948120735343566e-06,
|
|
"loss": 0.6841,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 3.2802062034606934,
|
|
"learning_rate": 9.947024734776076e-06,
|
|
"loss": 0.6866,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 2.7635345458984375,
|
|
"learning_rate": 9.945917339504457e-06,
|
|
"loss": 0.702,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.43,
|
|
"grad_norm": 2.5334112644195557,
|
|
"learning_rate": 9.944798552079422e-06,
|
|
"loss": 0.7038,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 3.57198429107666,
|
|
"learning_rate": 9.943668375077926e-06,
|
|
"loss": 0.6817,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 3.0424792766571045,
|
|
"learning_rate": 9.942526811103153e-06,
|
|
"loss": 0.6894,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.894702672958374,
|
|
"learning_rate": 9.94137386278452e-06,
|
|
"loss": 0.6905,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 3.1093649864196777,
|
|
"learning_rate": 9.940209532777666e-06,
|
|
"loss": 0.7126,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.57892107963562,
|
|
"learning_rate": 9.939033823764443e-06,
|
|
"loss": 0.6727,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.641435146331787,
|
|
"learning_rate": 9.937846738452914e-06,
|
|
"loss": 0.6899,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.499209403991699,
|
|
"learning_rate": 9.93664827957735e-06,
|
|
"loss": 0.6804,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.339439868927002,
|
|
"learning_rate": 9.93543844989821e-06,
|
|
"loss": 0.6684,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 3.1188161373138428,
|
|
"learning_rate": 9.93421725220215e-06,
|
|
"loss": 0.6885,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 2.80849552154541,
|
|
"learning_rate": 9.932984689302012e-06,
|
|
"loss": 0.6861,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 2.6763992309570312,
|
|
"learning_rate": 9.93174076403681e-06,
|
|
"loss": 0.6995,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 2.9029862880706787,
|
|
"learning_rate": 9.930485479271735e-06,
|
|
"loss": 0.6881,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 2.9763121604919434,
|
|
"learning_rate": 9.929218837898143e-06,
|
|
"loss": 0.6877,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 2.593538999557495,
|
|
"learning_rate": 9.92794084283354e-06,
|
|
"loss": 0.6901,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.45,
|
|
"grad_norm": 3.309509038925171,
|
|
"learning_rate": 9.926651497021595e-06,
|
|
"loss": 0.6841,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 3.0623817443847656,
|
|
"learning_rate": 9.925350803432112e-06,
|
|
"loss": 0.664,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 4.454458236694336,
|
|
"learning_rate": 9.924038765061042e-06,
|
|
"loss": 0.6818,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 3.123232126235962,
|
|
"learning_rate": 9.922715384930455e-06,
|
|
"loss": 0.6685,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 3.939410448074341,
|
|
"learning_rate": 9.921380666088558e-06,
|
|
"loss": 0.6869,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 5.342121601104736,
|
|
"learning_rate": 9.920034611609667e-06,
|
|
"loss": 0.6801,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 3.224928379058838,
|
|
"learning_rate": 9.918677224594207e-06,
|
|
"loss": 0.6746,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 2.875549793243408,
|
|
"learning_rate": 9.917308508168712e-06,
|
|
"loss": 0.6964,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 3.0345466136932373,
|
|
"learning_rate": 9.915928465485805e-06,
|
|
"loss": 0.6727,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 2.9075253009796143,
|
|
"learning_rate": 9.914537099724204e-06,
|
|
"loss": 0.6823,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 3.458336591720581,
|
|
"learning_rate": 9.913134414088698e-06,
|
|
"loss": 0.6884,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 3.051724910736084,
|
|
"learning_rate": 9.911720411810163e-06,
|
|
"loss": 0.7009,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 3.8520309925079346,
|
|
"learning_rate": 9.91029509614553e-06,
|
|
"loss": 0.6858,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 3.14030122756958,
|
|
"learning_rate": 9.908858470377793e-06,
|
|
"loss": 0.6847,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 3.072479009628296,
|
|
"learning_rate": 9.907410537815997e-06,
|
|
"loss": 0.7003,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.47,
|
|
"grad_norm": 4.00950813293457,
|
|
"learning_rate": 9.905951301795231e-06,
|
|
"loss": 0.673,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 2.5979695320129395,
|
|
"learning_rate": 9.904480765676617e-06,
|
|
"loss": 0.685,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 2.7463061809539795,
|
|
"learning_rate": 9.902998932847308e-06,
|
|
"loss": 0.6966,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 2.2453413009643555,
|
|
"learning_rate": 9.901505806720474e-06,
|
|
"loss": 0.6906,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 3.755852699279785,
|
|
"learning_rate": 9.9000013907353e-06,
|
|
"loss": 0.7008,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 3.0272433757781982,
|
|
"learning_rate": 9.89848568835698e-06,
|
|
"loss": 0.6912,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 2.8252182006835938,
|
|
"learning_rate": 9.896958703076693e-06,
|
|
"loss": 0.6806,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 3.7654056549072266,
|
|
"learning_rate": 9.895420438411616e-06,
|
|
"loss": 0.6778,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 2.8534739017486572,
|
|
"learning_rate": 9.8938708979049e-06,
|
|
"loss": 0.6842,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.0817458629608154,
|
|
"learning_rate": 9.892310085125675e-06,
|
|
"loss": 0.686,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.03659725189209,
|
|
"learning_rate": 9.890738003669029e-06,
|
|
"loss": 0.6858,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.211000919342041,
|
|
"learning_rate": 9.889154657156008e-06,
|
|
"loss": 0.6809,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.543001413345337,
|
|
"learning_rate": 9.887560049233606e-06,
|
|
"loss": 0.6956,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 2.351623058319092,
|
|
"learning_rate": 9.885954183574753e-06,
|
|
"loss": 0.678,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.029533624649048,
|
|
"learning_rate": 9.884337063878313e-06,
|
|
"loss": 0.6772,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 3.0510919094085693,
|
|
"learning_rate": 9.882708693869071e-06,
|
|
"loss": 0.6707,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.49,
|
|
"grad_norm": 2.8181586265563965,
|
|
"learning_rate": 9.881069077297724e-06,
|
|
"loss": 0.6768,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 3.0697293281555176,
|
|
"learning_rate": 9.879418217940872e-06,
|
|
"loss": 0.6893,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.8345251083374023,
|
|
"learning_rate": 9.877756119601018e-06,
|
|
"loss": 0.7028,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 3.112302780151367,
|
|
"learning_rate": 9.876082786106546e-06,
|
|
"loss": 0.6914,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.735736608505249,
|
|
"learning_rate": 9.87439822131172e-06,
|
|
"loss": 0.6748,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.8680014610290527,
|
|
"learning_rate": 9.87270242909667e-06,
|
|
"loss": 0.6785,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.8195078372955322,
|
|
"learning_rate": 9.870995413367397e-06,
|
|
"loss": 0.675,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.9144508838653564,
|
|
"learning_rate": 9.86927717805574e-06,
|
|
"loss": 0.685,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 2.886000871658325,
|
|
"learning_rate": 9.867547727119396e-06,
|
|
"loss": 0.6904,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 3.047471284866333,
|
|
"learning_rate": 9.865807064541878e-06,
|
|
"loss": 0.6943,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 2.9526615142822266,
|
|
"learning_rate": 9.864055194332538e-06,
|
|
"loss": 0.6815,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 3.2787699699401855,
|
|
"learning_rate": 9.862292120526536e-06,
|
|
"loss": 0.6791,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 2.6856937408447266,
|
|
"learning_rate": 9.860517847184837e-06,
|
|
"loss": 0.6978,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 2.927518367767334,
|
|
"learning_rate": 9.858732378394207e-06,
|
|
"loss": 0.6904,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 4.556071758270264,
|
|
"learning_rate": 9.856935718267196e-06,
|
|
"loss": 0.6889,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.51,
|
|
"grad_norm": 2.559556245803833,
|
|
"learning_rate": 9.855127870942131e-06,
|
|
"loss": 0.69,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 3.2897391319274902,
|
|
"learning_rate": 9.85330884058311e-06,
|
|
"loss": 0.6872,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 2.722827196121216,
|
|
"learning_rate": 9.851478631379982e-06,
|
|
"loss": 0.6865,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 3.322338581085205,
|
|
"learning_rate": 9.849637247548356e-06,
|
|
"loss": 0.6919,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 2.7876293659210205,
|
|
"learning_rate": 9.847784693329571e-06,
|
|
"loss": 0.6665,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 2.6033077239990234,
|
|
"learning_rate": 9.845920972990702e-06,
|
|
"loss": 0.6801,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 2.751955032348633,
|
|
"learning_rate": 9.844046090824533e-06,
|
|
"loss": 0.667,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 2.8029513359069824,
|
|
"learning_rate": 9.842160051149568e-06,
|
|
"loss": 0.6841,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 3.345020294189453,
|
|
"learning_rate": 9.840262858310007e-06,
|
|
"loss": 0.684,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 2.843904495239258,
|
|
"learning_rate": 9.83835451667574e-06,
|
|
"loss": 0.6878,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 2.4852325916290283,
|
|
"learning_rate": 9.836435030642335e-06,
|
|
"loss": 0.7087,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 3.3790409564971924,
|
|
"learning_rate": 9.834504404631032e-06,
|
|
"loss": 0.6913,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 3.407959461212158,
|
|
"learning_rate": 9.832562643088724e-06,
|
|
"loss": 0.6912,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 2.723505973815918,
|
|
"learning_rate": 9.830609750487963e-06,
|
|
"loss": 0.6927,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 3.1616594791412354,
|
|
"learning_rate": 9.82864573132693e-06,
|
|
"loss": 0.6714,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.53,
|
|
"grad_norm": 3.0670251846313477,
|
|
"learning_rate": 9.826670590129442e-06,
|
|
"loss": 0.6685,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 2.4355831146240234,
|
|
"learning_rate": 9.824684331444926e-06,
|
|
"loss": 0.6839,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 2.7502357959747314,
|
|
"learning_rate": 9.822686959848425e-06,
|
|
"loss": 0.6925,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 3.705983877182007,
|
|
"learning_rate": 9.820678479940573e-06,
|
|
"loss": 0.6715,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 2.4137468338012695,
|
|
"learning_rate": 9.818658896347591e-06,
|
|
"loss": 0.6882,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 3.260124921798706,
|
|
"learning_rate": 9.81662821372128e-06,
|
|
"loss": 0.684,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 3.2827420234680176,
|
|
"learning_rate": 9.814586436738998e-06,
|
|
"loss": 0.675,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 2.2685201168060303,
|
|
"learning_rate": 9.812533570103663e-06,
|
|
"loss": 0.6636,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 3.1365833282470703,
|
|
"learning_rate": 9.810469618543737e-06,
|
|
"loss": 0.6911,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 2.9192795753479004,
|
|
"learning_rate": 9.808394586813209e-06,
|
|
"loss": 0.6955,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 2.4191370010375977,
|
|
"learning_rate": 9.806308479691595e-06,
|
|
"loss": 0.6769,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 4.024157524108887,
|
|
"learning_rate": 9.804211301983919e-06,
|
|
"loss": 0.6837,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 2.6173815727233887,
|
|
"learning_rate": 9.802103058520704e-06,
|
|
"loss": 0.6703,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 2.637032985687256,
|
|
"learning_rate": 9.799983754157961e-06,
|
|
"loss": 0.681,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 2.4752278327941895,
|
|
"learning_rate": 9.797853393777182e-06,
|
|
"loss": 0.6667,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.55,
|
|
"grad_norm": 3.132769823074341,
|
|
"learning_rate": 9.795711982285317e-06,
|
|
"loss": 0.6903,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 2.7234723567962646,
|
|
"learning_rate": 9.793559524614779e-06,
|
|
"loss": 0.6763,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 2.6191039085388184,
|
|
"learning_rate": 9.791396025723418e-06,
|
|
"loss": 0.6732,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 2.8403711318969727,
|
|
"learning_rate": 9.78922149059452e-06,
|
|
"loss": 0.676,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 3.4012792110443115,
|
|
"learning_rate": 9.787035924236789e-06,
|
|
"loss": 0.6576,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 3.256134271621704,
|
|
"learning_rate": 9.784839331684338e-06,
|
|
"loss": 0.7017,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 2.8171510696411133,
|
|
"learning_rate": 9.782631717996675e-06,
|
|
"loss": 0.6764,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 3.248218297958374,
|
|
"learning_rate": 9.780413088258698e-06,
|
|
"loss": 0.6807,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.497915267944336,
|
|
"learning_rate": 9.778183447580675e-06,
|
|
"loss": 0.6714,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.140228509902954,
|
|
"learning_rate": 9.775942801098241e-06,
|
|
"loss": 0.7066,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.035100221633911,
|
|
"learning_rate": 9.773691153972375e-06,
|
|
"loss": 0.6803,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 2.7118821144104004,
|
|
"learning_rate": 9.771428511389395e-06,
|
|
"loss": 0.6755,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.035815477371216,
|
|
"learning_rate": 9.76915487856095e-06,
|
|
"loss": 0.6707,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.3597121238708496,
|
|
"learning_rate": 9.766870260724e-06,
|
|
"loss": 0.6781,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 3.151930809020996,
|
|
"learning_rate": 9.764574663140807e-06,
|
|
"loss": 0.6644,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.57,
|
|
"grad_norm": 2.971132278442383,
|
|
"learning_rate": 9.762268091098926e-06,
|
|
"loss": 0.6747,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.8724772930145264,
|
|
"learning_rate": 9.759950549911185e-06,
|
|
"loss": 0.6802,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 3.2583391666412354,
|
|
"learning_rate": 9.757622044915682e-06,
|
|
"loss": 0.6958,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.8724751472473145,
|
|
"learning_rate": 9.755282581475769e-06,
|
|
"loss": 0.6673,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.53116774559021,
|
|
"learning_rate": 9.752932164980033e-06,
|
|
"loss": 0.6771,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.5583338737487793,
|
|
"learning_rate": 9.750570800842298e-06,
|
|
"loss": 0.6835,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.7628743648529053,
|
|
"learning_rate": 9.748198494501598e-06,
|
|
"loss": 0.6759,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.4657537937164307,
|
|
"learning_rate": 9.74581525142217e-06,
|
|
"loss": 0.6912,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 2.4423508644104004,
|
|
"learning_rate": 9.74342107709345e-06,
|
|
"loss": 0.6584,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 2.9297876358032227,
|
|
"learning_rate": 9.741015977030046e-06,
|
|
"loss": 0.6898,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 4.085515022277832,
|
|
"learning_rate": 9.73859995677173e-06,
|
|
"loss": 0.6586,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 3.312915802001953,
|
|
"learning_rate": 9.736173021883433e-06,
|
|
"loss": 0.6819,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 2.7743465900421143,
|
|
"learning_rate": 9.733735177955219e-06,
|
|
"loss": 0.6621,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 3.45530366897583,
|
|
"learning_rate": 9.73128643060229e-06,
|
|
"loss": 0.6838,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 2.886432409286499,
|
|
"learning_rate": 9.728826785464948e-06,
|
|
"loss": 0.6859,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.59,
|
|
"grad_norm": 4.044523239135742,
|
|
"learning_rate": 9.72635624820861e-06,
|
|
"loss": 0.6832,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.746330499649048,
|
|
"learning_rate": 9.72387482452377e-06,
|
|
"loss": 0.6917,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 3.0390594005584717,
|
|
"learning_rate": 9.72138252012601e-06,
|
|
"loss": 0.6656,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.8167319297790527,
|
|
"learning_rate": 9.71887934075596e-06,
|
|
"loss": 0.6885,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.6761083602905273,
|
|
"learning_rate": 9.716365292179309e-06,
|
|
"loss": 0.6942,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.7335264682769775,
|
|
"learning_rate": 9.713840380186774e-06,
|
|
"loss": 0.684,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 3.458198070526123,
|
|
"learning_rate": 9.711304610594104e-06,
|
|
"loss": 0.692,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.902730703353882,
|
|
"learning_rate": 9.708757989242046e-06,
|
|
"loss": 0.6638,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 2.5396640300750732,
|
|
"learning_rate": 9.706200521996348e-06,
|
|
"loss": 0.69,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.7966129779815674,
|
|
"learning_rate": 9.703632214747742e-06,
|
|
"loss": 0.6832,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.746511697769165,
|
|
"learning_rate": 9.701053073411923e-06,
|
|
"loss": 0.6749,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 3.5651538372039795,
|
|
"learning_rate": 9.698463103929542e-06,
|
|
"loss": 0.6722,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"eval_loss": 0.7272596955299377,
|
|
"eval_runtime": 198.5906,
|
|
"eval_samples_per_second": 55.39,
|
|
"eval_steps_per_second": 6.924,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.358306407928467,
|
|
"learning_rate": 9.695862312266195e-06,
|
|
"loss": 0.6808,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.843318223953247,
|
|
"learning_rate": 9.6932507044124e-06,
|
|
"loss": 0.6852,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.2861886024475098,
|
|
"learning_rate": 9.690628286383593e-06,
|
|
"loss": 0.6736,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.61,
|
|
"grad_norm": 2.2561638355255127,
|
|
"learning_rate": 9.687995064220102e-06,
|
|
"loss": 0.6789,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.8437678813934326,
|
|
"learning_rate": 9.685351043987151e-06,
|
|
"loss": 0.6758,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.547785758972168,
|
|
"learning_rate": 9.682696231774829e-06,
|
|
"loss": 0.6855,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.6620736122131348,
|
|
"learning_rate": 9.680030633698083e-06,
|
|
"loss": 0.6711,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 3.146510362625122,
|
|
"learning_rate": 9.677354255896706e-06,
|
|
"loss": 0.6641,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.8901259899139404,
|
|
"learning_rate": 9.674667104535318e-06,
|
|
"loss": 0.6898,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.88307523727417,
|
|
"learning_rate": 9.671969185803357e-06,
|
|
"loss": 0.6848,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.6471335887908936,
|
|
"learning_rate": 9.669260505915057e-06,
|
|
"loss": 0.668,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.9674232006073,
|
|
"learning_rate": 9.666541071109446e-06,
|
|
"loss": 0.6849,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 2.370706081390381,
|
|
"learning_rate": 9.66381088765032e-06,
|
|
"loss": 0.6819,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 2.7703073024749756,
|
|
"learning_rate": 9.661069961826228e-06,
|
|
"loss": 0.6674,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 2.9082021713256836,
|
|
"learning_rate": 9.658318299950473e-06,
|
|
"loss": 0.6833,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 3.0396716594696045,
|
|
"learning_rate": 9.65555590836108e-06,
|
|
"loss": 0.6657,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 2.304875373840332,
|
|
"learning_rate": 9.652782793420789e-06,
|
|
"loss": 0.6964,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 2.8251640796661377,
|
|
"learning_rate": 9.64999896151704e-06,
|
|
"loss": 0.6774,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.63,
|
|
"grad_norm": 3.527707815170288,
|
|
"learning_rate": 9.647204419061957e-06,
|
|
"loss": 0.6778,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 2.566895008087158,
|
|
"learning_rate": 9.644399172492337e-06,
|
|
"loss": 0.681,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 3.0783915519714355,
|
|
"learning_rate": 9.641583228269629e-06,
|
|
"loss": 0.6744,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 2.982912302017212,
|
|
"learning_rate": 9.638756592879923e-06,
|
|
"loss": 0.6849,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 2.7487356662750244,
|
|
"learning_rate": 9.635919272833938e-06,
|
|
"loss": 0.6709,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 3.3017807006835938,
|
|
"learning_rate": 9.633071274666998e-06,
|
|
"loss": 0.6698,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 2.7575645446777344,
|
|
"learning_rate": 9.630212604939026e-06,
|
|
"loss": 0.6823,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 3.032663345336914,
|
|
"learning_rate": 9.627343270234526e-06,
|
|
"loss": 0.6754,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 2.4695844650268555,
|
|
"learning_rate": 9.624463277162563e-06,
|
|
"loss": 0.6793,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 2.7239301204681396,
|
|
"learning_rate": 9.621572632356754e-06,
|
|
"loss": 0.7041,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 2.497579336166382,
|
|
"learning_rate": 9.618671342475252e-06,
|
|
"loss": 0.694,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 2.7662250995635986,
|
|
"learning_rate": 9.615759414200729e-06,
|
|
"loss": 0.6739,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 3.1290366649627686,
|
|
"learning_rate": 9.61283685424036e-06,
|
|
"loss": 0.6802,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 2.9241154193878174,
|
|
"learning_rate": 9.609903669325807e-06,
|
|
"loss": 0.6859,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 2.5501949787139893,
|
|
"learning_rate": 9.606959866213206e-06,
|
|
"loss": 0.6608,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.65,
|
|
"grad_norm": 3.447155475616455,
|
|
"learning_rate": 9.604005451683154e-06,
|
|
"loss": 0.6813,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 2.4963529109954834,
|
|
"learning_rate": 9.601040432540684e-06,
|
|
"loss": 0.6743,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 2.8612847328186035,
|
|
"learning_rate": 9.598064815615259e-06,
|
|
"loss": 0.6614,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 3.20935320854187,
|
|
"learning_rate": 9.59507860776075e-06,
|
|
"loss": 0.6781,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 2.885199546813965,
|
|
"learning_rate": 9.592081815855425e-06,
|
|
"loss": 0.6738,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 3.2153961658477783,
|
|
"learning_rate": 9.589074446801928e-06,
|
|
"loss": 0.68,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 2.5133445262908936,
|
|
"learning_rate": 9.586056507527266e-06,
|
|
"loss": 0.6822,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 3.0179901123046875,
|
|
"learning_rate": 9.583028004982798e-06,
|
|
"loss": 0.675,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 3.4788663387298584,
|
|
"learning_rate": 9.579988946144205e-06,
|
|
"loss": 0.6832,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.847745418548584,
|
|
"learning_rate": 9.57693933801149e-06,
|
|
"loss": 0.6662,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.8910322189331055,
|
|
"learning_rate": 9.573879187608954e-06,
|
|
"loss": 0.6732,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.582803726196289,
|
|
"learning_rate": 9.570808501985176e-06,
|
|
"loss": 0.6782,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.1921629905700684,
|
|
"learning_rate": 9.567727288213005e-06,
|
|
"loss": 0.6881,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.068142890930176,
|
|
"learning_rate": 9.56463555338954e-06,
|
|
"loss": 0.6717,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.0534846782684326,
|
|
"learning_rate": 9.561533304636111e-06,
|
|
"loss": 0.6575,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 2.8238446712493896,
|
|
"learning_rate": 9.558420549098269e-06,
|
|
"loss": 0.6842,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.67,
|
|
"grad_norm": 3.434664249420166,
|
|
"learning_rate": 9.55529729394576e-06,
|
|
"loss": 0.6789,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 2.864886522293091,
|
|
"learning_rate": 9.552163546372521e-06,
|
|
"loss": 0.6707,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 2.919512987136841,
|
|
"learning_rate": 9.549019313596652e-06,
|
|
"loss": 0.675,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 2.967341899871826,
|
|
"learning_rate": 9.545864602860406e-06,
|
|
"loss": 0.6915,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 4.124980449676514,
|
|
"learning_rate": 9.542699421430169e-06,
|
|
"loss": 0.6707,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 2.38964581489563,
|
|
"learning_rate": 9.539523776596446e-06,
|
|
"loss": 0.6779,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 2.722057580947876,
|
|
"learning_rate": 9.536337675673842e-06,
|
|
"loss": 0.6912,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 4.076712131500244,
|
|
"learning_rate": 9.533141126001048e-06,
|
|
"loss": 0.6835,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.521733522415161,
|
|
"learning_rate": 9.529934134940819e-06,
|
|
"loss": 0.6741,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.929415464401245,
|
|
"learning_rate": 9.526716709879961e-06,
|
|
"loss": 0.6681,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.9488470554351807,
|
|
"learning_rate": 9.523488858229313e-06,
|
|
"loss": 0.6695,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.68656849861145,
|
|
"learning_rate": 9.520250587423733e-06,
|
|
"loss": 0.6791,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.4024336338043213,
|
|
"learning_rate": 9.517001904922074e-06,
|
|
"loss": 0.6861,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.476743221282959,
|
|
"learning_rate": 9.513742818207173e-06,
|
|
"loss": 0.6895,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.6581668853759766,
|
|
"learning_rate": 9.510473334785828e-06,
|
|
"loss": 0.677,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.69,
|
|
"grad_norm": 2.596719264984131,
|
|
"learning_rate": 9.507193462188791e-06,
|
|
"loss": 0.6842,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 2.334949016571045,
|
|
"learning_rate": 9.503903207970735e-06,
|
|
"loss": 0.6732,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 2.867070436477661,
|
|
"learning_rate": 9.500602579710256e-06,
|
|
"loss": 0.6676,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 3.4152629375457764,
|
|
"learning_rate": 9.497291585009834e-06,
|
|
"loss": 0.6618,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 2.69191837310791,
|
|
"learning_rate": 9.493970231495836e-06,
|
|
"loss": 0.6822,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 3.0387730598449707,
|
|
"learning_rate": 9.490638526818482e-06,
|
|
"loss": 0.6809,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 2.516139268875122,
|
|
"learning_rate": 9.487296478651838e-06,
|
|
"loss": 0.682,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 2.8808937072753906,
|
|
"learning_rate": 9.48394409469379e-06,
|
|
"loss": 0.6829,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 3.1148269176483154,
|
|
"learning_rate": 9.480581382666041e-06,
|
|
"loss": 0.666,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 2.933945894241333,
|
|
"learning_rate": 9.477208350314072e-06,
|
|
"loss": 0.6554,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 2.9011828899383545,
|
|
"learning_rate": 9.47382500540714e-06,
|
|
"loss": 0.6724,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 3.7872607707977295,
|
|
"learning_rate": 9.470431355738257e-06,
|
|
"loss": 0.6785,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 2.19966197013855,
|
|
"learning_rate": 9.467027409124167e-06,
|
|
"loss": 0.6767,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 2.590169668197632,
|
|
"learning_rate": 9.463613173405335e-06,
|
|
"loss": 0.6587,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 3.284235954284668,
|
|
"learning_rate": 9.460188656445921e-06,
|
|
"loss": 0.6819,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.71,
|
|
"grad_norm": 2.668703556060791,
|
|
"learning_rate": 9.45675386613377e-06,
|
|
"loss": 0.6675,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 3.212360382080078,
|
|
"learning_rate": 9.453308810380388e-06,
|
|
"loss": 0.6832,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 3.038121461868286,
|
|
"learning_rate": 9.449853497120928e-06,
|
|
"loss": 0.6987,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 2.2262122631073,
|
|
"learning_rate": 9.446387934314167e-06,
|
|
"loss": 0.6688,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 2.7394657135009766,
|
|
"learning_rate": 9.442912129942491e-06,
|
|
"loss": 0.6788,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 2.70332932472229,
|
|
"learning_rate": 9.439426092011877e-06,
|
|
"loss": 0.671,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 2.807642936706543,
|
|
"learning_rate": 9.435929828551872e-06,
|
|
"loss": 0.6748,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 3.0139355659484863,
|
|
"learning_rate": 9.432423347615578e-06,
|
|
"loss": 0.6723,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 3.2098162174224854,
|
|
"learning_rate": 9.428906657279629e-06,
|
|
"loss": 0.6717,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 2.6355557441711426,
|
|
"learning_rate": 9.425379765644174e-06,
|
|
"loss": 0.6816,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 2.0204873085021973,
|
|
"learning_rate": 9.421842680832862e-06,
|
|
"loss": 0.6671,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 3.1098544597625732,
|
|
"learning_rate": 9.418295410992821e-06,
|
|
"loss": 0.6911,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 3.0662097930908203,
|
|
"learning_rate": 9.414737964294636e-06,
|
|
"loss": 0.6846,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 3.711937665939331,
|
|
"learning_rate": 9.411170348932333e-06,
|
|
"loss": 0.6731,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 2.982342481613159,
|
|
"learning_rate": 9.407592573123359e-06,
|
|
"loss": 0.6747,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.73,
|
|
"grad_norm": 2.78140926361084,
|
|
"learning_rate": 9.40400464510857e-06,
|
|
"loss": 0.6787,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 3.521254539489746,
|
|
"learning_rate": 9.400406573152196e-06,
|
|
"loss": 0.6891,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.704249620437622,
|
|
"learning_rate": 9.396798365541841e-06,
|
|
"loss": 0.6823,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.6704928874969482,
|
|
"learning_rate": 9.393180030588454e-06,
|
|
"loss": 0.6814,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.9716169834136963,
|
|
"learning_rate": 9.389551576626303e-06,
|
|
"loss": 0.6786,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.880004405975342,
|
|
"learning_rate": 9.385913012012972e-06,
|
|
"loss": 0.6775,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.5522663593292236,
|
|
"learning_rate": 9.382264345129329e-06,
|
|
"loss": 0.6827,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.8483026027679443,
|
|
"learning_rate": 9.378605584379515e-06,
|
|
"loss": 0.656,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 2.7561230659484863,
|
|
"learning_rate": 9.374936738190913e-06,
|
|
"loss": 0.6694,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 3.149887800216675,
|
|
"learning_rate": 9.371257815014145e-06,
|
|
"loss": 0.6782,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 2.629521369934082,
|
|
"learning_rate": 9.367568823323039e-06,
|
|
"loss": 0.6758,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 3.5836355686187744,
|
|
"learning_rate": 9.363869771614615e-06,
|
|
"loss": 0.6738,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 2.83819317817688,
|
|
"learning_rate": 9.360160668409063e-06,
|
|
"loss": 0.6734,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 2.5035979747772217,
|
|
"learning_rate": 9.35644152224973e-06,
|
|
"loss": 0.6804,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 2.4892172813415527,
|
|
"learning_rate": 9.35271234170309e-06,
|
|
"loss": 0.6599,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.75,
|
|
"grad_norm": 3.170015811920166,
|
|
"learning_rate": 9.348973135358734e-06,
|
|
"loss": 0.6771,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.2458276748657227,
|
|
"learning_rate": 9.345223911829343e-06,
|
|
"loss": 0.6785,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.567450761795044,
|
|
"learning_rate": 9.341464679750669e-06,
|
|
"loss": 0.6732,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.638319730758667,
|
|
"learning_rate": 9.337695447781525e-06,
|
|
"loss": 0.6753,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 3.1988255977630615,
|
|
"learning_rate": 9.333916224603747e-06,
|
|
"loss": 0.6776,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.337787389755249,
|
|
"learning_rate": 9.330127018922195e-06,
|
|
"loss": 0.6766,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.9823319911956787,
|
|
"learning_rate": 9.326327839464711e-06,
|
|
"loss": 0.6749,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.39164662361145,
|
|
"learning_rate": 9.322518694982119e-06,
|
|
"loss": 0.6703,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.7940139770507812,
|
|
"learning_rate": 9.318699594248192e-06,
|
|
"loss": 0.6612,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 2.8293371200561523,
|
|
"learning_rate": 9.314870546059636e-06,
|
|
"loss": 0.6598,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 2.8672988414764404,
|
|
"learning_rate": 9.311031559236067e-06,
|
|
"loss": 0.6811,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 2.592622995376587,
|
|
"learning_rate": 9.307182642620001e-06,
|
|
"loss": 0.6857,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 3.7279140949249268,
|
|
"learning_rate": 9.303323805076816e-06,
|
|
"loss": 0.6606,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 3.0750820636749268,
|
|
"learning_rate": 9.299455055494747e-06,
|
|
"loss": 0.6766,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 3.0468876361846924,
|
|
"learning_rate": 9.295576402784858e-06,
|
|
"loss": 0.6675,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.77,
|
|
"grad_norm": 3.2743566036224365,
|
|
"learning_rate": 9.291687855881027e-06,
|
|
"loss": 0.6842,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 3.0007123947143555,
|
|
"learning_rate": 9.287789423739915e-06,
|
|
"loss": 0.6631,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 2.861750602722168,
|
|
"learning_rate": 9.283881115340957e-06,
|
|
"loss": 0.6624,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 2.6511833667755127,
|
|
"learning_rate": 9.279962939686333e-06,
|
|
"loss": 0.6735,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 2.709005832672119,
|
|
"learning_rate": 9.276034905800957e-06,
|
|
"loss": 0.6769,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 3.223933696746826,
|
|
"learning_rate": 9.272097022732444e-06,
|
|
"loss": 0.6818,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 2.787783622741699,
|
|
"learning_rate": 9.268149299551095e-06,
|
|
"loss": 0.6856,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 3.5154056549072266,
|
|
"learning_rate": 9.264191745349882e-06,
|
|
"loss": 0.6682,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 3.194385528564453,
|
|
"learning_rate": 9.260224369244414e-06,
|
|
"loss": 0.6659,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.8617637157440186,
|
|
"learning_rate": 9.256247180372927e-06,
|
|
"loss": 0.6855,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.8789777755737305,
|
|
"learning_rate": 9.252260187896257e-06,
|
|
"loss": 0.6829,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.897092580795288,
|
|
"learning_rate": 9.248263400997826e-06,
|
|
"loss": 0.6744,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.86773943901062,
|
|
"learning_rate": 9.244256828883611e-06,
|
|
"loss": 0.6867,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.738723039627075,
|
|
"learning_rate": 9.24024048078213e-06,
|
|
"loss": 0.6718,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 2.6888554096221924,
|
|
"learning_rate": 9.236214365944418e-06,
|
|
"loss": 0.6711,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.79,
|
|
"grad_norm": 3.0452592372894287,
|
|
"learning_rate": 9.232178493644006e-06,
|
|
"loss": 0.6816,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.9094676971435547,
|
|
"learning_rate": 9.228132873176899e-06,
|
|
"loss": 0.6817,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.685194492340088,
|
|
"learning_rate": 9.224077513861556e-06,
|
|
"loss": 0.6684,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.4088940620422363,
|
|
"learning_rate": 9.22001242503887e-06,
|
|
"loss": 0.6707,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.5745279788970947,
|
|
"learning_rate": 9.21593761607214e-06,
|
|
"loss": 0.6866,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.8593175411224365,
|
|
"learning_rate": 9.211853096347059e-06,
|
|
"loss": 0.6713,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.8683552742004395,
|
|
"learning_rate": 9.207758875271683e-06,
|
|
"loss": 0.6566,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.874685287475586,
|
|
"learning_rate": 9.203654962276415e-06,
|
|
"loss": 0.6791,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.6957099437713623,
|
|
"learning_rate": 9.199541366813984e-06,
|
|
"loss": 0.6688,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 3.0089595317840576,
|
|
"learning_rate": 9.195418098359417e-06,
|
|
"loss": 0.6708,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 2.352421760559082,
|
|
"learning_rate": 9.191285166410023e-06,
|
|
"loss": 0.6637,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 2.6234710216522217,
|
|
"learning_rate": 9.18714258048537e-06,
|
|
"loss": 0.6764,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 3.9939608573913574,
|
|
"learning_rate": 9.182990350127265e-06,
|
|
"loss": 0.6553,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 3.0175065994262695,
|
|
"learning_rate": 9.178828484899724e-06,
|
|
"loss": 0.6709,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 3.120433807373047,
|
|
"learning_rate": 9.174656994388957e-06,
|
|
"loss": 0.6739,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.81,
|
|
"grad_norm": 2.466984748840332,
|
|
"learning_rate": 9.170475888203348e-06,
|
|
"loss": 0.6652,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.471644639968872,
|
|
"learning_rate": 9.166285175973424e-06,
|
|
"loss": 0.6822,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.809940814971924,
|
|
"learning_rate": 9.16208486735184e-06,
|
|
"loss": 0.6806,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 3.266305446624756,
|
|
"learning_rate": 9.157874972013361e-06,
|
|
"loss": 0.6742,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.68939208984375,
|
|
"learning_rate": 9.153655499654824e-06,
|
|
"loss": 0.6778,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.9182255268096924,
|
|
"learning_rate": 9.149426459995127e-06,
|
|
"loss": 0.6688,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.9277961254119873,
|
|
"learning_rate": 9.145187862775208e-06,
|
|
"loss": 0.676,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.709533214569092,
|
|
"learning_rate": 9.140939717758022e-06,
|
|
"loss": 0.6713,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 3.068077325820923,
|
|
"learning_rate": 9.136682034728508e-06,
|
|
"loss": 0.6623,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.503319263458252,
|
|
"learning_rate": 9.13241482349358e-06,
|
|
"loss": 0.686,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.557908773422241,
|
|
"learning_rate": 9.128138093882098e-06,
|
|
"loss": 0.674,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.8420028686523438,
|
|
"learning_rate": 9.123851855744842e-06,
|
|
"loss": 0.6606,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 3.4535250663757324,
|
|
"learning_rate": 9.119556118954503e-06,
|
|
"loss": 0.6702,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.663339138031006,
|
|
"learning_rate": 9.115250893405637e-06,
|
|
"loss": 0.6788,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.507500410079956,
|
|
"learning_rate": 9.110936189014668e-06,
|
|
"loss": 0.6631,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 0.83,
|
|
"grad_norm": 2.51786732673645,
|
|
"learning_rate": 9.106612015719845e-06,
|
|
"loss": 0.6617,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 3.4296956062316895,
|
|
"learning_rate": 9.102278383481235e-06,
|
|
"loss": 0.6818,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.9477152824401855,
|
|
"learning_rate": 9.097935302280682e-06,
|
|
"loss": 0.6797,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.557518243789673,
|
|
"learning_rate": 9.093582782121805e-06,
|
|
"loss": 0.6741,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.746699571609497,
|
|
"learning_rate": 9.089220833029957e-06,
|
|
"loss": 0.6732,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.7738733291625977,
|
|
"learning_rate": 9.08484946505221e-06,
|
|
"loss": 0.672,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 3.2371561527252197,
|
|
"learning_rate": 9.080468688257334e-06,
|
|
"loss": 0.6836,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.3297362327575684,
|
|
"learning_rate": 9.07607851273577e-06,
|
|
"loss": 0.6739,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.604583740234375,
|
|
"learning_rate": 9.0716789485996e-06,
|
|
"loss": 0.6861,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 2.323979616165161,
|
|
"learning_rate": 9.067270005982545e-06,
|
|
"loss": 0.673,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.385627269744873,
|
|
"learning_rate": 9.062851695039915e-06,
|
|
"loss": 0.6733,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.809943914413452,
|
|
"learning_rate": 9.058424025948609e-06,
|
|
"loss": 0.6802,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.2964417934417725,
|
|
"learning_rate": 9.053987008907071e-06,
|
|
"loss": 0.6912,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.160759210586548,
|
|
"learning_rate": 9.049540654135285e-06,
|
|
"loss": 0.6672,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.6694741249084473,
|
|
"learning_rate": 9.045084971874738e-06,
|
|
"loss": 0.6843,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 3.6869866847991943,
|
|
"learning_rate": 9.040619972388402e-06,
|
|
"loss": 0.671,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 0.85,
|
|
"grad_norm": 2.74727463722229,
|
|
"learning_rate": 9.036145665960715e-06,
|
|
"loss": 0.6783,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.88826322555542,
|
|
"learning_rate": 9.03166206289754e-06,
|
|
"loss": 0.6547,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.761207103729248,
|
|
"learning_rate": 9.02716917352617e-06,
|
|
"loss": 0.6739,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.660529375076294,
|
|
"learning_rate": 9.022667008195273e-06,
|
|
"loss": 0.6595,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.8409204483032227,
|
|
"learning_rate": 9.018155577274891e-06,
|
|
"loss": 0.6881,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.212315320968628,
|
|
"learning_rate": 9.013634891156404e-06,
|
|
"loss": 0.6872,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 3.3113622665405273,
|
|
"learning_rate": 9.009104960252513e-06,
|
|
"loss": 0.6761,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 2.797126293182373,
|
|
"learning_rate": 9.004565794997209e-06,
|
|
"loss": 0.6741,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.9894332885742188,
|
|
"learning_rate": 9.000017405845755e-06,
|
|
"loss": 0.6835,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.9207139015197754,
|
|
"learning_rate": 8.995459803274664e-06,
|
|
"loss": 0.674,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.9169623851776123,
|
|
"learning_rate": 8.990892997781661e-06,
|
|
"loss": 0.6419,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.9211723804473877,
|
|
"learning_rate": 8.986316999885678e-06,
|
|
"loss": 0.6581,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.616330623626709,
|
|
"learning_rate": 8.981731820126816e-06,
|
|
"loss": 0.6741,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.5580813884735107,
|
|
"learning_rate": 8.977137469066321e-06,
|
|
"loss": 0.6741,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 2.785909414291382,
|
|
"learning_rate": 8.972533957286574e-06,
|
|
"loss": 0.6784,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.87,
|
|
"grad_norm": 3.340866804122925,
|
|
"learning_rate": 8.967921295391046e-06,
|
|
"loss": 0.6687,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 2.7479071617126465,
|
|
"learning_rate": 8.963299494004292e-06,
|
|
"loss": 0.6738,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 2.5367236137390137,
|
|
"learning_rate": 8.958668563771911e-06,
|
|
"loss": 0.6776,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 2.654205322265625,
|
|
"learning_rate": 8.954028515360535e-06,
|
|
"loss": 0.6664,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 2.851851463317871,
|
|
"learning_rate": 8.949379359457795e-06,
|
|
"loss": 0.6765,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 2.946833848953247,
|
|
"learning_rate": 8.944721106772298e-06,
|
|
"loss": 0.6642,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 3.0130770206451416,
|
|
"learning_rate": 8.94005376803361e-06,
|
|
"loss": 0.6775,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 3.278046131134033,
|
|
"learning_rate": 8.935377353992222e-06,
|
|
"loss": 0.6853,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 2.2684073448181152,
|
|
"learning_rate": 8.930691875419525e-06,
|
|
"loss": 0.6704,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 3.093383550643921,
|
|
"learning_rate": 8.925997343107796e-06,
|
|
"loss": 0.6665,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 2.568152666091919,
|
|
"learning_rate": 8.921293767870157e-06,
|
|
"loss": 0.6643,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 3.2096123695373535,
|
|
"learning_rate": 8.91658116054057e-06,
|
|
"loss": 0.679,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 2.912968397140503,
|
|
"learning_rate": 8.91185953197379e-06,
|
|
"loss": 0.6856,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 2.980971097946167,
|
|
"learning_rate": 8.907128893045359e-06,
|
|
"loss": 0.6672,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 3.0038115978240967,
|
|
"learning_rate": 8.902389254651568e-06,
|
|
"loss": 0.6738,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 0.89,
|
|
"grad_norm": 3.099684000015259,
|
|
"learning_rate": 8.897640627709441e-06,
|
|
"loss": 0.6924,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 2.444758892059326,
|
|
"learning_rate": 8.892883023156703e-06,
|
|
"loss": 0.6689,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 2.7014451026916504,
|
|
"learning_rate": 8.888116451951755e-06,
|
|
"loss": 0.683,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 3.183800458908081,
|
|
"learning_rate": 8.88334092507366e-06,
|
|
"loss": 0.6799,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 2.8283169269561768,
|
|
"learning_rate": 8.8785564535221e-06,
|
|
"loss": 0.6546,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 2.629321575164795,
|
|
"learning_rate": 8.873763048317363e-06,
|
|
"loss": 0.6774,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 2.320882558822632,
|
|
"learning_rate": 8.868960720500314e-06,
|
|
"loss": 0.6646,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 3.840276002883911,
|
|
"learning_rate": 8.86414948113237e-06,
|
|
"loss": 0.6768,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.9596545696258545,
|
|
"learning_rate": 8.85932934129548e-06,
|
|
"loss": 0.6873,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.690166473388672,
|
|
"learning_rate": 8.854500312092081e-06,
|
|
"loss": 0.6867,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 3.0237040519714355,
|
|
"learning_rate": 8.849662404645097e-06,
|
|
"loss": 0.664,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 3.3142759799957275,
|
|
"learning_rate": 8.844815630097896e-06,
|
|
"loss": 0.6776,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.4475510120391846,
|
|
"learning_rate": 8.839959999614272e-06,
|
|
"loss": 0.6755,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.689811944961548,
|
|
"learning_rate": 8.835095524378413e-06,
|
|
"loss": 0.677,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.6768815517425537,
|
|
"learning_rate": 8.83022221559489e-06,
|
|
"loss": 0.6867,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"eval_loss": 0.7290458083152771,
|
|
"eval_runtime": 197.9624,
|
|
"eval_samples_per_second": 55.566,
|
|
"eval_steps_per_second": 6.946,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.91,
|
|
"grad_norm": 2.7535643577575684,
|
|
"learning_rate": 8.82534008448861e-06,
|
|
"loss": 0.6751,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.4862465858459473,
|
|
"learning_rate": 8.820449142304805e-06,
|
|
"loss": 0.6745,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.895951747894287,
|
|
"learning_rate": 8.815549400309002e-06,
|
|
"loss": 0.6701,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.7666430473327637,
|
|
"learning_rate": 8.810640869786994e-06,
|
|
"loss": 0.6522,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 3.116729259490967,
|
|
"learning_rate": 8.805723562044825e-06,
|
|
"loss": 0.6613,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.8136725425720215,
|
|
"learning_rate": 8.800797488408746e-06,
|
|
"loss": 0.6739,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.925579309463501,
|
|
"learning_rate": 8.795862660225205e-06,
|
|
"loss": 0.655,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.7187774181365967,
|
|
"learning_rate": 8.790919088860815e-06,
|
|
"loss": 0.662,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 3.002408742904663,
|
|
"learning_rate": 8.785966785702323e-06,
|
|
"loss": 0.6677,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.7283411026000977,
|
|
"learning_rate": 8.781005762156593e-06,
|
|
"loss": 0.6775,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.955579996109009,
|
|
"learning_rate": 8.776036029650573e-06,
|
|
"loss": 0.6777,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.8140358924865723,
|
|
"learning_rate": 8.77105759963127e-06,
|
|
"loss": 0.6734,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.5485150814056396,
|
|
"learning_rate": 8.766070483565726e-06,
|
|
"loss": 0.6805,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.6847705841064453,
|
|
"learning_rate": 8.76107469294099e-06,
|
|
"loss": 0.6564,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 3.189195156097412,
|
|
"learning_rate": 8.756070239264089e-06,
|
|
"loss": 0.6794,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 0.93,
|
|
"grad_norm": 2.842616319656372,
|
|
"learning_rate": 8.75105713406201e-06,
|
|
"loss": 0.6784,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.710517168045044,
|
|
"learning_rate": 8.746035388881655e-06,
|
|
"loss": 0.6786,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 2.5461089611053467,
|
|
"learning_rate": 8.741005015289843e-06,
|
|
"loss": 0.6865,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.374018669128418,
|
|
"learning_rate": 8.735966024873257e-06,
|
|
"loss": 0.6646,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.168987989425659,
|
|
"learning_rate": 8.730918429238429e-06,
|
|
"loss": 0.6818,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.4316864013671875,
|
|
"learning_rate": 8.72586224001171e-06,
|
|
"loss": 0.6523,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.518183946609497,
|
|
"learning_rate": 8.720797468839255e-06,
|
|
"loss": 0.6692,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 3.6533937454223633,
|
|
"learning_rate": 8.715724127386971e-06,
|
|
"loss": 0.6729,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 2.501375913619995,
|
|
"learning_rate": 8.710642227340518e-06,
|
|
"loss": 0.6692,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 3.3970723152160645,
|
|
"learning_rate": 8.705551780405264e-06,
|
|
"loss": 0.6726,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 2.5831968784332275,
|
|
"learning_rate": 8.70045279830626e-06,
|
|
"loss": 0.684,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 2.8925693035125732,
|
|
"learning_rate": 8.695345292788223e-06,
|
|
"loss": 0.6587,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 3.127180576324463,
|
|
"learning_rate": 8.690229275615503e-06,
|
|
"loss": 0.67,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 2.4886677265167236,
|
|
"learning_rate": 8.685104758572047e-06,
|
|
"loss": 0.6666,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 4.1692070960998535,
|
|
"learning_rate": 8.679971753461388e-06,
|
|
"loss": 0.6668,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.95,
|
|
"grad_norm": 2.864614486694336,
|
|
"learning_rate": 8.674830272106604e-06,
|
|
"loss": 0.6658,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 3.166633129119873,
|
|
"learning_rate": 8.669680326350303e-06,
|
|
"loss": 0.6726,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 2.520665407180786,
|
|
"learning_rate": 8.664521928054585e-06,
|
|
"loss": 0.6693,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 3.273986339569092,
|
|
"learning_rate": 8.659355089101021e-06,
|
|
"loss": 0.6762,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 2.5142078399658203,
|
|
"learning_rate": 8.65417982139062e-06,
|
|
"loss": 0.6659,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 3.154919385910034,
|
|
"learning_rate": 8.648996136843814e-06,
|
|
"loss": 0.671,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 2.0966339111328125,
|
|
"learning_rate": 8.643804047400412e-06,
|
|
"loss": 0.6641,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 3.209606885910034,
|
|
"learning_rate": 8.638603565019588e-06,
|
|
"loss": 0.6677,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 2.470289468765259,
|
|
"learning_rate": 8.633394701679847e-06,
|
|
"loss": 0.6628,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 2.4616212844848633,
|
|
"learning_rate": 8.628177469378995e-06,
|
|
"loss": 0.6772,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 3.5428974628448486,
|
|
"learning_rate": 8.622951880134122e-06,
|
|
"loss": 0.6737,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 3.179137945175171,
|
|
"learning_rate": 8.617717945981558e-06,
|
|
"loss": 0.6855,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 3.3000833988189697,
|
|
"learning_rate": 8.612475678976861e-06,
|
|
"loss": 0.6805,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 2.7179107666015625,
|
|
"learning_rate": 8.60722509119478e-06,
|
|
"loss": 0.6742,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 2.58263897895813,
|
|
"learning_rate": 8.601966194729228e-06,
|
|
"loss": 0.6746,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 0.97,
|
|
"grad_norm": 2.772865056991577,
|
|
"learning_rate": 8.596699001693257e-06,
|
|
"loss": 0.6624,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.8705618381500244,
|
|
"learning_rate": 8.59142352421903e-06,
|
|
"loss": 0.6768,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.4928457736968994,
|
|
"learning_rate": 8.586139774457791e-06,
|
|
"loss": 0.6582,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.9825477600097656,
|
|
"learning_rate": 8.58084776457984e-06,
|
|
"loss": 0.686,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.839447021484375,
|
|
"learning_rate": 8.575547506774498e-06,
|
|
"loss": 0.6847,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 4.549822807312012,
|
|
"learning_rate": 8.570239013250089e-06,
|
|
"loss": 0.6599,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.622835874557495,
|
|
"learning_rate": 8.5649222962339e-06,
|
|
"loss": 0.6447,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.558931350708008,
|
|
"learning_rate": 8.559597367972168e-06,
|
|
"loss": 0.6653,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 2.8210113048553467,
|
|
"learning_rate": 8.554264240730042e-06,
|
|
"loss": 0.671,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 2.7722108364105225,
|
|
"learning_rate": 8.548922926791545e-06,
|
|
"loss": 0.6872,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 2.9724175930023193,
|
|
"learning_rate": 8.543573438459573e-06,
|
|
"loss": 0.6752,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 2.862396717071533,
|
|
"learning_rate": 8.538215788055839e-06,
|
|
"loss": 0.6667,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 3.055391788482666,
|
|
"learning_rate": 8.532849987920859e-06,
|
|
"loss": 0.6695,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 2.379634141921997,
|
|
"learning_rate": 8.527476050413922e-06,
|
|
"loss": 0.6535,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 3.219191551208496,
|
|
"learning_rate": 8.522093987913063e-06,
|
|
"loss": 0.6617,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.99,
|
|
"grad_norm": 3.1809182167053223,
|
|
"learning_rate": 8.516703812815024e-06,
|
|
"loss": 0.6672,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 2.790281057357788,
|
|
"learning_rate": 8.511305537535238e-06,
|
|
"loss": 0.6722,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 3.374549388885498,
|
|
"learning_rate": 8.505899174507793e-06,
|
|
"loss": 0.6622,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 2.377967119216919,
|
|
"learning_rate": 8.500484736185412e-06,
|
|
"loss": 0.6531,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 2.5226247310638428,
|
|
"learning_rate": 8.49506223503941e-06,
|
|
"loss": 0.6491,
|
|
"step": 7550
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 23000,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 4,
|
|
"save_steps": 1,
|
|
"total_flos": 5.621498429758977e+19,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|