{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037593984962406013, "grad_norm": 0.5625, "learning_rate": 9.962406015037594e-06, "loss": 2.1309, "step": 1 }, { "epoch": 0.007518796992481203, "grad_norm": 0.546875, "learning_rate": 9.924812030075189e-06, "loss": 2.1148, "step": 2 }, { "epoch": 0.011278195488721804, "grad_norm": 0.5625, "learning_rate": 9.887218045112783e-06, "loss": 2.1468, "step": 3 }, { "epoch": 0.015037593984962405, "grad_norm": 0.51171875, "learning_rate": 9.849624060150376e-06, "loss": 2.079, "step": 4 }, { "epoch": 0.018796992481203006, "grad_norm": 0.498046875, "learning_rate": 9.812030075187971e-06, "loss": 2.0392, "step": 5 }, { "epoch": 0.022556390977443608, "grad_norm": 0.455078125, "learning_rate": 9.774436090225564e-06, "loss": 2.145, "step": 6 }, { "epoch": 0.02631578947368421, "grad_norm": 0.384765625, "learning_rate": 9.736842105263159e-06, "loss": 2.0199, "step": 7 }, { "epoch": 0.03007518796992481, "grad_norm": 0.3671875, "learning_rate": 9.699248120300752e-06, "loss": 2.0441, "step": 8 }, { "epoch": 0.03383458646616541, "grad_norm": 0.33203125, "learning_rate": 9.661654135338347e-06, "loss": 1.9529, "step": 9 }, { "epoch": 0.03759398496240601, "grad_norm": 0.33203125, "learning_rate": 9.62406015037594e-06, "loss": 2.0237, "step": 10 }, { "epoch": 0.041353383458646614, "grad_norm": 0.27734375, "learning_rate": 9.586466165413535e-06, "loss": 1.955, "step": 11 }, { "epoch": 0.045112781954887216, "grad_norm": 0.275390625, "learning_rate": 9.54887218045113e-06, "loss": 1.9995, "step": 12 }, { "epoch": 0.04887218045112782, "grad_norm": 0.28515625, "learning_rate": 9.511278195488722e-06, "loss": 1.9685, "step": 13 }, { "epoch": 0.05263157894736842, "grad_norm": 0.283203125, "learning_rate": 9.473684210526315e-06, "loss": 1.9314, "step": 14 }, { "epoch": 0.05639097744360902, "grad_norm": 0.2734375, "learning_rate": 9.43609022556391e-06, "loss": 1.9443, "step": 15 }, { "epoch": 0.06015037593984962, "grad_norm": 0.259765625, "learning_rate": 9.398496240601505e-06, "loss": 1.9512, "step": 16 }, { "epoch": 0.06390977443609022, "grad_norm": 0.251953125, "learning_rate": 9.360902255639098e-06, "loss": 1.926, "step": 17 }, { "epoch": 0.06766917293233082, "grad_norm": 0.244140625, "learning_rate": 9.323308270676693e-06, "loss": 1.8982, "step": 18 }, { "epoch": 0.07142857142857142, "grad_norm": 0.255859375, "learning_rate": 9.285714285714288e-06, "loss": 1.9947, "step": 19 }, { "epoch": 0.07518796992481203, "grad_norm": 0.2470703125, "learning_rate": 9.24812030075188e-06, "loss": 1.9373, "step": 20 }, { "epoch": 0.07894736842105263, "grad_norm": 0.2275390625, "learning_rate": 9.210526315789474e-06, "loss": 1.8399, "step": 21 }, { "epoch": 0.08270676691729323, "grad_norm": 0.205078125, "learning_rate": 9.172932330827068e-06, "loss": 1.8937, "step": 22 }, { "epoch": 0.08646616541353383, "grad_norm": 0.1923828125, "learning_rate": 9.135338345864663e-06, "loss": 1.8808, "step": 23 }, { "epoch": 0.09022556390977443, "grad_norm": 0.203125, "learning_rate": 9.097744360902256e-06, "loss": 1.8782, "step": 24 }, { "epoch": 0.09398496240601503, "grad_norm": 0.1923828125, "learning_rate": 9.06015037593985e-06, "loss": 1.8855, "step": 25 }, { "epoch": 0.09774436090225563, "grad_norm": 0.1865234375, "learning_rate": 9.022556390977444e-06, "loss": 1.9202, "step": 26 }, { "epoch": 0.10150375939849623, "grad_norm": 0.1689453125, "learning_rate": 8.984962406015039e-06, "loss": 1.8394, "step": 27 }, { "epoch": 0.10526315789473684, "grad_norm": 0.1806640625, "learning_rate": 8.947368421052632e-06, "loss": 1.8974, "step": 28 }, { "epoch": 0.10902255639097744, "grad_norm": 0.1640625, "learning_rate": 8.909774436090227e-06, "loss": 1.8492, "step": 29 }, { "epoch": 0.11278195488721804, "grad_norm": 0.162109375, "learning_rate": 8.87218045112782e-06, "loss": 1.8371, "step": 30 }, { "epoch": 0.11654135338345864, "grad_norm": 0.1728515625, "learning_rate": 8.834586466165414e-06, "loss": 1.8237, "step": 31 }, { "epoch": 0.12030075187969924, "grad_norm": 0.189453125, "learning_rate": 8.796992481203007e-06, "loss": 1.869, "step": 32 }, { "epoch": 0.12406015037593984, "grad_norm": 0.169921875, "learning_rate": 8.759398496240602e-06, "loss": 1.8077, "step": 33 }, { "epoch": 0.12781954887218044, "grad_norm": 0.1787109375, "learning_rate": 8.721804511278195e-06, "loss": 1.8324, "step": 34 }, { "epoch": 0.13157894736842105, "grad_norm": 0.1806640625, "learning_rate": 8.68421052631579e-06, "loss": 1.7985, "step": 35 }, { "epoch": 0.13533834586466165, "grad_norm": 0.169921875, "learning_rate": 8.646616541353385e-06, "loss": 1.8283, "step": 36 }, { "epoch": 0.13909774436090225, "grad_norm": 0.1630859375, "learning_rate": 8.609022556390978e-06, "loss": 1.7779, "step": 37 }, { "epoch": 0.14285714285714285, "grad_norm": 0.154296875, "learning_rate": 8.571428571428571e-06, "loss": 1.8007, "step": 38 }, { "epoch": 0.14661654135338345, "grad_norm": 0.1552734375, "learning_rate": 8.533834586466166e-06, "loss": 1.7902, "step": 39 }, { "epoch": 0.15037593984962405, "grad_norm": 0.19921875, "learning_rate": 8.49624060150376e-06, "loss": 1.7989, "step": 40 }, { "epoch": 0.15413533834586465, "grad_norm": 0.1611328125, "learning_rate": 8.458646616541353e-06, "loss": 1.7651, "step": 41 }, { "epoch": 0.15789473684210525, "grad_norm": 0.1591796875, "learning_rate": 8.421052631578948e-06, "loss": 1.7581, "step": 42 }, { "epoch": 0.16165413533834586, "grad_norm": 0.15234375, "learning_rate": 8.383458646616543e-06, "loss": 1.743, "step": 43 }, { "epoch": 0.16541353383458646, "grad_norm": 0.150390625, "learning_rate": 8.345864661654136e-06, "loss": 1.7409, "step": 44 }, { "epoch": 0.16917293233082706, "grad_norm": 0.134765625, "learning_rate": 8.308270676691729e-06, "loss": 1.719, "step": 45 }, { "epoch": 0.17293233082706766, "grad_norm": 0.1435546875, "learning_rate": 8.270676691729324e-06, "loss": 1.749, "step": 46 }, { "epoch": 0.17669172932330826, "grad_norm": 0.154296875, "learning_rate": 8.233082706766919e-06, "loss": 1.7724, "step": 47 }, { "epoch": 0.18045112781954886, "grad_norm": 0.1455078125, "learning_rate": 8.195488721804512e-06, "loss": 1.7485, "step": 48 }, { "epoch": 0.18421052631578946, "grad_norm": 0.2265625, "learning_rate": 8.157894736842106e-06, "loss": 1.7398, "step": 49 }, { "epoch": 0.18796992481203006, "grad_norm": 0.1416015625, "learning_rate": 8.1203007518797e-06, "loss": 1.7485, "step": 50 }, { "epoch": 0.19172932330827067, "grad_norm": 0.140625, "learning_rate": 8.082706766917294e-06, "loss": 1.7284, "step": 51 }, { "epoch": 0.19548872180451127, "grad_norm": 0.1533203125, "learning_rate": 8.045112781954887e-06, "loss": 1.6845, "step": 52 }, { "epoch": 0.19924812030075187, "grad_norm": 0.1357421875, "learning_rate": 8.007518796992482e-06, "loss": 1.7095, "step": 53 }, { "epoch": 0.20300751879699247, "grad_norm": 0.1279296875, "learning_rate": 7.969924812030075e-06, "loss": 1.7161, "step": 54 }, { "epoch": 0.20676691729323307, "grad_norm": 0.1396484375, "learning_rate": 7.93233082706767e-06, "loss": 1.7005, "step": 55 }, { "epoch": 0.21052631578947367, "grad_norm": 0.1298828125, "learning_rate": 7.894736842105265e-06, "loss": 1.6836, "step": 56 }, { "epoch": 0.21428571428571427, "grad_norm": 0.126953125, "learning_rate": 7.857142857142858e-06, "loss": 1.7109, "step": 57 }, { "epoch": 0.21804511278195488, "grad_norm": 0.166015625, "learning_rate": 7.81954887218045e-06, "loss": 1.7338, "step": 58 }, { "epoch": 0.22180451127819548, "grad_norm": 0.12890625, "learning_rate": 7.781954887218045e-06, "loss": 1.6894, "step": 59 }, { "epoch": 0.22556390977443608, "grad_norm": 0.1630859375, "learning_rate": 7.74436090225564e-06, "loss": 1.6767, "step": 60 }, { "epoch": 0.22932330827067668, "grad_norm": 0.125, "learning_rate": 7.706766917293233e-06, "loss": 1.6984, "step": 61 }, { "epoch": 0.23308270676691728, "grad_norm": 0.1337890625, "learning_rate": 7.669172932330828e-06, "loss": 1.6824, "step": 62 }, { "epoch": 0.23684210526315788, "grad_norm": 0.130859375, "learning_rate": 7.631578947368423e-06, "loss": 1.7111, "step": 63 }, { "epoch": 0.24060150375939848, "grad_norm": 0.12890625, "learning_rate": 7.593984962406016e-06, "loss": 1.68, "step": 64 }, { "epoch": 0.24436090225563908, "grad_norm": 0.13671875, "learning_rate": 7.55639097744361e-06, "loss": 1.7161, "step": 65 }, { "epoch": 0.24812030075187969, "grad_norm": 0.119140625, "learning_rate": 7.518796992481203e-06, "loss": 1.6781, "step": 66 }, { "epoch": 0.2518796992481203, "grad_norm": 0.1298828125, "learning_rate": 7.481203007518798e-06, "loss": 1.6976, "step": 67 }, { "epoch": 0.2556390977443609, "grad_norm": 0.12255859375, "learning_rate": 7.4436090225563915e-06, "loss": 1.6768, "step": 68 }, { "epoch": 0.2593984962406015, "grad_norm": 0.1337890625, "learning_rate": 7.406015037593985e-06, "loss": 1.6507, "step": 69 }, { "epoch": 0.2631578947368421, "grad_norm": 0.1259765625, "learning_rate": 7.368421052631579e-06, "loss": 1.6782, "step": 70 }, { "epoch": 0.2669172932330827, "grad_norm": 0.123046875, "learning_rate": 7.330827067669174e-06, "loss": 1.6697, "step": 71 }, { "epoch": 0.2706766917293233, "grad_norm": 0.1318359375, "learning_rate": 7.293233082706768e-06, "loss": 1.667, "step": 72 }, { "epoch": 0.2744360902255639, "grad_norm": 0.123046875, "learning_rate": 7.255639097744361e-06, "loss": 1.6558, "step": 73 }, { "epoch": 0.2781954887218045, "grad_norm": 0.12060546875, "learning_rate": 7.218045112781955e-06, "loss": 1.6728, "step": 74 }, { "epoch": 0.2819548872180451, "grad_norm": 0.171875, "learning_rate": 7.18045112781955e-06, "loss": 1.6172, "step": 75 }, { "epoch": 0.2857142857142857, "grad_norm": 0.12451171875, "learning_rate": 7.1428571428571436e-06, "loss": 1.6714, "step": 76 }, { "epoch": 0.2894736842105263, "grad_norm": 0.126953125, "learning_rate": 7.1052631578947375e-06, "loss": 1.6557, "step": 77 }, { "epoch": 0.2932330827067669, "grad_norm": 0.14453125, "learning_rate": 7.067669172932331e-06, "loss": 1.6452, "step": 78 }, { "epoch": 0.29699248120300753, "grad_norm": 0.11669921875, "learning_rate": 7.030075187969926e-06, "loss": 1.6577, "step": 79 }, { "epoch": 0.3007518796992481, "grad_norm": 0.11572265625, "learning_rate": 6.992481203007519e-06, "loss": 1.6411, "step": 80 }, { "epoch": 0.30451127819548873, "grad_norm": 0.1142578125, "learning_rate": 6.954887218045113e-06, "loss": 1.6033, "step": 81 }, { "epoch": 0.3082706766917293, "grad_norm": 0.1357421875, "learning_rate": 6.917293233082707e-06, "loss": 1.6587, "step": 82 }, { "epoch": 0.31203007518796994, "grad_norm": 0.11279296875, "learning_rate": 6.879699248120302e-06, "loss": 1.6648, "step": 83 }, { "epoch": 0.3157894736842105, "grad_norm": 0.10986328125, "learning_rate": 6.842105263157896e-06, "loss": 1.6384, "step": 84 }, { "epoch": 0.31954887218045114, "grad_norm": 0.1171875, "learning_rate": 6.8045112781954896e-06, "loss": 1.6531, "step": 85 }, { "epoch": 0.3233082706766917, "grad_norm": 0.1123046875, "learning_rate": 6.766917293233083e-06, "loss": 1.6455, "step": 86 }, { "epoch": 0.32706766917293234, "grad_norm": 0.11669921875, "learning_rate": 6.729323308270677e-06, "loss": 1.6568, "step": 87 }, { "epoch": 0.3308270676691729, "grad_norm": 0.11328125, "learning_rate": 6.691729323308271e-06, "loss": 1.6319, "step": 88 }, { "epoch": 0.33458646616541354, "grad_norm": 0.11328125, "learning_rate": 6.654135338345865e-06, "loss": 1.6211, "step": 89 }, { "epoch": 0.3383458646616541, "grad_norm": 0.12060546875, "learning_rate": 6.616541353383459e-06, "loss": 1.6466, "step": 90 }, { "epoch": 0.34210526315789475, "grad_norm": 0.12890625, "learning_rate": 6.578947368421054e-06, "loss": 1.6599, "step": 91 }, { "epoch": 0.3458646616541353, "grad_norm": 0.11279296875, "learning_rate": 6.541353383458648e-06, "loss": 1.6457, "step": 92 }, { "epoch": 0.34962406015037595, "grad_norm": 0.1328125, "learning_rate": 6.503759398496241e-06, "loss": 1.6366, "step": 93 }, { "epoch": 0.3533834586466165, "grad_norm": 0.1103515625, "learning_rate": 6.466165413533835e-06, "loss": 1.6281, "step": 94 }, { "epoch": 0.35714285714285715, "grad_norm": 0.12109375, "learning_rate": 6.4285714285714295e-06, "loss": 1.6371, "step": 95 }, { "epoch": 0.3609022556390977, "grad_norm": 0.10986328125, "learning_rate": 6.390977443609023e-06, "loss": 1.6179, "step": 96 }, { "epoch": 0.36466165413533835, "grad_norm": 0.10888671875, "learning_rate": 6.353383458646617e-06, "loss": 1.6234, "step": 97 }, { "epoch": 0.3684210526315789, "grad_norm": 0.1396484375, "learning_rate": 6.31578947368421e-06, "loss": 1.5895, "step": 98 }, { "epoch": 0.37218045112781956, "grad_norm": 0.1298828125, "learning_rate": 6.278195488721806e-06, "loss": 1.5959, "step": 99 }, { "epoch": 0.37593984962406013, "grad_norm": 0.1484375, "learning_rate": 6.240601503759399e-06, "loss": 1.6245, "step": 100 }, { "epoch": 0.37969924812030076, "grad_norm": 0.111328125, "learning_rate": 6.203007518796993e-06, "loss": 1.6061, "step": 101 }, { "epoch": 0.38345864661654133, "grad_norm": 0.1181640625, "learning_rate": 6.165413533834587e-06, "loss": 1.6062, "step": 102 }, { "epoch": 0.38721804511278196, "grad_norm": 0.1123046875, "learning_rate": 6.1278195488721816e-06, "loss": 1.6603, "step": 103 }, { "epoch": 0.39097744360902253, "grad_norm": 0.11083984375, "learning_rate": 6.0902255639097755e-06, "loss": 1.6465, "step": 104 }, { "epoch": 0.39473684210526316, "grad_norm": 0.11328125, "learning_rate": 6.0526315789473685e-06, "loss": 1.5833, "step": 105 }, { "epoch": 0.39849624060150374, "grad_norm": 0.125, "learning_rate": 6.015037593984962e-06, "loss": 1.5936, "step": 106 }, { "epoch": 0.40225563909774437, "grad_norm": 0.126953125, "learning_rate": 5.977443609022557e-06, "loss": 1.6684, "step": 107 }, { "epoch": 0.40601503759398494, "grad_norm": 0.115234375, "learning_rate": 5.939849624060151e-06, "loss": 1.616, "step": 108 }, { "epoch": 0.40977443609022557, "grad_norm": 0.12451171875, "learning_rate": 5.902255639097745e-06, "loss": 1.6036, "step": 109 }, { "epoch": 0.41353383458646614, "grad_norm": 0.1044921875, "learning_rate": 5.864661654135339e-06, "loss": 1.5928, "step": 110 }, { "epoch": 0.41729323308270677, "grad_norm": 0.12060546875, "learning_rate": 5.827067669172934e-06, "loss": 1.6031, "step": 111 }, { "epoch": 0.42105263157894735, "grad_norm": 0.1669921875, "learning_rate": 5.789473684210527e-06, "loss": 1.5906, "step": 112 }, { "epoch": 0.424812030075188, "grad_norm": 0.10693359375, "learning_rate": 5.751879699248121e-06, "loss": 1.5912, "step": 113 }, { "epoch": 0.42857142857142855, "grad_norm": 0.1142578125, "learning_rate": 5.7142857142857145e-06, "loss": 1.6194, "step": 114 }, { "epoch": 0.4323308270676692, "grad_norm": 0.12890625, "learning_rate": 5.676691729323309e-06, "loss": 1.6372, "step": 115 }, { "epoch": 0.43609022556390975, "grad_norm": 0.1484375, "learning_rate": 5.639097744360903e-06, "loss": 1.5576, "step": 116 }, { "epoch": 0.4398496240601504, "grad_norm": 0.1064453125, "learning_rate": 5.601503759398497e-06, "loss": 1.5777, "step": 117 }, { "epoch": 0.44360902255639095, "grad_norm": 0.1083984375, "learning_rate": 5.56390977443609e-06, "loss": 1.6032, "step": 118 }, { "epoch": 0.4473684210526316, "grad_norm": 0.11865234375, "learning_rate": 5.526315789473685e-06, "loss": 1.616, "step": 119 }, { "epoch": 0.45112781954887216, "grad_norm": 0.1201171875, "learning_rate": 5.488721804511279e-06, "loss": 1.5957, "step": 120 }, { "epoch": 0.4548872180451128, "grad_norm": 0.11279296875, "learning_rate": 5.451127819548873e-06, "loss": 1.6231, "step": 121 }, { "epoch": 0.45864661654135336, "grad_norm": 0.1279296875, "learning_rate": 5.413533834586467e-06, "loss": 1.5908, "step": 122 }, { "epoch": 0.462406015037594, "grad_norm": 0.1484375, "learning_rate": 5.375939849624061e-06, "loss": 1.6041, "step": 123 }, { "epoch": 0.46616541353383456, "grad_norm": 0.11865234375, "learning_rate": 5.338345864661654e-06, "loss": 1.5849, "step": 124 }, { "epoch": 0.4699248120300752, "grad_norm": 0.11865234375, "learning_rate": 5.300751879699248e-06, "loss": 1.594, "step": 125 }, { "epoch": 0.47368421052631576, "grad_norm": 0.1123046875, "learning_rate": 5.263157894736842e-06, "loss": 1.5721, "step": 126 }, { "epoch": 0.4774436090225564, "grad_norm": 0.1240234375, "learning_rate": 5.225563909774437e-06, "loss": 1.5949, "step": 127 }, { "epoch": 0.48120300751879697, "grad_norm": 0.1455078125, "learning_rate": 5.187969924812031e-06, "loss": 1.61, "step": 128 }, { "epoch": 0.4849624060150376, "grad_norm": 0.11865234375, "learning_rate": 5.150375939849625e-06, "loss": 1.5879, "step": 129 }, { "epoch": 0.48872180451127817, "grad_norm": 0.11279296875, "learning_rate": 5.112781954887218e-06, "loss": 1.5907, "step": 130 }, { "epoch": 0.4924812030075188, "grad_norm": 0.11328125, "learning_rate": 5.075187969924813e-06, "loss": 1.579, "step": 131 }, { "epoch": 0.49624060150375937, "grad_norm": 0.10791015625, "learning_rate": 5.0375939849624065e-06, "loss": 1.5808, "step": 132 }, { "epoch": 0.5, "grad_norm": 0.109375, "learning_rate": 5e-06, "loss": 1.5888, "step": 133 }, { "epoch": 0.5037593984962406, "grad_norm": 0.12255859375, "learning_rate": 4.962406015037594e-06, "loss": 1.5729, "step": 134 }, { "epoch": 0.5075187969924813, "grad_norm": 0.10888671875, "learning_rate": 4.924812030075188e-06, "loss": 1.5551, "step": 135 }, { "epoch": 0.5112781954887218, "grad_norm": 0.1103515625, "learning_rate": 4.887218045112782e-06, "loss": 1.5706, "step": 136 }, { "epoch": 0.5150375939849624, "grad_norm": 0.12109375, "learning_rate": 4.849624060150376e-06, "loss": 1.6047, "step": 137 }, { "epoch": 0.518796992481203, "grad_norm": 0.11328125, "learning_rate": 4.81203007518797e-06, "loss": 1.594, "step": 138 }, { "epoch": 0.5225563909774437, "grad_norm": 0.11865234375, "learning_rate": 4.774436090225565e-06, "loss": 1.6333, "step": 139 }, { "epoch": 0.5263157894736842, "grad_norm": 0.11865234375, "learning_rate": 4.736842105263158e-06, "loss": 1.5992, "step": 140 }, { "epoch": 0.5300751879699248, "grad_norm": 0.1474609375, "learning_rate": 4.6992481203007525e-06, "loss": 1.5606, "step": 141 }, { "epoch": 0.5338345864661654, "grad_norm": 0.10888671875, "learning_rate": 4.661654135338346e-06, "loss": 1.6227, "step": 142 }, { "epoch": 0.5375939849624061, "grad_norm": 0.11328125, "learning_rate": 4.62406015037594e-06, "loss": 1.5764, "step": 143 }, { "epoch": 0.5413533834586466, "grad_norm": 0.1748046875, "learning_rate": 4.586466165413534e-06, "loss": 1.6476, "step": 144 }, { "epoch": 0.5451127819548872, "grad_norm": 0.115234375, "learning_rate": 4.548872180451128e-06, "loss": 1.5807, "step": 145 }, { "epoch": 0.5488721804511278, "grad_norm": 0.11865234375, "learning_rate": 4.511278195488722e-06, "loss": 1.609, "step": 146 }, { "epoch": 0.5526315789473685, "grad_norm": 0.11083984375, "learning_rate": 4.473684210526316e-06, "loss": 1.5962, "step": 147 }, { "epoch": 0.556390977443609, "grad_norm": 0.11376953125, "learning_rate": 4.43609022556391e-06, "loss": 1.6281, "step": 148 }, { "epoch": 0.5601503759398496, "grad_norm": 0.109375, "learning_rate": 4.398496240601504e-06, "loss": 1.5872, "step": 149 }, { "epoch": 0.5639097744360902, "grad_norm": 0.109375, "learning_rate": 4.360902255639098e-06, "loss": 1.5688, "step": 150 }, { "epoch": 0.5676691729323309, "grad_norm": 0.1279296875, "learning_rate": 4.323308270676692e-06, "loss": 1.5516, "step": 151 }, { "epoch": 0.5714285714285714, "grad_norm": 0.10693359375, "learning_rate": 4.2857142857142855e-06, "loss": 1.5516, "step": 152 }, { "epoch": 0.575187969924812, "grad_norm": 0.111328125, "learning_rate": 4.24812030075188e-06, "loss": 1.5927, "step": 153 }, { "epoch": 0.5789473684210527, "grad_norm": 0.11181640625, "learning_rate": 4.210526315789474e-06, "loss": 1.5904, "step": 154 }, { "epoch": 0.5827067669172933, "grad_norm": 0.1171875, "learning_rate": 4.172932330827068e-06, "loss": 1.6308, "step": 155 }, { "epoch": 0.5864661654135338, "grad_norm": 0.115234375, "learning_rate": 4.135338345864662e-06, "loss": 1.5749, "step": 156 }, { "epoch": 0.5902255639097744, "grad_norm": 0.111328125, "learning_rate": 4.097744360902256e-06, "loss": 1.5763, "step": 157 }, { "epoch": 0.5939849624060151, "grad_norm": 0.11181640625, "learning_rate": 4.06015037593985e-06, "loss": 1.5667, "step": 158 }, { "epoch": 0.5977443609022557, "grad_norm": 0.1201171875, "learning_rate": 4.022556390977444e-06, "loss": 1.563, "step": 159 }, { "epoch": 0.6015037593984962, "grad_norm": 0.12451171875, "learning_rate": 3.9849624060150376e-06, "loss": 1.5973, "step": 160 }, { "epoch": 0.6052631578947368, "grad_norm": 0.1240234375, "learning_rate": 3.947368421052632e-06, "loss": 1.5963, "step": 161 }, { "epoch": 0.6090225563909775, "grad_norm": 0.10986328125, "learning_rate": 3.909774436090225e-06, "loss": 1.567, "step": 162 }, { "epoch": 0.6127819548872181, "grad_norm": 0.12890625, "learning_rate": 3.87218045112782e-06, "loss": 1.5866, "step": 163 }, { "epoch": 0.6165413533834586, "grad_norm": 0.11181640625, "learning_rate": 3.834586466165414e-06, "loss": 1.5829, "step": 164 }, { "epoch": 0.6203007518796992, "grad_norm": 0.1259765625, "learning_rate": 3.796992481203008e-06, "loss": 1.5601, "step": 165 }, { "epoch": 0.6240601503759399, "grad_norm": 0.1171875, "learning_rate": 3.7593984962406014e-06, "loss": 1.5983, "step": 166 }, { "epoch": 0.6278195488721805, "grad_norm": 0.1611328125, "learning_rate": 3.7218045112781957e-06, "loss": 1.5801, "step": 167 }, { "epoch": 0.631578947368421, "grad_norm": 0.11181640625, "learning_rate": 3.6842105263157896e-06, "loss": 1.5545, "step": 168 }, { "epoch": 0.6353383458646616, "grad_norm": 0.1103515625, "learning_rate": 3.646616541353384e-06, "loss": 1.5293, "step": 169 }, { "epoch": 0.6390977443609023, "grad_norm": 0.1103515625, "learning_rate": 3.6090225563909775e-06, "loss": 1.5764, "step": 170 }, { "epoch": 0.6428571428571429, "grad_norm": 0.11279296875, "learning_rate": 3.5714285714285718e-06, "loss": 1.5941, "step": 171 }, { "epoch": 0.6466165413533834, "grad_norm": 0.11181640625, "learning_rate": 3.5338345864661657e-06, "loss": 1.5842, "step": 172 }, { "epoch": 0.650375939849624, "grad_norm": 0.119140625, "learning_rate": 3.4962406015037596e-06, "loss": 1.5493, "step": 173 }, { "epoch": 0.6541353383458647, "grad_norm": 0.111328125, "learning_rate": 3.4586466165413535e-06, "loss": 1.5622, "step": 174 }, { "epoch": 0.6578947368421053, "grad_norm": 0.244140625, "learning_rate": 3.421052631578948e-06, "loss": 1.5905, "step": 175 }, { "epoch": 0.6616541353383458, "grad_norm": 0.1328125, "learning_rate": 3.3834586466165413e-06, "loss": 1.5928, "step": 176 }, { "epoch": 0.6654135338345865, "grad_norm": 0.12890625, "learning_rate": 3.3458646616541356e-06, "loss": 1.569, "step": 177 }, { "epoch": 0.6691729323308271, "grad_norm": 0.1083984375, "learning_rate": 3.3082706766917295e-06, "loss": 1.5706, "step": 178 }, { "epoch": 0.6729323308270677, "grad_norm": 0.109375, "learning_rate": 3.270676691729324e-06, "loss": 1.5598, "step": 179 }, { "epoch": 0.6766917293233082, "grad_norm": 0.134765625, "learning_rate": 3.2330827067669174e-06, "loss": 1.5968, "step": 180 }, { "epoch": 0.6804511278195489, "grad_norm": 0.119140625, "learning_rate": 3.1954887218045117e-06, "loss": 1.5789, "step": 181 }, { "epoch": 0.6842105263157895, "grad_norm": 0.11376953125, "learning_rate": 3.157894736842105e-06, "loss": 1.5565, "step": 182 }, { "epoch": 0.6879699248120301, "grad_norm": 0.1171875, "learning_rate": 3.1203007518796995e-06, "loss": 1.6115, "step": 183 }, { "epoch": 0.6917293233082706, "grad_norm": 0.123046875, "learning_rate": 3.0827067669172934e-06, "loss": 1.5572, "step": 184 }, { "epoch": 0.6954887218045113, "grad_norm": 0.111328125, "learning_rate": 3.0451127819548877e-06, "loss": 1.5763, "step": 185 }, { "epoch": 0.6992481203007519, "grad_norm": 0.1240234375, "learning_rate": 3.007518796992481e-06, "loss": 1.5728, "step": 186 }, { "epoch": 0.7030075187969925, "grad_norm": 0.11767578125, "learning_rate": 2.9699248120300755e-06, "loss": 1.5665, "step": 187 }, { "epoch": 0.706766917293233, "grad_norm": 0.12353515625, "learning_rate": 2.9323308270676694e-06, "loss": 1.5877, "step": 188 }, { "epoch": 0.7105263157894737, "grad_norm": 0.1572265625, "learning_rate": 2.8947368421052634e-06, "loss": 1.5883, "step": 189 }, { "epoch": 0.7142857142857143, "grad_norm": 0.11328125, "learning_rate": 2.8571428571428573e-06, "loss": 1.5978, "step": 190 }, { "epoch": 0.7180451127819549, "grad_norm": 0.1123046875, "learning_rate": 2.8195488721804516e-06, "loss": 1.5785, "step": 191 }, { "epoch": 0.7218045112781954, "grad_norm": 0.11962890625, "learning_rate": 2.781954887218045e-06, "loss": 1.5598, "step": 192 }, { "epoch": 0.7255639097744361, "grad_norm": 0.1123046875, "learning_rate": 2.7443609022556394e-06, "loss": 1.5426, "step": 193 }, { "epoch": 0.7293233082706767, "grad_norm": 0.1201171875, "learning_rate": 2.7067669172932333e-06, "loss": 1.5618, "step": 194 }, { "epoch": 0.7330827067669173, "grad_norm": 0.142578125, "learning_rate": 2.669172932330827e-06, "loss": 1.6137, "step": 195 }, { "epoch": 0.7368421052631579, "grad_norm": 0.15625, "learning_rate": 2.631578947368421e-06, "loss": 1.5443, "step": 196 }, { "epoch": 0.7406015037593985, "grad_norm": 0.130859375, "learning_rate": 2.5939849624060154e-06, "loss": 1.5559, "step": 197 }, { "epoch": 0.7443609022556391, "grad_norm": 0.177734375, "learning_rate": 2.556390977443609e-06, "loss": 1.5526, "step": 198 }, { "epoch": 0.7481203007518797, "grad_norm": 0.11962890625, "learning_rate": 2.5187969924812033e-06, "loss": 1.5839, "step": 199 }, { "epoch": 0.7518796992481203, "grad_norm": 0.1123046875, "learning_rate": 2.481203007518797e-06, "loss": 1.5576, "step": 200 }, { "epoch": 0.7556390977443609, "grad_norm": 0.1240234375, "learning_rate": 2.443609022556391e-06, "loss": 1.6115, "step": 201 }, { "epoch": 0.7593984962406015, "grad_norm": 0.1162109375, "learning_rate": 2.406015037593985e-06, "loss": 1.5613, "step": 202 }, { "epoch": 0.7631578947368421, "grad_norm": 0.11181640625, "learning_rate": 2.368421052631579e-06, "loss": 1.5773, "step": 203 }, { "epoch": 0.7669172932330827, "grad_norm": 0.11962890625, "learning_rate": 2.330827067669173e-06, "loss": 1.5458, "step": 204 }, { "epoch": 0.7706766917293233, "grad_norm": 0.1123046875, "learning_rate": 2.293233082706767e-06, "loss": 1.5696, "step": 205 }, { "epoch": 0.7744360902255639, "grad_norm": 0.10986328125, "learning_rate": 2.255639097744361e-06, "loss": 1.5509, "step": 206 }, { "epoch": 0.7781954887218046, "grad_norm": 0.11181640625, "learning_rate": 2.218045112781955e-06, "loss": 1.5625, "step": 207 }, { "epoch": 0.7819548872180451, "grad_norm": 0.10888671875, "learning_rate": 2.180451127819549e-06, "loss": 1.5507, "step": 208 }, { "epoch": 0.7857142857142857, "grad_norm": 0.12158203125, "learning_rate": 2.1428571428571427e-06, "loss": 1.5591, "step": 209 }, { "epoch": 0.7894736842105263, "grad_norm": 0.1533203125, "learning_rate": 2.105263157894737e-06, "loss": 1.5325, "step": 210 }, { "epoch": 0.793233082706767, "grad_norm": 0.1435546875, "learning_rate": 2.067669172932331e-06, "loss": 1.5841, "step": 211 }, { "epoch": 0.7969924812030075, "grad_norm": 0.11181640625, "learning_rate": 2.030075187969925e-06, "loss": 1.5669, "step": 212 }, { "epoch": 0.8007518796992481, "grad_norm": 0.1142578125, "learning_rate": 1.9924812030075188e-06, "loss": 1.5734, "step": 213 }, { "epoch": 0.8045112781954887, "grad_norm": 0.1474609375, "learning_rate": 1.9548872180451127e-06, "loss": 1.5846, "step": 214 }, { "epoch": 0.8082706766917294, "grad_norm": 0.115234375, "learning_rate": 1.917293233082707e-06, "loss": 1.5544, "step": 215 }, { "epoch": 0.8120300751879699, "grad_norm": 0.123046875, "learning_rate": 1.8796992481203007e-06, "loss": 1.5718, "step": 216 }, { "epoch": 0.8157894736842105, "grad_norm": 0.1318359375, "learning_rate": 1.8421052631578948e-06, "loss": 1.5483, "step": 217 }, { "epoch": 0.8195488721804511, "grad_norm": 0.11376953125, "learning_rate": 1.8045112781954887e-06, "loss": 1.5883, "step": 218 }, { "epoch": 0.8233082706766918, "grad_norm": 0.12158203125, "learning_rate": 1.7669172932330828e-06, "loss": 1.5785, "step": 219 }, { "epoch": 0.8270676691729323, "grad_norm": 0.11572265625, "learning_rate": 1.7293233082706767e-06, "loss": 1.5372, "step": 220 }, { "epoch": 0.8308270676691729, "grad_norm": 0.12890625, "learning_rate": 1.6917293233082707e-06, "loss": 1.5527, "step": 221 }, { "epoch": 0.8345864661654135, "grad_norm": 0.166015625, "learning_rate": 1.6541353383458648e-06, "loss": 1.5593, "step": 222 }, { "epoch": 0.8383458646616542, "grad_norm": 0.126953125, "learning_rate": 1.6165413533834587e-06, "loss": 1.545, "step": 223 }, { "epoch": 0.8421052631578947, "grad_norm": 0.11376953125, "learning_rate": 1.5789473684210526e-06, "loss": 1.5537, "step": 224 }, { "epoch": 0.8458646616541353, "grad_norm": 0.13671875, "learning_rate": 1.5413533834586467e-06, "loss": 1.5667, "step": 225 }, { "epoch": 0.849624060150376, "grad_norm": 0.12451171875, "learning_rate": 1.5037593984962406e-06, "loss": 1.5319, "step": 226 }, { "epoch": 0.8533834586466166, "grad_norm": 0.12451171875, "learning_rate": 1.4661654135338347e-06, "loss": 1.5617, "step": 227 }, { "epoch": 0.8571428571428571, "grad_norm": 0.146484375, "learning_rate": 1.4285714285714286e-06, "loss": 1.5843, "step": 228 }, { "epoch": 0.8609022556390977, "grad_norm": 0.1220703125, "learning_rate": 1.3909774436090225e-06, "loss": 1.5435, "step": 229 }, { "epoch": 0.8646616541353384, "grad_norm": 0.10888671875, "learning_rate": 1.3533834586466167e-06, "loss": 1.5431, "step": 230 }, { "epoch": 0.868421052631579, "grad_norm": 0.11328125, "learning_rate": 1.3157894736842106e-06, "loss": 1.5902, "step": 231 }, { "epoch": 0.8721804511278195, "grad_norm": 0.13671875, "learning_rate": 1.2781954887218045e-06, "loss": 1.5973, "step": 232 }, { "epoch": 0.8759398496240601, "grad_norm": 0.1162109375, "learning_rate": 1.2406015037593986e-06, "loss": 1.5482, "step": 233 }, { "epoch": 0.8796992481203008, "grad_norm": 0.11279296875, "learning_rate": 1.2030075187969925e-06, "loss": 1.5471, "step": 234 }, { "epoch": 0.8834586466165414, "grad_norm": 0.1513671875, "learning_rate": 1.1654135338345866e-06, "loss": 1.5542, "step": 235 }, { "epoch": 0.8872180451127819, "grad_norm": 0.166015625, "learning_rate": 1.1278195488721805e-06, "loss": 1.6103, "step": 236 }, { "epoch": 0.8909774436090225, "grad_norm": 0.119140625, "learning_rate": 1.0902255639097744e-06, "loss": 1.5798, "step": 237 }, { "epoch": 0.8947368421052632, "grad_norm": 0.1162109375, "learning_rate": 1.0526315789473685e-06, "loss": 1.5934, "step": 238 }, { "epoch": 0.8984962406015038, "grad_norm": 0.11083984375, "learning_rate": 1.0150375939849624e-06, "loss": 1.5938, "step": 239 }, { "epoch": 0.9022556390977443, "grad_norm": 0.130859375, "learning_rate": 9.774436090225563e-07, "loss": 1.5225, "step": 240 }, { "epoch": 0.9060150375939849, "grad_norm": 0.1376953125, "learning_rate": 9.398496240601504e-07, "loss": 1.5115, "step": 241 }, { "epoch": 0.9097744360902256, "grad_norm": 0.1279296875, "learning_rate": 9.022556390977444e-07, "loss": 1.5831, "step": 242 }, { "epoch": 0.9135338345864662, "grad_norm": 0.140625, "learning_rate": 8.646616541353384e-07, "loss": 1.5898, "step": 243 }, { "epoch": 0.9172932330827067, "grad_norm": 0.134765625, "learning_rate": 8.270676691729324e-07, "loss": 1.567, "step": 244 }, { "epoch": 0.9210526315789473, "grad_norm": 0.111328125, "learning_rate": 7.894736842105263e-07, "loss": 1.5691, "step": 245 }, { "epoch": 0.924812030075188, "grad_norm": 0.115234375, "learning_rate": 7.518796992481203e-07, "loss": 1.5861, "step": 246 }, { "epoch": 0.9285714285714286, "grad_norm": 0.11474609375, "learning_rate": 7.142857142857143e-07, "loss": 1.5823, "step": 247 }, { "epoch": 0.9323308270676691, "grad_norm": 0.1708984375, "learning_rate": 6.766917293233083e-07, "loss": 1.4993, "step": 248 }, { "epoch": 0.9360902255639098, "grad_norm": 0.16796875, "learning_rate": 6.390977443609022e-07, "loss": 1.6202, "step": 249 }, { "epoch": 0.9398496240601504, "grad_norm": 0.11376953125, "learning_rate": 6.015037593984962e-07, "loss": 1.5887, "step": 250 }, { "epoch": 0.943609022556391, "grad_norm": 0.1982421875, "learning_rate": 5.639097744360903e-07, "loss": 1.5946, "step": 251 }, { "epoch": 0.9473684210526315, "grad_norm": 0.125, "learning_rate": 5.263157894736843e-07, "loss": 1.597, "step": 252 }, { "epoch": 0.9511278195488722, "grad_norm": 0.1337890625, "learning_rate": 4.887218045112782e-07, "loss": 1.5722, "step": 253 }, { "epoch": 0.9548872180451128, "grad_norm": 0.142578125, "learning_rate": 4.511278195488722e-07, "loss": 1.5879, "step": 254 }, { "epoch": 0.9586466165413534, "grad_norm": 0.19140625, "learning_rate": 4.135338345864662e-07, "loss": 1.6135, "step": 255 }, { "epoch": 0.9624060150375939, "grad_norm": 0.115234375, "learning_rate": 3.7593984962406015e-07, "loss": 1.5847, "step": 256 }, { "epoch": 0.9661654135338346, "grad_norm": 0.11328125, "learning_rate": 3.3834586466165416e-07, "loss": 1.5615, "step": 257 }, { "epoch": 0.9699248120300752, "grad_norm": 0.1669921875, "learning_rate": 3.007518796992481e-07, "loss": 1.5342, "step": 258 }, { "epoch": 0.9736842105263158, "grad_norm": 0.1640625, "learning_rate": 2.6315789473684213e-07, "loss": 1.5971, "step": 259 }, { "epoch": 0.9774436090225563, "grad_norm": 0.1748046875, "learning_rate": 2.255639097744361e-07, "loss": 1.5573, "step": 260 }, { "epoch": 0.981203007518797, "grad_norm": 0.1591796875, "learning_rate": 1.8796992481203008e-07, "loss": 1.5388, "step": 261 }, { "epoch": 0.9849624060150376, "grad_norm": 0.11328125, "learning_rate": 1.5037593984962406e-07, "loss": 1.5811, "step": 262 }, { "epoch": 0.9887218045112782, "grad_norm": 0.1494140625, "learning_rate": 1.1278195488721805e-07, "loss": 1.5266, "step": 263 }, { "epoch": 0.9924812030075187, "grad_norm": 0.12060546875, "learning_rate": 7.518796992481203e-08, "loss": 1.5533, "step": 264 }, { "epoch": 0.9962406015037594, "grad_norm": 0.123046875, "learning_rate": 3.7593984962406015e-08, "loss": 1.5889, "step": 265 }, { "epoch": 1.0, "grad_norm": 0.11865234375, "learning_rate": 0.0, "loss": 1.5887, "step": 266 } ], "logging_steps": 1.0, "max_steps": 266, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.317663665780163e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }