{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": -3.22894287109375, "accuracy": 0.5625, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 5.802114963531494, "learning_rate": 2.5000000000000004e-07, "loss": 0.6948, "step": 1 }, { "Batch Mean": -3.20587158203125, "accuracy": 0.4921875, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 7.483729362487793, "learning_rate": 5.000000000000001e-07, "loss": 0.7075, "step": 2 }, { "Batch Mean": -3.23907470703125, "accuracy": 0.453125, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 6.9934587478637695, "learning_rate": 7.5e-07, "loss": 0.6985, "step": 3 }, { "Batch Mean": -3.25433349609375, "accuracy": 0.4609375, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 8.51505184173584, "learning_rate": 1.0000000000000002e-06, "loss": 0.71, "step": 4 }, { "Batch Mean": -3.229034423828125, "accuracy": 0.546875, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 6.16685676574707, "learning_rate": 1.25e-06, "loss": 0.6887, "step": 5 }, { "Batch Mean": -3.19061279296875, "accuracy": 0.453125, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 9.016836166381836, "learning_rate": 1.5e-06, "loss": 0.7021, "step": 6 }, { "Batch Mean": -3.174072265625, "accuracy": 0.5859375, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 8.049066543579102, "learning_rate": 1.75e-06, "loss": 0.6692, "step": 7 }, { "Batch Mean": -3.18316650390625, "accuracy": 0.4296875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 6.920315265655518, "learning_rate": 2.0000000000000003e-06, "loss": 0.6972, "step": 8 }, { "Batch Mean": -3.1539306640625, "accuracy": 0.546875, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 11.55068302154541, "learning_rate": 2.25e-06, "loss": 0.6977, "step": 9 }, { "Batch Mean": -3.0648651123046875, "accuracy": 0.578125, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 8.16010570526123, "learning_rate": 2.5e-06, "loss": 0.6914, "step": 10 }, { "Batch Mean": -2.897817611694336, "accuracy": 0.5703125, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 11.839621543884277, "learning_rate": 2.7500000000000004e-06, "loss": 0.7092, "step": 11 }, { "Batch Mean": -2.694028377532959, "accuracy": 0.5625, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 15.166038513183594, "learning_rate": 3e-06, "loss": 0.7144, "step": 12 }, { "Batch Mean": -2.681365966796875, "accuracy": 0.625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 8.728507995605469, "learning_rate": 3.2500000000000002e-06, "loss": 0.6493, "step": 13 }, { "Batch Mean": -2.1446685791015625, "accuracy": 0.6875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 8.090866088867188, "learning_rate": 3.5e-06, "loss": 0.6339, "step": 14 }, { "Batch Mean": -1.779296875, "accuracy": 0.703125, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 6.903839588165283, "learning_rate": 3.7500000000000005e-06, "loss": 0.5809, "step": 15 }, { "Batch Mean": -1.4260149002075195, "accuracy": 0.6953125, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 7.504371166229248, "learning_rate": 4.000000000000001e-06, "loss": 0.5785, "step": 16 }, { "Batch Mean": -1.3612947463989258, "accuracy": 0.734375, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 8.880864143371582, "learning_rate": 4.25e-06, "loss": 0.6355, "step": 17 }, { "Batch Mean": -1.0974332094192505, "accuracy": 0.671875, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 7.95067834854126, "learning_rate": 4.5e-06, "loss": 0.5865, "step": 18 }, { "Batch Mean": -1.0067845582962036, "accuracy": 0.6796875, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 11.53207778930664, "learning_rate": 4.75e-06, "loss": 0.6069, "step": 19 }, { "Batch Mean": -1.4595129489898682, "accuracy": 0.6640625, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 9.007086753845215, "learning_rate": 5e-06, "loss": 0.5642, "step": 20 }, { "Batch Mean": -2.0267333984375, "accuracy": 0.625, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 10.302614212036133, "learning_rate": 4.986842105263158e-06, "loss": 0.6487, "step": 21 }, { "Batch Mean": -2.4782352447509766, "accuracy": 0.765625, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 9.321782112121582, "learning_rate": 4.973684210526316e-06, "loss": 0.5038, "step": 22 }, { "Batch Mean": -2.5980658531188965, "accuracy": 0.6796875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 8.676246643066406, "learning_rate": 4.960526315789474e-06, "loss": 0.5891, "step": 23 }, { "Batch Mean": -2.863405227661133, "accuracy": 0.59375, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 9.5335111618042, "learning_rate": 4.947368421052632e-06, "loss": 0.6704, "step": 24 }, { "Batch Mean": -2.9449462890625, "accuracy": 0.6796875, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 6.348640441894531, "learning_rate": 4.9342105263157895e-06, "loss": 0.5517, "step": 25 }, { "Batch Mean": -2.93194580078125, "accuracy": 0.6796875, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 7.16481876373291, "learning_rate": 4.921052631578948e-06, "loss": 0.554, "step": 26 }, { "Batch Mean": -2.9799575805664062, "accuracy": 0.78125, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 6.40033483505249, "learning_rate": 4.907894736842106e-06, "loss": 0.5142, "step": 27 }, { "Batch Mean": -3.10003662109375, "accuracy": 0.671875, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.684910774230957, "learning_rate": 4.894736842105264e-06, "loss": 0.5953, "step": 28 }, { "Batch Mean": -3.2201080322265625, "accuracy": 0.6796875, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 6.370337009429932, "learning_rate": 4.881578947368422e-06, "loss": 0.5572, "step": 29 }, { "Batch Mean": -3.1351852416992188, "accuracy": 0.7421875, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 6.65242338180542, "learning_rate": 4.8684210526315795e-06, "loss": 0.5418, "step": 30 }, { "Batch Mean": -3.2759580612182617, "accuracy": 0.7265625, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 7.722072124481201, "learning_rate": 4.855263157894737e-06, "loss": 0.5225, "step": 31 }, { "Batch Mean": -3.0810060501098633, "accuracy": 0.7109375, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 8.74831771850586, "learning_rate": 4.842105263157895e-06, "loss": 0.5407, "step": 32 }, { "Batch Mean": -2.9466419219970703, "accuracy": 0.71875, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 8.841214179992676, "learning_rate": 4.828947368421053e-06, "loss": 0.5603, "step": 33 }, { "Batch Mean": -3.164520263671875, "accuracy": 0.671875, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 8.782054901123047, "learning_rate": 4.815789473684211e-06, "loss": 0.5638, "step": 34 }, { "Batch Mean": -3.0882205963134766, "accuracy": 0.7421875, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 7.124765872955322, "learning_rate": 4.802631578947369e-06, "loss": 0.4709, "step": 35 }, { "Batch Mean": -3.3660800457000732, "accuracy": 0.7734375, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 8.432646751403809, "learning_rate": 4.789473684210527e-06, "loss": 0.5142, "step": 36 }, { "Batch Mean": -3.3540000915527344, "accuracy": 0.65625, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 8.90056037902832, "learning_rate": 4.7763157894736844e-06, "loss": 0.5726, "step": 37 }, { "Batch Mean": -3.630018949508667, "accuracy": 0.7578125, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 10.051451683044434, "learning_rate": 4.763157894736842e-06, "loss": 0.5455, "step": 38 }, { "Batch Mean": -3.8007164001464844, "accuracy": 0.671875, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 8.25788688659668, "learning_rate": 4.75e-06, "loss": 0.6009, "step": 39 }, { "Batch Mean": -3.8345565795898438, "accuracy": 0.7421875, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 6.112834453582764, "learning_rate": 4.736842105263158e-06, "loss": 0.4995, "step": 40 }, { "Batch Mean": -3.868865966796875, "accuracy": 0.765625, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 6.251713752746582, "learning_rate": 4.723684210526316e-06, "loss": 0.5117, "step": 41 }, { "Batch Mean": -4.0014801025390625, "accuracy": 0.7109375, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 6.486876487731934, "learning_rate": 4.710526315789474e-06, "loss": 0.5281, "step": 42 }, { "Batch Mean": -3.8963470458984375, "accuracy": 0.71875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 7.075796604156494, "learning_rate": 4.697368421052632e-06, "loss": 0.5201, "step": 43 }, { "Batch Mean": -3.6251320838928223, "accuracy": 0.765625, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 6.587506294250488, "learning_rate": 4.68421052631579e-06, "loss": 0.453, "step": 44 }, { "Batch Mean": -3.119325637817383, "accuracy": 0.75, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 6.736975193023682, "learning_rate": 4.671052631578948e-06, "loss": 0.4731, "step": 45 }, { "Batch Mean": -2.6836140155792236, "accuracy": 0.7890625, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 7.933563709259033, "learning_rate": 4.657894736842106e-06, "loss": 0.4237, "step": 46 }, { "Batch Mean": -2.348551034927368, "accuracy": 0.7109375, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 12.173978805541992, "learning_rate": 4.6447368421052635e-06, "loss": 0.562, "step": 47 }, { "Batch Mean": -2.281805992126465, "accuracy": 0.8125, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 8.108612060546875, "learning_rate": 4.631578947368421e-06, "loss": 0.4135, "step": 48 }, { "Batch Mean": -1.5221551656723022, "accuracy": 0.7265625, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 9.601967811584473, "learning_rate": 4.618421052631579e-06, "loss": 0.5164, "step": 49 }, { "Batch Mean": -1.7016830444335938, "accuracy": 0.7109375, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 12.160585403442383, "learning_rate": 4.605263157894737e-06, "loss": 0.5847, "step": 50 }, { "Batch Mean": -1.818105697631836, "accuracy": 0.7265625, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 10.746450424194336, "learning_rate": 4.592105263157895e-06, "loss": 0.512, "step": 51 }, { "Batch Mean": -1.5451364517211914, "accuracy": 0.734375, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 8.993597984313965, "learning_rate": 4.578947368421053e-06, "loss": 0.4932, "step": 52 }, { "Batch Mean": -1.592740535736084, "accuracy": 0.734375, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 11.777365684509277, "learning_rate": 4.565789473684211e-06, "loss": 0.5973, "step": 53 }, { "Batch Mean": -1.7593857049942017, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 8.874958992004395, "learning_rate": 4.552631578947369e-06, "loss": 0.4976, "step": 54 }, { "Batch Mean": -1.6973432302474976, "accuracy": 0.703125, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 7.982842922210693, "learning_rate": 4.539473684210527e-06, "loss": 0.523, "step": 55 }, { "Batch Mean": -1.5194091796875, "accuracy": 0.7578125, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 7.006033897399902, "learning_rate": 4.526315789473685e-06, "loss": 0.4739, "step": 56 }, { "Batch Mean": -1.8582556247711182, "accuracy": 0.6640625, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 10.20836067199707, "learning_rate": 4.513157894736843e-06, "loss": 0.5806, "step": 57 }, { "Batch Mean": -1.5920772552490234, "accuracy": 0.7265625, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 7.661640167236328, "learning_rate": 4.5e-06, "loss": 0.542, "step": 58 }, { "Batch Mean": -1.8221396207809448, "accuracy": 0.7265625, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 5.577967166900635, "learning_rate": 4.4868421052631584e-06, "loss": 0.484, "step": 59 }, { "Batch Mean": -1.7805979251861572, "accuracy": 0.734375, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 6.151084899902344, "learning_rate": 4.473684210526316e-06, "loss": 0.5184, "step": 60 }, { "Batch Mean": -1.9499595165252686, "accuracy": 0.78125, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 6.0847954750061035, "learning_rate": 4.460526315789474e-06, "loss": 0.4684, "step": 61 }, { "Batch Mean": -1.6745810508728027, "accuracy": 0.71875, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 5.9063334465026855, "learning_rate": 4.447368421052632e-06, "loss": 0.4899, "step": 62 }, { "Batch Mean": -1.7260775566101074, "accuracy": 0.7734375, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 6.408818244934082, "learning_rate": 4.43421052631579e-06, "loss": 0.4802, "step": 63 }, { "Batch Mean": -1.6710443496704102, "accuracy": 0.7265625, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 6.913627624511719, "learning_rate": 4.4210526315789476e-06, "loss": 0.4864, "step": 64 }, { "Batch Mean": -1.5562989711761475, "accuracy": 0.7890625, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 6.550841808319092, "learning_rate": 4.407894736842105e-06, "loss": 0.5027, "step": 65 }, { "Batch Mean": -1.1634740829467773, "accuracy": 0.6875, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 8.442750930786133, "learning_rate": 4.394736842105263e-06, "loss": 0.5376, "step": 66 }, { "Batch Mean": -1.1364693641662598, "accuracy": 0.8046875, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.907973289489746, "learning_rate": 4.381578947368421e-06, "loss": 0.4159, "step": 67 }, { "Batch Mean": -0.5613183975219727, "accuracy": 0.7734375, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 8.844648361206055, "learning_rate": 4.368421052631579e-06, "loss": 0.4656, "step": 68 }, { "Batch Mean": -0.8619356155395508, "accuracy": 0.703125, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 9.813117027282715, "learning_rate": 4.3552631578947375e-06, "loss": 0.4931, "step": 69 }, { "Batch Mean": -0.7304730415344238, "accuracy": 0.7265625, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 9.341872215270996, "learning_rate": 4.342105263157895e-06, "loss": 0.5239, "step": 70 }, { "Batch Mean": -0.6615705490112305, "accuracy": 0.7109375, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 9.673095703125, "learning_rate": 4.328947368421053e-06, "loss": 0.5137, "step": 71 }, { "Batch Mean": -0.551055908203125, "accuracy": 0.765625, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 8.690629005432129, "learning_rate": 4.315789473684211e-06, "loss": 0.4509, "step": 72 }, { "Batch Mean": -0.6300144195556641, "accuracy": 0.8515625, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 6.654637336730957, "learning_rate": 4.302631578947369e-06, "loss": 0.3857, "step": 73 }, { "Batch Mean": -0.6115929484367371, "accuracy": 0.7734375, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 7.764965534210205, "learning_rate": 4.289473684210527e-06, "loss": 0.4766, "step": 74 }, { "Batch Mean": -0.9064078330993652, "accuracy": 0.7734375, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.712812423706055, "learning_rate": 4.276315789473684e-06, "loss": 0.4238, "step": 75 }, { "Batch Mean": -0.5394480228424072, "accuracy": 0.8203125, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 6.678032398223877, "learning_rate": 4.2631578947368425e-06, "loss": 0.3955, "step": 76 }, { "Batch Mean": -0.751708447933197, "accuracy": 0.7421875, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 7.722261428833008, "learning_rate": 4.25e-06, "loss": 0.4607, "step": 77 }, { "Batch Mean": -0.47676777839660645, "accuracy": 0.78125, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 7.482136249542236, "learning_rate": 4.236842105263158e-06, "loss": 0.4523, "step": 78 }, { "Batch Mean": -0.44623780250549316, "accuracy": 0.7109375, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 8.689245223999023, "learning_rate": 4.223684210526316e-06, "loss": 0.5507, "step": 79 }, { "Batch Mean": -0.44862329959869385, "accuracy": 0.7890625, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 8.543317794799805, "learning_rate": 4.210526315789474e-06, "loss": 0.481, "step": 80 }, { "Batch Mean": -0.8662099838256836, "accuracy": 0.7734375, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 8.948282241821289, "learning_rate": 4.197368421052632e-06, "loss": 0.5276, "step": 81 }, { "Batch Mean": -1.0495636463165283, "accuracy": 0.7109375, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.718348026275635, "learning_rate": 4.18421052631579e-06, "loss": 0.4894, "step": 82 }, { "Batch Mean": -0.7387936115264893, "accuracy": 0.8046875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 6.261586666107178, "learning_rate": 4.171052631578948e-06, "loss": 0.3884, "step": 83 }, { "Batch Mean": -1.0069444179534912, "accuracy": 0.7890625, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 7.533053874969482, "learning_rate": 4.157894736842106e-06, "loss": 0.5043, "step": 84 }, { "Batch Mean": -1.2979249954223633, "accuracy": 0.71875, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 8.032295227050781, "learning_rate": 4.144736842105263e-06, "loss": 0.5288, "step": 85 }, { "Batch Mean": -1.8461058139801025, "accuracy": 0.7421875, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 8.22770881652832, "learning_rate": 4.1315789473684216e-06, "loss": 0.5492, "step": 86 }, { "Batch Mean": -1.119255542755127, "accuracy": 0.734375, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 6.965932846069336, "learning_rate": 4.118421052631579e-06, "loss": 0.5123, "step": 87 }, { "Batch Mean": -1.3000688552856445, "accuracy": 0.7734375, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 6.322815418243408, "learning_rate": 4.105263157894737e-06, "loss": 0.5046, "step": 88 }, { "Batch Mean": -1.6452465057373047, "accuracy": 0.78125, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 5.874916076660156, "learning_rate": 4.092105263157895e-06, "loss": 0.4862, "step": 89 }, { "Batch Mean": -1.3325897455215454, "accuracy": 0.703125, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 6.671369552612305, "learning_rate": 4.078947368421053e-06, "loss": 0.4861, "step": 90 }, { "Batch Mean": -1.272374153137207, "accuracy": 0.7734375, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 6.803928375244141, "learning_rate": 4.065789473684211e-06, "loss": 0.5159, "step": 91 }, { "Batch Mean": -1.2404394149780273, "accuracy": 0.7265625, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 6.205514907836914, "learning_rate": 4.052631578947368e-06, "loss": 0.4998, "step": 92 }, { "Batch Mean": -0.8970499038696289, "accuracy": 0.7890625, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 6.928950309753418, "learning_rate": 4.0394736842105265e-06, "loss": 0.4457, "step": 93 }, { "Batch Mean": -1.1840200424194336, "accuracy": 0.7265625, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 6.628427028656006, "learning_rate": 4.026315789473684e-06, "loss": 0.4956, "step": 94 }, { "Batch Mean": -0.9916634559631348, "accuracy": 0.7421875, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 6.572470188140869, "learning_rate": 4.013157894736842e-06, "loss": 0.4784, "step": 95 }, { "Batch Mean": -1.0057942867279053, "accuracy": 0.6875, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 7.420350551605225, "learning_rate": 4.000000000000001e-06, "loss": 0.5301, "step": 96 }, { "Batch Mean": -1.141215205192566, "accuracy": 0.8125, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 6.693457126617432, "learning_rate": 3.986842105263158e-06, "loss": 0.4688, "step": 97 }, { "Batch Mean": -1.14703369140625, "accuracy": 0.796875, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 7.45700740814209, "learning_rate": 3.9736842105263165e-06, "loss": 0.4147, "step": 98 }, { "Batch Mean": -1.1375855207443237, "accuracy": 0.8359375, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 6.689132213592529, "learning_rate": 3.960526315789474e-06, "loss": 0.3805, "step": 99 }, { "Batch Mean": -0.9851565361022949, "accuracy": 0.75, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 9.684271812438965, "learning_rate": 3.947368421052632e-06, "loss": 0.4405, "step": 100 }, { "Batch Mean": -1.0915021896362305, "accuracy": 0.7734375, "epoch": 0.25, "step": 100 }, { "epoch": 0.2525, "grad_norm": 8.271260261535645, "learning_rate": 3.93421052631579e-06, "loss": 0.4442, "step": 101 }, { "Batch Mean": -0.7684752941131592, "accuracy": 0.7890625, "epoch": 0.2525, "step": 101 }, { "epoch": 0.255, "grad_norm": 8.824223518371582, "learning_rate": 3.921052631578947e-06, "loss": 0.4561, "step": 102 }, { "Batch Mean": -0.8534479141235352, "accuracy": 0.734375, "epoch": 0.255, "step": 102 }, { "epoch": 0.2575, "grad_norm": 10.552571296691895, "learning_rate": 3.907894736842106e-06, "loss": 0.4812, "step": 103 }, { "Batch Mean": -0.4684869349002838, "accuracy": 0.7421875, "epoch": 0.2575, "step": 103 }, { "epoch": 0.26, "grad_norm": 9.715003967285156, "learning_rate": 3.894736842105263e-06, "loss": 0.468, "step": 104 }, { "Batch Mean": -0.3979635238647461, "accuracy": 0.7734375, "epoch": 0.26, "step": 104 }, { "epoch": 0.2625, "grad_norm": 10.01593017578125, "learning_rate": 3.8815789473684214e-06, "loss": 0.4727, "step": 105 }, { "Batch Mean": -0.1986093521118164, "accuracy": 0.796875, "epoch": 0.2625, "step": 105 }, { "epoch": 0.265, "grad_norm": 10.657917022705078, "learning_rate": 3.868421052631579e-06, "loss": 0.455, "step": 106 }, { "Batch Mean": -0.09849877655506134, "accuracy": 0.7734375, "epoch": 0.265, "step": 106 }, { "epoch": 0.2675, "grad_norm": 11.969587326049805, "learning_rate": 3.855263157894737e-06, "loss": 0.4715, "step": 107 }, { "Batch Mean": -0.21042925119400024, "accuracy": 0.8046875, "epoch": 0.2675, "step": 107 }, { "epoch": 0.27, "grad_norm": 10.609965324401855, "learning_rate": 3.842105263157895e-06, "loss": 0.4328, "step": 108 }, { "Batch Mean": -0.5359489917755127, "accuracy": 0.8046875, "epoch": 0.27, "step": 108 }, { "epoch": 0.2725, "grad_norm": 9.500229835510254, "learning_rate": 3.828947368421053e-06, "loss": 0.4106, "step": 109 }, { "Batch Mean": 0.3338432312011719, "accuracy": 0.796875, "epoch": 0.2725, "step": 109 }, { "epoch": 0.275, "grad_norm": 11.425292015075684, "learning_rate": 3.815789473684211e-06, "loss": 0.483, "step": 110 }, { "Batch Mean": -0.30494236946105957, "accuracy": 0.796875, "epoch": 0.275, "step": 110 }, { "epoch": 0.2775, "grad_norm": 9.747135162353516, "learning_rate": 3.802631578947369e-06, "loss": 0.457, "step": 111 }, { "Batch Mean": -0.5019316673278809, "accuracy": 0.75, "epoch": 0.2775, "step": 111 }, { "epoch": 0.28, "grad_norm": 10.03312873840332, "learning_rate": 3.789473684210527e-06, "loss": 0.489, "step": 112 }, { "Batch Mean": -0.3197094202041626, "accuracy": 0.7734375, "epoch": 0.28, "step": 112 }, { "epoch": 0.2825, "grad_norm": 7.567203044891357, "learning_rate": 3.7763157894736847e-06, "loss": 0.4097, "step": 113 }, { "Batch Mean": -0.6099193096160889, "accuracy": 0.7890625, "epoch": 0.2825, "step": 113 }, { "epoch": 0.285, "grad_norm": 9.13629150390625, "learning_rate": 3.7631578947368426e-06, "loss": 0.5063, "step": 114 }, { "Batch Mean": -0.5079336166381836, "accuracy": 0.78125, "epoch": 0.285, "step": 114 }, { "epoch": 0.2875, "grad_norm": 7.463940143585205, "learning_rate": 3.7500000000000005e-06, "loss": 0.4452, "step": 115 }, { "Batch Mean": -0.982884407043457, "accuracy": 0.8125, "epoch": 0.2875, "step": 115 }, { "epoch": 0.29, "grad_norm": 6.665597438812256, "learning_rate": 3.736842105263158e-06, "loss": 0.4275, "step": 116 }, { "Batch Mean": -1.1093603372573853, "accuracy": 0.7265625, "epoch": 0.29, "step": 116 }, { "epoch": 0.2925, "grad_norm": 7.881545066833496, "learning_rate": 3.723684210526316e-06, "loss": 0.5385, "step": 117 }, { "Batch Mean": -1.0912278890609741, "accuracy": 0.7421875, "epoch": 0.2925, "step": 117 }, { "epoch": 0.295, "grad_norm": 7.39991569519043, "learning_rate": 3.710526315789474e-06, "loss": 0.5116, "step": 118 }, { "Batch Mean": -1.1224448680877686, "accuracy": 0.7265625, "epoch": 0.295, "step": 118 }, { "epoch": 0.2975, "grad_norm": 6.270720481872559, "learning_rate": 3.6973684210526317e-06, "loss": 0.4805, "step": 119 }, { "Batch Mean": -0.8693461418151855, "accuracy": 0.7578125, "epoch": 0.2975, "step": 119 }, { "epoch": 0.3, "grad_norm": 5.773062705993652, "learning_rate": 3.6842105263157896e-06, "loss": 0.4403, "step": 120 }, { "Batch Mean": -1.0121287107467651, "accuracy": 0.7734375, "epoch": 0.3, "step": 120 }, { "epoch": 0.3025, "grad_norm": 6.426705837249756, "learning_rate": 3.6710526315789476e-06, "loss": 0.4496, "step": 121 }, { "Batch Mean": -1.0755189657211304, "accuracy": 0.765625, "epoch": 0.3025, "step": 121 }, { "epoch": 0.305, "grad_norm": 6.251639366149902, "learning_rate": 3.657894736842106e-06, "loss": 0.4839, "step": 122 }, { "Batch Mean": -0.6420345306396484, "accuracy": 0.8359375, "epoch": 0.305, "step": 122 }, { "epoch": 0.3075, "grad_norm": 6.248297691345215, "learning_rate": 3.644736842105264e-06, "loss": 0.4346, "step": 123 }, { "Batch Mean": -0.5942535400390625, "accuracy": 0.75, "epoch": 0.3075, "step": 123 }, { "epoch": 0.31, "grad_norm": 6.997790813446045, "learning_rate": 3.6315789473684217e-06, "loss": 0.4997, "step": 124 }, { "Batch Mean": -0.3599095344543457, "accuracy": 0.765625, "epoch": 0.31, "step": 124 }, { "epoch": 0.3125, "grad_norm": 6.403144836425781, "learning_rate": 3.618421052631579e-06, "loss": 0.4857, "step": 125 }, { "Batch Mean": -0.12914514541625977, "accuracy": 0.8046875, "epoch": 0.3125, "step": 125 }, { "epoch": 0.315, "grad_norm": 6.186731338500977, "learning_rate": 3.605263157894737e-06, "loss": 0.4258, "step": 126 }, { "Batch Mean": -0.026085376739501953, "accuracy": 0.8203125, "epoch": 0.315, "step": 126 }, { "epoch": 0.3175, "grad_norm": 6.993777275085449, "learning_rate": 3.592105263157895e-06, "loss": 0.4127, "step": 127 }, { "Batch Mean": -0.19666238129138947, "accuracy": 0.75, "epoch": 0.3175, "step": 127 }, { "epoch": 0.32, "grad_norm": 6.9689483642578125, "learning_rate": 3.578947368421053e-06, "loss": 0.4782, "step": 128 }, { "Batch Mean": -0.2396249771118164, "accuracy": 0.75, "epoch": 0.32, "step": 128 }, { "epoch": 0.3225, "grad_norm": 8.292496681213379, "learning_rate": 3.565789473684211e-06, "loss": 0.5033, "step": 129 }, { "Batch Mean": -0.08678841590881348, "accuracy": 0.828125, "epoch": 0.3225, "step": 129 }, { "epoch": 0.325, "grad_norm": 8.41964054107666, "learning_rate": 3.5526315789473687e-06, "loss": 0.4106, "step": 130 }, { "Batch Mean": -0.2747459411621094, "accuracy": 0.7578125, "epoch": 0.325, "step": 130 }, { "epoch": 0.3275, "grad_norm": 9.222663879394531, "learning_rate": 3.5394736842105266e-06, "loss": 0.4377, "step": 131 }, { "Batch Mean": 0.3878480792045593, "accuracy": 0.7421875, "epoch": 0.3275, "step": 131 }, { "epoch": 0.33, "grad_norm": 10.44568920135498, "learning_rate": 3.5263157894736846e-06, "loss": 0.507, "step": 132 }, { "Batch Mean": 0.42960524559020996, "accuracy": 0.7734375, "epoch": 0.33, "step": 132 }, { "epoch": 0.3325, "grad_norm": 8.473082542419434, "learning_rate": 3.513157894736842e-06, "loss": 0.468, "step": 133 }, { "Batch Mean": 0.5482549667358398, "accuracy": 0.75, "epoch": 0.3325, "step": 133 }, { "epoch": 0.335, "grad_norm": 9.393155097961426, "learning_rate": 3.5e-06, "loss": 0.5064, "step": 134 }, { "Batch Mean": 0.21044254302978516, "accuracy": 0.8125, "epoch": 0.335, "step": 134 }, { "epoch": 0.3375, "grad_norm": 7.462342739105225, "learning_rate": 3.486842105263158e-06, "loss": 0.4034, "step": 135 }, { "Batch Mean": 0.511605978012085, "accuracy": 0.765625, "epoch": 0.3375, "step": 135 }, { "epoch": 0.34, "grad_norm": 7.791304588317871, "learning_rate": 3.473684210526316e-06, "loss": 0.4827, "step": 136 }, { "Batch Mean": -0.08954060077667236, "accuracy": 0.7421875, "epoch": 0.34, "step": 136 }, { "epoch": 0.3425, "grad_norm": 9.468352317810059, "learning_rate": 3.460526315789474e-06, "loss": 0.5404, "step": 137 }, { "Batch Mean": -0.3172520399093628, "accuracy": 0.7578125, "epoch": 0.3425, "step": 137 }, { "epoch": 0.345, "grad_norm": 7.7005720138549805, "learning_rate": 3.447368421052632e-06, "loss": 0.4348, "step": 138 }, { "Batch Mean": -0.38188767433166504, "accuracy": 0.8046875, "epoch": 0.345, "step": 138 }, { "epoch": 0.3475, "grad_norm": 7.671250820159912, "learning_rate": 3.43421052631579e-06, "loss": 0.4548, "step": 139 }, { "Batch Mean": -0.5834815502166748, "accuracy": 0.7890625, "epoch": 0.3475, "step": 139 }, { "epoch": 0.35, "grad_norm": 6.371372699737549, "learning_rate": 3.421052631578948e-06, "loss": 0.4407, "step": 140 }, { "Batch Mean": -0.794266939163208, "accuracy": 0.78125, "epoch": 0.35, "step": 140 }, { "epoch": 0.3525, "grad_norm": 6.418947696685791, "learning_rate": 3.4078947368421057e-06, "loss": 0.4297, "step": 141 }, { "Batch Mean": -1.031827449798584, "accuracy": 0.78125, "epoch": 0.3525, "step": 141 }, { "epoch": 0.355, "grad_norm": 7.1692023277282715, "learning_rate": 3.3947368421052636e-06, "loss": 0.4884, "step": 142 }, { "Batch Mean": -1.0245031118392944, "accuracy": 0.8125, "epoch": 0.355, "step": 142 }, { "epoch": 0.3575, "grad_norm": 6.299808025360107, "learning_rate": 3.381578947368421e-06, "loss": 0.4165, "step": 143 }, { "Batch Mean": -1.6456010341644287, "accuracy": 0.78125, "epoch": 0.3575, "step": 143 }, { "epoch": 0.36, "grad_norm": 7.132191181182861, "learning_rate": 3.368421052631579e-06, "loss": 0.496, "step": 144 }, { "Batch Mean": -1.4577171802520752, "accuracy": 0.8046875, "epoch": 0.36, "step": 144 }, { "epoch": 0.3625, "grad_norm": 7.309590816497803, "learning_rate": 3.355263157894737e-06, "loss": 0.4046, "step": 145 }, { "Batch Mean": -1.2381335496902466, "accuracy": 0.7890625, "epoch": 0.3625, "step": 145 }, { "epoch": 0.365, "grad_norm": 7.03560209274292, "learning_rate": 3.342105263157895e-06, "loss": 0.4463, "step": 146 }, { "Batch Mean": -1.3830366134643555, "accuracy": 0.7890625, "epoch": 0.365, "step": 146 }, { "epoch": 0.3675, "grad_norm": 7.277976036071777, "learning_rate": 3.3289473684210528e-06, "loss": 0.4238, "step": 147 }, { "Batch Mean": -1.2444829940795898, "accuracy": 0.84375, "epoch": 0.3675, "step": 147 }, { "epoch": 0.37, "grad_norm": 6.460512638092041, "learning_rate": 3.3157894736842107e-06, "loss": 0.3967, "step": 148 }, { "Batch Mean": -0.9982385039329529, "accuracy": 0.7734375, "epoch": 0.37, "step": 148 }, { "epoch": 0.3725, "grad_norm": 7.685225486755371, "learning_rate": 3.302631578947369e-06, "loss": 0.4706, "step": 149 }, { "Batch Mean": -0.9622928500175476, "accuracy": 0.78125, "epoch": 0.3725, "step": 149 }, { "epoch": 0.375, "grad_norm": 7.0924153327941895, "learning_rate": 3.289473684210527e-06, "loss": 0.425, "step": 150 }, { "Batch Mean": -0.6390509605407715, "accuracy": 0.7421875, "epoch": 0.375, "step": 150 }, { "epoch": 0.3775, "grad_norm": 8.69453239440918, "learning_rate": 3.276315789473685e-06, "loss": 0.5108, "step": 151 }, { "Batch Mean": -0.0986168384552002, "accuracy": 0.7578125, "epoch": 0.3775, "step": 151 }, { "epoch": 0.38, "grad_norm": 10.331669807434082, "learning_rate": 3.2631578947368423e-06, "loss": 0.5265, "step": 152 }, { "Batch Mean": -0.36638665199279785, "accuracy": 0.8203125, "epoch": 0.38, "step": 152 }, { "epoch": 0.3825, "grad_norm": 6.3816704750061035, "learning_rate": 3.2500000000000002e-06, "loss": 0.3436, "step": 153 }, { "Batch Mean": -0.700871467590332, "accuracy": 0.828125, "epoch": 0.3825, "step": 153 }, { "epoch": 0.385, "grad_norm": 7.452230453491211, "learning_rate": 3.236842105263158e-06, "loss": 0.3744, "step": 154 }, { "Batch Mean": -0.22220516204833984, "accuracy": 0.7734375, "epoch": 0.385, "step": 154 }, { "epoch": 0.3875, "grad_norm": 9.25493049621582, "learning_rate": 3.223684210526316e-06, "loss": 0.473, "step": 155 }, { "Batch Mean": -0.5362415313720703, "accuracy": 0.8203125, "epoch": 0.3875, "step": 155 }, { "epoch": 0.39, "grad_norm": 8.848881721496582, "learning_rate": 3.210526315789474e-06, "loss": 0.4727, "step": 156 }, { "Batch Mean": -0.39302539825439453, "accuracy": 0.7890625, "epoch": 0.39, "step": 156 }, { "epoch": 0.3925, "grad_norm": 8.796403884887695, "learning_rate": 3.197368421052632e-06, "loss": 0.4675, "step": 157 }, { "Batch Mean": -0.24126970767974854, "accuracy": 0.7734375, "epoch": 0.3925, "step": 157 }, { "epoch": 0.395, "grad_norm": 9.450688362121582, "learning_rate": 3.1842105263157898e-06, "loss": 0.4724, "step": 158 }, { "Batch Mean": -0.40082311630249023, "accuracy": 0.8046875, "epoch": 0.395, "step": 158 }, { "epoch": 0.3975, "grad_norm": 8.867382049560547, "learning_rate": 3.1710526315789477e-06, "loss": 0.4516, "step": 159 }, { "Batch Mean": -0.40192973613739014, "accuracy": 0.7421875, "epoch": 0.3975, "step": 159 }, { "epoch": 0.4, "grad_norm": 11.669089317321777, "learning_rate": 3.157894736842105e-06, "loss": 0.5932, "step": 160 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }