{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "Batch Mean": 0.7002408504486084, "accuracy": 0.53125, "epoch": 0, "step": 0 }, { "epoch": 0.0025, "grad_norm": 4.865061283111572, "learning_rate": 1.5000000000000002e-07, "loss": 0.6999, "step": 1 }, { "Batch Mean": 0.7006568908691406, "accuracy": 0.5078125, "epoch": 0.0025, "step": 1 }, { "epoch": 0.005, "grad_norm": 5.116915702819824, "learning_rate": 3.0000000000000004e-07, "loss": 0.706, "step": 2 }, { "Batch Mean": 0.7036838531494141, "accuracy": 0.5625, "epoch": 0.005, "step": 2 }, { "epoch": 0.0075, "grad_norm": 4.952433109283447, "learning_rate": 4.5e-07, "loss": 0.6957, "step": 3 }, { "Batch Mean": 0.7087974548339844, "accuracy": 0.5546875, "epoch": 0.0075, "step": 3 }, { "epoch": 0.01, "grad_norm": 5.185088157653809, "learning_rate": 6.000000000000001e-07, "loss": 0.6974, "step": 4 }, { "Batch Mean": 0.7068939208984375, "accuracy": 0.5234375, "epoch": 0.01, "step": 4 }, { "epoch": 0.0125, "grad_norm": 5.04327392578125, "learning_rate": 7.5e-07, "loss": 0.7047, "step": 5 }, { "Batch Mean": 0.6315479278564453, "accuracy": 0.5234375, "epoch": 0.0125, "step": 5 }, { "epoch": 0.015, "grad_norm": 4.728168487548828, "learning_rate": 9e-07, "loss": 0.6932, "step": 6 }, { "Batch Mean": 0.5291571617126465, "accuracy": 0.5234375, "epoch": 0.015, "step": 6 }, { "epoch": 0.0175, "grad_norm": 4.0146050453186035, "learning_rate": 1.05e-06, "loss": 0.691, "step": 7 }, { "Batch Mean": 0.42229437828063965, "accuracy": 0.5546875, "epoch": 0.0175, "step": 7 }, { "epoch": 0.02, "grad_norm": 3.488922119140625, "learning_rate": 1.2000000000000002e-06, "loss": 0.6947, "step": 8 }, { "Batch Mean": 0.07428494095802307, "accuracy": 0.515625, "epoch": 0.02, "step": 8 }, { "epoch": 0.0225, "grad_norm": 2.5626139640808105, "learning_rate": 1.35e-06, "loss": 0.6965, "step": 9 }, { "Batch Mean": -0.1413576304912567, "accuracy": 0.5859375, "epoch": 0.0225, "step": 9 }, { "epoch": 0.025, "grad_norm": 2.776390790939331, "learning_rate": 1.5e-06, "loss": 0.6784, "step": 10 }, { "Batch Mean": -0.2741769552230835, "accuracy": 0.5234375, "epoch": 0.025, "step": 10 }, { "epoch": 0.0275, "grad_norm": 3.4468233585357666, "learning_rate": 1.65e-06, "loss": 0.6844, "step": 11 }, { "Batch Mean": -0.45577430725097656, "accuracy": 0.5859375, "epoch": 0.0275, "step": 11 }, { "epoch": 0.03, "grad_norm": 4.281182765960693, "learning_rate": 1.8e-06, "loss": 0.6753, "step": 12 }, { "Batch Mean": -1.0260009765625, "accuracy": 0.65625, "epoch": 0.03, "step": 12 }, { "epoch": 0.0325, "grad_norm": 6.948849201202393, "learning_rate": 1.95e-06, "loss": 0.6734, "step": 13 }, { "Batch Mean": -1.1147994995117188, "accuracy": 0.6171875, "epoch": 0.0325, "step": 13 }, { "epoch": 0.035, "grad_norm": 7.589419841766357, "learning_rate": 2.1e-06, "loss": 0.677, "step": 14 }, { "Batch Mean": -1.0315093994140625, "accuracy": 0.65625, "epoch": 0.035, "step": 14 }, { "epoch": 0.0375, "grad_norm": 7.2146382331848145, "learning_rate": 2.25e-06, "loss": 0.677, "step": 15 }, { "Batch Mean": -0.8089427947998047, "accuracy": 0.65625, "epoch": 0.0375, "step": 15 }, { "epoch": 0.04, "grad_norm": 6.308068752288818, "learning_rate": 2.4000000000000003e-06, "loss": 0.6503, "step": 16 }, { "Batch Mean": -0.3669371008872986, "accuracy": 0.671875, "epoch": 0.04, "step": 16 }, { "epoch": 0.0425, "grad_norm": 4.996833324432373, "learning_rate": 2.55e-06, "loss": 0.617, "step": 17 }, { "Batch Mean": 0.2619136571884155, "accuracy": 0.6796875, "epoch": 0.0425, "step": 17 }, { "epoch": 0.045, "grad_norm": 4.9718918800354, "learning_rate": 2.7e-06, "loss": 0.5981, "step": 18 }, { "Batch Mean": 1.0361242294311523, "accuracy": 0.625, "epoch": 0.045, "step": 18 }, { "epoch": 0.0475, "grad_norm": 9.635746955871582, "learning_rate": 2.85e-06, "loss": 0.6236, "step": 19 }, { "Batch Mean": 1.5605192184448242, "accuracy": 0.65625, "epoch": 0.0475, "step": 19 }, { "epoch": 0.05, "grad_norm": 14.90079116821289, "learning_rate": 3e-06, "loss": 0.6511, "step": 20 }, { "Batch Mean": 1.7601673603057861, "accuracy": 0.6796875, "epoch": 0.05, "step": 20 }, { "epoch": 0.0525, "grad_norm": 16.801076889038086, "learning_rate": 2.992105263157895e-06, "loss": 0.6712, "step": 21 }, { "Batch Mean": 1.1987818479537964, "accuracy": 0.7109375, "epoch": 0.0525, "step": 21 }, { "epoch": 0.055, "grad_norm": 11.280301094055176, "learning_rate": 2.9842105263157896e-06, "loss": 0.6033, "step": 22 }, { "Batch Mean": 0.5978413820266724, "accuracy": 0.6875, "epoch": 0.055, "step": 22 }, { "epoch": 0.0575, "grad_norm": 7.414430141448975, "learning_rate": 2.9763157894736843e-06, "loss": 0.5819, "step": 23 }, { "Batch Mean": -0.20422333478927612, "accuracy": 0.6640625, "epoch": 0.0575, "step": 23 }, { "epoch": 0.06, "grad_norm": 5.717103004455566, "learning_rate": 2.968421052631579e-06, "loss": 0.638, "step": 24 }, { "Batch Mean": -0.7162615060806274, "accuracy": 0.75, "epoch": 0.06, "step": 24 }, { "epoch": 0.0625, "grad_norm": 7.513117790222168, "learning_rate": 2.960526315789474e-06, "loss": 0.5851, "step": 25 }, { "Batch Mean": -1.054888129234314, "accuracy": 0.6796875, "epoch": 0.0625, "step": 25 }, { "epoch": 0.065, "grad_norm": 8.300326347351074, "learning_rate": 2.9526315789473685e-06, "loss": 0.6286, "step": 26 }, { "Batch Mean": -1.1319353580474854, "accuracy": 0.703125, "epoch": 0.065, "step": 26 }, { "epoch": 0.0675, "grad_norm": 9.116662979125977, "learning_rate": 2.9447368421052633e-06, "loss": 0.6003, "step": 27 }, { "Batch Mean": -0.7403234839439392, "accuracy": 0.7265625, "epoch": 0.0675, "step": 27 }, { "epoch": 0.07, "grad_norm": 6.799551486968994, "learning_rate": 2.936842105263158e-06, "loss": 0.5383, "step": 28 }, { "Batch Mean": -0.25867950916290283, "accuracy": 0.703125, "epoch": 0.07, "step": 28 }, { "epoch": 0.0725, "grad_norm": 5.493288040161133, "learning_rate": 2.9289473684210528e-06, "loss": 0.5683, "step": 29 }, { "Batch Mean": 0.4318051338195801, "accuracy": 0.6796875, "epoch": 0.0725, "step": 29 }, { "epoch": 0.075, "grad_norm": 6.373619556427002, "learning_rate": 2.9210526315789475e-06, "loss": 0.6126, "step": 30 }, { "Batch Mean": 0.8685734272003174, "accuracy": 0.7265625, "epoch": 0.075, "step": 30 }, { "epoch": 0.0775, "grad_norm": 7.750959396362305, "learning_rate": 2.9131578947368423e-06, "loss": 0.5984, "step": 31 }, { "Batch Mean": 1.1348557472229004, "accuracy": 0.6484375, "epoch": 0.0775, "step": 31 }, { "epoch": 0.08, "grad_norm": 8.742609977722168, "learning_rate": 2.905263157894737e-06, "loss": 0.6016, "step": 32 }, { "Batch Mean": 0.6863436698913574, "accuracy": 0.7421875, "epoch": 0.08, "step": 32 }, { "epoch": 0.0825, "grad_norm": 6.586100101470947, "learning_rate": 2.8973684210526318e-06, "loss": 0.5485, "step": 33 }, { "Batch Mean": 0.23023658990859985, "accuracy": 0.765625, "epoch": 0.0825, "step": 33 }, { "epoch": 0.085, "grad_norm": 5.231872081756592, "learning_rate": 2.8894736842105265e-06, "loss": 0.4999, "step": 34 }, { "Batch Mean": -0.2048710286617279, "accuracy": 0.734375, "epoch": 0.085, "step": 34 }, { "epoch": 0.0875, "grad_norm": 5.551333427429199, "learning_rate": 2.8815789473684213e-06, "loss": 0.5541, "step": 35 }, { "Batch Mean": -0.4559789299964905, "accuracy": 0.7421875, "epoch": 0.0875, "step": 35 }, { "epoch": 0.09, "grad_norm": 5.972931385040283, "learning_rate": 2.873684210526316e-06, "loss": 0.5698, "step": 36 }, { "Batch Mean": -0.7627459764480591, "accuracy": 0.7734375, "epoch": 0.09, "step": 36 }, { "epoch": 0.0925, "grad_norm": 7.165839672088623, "learning_rate": 2.8657894736842103e-06, "loss": 0.5157, "step": 37 }, { "Batch Mean": -0.6894429326057434, "accuracy": 0.671875, "epoch": 0.0925, "step": 37 }, { "epoch": 0.095, "grad_norm": 6.630836009979248, "learning_rate": 2.857894736842105e-06, "loss": 0.5955, "step": 38 }, { "Batch Mean": -0.2740752696990967, "accuracy": 0.734375, "epoch": 0.095, "step": 38 }, { "epoch": 0.0975, "grad_norm": 6.181252956390381, "learning_rate": 2.85e-06, "loss": 0.5037, "step": 39 }, { "Batch Mean": -0.09357815980911255, "accuracy": 0.734375, "epoch": 0.0975, "step": 39 }, { "epoch": 0.1, "grad_norm": 6.351346969604492, "learning_rate": 2.8421052631578946e-06, "loss": 0.5152, "step": 40 }, { "Batch Mean": 0.33483320474624634, "accuracy": 0.796875, "epoch": 0.1, "step": 40 }, { "epoch": 0.1025, "grad_norm": 6.3938117027282715, "learning_rate": 2.8342105263157897e-06, "loss": 0.4573, "step": 41 }, { "Batch Mean": 0.503718376159668, "accuracy": 0.7578125, "epoch": 0.1025, "step": 41 }, { "epoch": 0.105, "grad_norm": 6.795258522033691, "learning_rate": 2.8263157894736845e-06, "loss": 0.4923, "step": 42 }, { "Batch Mean": 0.4224682152271271, "accuracy": 0.6875, "epoch": 0.105, "step": 42 }, { "epoch": 0.1075, "grad_norm": 9.03122329711914, "learning_rate": 2.8184210526315792e-06, "loss": 0.5803, "step": 43 }, { "Batch Mean": 0.11721980571746826, "accuracy": 0.8046875, "epoch": 0.1075, "step": 43 }, { "epoch": 0.11, "grad_norm": 6.323685646057129, "learning_rate": 2.810526315789474e-06, "loss": 0.4518, "step": 44 }, { "Batch Mean": 0.11213397979736328, "accuracy": 0.7734375, "epoch": 0.11, "step": 44 }, { "epoch": 0.1125, "grad_norm": 6.636176586151123, "learning_rate": 2.8026315789473687e-06, "loss": 0.4747, "step": 45 }, { "Batch Mean": -0.19074296951293945, "accuracy": 0.7421875, "epoch": 0.1125, "step": 45 }, { "epoch": 0.115, "grad_norm": 8.862186431884766, "learning_rate": 2.7947368421052635e-06, "loss": 0.5295, "step": 46 }, { "Batch Mean": -0.26628145575523376, "accuracy": 0.7578125, "epoch": 0.115, "step": 46 }, { "epoch": 0.1175, "grad_norm": 9.089022636413574, "learning_rate": 2.7868421052631578e-06, "loss": 0.5132, "step": 47 }, { "Batch Mean": -0.11882030963897705, "accuracy": 0.765625, "epoch": 0.1175, "step": 47 }, { "epoch": 0.12, "grad_norm": 8.50251293182373, "learning_rate": 2.7789473684210525e-06, "loss": 0.4904, "step": 48 }, { "Batch Mean": -0.09545481204986572, "accuracy": 0.7890625, "epoch": 0.12, "step": 48 }, { "epoch": 0.1225, "grad_norm": 8.091400146484375, "learning_rate": 2.7710526315789473e-06, "loss": 0.4615, "step": 49 }, { "Batch Mean": -0.01603543758392334, "accuracy": 0.6328125, "epoch": 0.1225, "step": 49 }, { "epoch": 0.125, "grad_norm": 11.12177562713623, "learning_rate": 2.763157894736842e-06, "loss": 0.6245, "step": 50 }, { "Batch Mean": -0.03519377112388611, "accuracy": 0.6953125, "epoch": 0.125, "step": 50 }, { "epoch": 0.1275, "grad_norm": 8.476814270019531, "learning_rate": 2.7552631578947368e-06, "loss": 0.5094, "step": 51 }, { "Batch Mean": -0.07445716857910156, "accuracy": 0.765625, "epoch": 0.1275, "step": 51 }, { "epoch": 0.13, "grad_norm": 8.475494384765625, "learning_rate": 2.7473684210526315e-06, "loss": 0.464, "step": 52 }, { "Batch Mean": -0.08127522468566895, "accuracy": 0.7890625, "epoch": 0.13, "step": 52 }, { "epoch": 0.1325, "grad_norm": 7.754316806793213, "learning_rate": 2.7394736842105263e-06, "loss": 0.4618, "step": 53 }, { "Batch Mean": 0.05236625671386719, "accuracy": 0.75, "epoch": 0.1325, "step": 53 }, { "epoch": 0.135, "grad_norm": 7.3552751541137695, "learning_rate": 2.7315789473684214e-06, "loss": 0.5202, "step": 54 }, { "Batch Mean": -0.08030200004577637, "accuracy": 0.765625, "epoch": 0.135, "step": 54 }, { "epoch": 0.1375, "grad_norm": 8.413111686706543, "learning_rate": 2.723684210526316e-06, "loss": 0.4902, "step": 55 }, { "Batch Mean": 0.20010590553283691, "accuracy": 0.765625, "epoch": 0.1375, "step": 55 }, { "epoch": 0.14, "grad_norm": 9.49488353729248, "learning_rate": 2.715789473684211e-06, "loss": 0.4945, "step": 56 }, { "Batch Mean": 0.1068873256444931, "accuracy": 0.765625, "epoch": 0.14, "step": 56 }, { "epoch": 0.1425, "grad_norm": 9.978996276855469, "learning_rate": 2.7078947368421052e-06, "loss": 0.5936, "step": 57 }, { "Batch Mean": 0.12400929629802704, "accuracy": 0.8046875, "epoch": 0.1425, "step": 57 }, { "epoch": 0.145, "grad_norm": 8.332430839538574, "learning_rate": 2.7e-06, "loss": 0.4549, "step": 58 }, { "Batch Mean": 0.05874582380056381, "accuracy": 0.8125, "epoch": 0.145, "step": 58 }, { "epoch": 0.1475, "grad_norm": 7.287342548370361, "learning_rate": 2.6921052631578947e-06, "loss": 0.3996, "step": 59 }, { "Batch Mean": 0.17911481857299805, "accuracy": 0.828125, "epoch": 0.1475, "step": 59 }, { "epoch": 0.15, "grad_norm": 7.197307109832764, "learning_rate": 2.6842105263157895e-06, "loss": 0.4486, "step": 60 }, { "Batch Mean": -0.0680088996887207, "accuracy": 0.7265625, "epoch": 0.15, "step": 60 }, { "epoch": 0.1525, "grad_norm": 8.422717094421387, "learning_rate": 2.6763157894736842e-06, "loss": 0.5283, "step": 61 }, { "Batch Mean": -0.1332111358642578, "accuracy": 0.7890625, "epoch": 0.1525, "step": 61 }, { "epoch": 0.155, "grad_norm": 8.836848258972168, "learning_rate": 2.668421052631579e-06, "loss": 0.4547, "step": 62 }, { "Batch Mean": -0.005272388458251953, "accuracy": 0.734375, "epoch": 0.155, "step": 62 }, { "epoch": 0.1575, "grad_norm": 7.995077133178711, "learning_rate": 2.6605263157894737e-06, "loss": 0.5227, "step": 63 }, { "Batch Mean": 0.3416769504547119, "accuracy": 0.8359375, "epoch": 0.1575, "step": 63 }, { "epoch": 0.16, "grad_norm": 8.131783485412598, "learning_rate": 2.6526315789473685e-06, "loss": 0.4191, "step": 64 }, { "Batch Mean": 0.2797560691833496, "accuracy": 0.765625, "epoch": 0.16, "step": 64 }, { "epoch": 0.1625, "grad_norm": 9.04648208618164, "learning_rate": 2.644736842105263e-06, "loss": 0.4893, "step": 65 }, { "Batch Mean": -0.14584161341190338, "accuracy": 0.7734375, "epoch": 0.1625, "step": 65 }, { "epoch": 0.165, "grad_norm": 8.039698600769043, "learning_rate": 2.636842105263158e-06, "loss": 0.4753, "step": 66 }, { "Batch Mean": -0.37075191736221313, "accuracy": 0.7421875, "epoch": 0.165, "step": 66 }, { "epoch": 0.1675, "grad_norm": 7.643855094909668, "learning_rate": 2.6289473684210527e-06, "loss": 0.4424, "step": 67 }, { "Batch Mean": -0.24360448122024536, "accuracy": 0.828125, "epoch": 0.1675, "step": 67 }, { "epoch": 0.17, "grad_norm": 6.892768383026123, "learning_rate": 2.6210526315789474e-06, "loss": 0.4126, "step": 68 }, { "Batch Mean": -0.0484846830368042, "accuracy": 0.734375, "epoch": 0.17, "step": 68 }, { "epoch": 0.1725, "grad_norm": 10.497048377990723, "learning_rate": 2.613157894736842e-06, "loss": 0.5242, "step": 69 }, { "Batch Mean": 0.07249626517295837, "accuracy": 0.8125, "epoch": 0.1725, "step": 69 }, { "epoch": 0.175, "grad_norm": 6.990925312042236, "learning_rate": 2.605263157894737e-06, "loss": 0.4206, "step": 70 }, { "Batch Mean": -0.06337249279022217, "accuracy": 0.8125, "epoch": 0.175, "step": 70 }, { "epoch": 0.1775, "grad_norm": 7.180996417999268, "learning_rate": 2.5973684210526317e-06, "loss": 0.4412, "step": 71 }, { "Batch Mean": 0.06033170223236084, "accuracy": 0.8515625, "epoch": 0.1775, "step": 71 }, { "epoch": 0.18, "grad_norm": 6.108893871307373, "learning_rate": 2.5894736842105264e-06, "loss": 0.3576, "step": 72 }, { "Batch Mean": -0.04290449619293213, "accuracy": 0.765625, "epoch": 0.18, "step": 72 }, { "epoch": 0.1825, "grad_norm": 7.272516250610352, "learning_rate": 2.581578947368421e-06, "loss": 0.487, "step": 73 }, { "Batch Mean": 0.09309220314025879, "accuracy": 0.796875, "epoch": 0.1825, "step": 73 }, { "epoch": 0.185, "grad_norm": 6.689022064208984, "learning_rate": 2.573684210526316e-06, "loss": 0.4576, "step": 74 }, { "Batch Mean": -0.3521728515625, "accuracy": 0.8046875, "epoch": 0.185, "step": 74 }, { "epoch": 0.1875, "grad_norm": 7.446789264678955, "learning_rate": 2.5657894736842107e-06, "loss": 0.4537, "step": 75 }, { "Batch Mean": -0.1267460584640503, "accuracy": 0.7265625, "epoch": 0.1875, "step": 75 }, { "epoch": 0.19, "grad_norm": 8.600646018981934, "learning_rate": 2.5578947368421054e-06, "loss": 0.5066, "step": 76 }, { "Batch Mean": 0.02230433002114296, "accuracy": 0.7578125, "epoch": 0.19, "step": 76 }, { "epoch": 0.1925, "grad_norm": 8.842095375061035, "learning_rate": 2.55e-06, "loss": 0.4823, "step": 77 }, { "Batch Mean": 0.17012596130371094, "accuracy": 0.7890625, "epoch": 0.1925, "step": 77 }, { "epoch": 0.195, "grad_norm": 6.993001937866211, "learning_rate": 2.542105263157895e-06, "loss": 0.3959, "step": 78 }, { "Batch Mean": -0.11112558841705322, "accuracy": 0.78125, "epoch": 0.195, "step": 78 }, { "epoch": 0.1975, "grad_norm": 8.71827220916748, "learning_rate": 2.5342105263157892e-06, "loss": 0.4407, "step": 79 }, { "Batch Mean": 0.11298668384552002, "accuracy": 0.8515625, "epoch": 0.1975, "step": 79 }, { "epoch": 0.2, "grad_norm": 7.829047679901123, "learning_rate": 2.526315789473684e-06, "loss": 0.3905, "step": 80 }, { "Batch Mean": 0.010145187377929688, "accuracy": 0.78125, "epoch": 0.2, "step": 80 }, { "epoch": 0.2025, "grad_norm": 10.1448335647583, "learning_rate": 2.5184210526315787e-06, "loss": 0.4667, "step": 81 }, { "Batch Mean": 0.1327219009399414, "accuracy": 0.7890625, "epoch": 0.2025, "step": 81 }, { "epoch": 0.205, "grad_norm": 7.784322738647461, "learning_rate": 2.510526315789474e-06, "loss": 0.408, "step": 82 }, { "Batch Mean": 0.3474133014678955, "accuracy": 0.7421875, "epoch": 0.205, "step": 82 }, { "epoch": 0.2075, "grad_norm": 8.116996765136719, "learning_rate": 2.5026315789473686e-06, "loss": 0.438, "step": 83 }, { "Batch Mean": 0.2660253047943115, "accuracy": 0.75, "epoch": 0.2075, "step": 83 }, { "epoch": 0.21, "grad_norm": 7.8381452560424805, "learning_rate": 2.4947368421052634e-06, "loss": 0.4639, "step": 84 }, { "Batch Mean": 0.004191964864730835, "accuracy": 0.8125, "epoch": 0.21, "step": 84 }, { "epoch": 0.2125, "grad_norm": 6.7916412353515625, "learning_rate": 2.486842105263158e-06, "loss": 0.3937, "step": 85 }, { "Batch Mean": -0.4819910526275635, "accuracy": 0.75, "epoch": 0.2125, "step": 85 }, { "epoch": 0.215, "grad_norm": 8.073772430419922, "learning_rate": 2.478947368421053e-06, "loss": 0.4866, "step": 86 }, { "Batch Mean": -0.5541934967041016, "accuracy": 0.75, "epoch": 0.215, "step": 86 }, { "epoch": 0.2175, "grad_norm": 8.677484512329102, "learning_rate": 2.4710526315789476e-06, "loss": 0.503, "step": 87 }, { "Batch Mean": 0.14371705055236816, "accuracy": 0.7890625, "epoch": 0.2175, "step": 87 }, { "epoch": 0.22, "grad_norm": 7.931331634521484, "learning_rate": 2.4631578947368424e-06, "loss": 0.4483, "step": 88 }, { "Batch Mean": 0.013556957244873047, "accuracy": 0.859375, "epoch": 0.22, "step": 88 }, { "epoch": 0.2225, "grad_norm": 6.920699119567871, "learning_rate": 2.4552631578947367e-06, "loss": 0.3674, "step": 89 }, { "Batch Mean": -0.11897921562194824, "accuracy": 0.8203125, "epoch": 0.2225, "step": 89 }, { "epoch": 0.225, "grad_norm": 7.135400295257568, "learning_rate": 2.4473684210526314e-06, "loss": 0.3953, "step": 90 }, { "Batch Mean": 0.33566761016845703, "accuracy": 0.7890625, "epoch": 0.225, "step": 90 }, { "epoch": 0.2275, "grad_norm": 10.429315567016602, "learning_rate": 2.439473684210526e-06, "loss": 0.4943, "step": 91 }, { "Batch Mean": 0.4449765682220459, "accuracy": 0.84375, "epoch": 0.2275, "step": 91 }, { "epoch": 0.23, "grad_norm": 8.419116020202637, "learning_rate": 2.431578947368421e-06, "loss": 0.3647, "step": 92 }, { "Batch Mean": -0.06833112239837646, "accuracy": 0.8359375, "epoch": 0.23, "step": 92 }, { "epoch": 0.2325, "grad_norm": 7.288305282592773, "learning_rate": 2.4236842105263157e-06, "loss": 0.3719, "step": 93 }, { "Batch Mean": -0.36795759201049805, "accuracy": 0.8046875, "epoch": 0.2325, "step": 93 }, { "epoch": 0.235, "grad_norm": 8.255443572998047, "learning_rate": 2.4157894736842104e-06, "loss": 0.3983, "step": 94 }, { "Batch Mean": -0.3381902277469635, "accuracy": 0.8515625, "epoch": 0.235, "step": 94 }, { "epoch": 0.2375, "grad_norm": 8.25379467010498, "learning_rate": 2.4078947368421056e-06, "loss": 0.3602, "step": 95 }, { "Batch Mean": -0.2966504693031311, "accuracy": 0.75, "epoch": 0.2375, "step": 95 }, { "epoch": 0.24, "grad_norm": 9.554913520812988, "learning_rate": 2.4000000000000003e-06, "loss": 0.4499, "step": 96 }, { "Batch Mean": -0.02951526641845703, "accuracy": 0.796875, "epoch": 0.24, "step": 96 }, { "epoch": 0.2425, "grad_norm": 8.300037384033203, "learning_rate": 2.392105263157895e-06, "loss": 0.41, "step": 97 }, { "Batch Mean": 0.4994839131832123, "accuracy": 0.84375, "epoch": 0.2425, "step": 97 }, { "epoch": 0.245, "grad_norm": 9.282505989074707, "learning_rate": 2.38421052631579e-06, "loss": 0.3631, "step": 98 }, { "Batch Mean": 0.5059927701950073, "accuracy": 0.84375, "epoch": 0.245, "step": 98 }, { "epoch": 0.2475, "grad_norm": 10.97496509552002, "learning_rate": 2.376315789473684e-06, "loss": 0.4266, "step": 99 }, { "Batch Mean": 0.5012010335922241, "accuracy": 0.7890625, "epoch": 0.2475, "step": 99 }, { "epoch": 0.25, "grad_norm": 10.482470512390137, "learning_rate": 2.368421052631579e-06, "loss": 0.4402, "step": 100 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }