diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23062 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2249183938333266, + "eval_steps": 10000, + "global_step": 82000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 19.548603911230614, + "learning_rate": 3e-09, + "loss": 1.3664, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 17.74263326459735, + "learning_rate": 1.55e-08, + "loss": 1.3938, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 19.768157816871117, + "learning_rate": 2.8000000000000003e-08, + "loss": 1.3875, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 18.18791431777122, + "learning_rate": 4.05e-08, + "loss": 1.2869, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 20.288361662728292, + "learning_rate": 5.3000000000000005e-08, + "loss": 1.3836, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 19.62961424039089, + "learning_rate": 6.550000000000001e-08, + "loss": 1.3198, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 18.510724860772605, + "learning_rate": 7.8e-08, + "loss": 1.3883, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 18.00659579211933, + "learning_rate": 9.050000000000001e-08, + "loss": 1.2896, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 17.149030358259694, + "learning_rate": 1.0300000000000001e-07, + "loss": 1.2758, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 17.719461286476776, + "learning_rate": 1.1550000000000001e-07, + "loss": 1.1934, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 18.479532395253678, + "learning_rate": 1.28e-07, + "loss": 1.2667, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 18.189268236304354, + "learning_rate": 1.4050000000000002e-07, + "loss": 1.1188, + "step": 300 + }, + { + "epoch": 0.01, + "grad_norm": 18.485150666624385, + "learning_rate": 1.53e-07, + "loss": 1.0843, + "step": 325 + }, + { + "epoch": 0.01, + "grad_norm": 16.161860822592118, + "learning_rate": 1.655e-07, + "loss": 0.9748, + "step": 350 + }, + { + "epoch": 0.01, + "grad_norm": 16.77864844044536, + "learning_rate": 1.78e-07, + "loss": 0.7618, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 13.284839665193802, + "learning_rate": 1.9050000000000002e-07, + "loss": 0.6402, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 12.563585737686541, + "learning_rate": 2.03e-07, + "loss": 0.6147, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 14.195526034976105, + "learning_rate": 2.1550000000000002e-07, + "loss": 0.5645, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 15.813975597971679, + "learning_rate": 2.2800000000000003e-07, + "loss": 0.564, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 13.223184796437847, + "learning_rate": 2.405e-07, + "loss": 0.5439, + "step": 500 + }, + { + "epoch": 0.02, + "grad_norm": 11.65715823434448, + "learning_rate": 2.53e-07, + "loss": 0.5262, + "step": 525 + }, + { + "epoch": 0.02, + "grad_norm": 12.140278336499202, + "learning_rate": 2.6550000000000004e-07, + "loss": 0.4831, + "step": 550 + }, + { + "epoch": 0.02, + "grad_norm": 10.595125220315392, + "learning_rate": 2.7800000000000003e-07, + "loss": 0.4605, + "step": 575 + }, + { + "epoch": 0.02, + "grad_norm": 10.287013706634347, + "learning_rate": 2.905e-07, + "loss": 0.4882, + "step": 600 + }, + { + "epoch": 0.02, + "grad_norm": 11.461372129814126, + "learning_rate": 3.0300000000000005e-07, + "loss": 0.4616, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 9.492888534763829, + "learning_rate": 3.1550000000000004e-07, + "loss": 0.4572, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 11.17219796215868, + "learning_rate": 3.280000000000001e-07, + "loss": 0.4422, + "step": 675 + }, + { + "epoch": 0.03, + "grad_norm": 9.431269877118583, + "learning_rate": 3.405e-07, + "loss": 0.4441, + "step": 700 + }, + { + "epoch": 0.03, + "grad_norm": 9.077076398765083, + "learning_rate": 3.53e-07, + "loss": 0.4324, + "step": 725 + }, + { + "epoch": 0.03, + "grad_norm": 9.42463034481312, + "learning_rate": 3.6550000000000004e-07, + "loss": 0.4757, + "step": 750 + }, + { + "epoch": 0.03, + "grad_norm": 9.002220474801542, + "learning_rate": 3.78e-07, + "loss": 0.4235, + "step": 775 + }, + { + "epoch": 0.03, + "grad_norm": 9.249177989263307, + "learning_rate": 3.9050000000000006e-07, + "loss": 0.3844, + "step": 800 + }, + { + "epoch": 0.03, + "grad_norm": 9.222274718732235, + "learning_rate": 4.0300000000000005e-07, + "loss": 0.4303, + "step": 825 + }, + { + "epoch": 0.03, + "grad_norm": 9.29593645258072, + "learning_rate": 4.155e-07, + "loss": 0.3783, + "step": 850 + }, + { + "epoch": 0.03, + "grad_norm": 8.0820275737306, + "learning_rate": 4.28e-07, + "loss": 0.3866, + "step": 875 + }, + { + "epoch": 0.04, + "grad_norm": 9.284235367838265, + "learning_rate": 4.405e-07, + "loss": 0.3863, + "step": 900 + }, + { + "epoch": 0.04, + "grad_norm": 7.990489183242782, + "learning_rate": 4.5300000000000005e-07, + "loss": 0.3868, + "step": 925 + }, + { + "epoch": 0.04, + "grad_norm": 10.826149813494483, + "learning_rate": 4.6550000000000003e-07, + "loss": 0.3962, + "step": 950 + }, + { + "epoch": 0.04, + "grad_norm": 12.652924783202419, + "learning_rate": 4.78e-07, + "loss": 0.4151, + "step": 975 + }, + { + "epoch": 0.04, + "grad_norm": 7.231122893213122, + "learning_rate": 4.905000000000001e-07, + "loss": 0.4139, + "step": 1000 + }, + { + "epoch": 0.04, + "grad_norm": 8.284357637366583, + "learning_rate": 5.03e-07, + "loss": 0.4003, + "step": 1025 + }, + { + "epoch": 0.04, + "grad_norm": 9.764967682696836, + "learning_rate": 5.155e-07, + "loss": 0.3424, + "step": 1050 + }, + { + "epoch": 0.04, + "grad_norm": 8.623487096739643, + "learning_rate": 5.280000000000001e-07, + "loss": 0.3726, + "step": 1075 + }, + { + "epoch": 0.04, + "grad_norm": 9.118786193989378, + "learning_rate": 5.405000000000001e-07, + "loss": 0.3628, + "step": 1100 + }, + { + "epoch": 0.04, + "grad_norm": 8.266772083450553, + "learning_rate": 5.53e-07, + "loss": 0.3585, + "step": 1125 + }, + { + "epoch": 0.05, + "grad_norm": 7.9161160573400835, + "learning_rate": 5.655e-07, + "loss": 0.3638, + "step": 1150 + }, + { + "epoch": 0.05, + "grad_norm": 8.957547049197, + "learning_rate": 5.78e-07, + "loss": 0.3543, + "step": 1175 + }, + { + "epoch": 0.05, + "grad_norm": 7.64987127577368, + "learning_rate": 5.905e-07, + "loss": 0.3635, + "step": 1200 + }, + { + "epoch": 0.05, + "grad_norm": 7.6784759653959345, + "learning_rate": 6.030000000000001e-07, + "loss": 0.3822, + "step": 1225 + }, + { + "epoch": 0.05, + "grad_norm": 8.353852424599333, + "learning_rate": 6.155000000000001e-07, + "loss": 0.3715, + "step": 1250 + }, + { + "epoch": 0.05, + "grad_norm": 7.886970263184239, + "learning_rate": 6.28e-07, + "loss": 0.3751, + "step": 1275 + }, + { + "epoch": 0.05, + "grad_norm": 7.1464634900527635, + "learning_rate": 6.405e-07, + "loss": 0.3637, + "step": 1300 + }, + { + "epoch": 0.05, + "grad_norm": 8.365126116348344, + "learning_rate": 6.53e-07, + "loss": 0.3717, + "step": 1325 + }, + { + "epoch": 0.05, + "grad_norm": 8.264665698841982, + "learning_rate": 6.655e-07, + "loss": 0.3815, + "step": 1350 + }, + { + "epoch": 0.05, + "grad_norm": 8.817525071240404, + "learning_rate": 6.78e-07, + "loss": 0.3192, + "step": 1375 + }, + { + "epoch": 0.06, + "grad_norm": 8.541821815933655, + "learning_rate": 6.905000000000001e-07, + "loss": 0.3682, + "step": 1400 + }, + { + "epoch": 0.06, + "grad_norm": 8.775244243834967, + "learning_rate": 7.030000000000001e-07, + "loss": 0.3583, + "step": 1425 + }, + { + "epoch": 0.06, + "grad_norm": 8.699420120871023, + "learning_rate": 7.155000000000001e-07, + "loss": 0.353, + "step": 1450 + }, + { + "epoch": 0.06, + "grad_norm": 7.3256513159082886, + "learning_rate": 7.280000000000001e-07, + "loss": 0.3475, + "step": 1475 + }, + { + "epoch": 0.06, + "grad_norm": 8.954227102446838, + "learning_rate": 7.405000000000002e-07, + "loss": 0.3531, + "step": 1500 + }, + { + "epoch": 0.06, + "grad_norm": 7.268304372701325, + "learning_rate": 7.530000000000001e-07, + "loss": 0.3641, + "step": 1525 + }, + { + "epoch": 0.06, + "grad_norm": 8.594146689998704, + "learning_rate": 7.655000000000001e-07, + "loss": 0.3539, + "step": 1550 + }, + { + "epoch": 0.06, + "grad_norm": 8.380277920462982, + "learning_rate": 7.78e-07, + "loss": 0.3398, + "step": 1575 + }, + { + "epoch": 0.06, + "grad_norm": 8.249645134391981, + "learning_rate": 7.905e-07, + "loss": 0.3317, + "step": 1600 + }, + { + "epoch": 0.06, + "grad_norm": 9.168651318042285, + "learning_rate": 8.03e-07, + "loss": 0.3347, + "step": 1625 + }, + { + "epoch": 0.06, + "grad_norm": 7.779448193291338, + "learning_rate": 8.155000000000001e-07, + "loss": 0.3574, + "step": 1650 + }, + { + "epoch": 0.07, + "grad_norm": 8.413939794725913, + "learning_rate": 8.280000000000001e-07, + "loss": 0.3444, + "step": 1675 + }, + { + "epoch": 0.07, + "grad_norm": 8.011851874436536, + "learning_rate": 8.405e-07, + "loss": 0.346, + "step": 1700 + }, + { + "epoch": 0.07, + "grad_norm": 7.4563682729389615, + "learning_rate": 8.53e-07, + "loss": 0.3496, + "step": 1725 + }, + { + "epoch": 0.07, + "grad_norm": 8.042812209726682, + "learning_rate": 8.655000000000001e-07, + "loss": 0.3251, + "step": 1750 + }, + { + "epoch": 0.07, + "grad_norm": 8.982650134613024, + "learning_rate": 8.780000000000001e-07, + "loss": 0.3312, + "step": 1775 + }, + { + "epoch": 0.07, + "grad_norm": 6.208414233430044, + "learning_rate": 8.905000000000001e-07, + "loss": 0.3214, + "step": 1800 + }, + { + "epoch": 0.07, + "grad_norm": 7.3406319348355105, + "learning_rate": 9.030000000000001e-07, + "loss": 0.351, + "step": 1825 + }, + { + "epoch": 0.07, + "grad_norm": 10.869539965379989, + "learning_rate": 9.155000000000002e-07, + "loss": 0.3277, + "step": 1850 + }, + { + "epoch": 0.07, + "grad_norm": 7.619686331747831, + "learning_rate": 9.28e-07, + "loss": 0.3419, + "step": 1875 + }, + { + "epoch": 0.07, + "grad_norm": 7.690019286737048, + "learning_rate": 9.405e-07, + "loss": 0.3313, + "step": 1900 + }, + { + "epoch": 0.08, + "grad_norm": 8.40223765276817, + "learning_rate": 9.53e-07, + "loss": 0.3259, + "step": 1925 + }, + { + "epoch": 0.08, + "grad_norm": 9.092936931428083, + "learning_rate": 9.655000000000001e-07, + "loss": 0.325, + "step": 1950 + }, + { + "epoch": 0.08, + "grad_norm": 8.59220421800555, + "learning_rate": 9.78e-07, + "loss": 0.3364, + "step": 1975 + }, + { + "epoch": 0.08, + "grad_norm": 7.215626189268482, + "learning_rate": 9.905e-07, + "loss": 0.3372, + "step": 2000 + }, + { + "epoch": 0.08, + "grad_norm": 7.7827726836036035, + "learning_rate": 1.0030000000000002e-06, + "loss": 0.2945, + "step": 2025 + }, + { + "epoch": 0.08, + "grad_norm": 8.846888472808441, + "learning_rate": 1.0155e-06, + "loss": 0.3352, + "step": 2050 + }, + { + "epoch": 0.08, + "grad_norm": 7.524600018661036, + "learning_rate": 1.0280000000000002e-06, + "loss": 0.325, + "step": 2075 + }, + { + "epoch": 0.08, + "grad_norm": 8.029269384160894, + "learning_rate": 1.0405e-06, + "loss": 0.3445, + "step": 2100 + }, + { + "epoch": 0.08, + "grad_norm": 6.541151743266427, + "learning_rate": 1.0530000000000001e-06, + "loss": 0.3487, + "step": 2125 + }, + { + "epoch": 0.08, + "grad_norm": 9.597600892762339, + "learning_rate": 1.0655000000000002e-06, + "loss": 0.3384, + "step": 2150 + }, + { + "epoch": 0.09, + "grad_norm": 8.130328659028095, + "learning_rate": 1.078e-06, + "loss": 0.3482, + "step": 2175 + }, + { + "epoch": 0.09, + "grad_norm": 8.4990076386556, + "learning_rate": 1.0905e-06, + "loss": 0.3026, + "step": 2200 + }, + { + "epoch": 0.09, + "grad_norm": 7.422295132082161, + "learning_rate": 1.103e-06, + "loss": 0.3057, + "step": 2225 + }, + { + "epoch": 0.09, + "grad_norm": 8.017101554617902, + "learning_rate": 1.1155e-06, + "loss": 0.3233, + "step": 2250 + }, + { + "epoch": 0.09, + "grad_norm": 8.25880061391202, + "learning_rate": 1.128e-06, + "loss": 0.3046, + "step": 2275 + }, + { + "epoch": 0.09, + "grad_norm": 7.812137488778935, + "learning_rate": 1.1405000000000001e-06, + "loss": 0.3125, + "step": 2300 + }, + { + "epoch": 0.09, + "grad_norm": 9.88024638341902, + "learning_rate": 1.153e-06, + "loss": 0.3322, + "step": 2325 + }, + { + "epoch": 0.09, + "grad_norm": 9.480678112460733, + "learning_rate": 1.1655000000000001e-06, + "loss": 0.3506, + "step": 2350 + }, + { + "epoch": 0.09, + "grad_norm": 7.625242578507233, + "learning_rate": 1.1780000000000002e-06, + "loss": 0.3081, + "step": 2375 + }, + { + "epoch": 0.09, + "grad_norm": 8.471850154248505, + "learning_rate": 1.1905e-06, + "loss": 0.3411, + "step": 2400 + }, + { + "epoch": 0.1, + "grad_norm": 7.700212579759201, + "learning_rate": 1.2030000000000002e-06, + "loss": 0.2955, + "step": 2425 + }, + { + "epoch": 0.1, + "grad_norm": 7.640668754634842, + "learning_rate": 1.2155e-06, + "loss": 0.3147, + "step": 2450 + }, + { + "epoch": 0.1, + "grad_norm": 6.142799254759943, + "learning_rate": 1.2280000000000001e-06, + "loss": 0.3223, + "step": 2475 + }, + { + "epoch": 0.1, + "grad_norm": 9.273025415582895, + "learning_rate": 1.2405e-06, + "loss": 0.3183, + "step": 2500 + }, + { + "epoch": 0.1, + "grad_norm": 8.15480159136244, + "learning_rate": 1.2530000000000001e-06, + "loss": 0.3042, + "step": 2525 + }, + { + "epoch": 0.1, + "grad_norm": 7.986248278662067, + "learning_rate": 1.2655e-06, + "loss": 0.2898, + "step": 2550 + }, + { + "epoch": 0.1, + "grad_norm": 6.9880022706315765, + "learning_rate": 1.278e-06, + "loss": 0.3342, + "step": 2575 + }, + { + "epoch": 0.1, + "grad_norm": 6.797582197495177, + "learning_rate": 1.2905000000000002e-06, + "loss": 0.3204, + "step": 2600 + }, + { + "epoch": 0.1, + "grad_norm": 9.63428878101681, + "learning_rate": 1.303e-06, + "loss": 0.3106, + "step": 2625 + }, + { + "epoch": 0.1, + "grad_norm": 7.016874596706454, + "learning_rate": 1.3155000000000002e-06, + "loss": 0.3034, + "step": 2650 + }, + { + "epoch": 0.11, + "grad_norm": 8.761116991346277, + "learning_rate": 1.328e-06, + "loss": 0.3125, + "step": 2675 + }, + { + "epoch": 0.11, + "grad_norm": 9.147210364841152, + "learning_rate": 1.3405000000000001e-06, + "loss": 0.3212, + "step": 2700 + }, + { + "epoch": 0.11, + "grad_norm": 7.198210069471468, + "learning_rate": 1.3530000000000002e-06, + "loss": 0.3005, + "step": 2725 + }, + { + "epoch": 0.11, + "grad_norm": 9.62021097760392, + "learning_rate": 1.3655e-06, + "loss": 0.3095, + "step": 2750 + }, + { + "epoch": 0.11, + "grad_norm": 8.498105116813674, + "learning_rate": 1.3780000000000002e-06, + "loss": 0.2968, + "step": 2775 + }, + { + "epoch": 0.11, + "grad_norm": 9.096234425980665, + "learning_rate": 1.3905000000000003e-06, + "loss": 0.3013, + "step": 2800 + }, + { + "epoch": 0.11, + "grad_norm": 9.529807578803446, + "learning_rate": 1.4030000000000002e-06, + "loss": 0.3285, + "step": 2825 + }, + { + "epoch": 0.11, + "grad_norm": 8.090935769510644, + "learning_rate": 1.4155000000000003e-06, + "loss": 0.3186, + "step": 2850 + }, + { + "epoch": 0.11, + "grad_norm": 8.142203590894807, + "learning_rate": 1.4280000000000001e-06, + "loss": 0.3283, + "step": 2875 + }, + { + "epoch": 0.11, + "grad_norm": 7.5553560310705885, + "learning_rate": 1.4405000000000002e-06, + "loss": 0.3078, + "step": 2900 + }, + { + "epoch": 0.12, + "grad_norm": 7.030443253782761, + "learning_rate": 1.4530000000000003e-06, + "loss": 0.3259, + "step": 2925 + }, + { + "epoch": 0.12, + "grad_norm": 6.45791822784214, + "learning_rate": 1.4655000000000002e-06, + "loss": 0.3071, + "step": 2950 + }, + { + "epoch": 0.12, + "grad_norm": 7.169665682759633, + "learning_rate": 1.478e-06, + "loss": 0.3059, + "step": 2975 + }, + { + "epoch": 0.12, + "grad_norm": 9.903163712753615, + "learning_rate": 1.4905e-06, + "loss": 0.3201, + "step": 3000 + }, + { + "epoch": 0.12, + "grad_norm": 3.4235199170734267, + "learning_rate": 1.5025e-06, + "loss": 0.3193, + "step": 3025 + }, + { + "epoch": 0.12, + "grad_norm": 7.388562008472956, + "learning_rate": 1.5145e-06, + "loss": 0.3154, + "step": 3050 + }, + { + "epoch": 0.12, + "grad_norm": 7.32478874239276, + "learning_rate": 1.5270000000000002e-06, + "loss": 0.3341, + "step": 3075 + }, + { + "epoch": 0.12, + "grad_norm": 5.882897860372599, + "learning_rate": 1.5395000000000003e-06, + "loss": 0.2996, + "step": 3100 + }, + { + "epoch": 0.12, + "grad_norm": 8.830461306218142, + "learning_rate": 1.5520000000000001e-06, + "loss": 0.2997, + "step": 3125 + }, + { + "epoch": 0.12, + "grad_norm": 7.340070140671299, + "learning_rate": 1.5645000000000002e-06, + "loss": 0.2984, + "step": 3150 + }, + { + "epoch": 0.12, + "grad_norm": 6.491658642899819, + "learning_rate": 1.577e-06, + "loss": 0.3165, + "step": 3175 + }, + { + "epoch": 0.13, + "grad_norm": 8.69433147934288, + "learning_rate": 1.5895000000000002e-06, + "loss": 0.3031, + "step": 3200 + }, + { + "epoch": 0.13, + "grad_norm": 8.314351227889603, + "learning_rate": 1.6020000000000003e-06, + "loss": 0.3018, + "step": 3225 + }, + { + "epoch": 0.13, + "grad_norm": 8.310085290349688, + "learning_rate": 1.6145000000000002e-06, + "loss": 0.2997, + "step": 3250 + }, + { + "epoch": 0.13, + "grad_norm": 10.305879482033044, + "learning_rate": 1.6270000000000003e-06, + "loss": 0.3054, + "step": 3275 + }, + { + "epoch": 0.13, + "grad_norm": 7.40127326870802, + "learning_rate": 1.6395000000000004e-06, + "loss": 0.3134, + "step": 3300 + }, + { + "epoch": 0.13, + "grad_norm": 6.850845319267836, + "learning_rate": 1.6520000000000002e-06, + "loss": 0.3062, + "step": 3325 + }, + { + "epoch": 0.13, + "grad_norm": 7.047475032917652, + "learning_rate": 1.6645e-06, + "loss": 0.2863, + "step": 3350 + }, + { + "epoch": 0.13, + "grad_norm": 8.725844976704757, + "learning_rate": 1.677e-06, + "loss": 0.2891, + "step": 3375 + }, + { + "epoch": 0.13, + "grad_norm": 7.78153953615269, + "learning_rate": 1.6895e-06, + "loss": 0.3248, + "step": 3400 + }, + { + "epoch": 0.13, + "grad_norm": 7.696888074356876, + "learning_rate": 1.702e-06, + "loss": 0.3188, + "step": 3425 + }, + { + "epoch": 0.14, + "grad_norm": 7.676273130360383, + "learning_rate": 1.7145e-06, + "loss": 0.2988, + "step": 3450 + }, + { + "epoch": 0.14, + "grad_norm": 6.006703021039653, + "learning_rate": 1.7270000000000002e-06, + "loss": 0.3086, + "step": 3475 + }, + { + "epoch": 0.14, + "grad_norm": 8.465714704473822, + "learning_rate": 1.7395e-06, + "loss": 0.3052, + "step": 3500 + }, + { + "epoch": 0.14, + "grad_norm": 8.170356640181524, + "learning_rate": 1.7520000000000001e-06, + "loss": 0.2844, + "step": 3525 + }, + { + "epoch": 0.14, + "grad_norm": 7.411466808472138, + "learning_rate": 1.7645e-06, + "loss": 0.2994, + "step": 3550 + }, + { + "epoch": 0.14, + "grad_norm": 8.624191447880747, + "learning_rate": 1.777e-06, + "loss": 0.3387, + "step": 3575 + }, + { + "epoch": 0.14, + "grad_norm": 7.697489245961635, + "learning_rate": 1.7895000000000002e-06, + "loss": 0.2976, + "step": 3600 + }, + { + "epoch": 0.14, + "grad_norm": 8.358874355083804, + "learning_rate": 1.802e-06, + "loss": 0.3221, + "step": 3625 + }, + { + "epoch": 0.14, + "grad_norm": 7.701960236286209, + "learning_rate": 1.8145000000000002e-06, + "loss": 0.2778, + "step": 3650 + }, + { + "epoch": 0.14, + "grad_norm": 7.093145317658292, + "learning_rate": 1.8270000000000003e-06, + "loss": 0.2809, + "step": 3675 + }, + { + "epoch": 0.15, + "grad_norm": 7.5735424824748065, + "learning_rate": 1.8395000000000001e-06, + "loss": 0.3094, + "step": 3700 + }, + { + "epoch": 0.15, + "grad_norm": 7.111126859679747, + "learning_rate": 1.8520000000000002e-06, + "loss": 0.2811, + "step": 3725 + }, + { + "epoch": 0.15, + "grad_norm": 7.603638409237136, + "learning_rate": 1.8645e-06, + "loss": 0.3065, + "step": 3750 + }, + { + "epoch": 0.15, + "grad_norm": 9.4063632414015, + "learning_rate": 1.8770000000000002e-06, + "loss": 0.3027, + "step": 3775 + }, + { + "epoch": 0.15, + "grad_norm": 7.494637512263619, + "learning_rate": 1.8895000000000003e-06, + "loss": 0.3136, + "step": 3800 + }, + { + "epoch": 0.15, + "grad_norm": 7.048441681493421, + "learning_rate": 1.9020000000000002e-06, + "loss": 0.3337, + "step": 3825 + }, + { + "epoch": 0.15, + "grad_norm": 4.438034199516792, + "learning_rate": 1.9145e-06, + "loss": 0.269, + "step": 3850 + }, + { + "epoch": 0.15, + "grad_norm": 7.358960525686304, + "learning_rate": 1.9270000000000004e-06, + "loss": 0.3209, + "step": 3875 + }, + { + "epoch": 0.15, + "grad_norm": 6.925504929075971, + "learning_rate": 1.9395000000000002e-06, + "loss": 0.2988, + "step": 3900 + }, + { + "epoch": 0.15, + "grad_norm": 6.739147879633613, + "learning_rate": 1.952e-06, + "loss": 0.2722, + "step": 3925 + }, + { + "epoch": 0.16, + "grad_norm": 4.776015161103489, + "learning_rate": 1.9645000000000004e-06, + "loss": 0.2798, + "step": 3950 + }, + { + "epoch": 0.16, + "grad_norm": 8.056155062567179, + "learning_rate": 1.977e-06, + "loss": 0.3053, + "step": 3975 + }, + { + "epoch": 0.16, + "grad_norm": 6.121259851550022, + "learning_rate": 1.9895e-06, + "loss": 0.2838, + "step": 4000 + }, + { + "epoch": 0.16, + "grad_norm": 6.30300122513391, + "learning_rate": 2.002e-06, + "loss": 0.3092, + "step": 4025 + }, + { + "epoch": 0.16, + "grad_norm": 7.035093216193755, + "learning_rate": 2.0145e-06, + "loss": 0.2989, + "step": 4050 + }, + { + "epoch": 0.16, + "grad_norm": 7.312870093933501, + "learning_rate": 2.0270000000000002e-06, + "loss": 0.2909, + "step": 4075 + }, + { + "epoch": 0.16, + "grad_norm": 7.1161318992697105, + "learning_rate": 2.0395e-06, + "loss": 0.3155, + "step": 4100 + }, + { + "epoch": 0.16, + "grad_norm": 8.985023001843178, + "learning_rate": 2.052e-06, + "loss": 0.2811, + "step": 4125 + }, + { + "epoch": 0.16, + "grad_norm": 8.490009552023306, + "learning_rate": 2.0645000000000003e-06, + "loss": 0.2943, + "step": 4150 + }, + { + "epoch": 0.16, + "grad_norm": 6.036226568442008, + "learning_rate": 2.077e-06, + "loss": 0.3051, + "step": 4175 + }, + { + "epoch": 0.17, + "grad_norm": 7.160882683593939, + "learning_rate": 2.0895e-06, + "loss": 0.3128, + "step": 4200 + }, + { + "epoch": 0.17, + "grad_norm": 6.899026627729313, + "learning_rate": 2.102e-06, + "loss": 0.3007, + "step": 4225 + }, + { + "epoch": 0.17, + "grad_norm": 6.667556679571071, + "learning_rate": 2.1145000000000003e-06, + "loss": 0.294, + "step": 4250 + }, + { + "epoch": 0.17, + "grad_norm": 8.45891735416569, + "learning_rate": 2.127e-06, + "loss": 0.3072, + "step": 4275 + }, + { + "epoch": 0.17, + "grad_norm": 7.247465785827324, + "learning_rate": 2.1395e-06, + "loss": 0.2987, + "step": 4300 + }, + { + "epoch": 0.17, + "grad_norm": 6.925041488918864, + "learning_rate": 2.1520000000000003e-06, + "loss": 0.3038, + "step": 4325 + }, + { + "epoch": 0.17, + "grad_norm": 7.697636253311961, + "learning_rate": 2.1645e-06, + "loss": 0.2896, + "step": 4350 + }, + { + "epoch": 0.17, + "grad_norm": 5.671845123322663, + "learning_rate": 2.177e-06, + "loss": 0.3013, + "step": 4375 + }, + { + "epoch": 0.17, + "grad_norm": 14.42965685715927, + "learning_rate": 2.1895000000000004e-06, + "loss": 0.2935, + "step": 4400 + }, + { + "epoch": 0.17, + "grad_norm": 7.619313005480762, + "learning_rate": 2.2020000000000003e-06, + "loss": 0.2822, + "step": 4425 + }, + { + "epoch": 0.18, + "grad_norm": 5.96836481945949, + "learning_rate": 2.2145e-06, + "loss": 0.3013, + "step": 4450 + }, + { + "epoch": 0.18, + "grad_norm": 8.013522956861229, + "learning_rate": 2.2270000000000004e-06, + "loss": 0.2998, + "step": 4475 + }, + { + "epoch": 0.18, + "grad_norm": 6.564283110102572, + "learning_rate": 2.2395000000000003e-06, + "loss": 0.3069, + "step": 4500 + }, + { + "epoch": 0.18, + "grad_norm": 7.412696409963094, + "learning_rate": 2.252e-06, + "loss": 0.2911, + "step": 4525 + }, + { + "epoch": 0.18, + "grad_norm": 6.532716257217559, + "learning_rate": 2.2645000000000005e-06, + "loss": 0.293, + "step": 4550 + }, + { + "epoch": 0.18, + "grad_norm": 6.44868242354979, + "learning_rate": 2.2770000000000004e-06, + "loss": 0.2785, + "step": 4575 + }, + { + "epoch": 0.18, + "grad_norm": 6.10635902763617, + "learning_rate": 2.2895e-06, + "loss": 0.2966, + "step": 4600 + }, + { + "epoch": 0.18, + "grad_norm": 8.249805780154833, + "learning_rate": 2.302e-06, + "loss": 0.3102, + "step": 4625 + }, + { + "epoch": 0.18, + "grad_norm": 8.204571957255101, + "learning_rate": 2.3145e-06, + "loss": 0.309, + "step": 4650 + }, + { + "epoch": 0.18, + "grad_norm": 7.803854804655206, + "learning_rate": 2.327e-06, + "loss": 0.2814, + "step": 4675 + }, + { + "epoch": 0.18, + "grad_norm": 6.440776762453003, + "learning_rate": 2.3395000000000002e-06, + "loss": 0.3056, + "step": 4700 + }, + { + "epoch": 0.19, + "grad_norm": 8.15879339209754, + "learning_rate": 2.352e-06, + "loss": 0.2964, + "step": 4725 + }, + { + "epoch": 0.19, + "grad_norm": 5.891657048350974, + "learning_rate": 2.3645e-06, + "loss": 0.2815, + "step": 4750 + }, + { + "epoch": 0.19, + "grad_norm": 7.240599749344253, + "learning_rate": 2.3770000000000003e-06, + "loss": 0.2854, + "step": 4775 + }, + { + "epoch": 0.19, + "grad_norm": 5.975988999533506, + "learning_rate": 2.3895e-06, + "loss": 0.2888, + "step": 4800 + }, + { + "epoch": 0.19, + "grad_norm": 8.013535222990837, + "learning_rate": 2.402e-06, + "loss": 0.2954, + "step": 4825 + }, + { + "epoch": 0.19, + "grad_norm": 11.029743044779613, + "learning_rate": 2.4145000000000003e-06, + "loss": 0.2884, + "step": 4850 + }, + { + "epoch": 0.19, + "grad_norm": 6.956958714016763, + "learning_rate": 2.4270000000000002e-06, + "loss": 0.2969, + "step": 4875 + }, + { + "epoch": 0.19, + "grad_norm": 7.823749748330595, + "learning_rate": 2.4395e-06, + "loss": 0.3081, + "step": 4900 + }, + { + "epoch": 0.19, + "grad_norm": 6.2241357450122425, + "learning_rate": 2.4520000000000004e-06, + "loss": 0.2992, + "step": 4925 + }, + { + "epoch": 0.19, + "grad_norm": 9.16911141354443, + "learning_rate": 2.4645000000000003e-06, + "loss": 0.3245, + "step": 4950 + }, + { + "epoch": 0.2, + "grad_norm": 6.060217712197034, + "learning_rate": 2.477e-06, + "loss": 0.3118, + "step": 4975 + }, + { + "epoch": 0.2, + "grad_norm": 7.425673795449125, + "learning_rate": 2.4895e-06, + "loss": 0.2662, + "step": 5000 + }, + { + "epoch": 0.2, + "grad_norm": 6.595958329136231, + "learning_rate": 2.502e-06, + "loss": 0.3158, + "step": 5025 + }, + { + "epoch": 0.2, + "grad_norm": 3.6550675982874363, + "learning_rate": 2.5140000000000004e-06, + "loss": 0.28, + "step": 5050 + }, + { + "epoch": 0.2, + "grad_norm": 7.79782451562284, + "learning_rate": 2.5265e-06, + "loss": 0.2879, + "step": 5075 + }, + { + "epoch": 0.2, + "grad_norm": 8.451642556039346, + "learning_rate": 2.539e-06, + "loss": 0.305, + "step": 5100 + }, + { + "epoch": 0.2, + "grad_norm": 6.916829339590843, + "learning_rate": 2.5515e-06, + "loss": 0.271, + "step": 5125 + }, + { + "epoch": 0.2, + "grad_norm": 6.245252699817357, + "learning_rate": 2.5640000000000004e-06, + "loss": 0.2763, + "step": 5150 + }, + { + "epoch": 0.2, + "grad_norm": 7.212402917259702, + "learning_rate": 2.5765000000000002e-06, + "loss": 0.2763, + "step": 5175 + }, + { + "epoch": 0.2, + "grad_norm": 6.823573409166045, + "learning_rate": 2.5890000000000005e-06, + "loss": 0.3001, + "step": 5200 + }, + { + "epoch": 0.21, + "grad_norm": 7.940683369097169, + "learning_rate": 2.6015e-06, + "loss": 0.3116, + "step": 5225 + }, + { + "epoch": 0.21, + "grad_norm": 6.3976420123516755, + "learning_rate": 2.6140000000000003e-06, + "loss": 0.2606, + "step": 5250 + }, + { + "epoch": 0.21, + "grad_norm": 7.791330529626596, + "learning_rate": 2.6265e-06, + "loss": 0.2826, + "step": 5275 + }, + { + "epoch": 0.21, + "grad_norm": 8.076143804816086, + "learning_rate": 2.6390000000000005e-06, + "loss": 0.2833, + "step": 5300 + }, + { + "epoch": 0.21, + "grad_norm": 10.190822919416679, + "learning_rate": 2.6515000000000004e-06, + "loss": 0.3373, + "step": 5325 + }, + { + "epoch": 0.21, + "grad_norm": 6.8692012310423465, + "learning_rate": 2.6640000000000007e-06, + "loss": 0.3403, + "step": 5350 + }, + { + "epoch": 0.21, + "grad_norm": 7.442368544772186, + "learning_rate": 2.6765e-06, + "loss": 0.2975, + "step": 5375 + }, + { + "epoch": 0.21, + "grad_norm": 7.524857704750757, + "learning_rate": 2.689e-06, + "loss": 0.2758, + "step": 5400 + }, + { + "epoch": 0.21, + "grad_norm": 8.21051104357922, + "learning_rate": 2.7015000000000003e-06, + "loss": 0.2862, + "step": 5425 + }, + { + "epoch": 0.21, + "grad_norm": 7.6759388918523985, + "learning_rate": 2.7139999999999998e-06, + "loss": 0.2744, + "step": 5450 + }, + { + "epoch": 0.22, + "grad_norm": 7.630174118449949, + "learning_rate": 2.7265e-06, + "loss": 0.2988, + "step": 5475 + }, + { + "epoch": 0.22, + "grad_norm": 6.357716937591224, + "learning_rate": 2.739e-06, + "loss": 0.2827, + "step": 5500 + }, + { + "epoch": 0.22, + "grad_norm": 6.779528956514759, + "learning_rate": 2.7515000000000003e-06, + "loss": 0.2589, + "step": 5525 + }, + { + "epoch": 0.22, + "grad_norm": 9.02168399367326, + "learning_rate": 2.764e-06, + "loss": 0.2819, + "step": 5550 + }, + { + "epoch": 0.22, + "grad_norm": 6.823990587562972, + "learning_rate": 2.7765000000000004e-06, + "loss": 0.3137, + "step": 5575 + }, + { + "epoch": 0.22, + "grad_norm": 7.019705933767146, + "learning_rate": 2.789e-06, + "loss": 0.3069, + "step": 5600 + }, + { + "epoch": 0.22, + "grad_norm": 7.7318630027454995, + "learning_rate": 2.8015e-06, + "loss": 0.2665, + "step": 5625 + }, + { + "epoch": 0.22, + "grad_norm": 8.11880507789445, + "learning_rate": 2.814e-06, + "loss": 0.2872, + "step": 5650 + }, + { + "epoch": 0.22, + "grad_norm": 5.7977850369642505, + "learning_rate": 2.8265000000000004e-06, + "loss": 0.283, + "step": 5675 + }, + { + "epoch": 0.22, + "grad_norm": 6.341136733880947, + "learning_rate": 2.8390000000000003e-06, + "loss": 0.3007, + "step": 5700 + }, + { + "epoch": 0.23, + "grad_norm": 5.970881329915707, + "learning_rate": 2.8515000000000006e-06, + "loss": 0.2784, + "step": 5725 + }, + { + "epoch": 0.23, + "grad_norm": 7.729318544130669, + "learning_rate": 2.864e-06, + "loss": 0.3107, + "step": 5750 + }, + { + "epoch": 0.23, + "grad_norm": 6.546507761712345, + "learning_rate": 2.8765000000000003e-06, + "loss": 0.2822, + "step": 5775 + }, + { + "epoch": 0.23, + "grad_norm": 4.978696715726333, + "learning_rate": 2.889e-06, + "loss": 0.288, + "step": 5800 + }, + { + "epoch": 0.23, + "grad_norm": 9.228311134929747, + "learning_rate": 2.9015000000000005e-06, + "loss": 0.2903, + "step": 5825 + }, + { + "epoch": 0.23, + "grad_norm": 7.909081531042711, + "learning_rate": 2.914e-06, + "loss": 0.3001, + "step": 5850 + }, + { + "epoch": 0.23, + "grad_norm": 9.066473702285178, + "learning_rate": 2.9265000000000003e-06, + "loss": 0.3, + "step": 5875 + }, + { + "epoch": 0.23, + "grad_norm": 4.981022806434303, + "learning_rate": 2.939e-06, + "loss": 0.2623, + "step": 5900 + }, + { + "epoch": 0.23, + "grad_norm": 7.451955876526818, + "learning_rate": 2.9515000000000005e-06, + "loss": 0.3001, + "step": 5925 + }, + { + "epoch": 0.23, + "grad_norm": 5.213926487478616, + "learning_rate": 2.9640000000000003e-06, + "loss": 0.2856, + "step": 5950 + }, + { + "epoch": 0.24, + "grad_norm": 7.590418657876296, + "learning_rate": 2.9765000000000006e-06, + "loss": 0.3048, + "step": 5975 + }, + { + "epoch": 0.24, + "grad_norm": 6.618581560821221, + "learning_rate": 2.989e-06, + "loss": 0.2802, + "step": 6000 + }, + { + "epoch": 0.24, + "grad_norm": 7.747177181686857, + "learning_rate": 3.0015e-06, + "loss": 0.3047, + "step": 6025 + }, + { + "epoch": 0.24, + "grad_norm": 2.702779308707192, + "learning_rate": 3.0140000000000003e-06, + "loss": 0.2845, + "step": 6050 + }, + { + "epoch": 0.24, + "grad_norm": 7.466401914069188, + "learning_rate": 3.026e-06, + "loss": 0.2889, + "step": 6075 + }, + { + "epoch": 0.24, + "grad_norm": 8.750175939483835, + "learning_rate": 3.0385000000000002e-06, + "loss": 0.2998, + "step": 6100 + }, + { + "epoch": 0.24, + "grad_norm": 6.052264883280283, + "learning_rate": 3.051e-06, + "loss": 0.3153, + "step": 6125 + }, + { + "epoch": 0.24, + "grad_norm": 6.1350193715664165, + "learning_rate": 3.0635000000000004e-06, + "loss": 0.2599, + "step": 6150 + }, + { + "epoch": 0.24, + "grad_norm": 4.948910715297065, + "learning_rate": 3.0760000000000003e-06, + "loss": 0.2711, + "step": 6175 + }, + { + "epoch": 0.24, + "grad_norm": 6.252587211408633, + "learning_rate": 3.0885000000000006e-06, + "loss": 0.295, + "step": 6200 + }, + { + "epoch": 0.24, + "grad_norm": 6.165520107137274, + "learning_rate": 3.101e-06, + "loss": 0.2763, + "step": 6225 + }, + { + "epoch": 0.25, + "grad_norm": 7.4996014769823915, + "learning_rate": 3.1135000000000003e-06, + "loss": 0.2849, + "step": 6250 + }, + { + "epoch": 0.25, + "grad_norm": 6.964362624237183, + "learning_rate": 3.1260000000000002e-06, + "loss": 0.3028, + "step": 6275 + }, + { + "epoch": 0.25, + "grad_norm": 7.169773589103691, + "learning_rate": 3.1385000000000005e-06, + "loss": 0.278, + "step": 6300 + }, + { + "epoch": 0.25, + "grad_norm": 8.406835628812196, + "learning_rate": 3.151e-06, + "loss": 0.2558, + "step": 6325 + }, + { + "epoch": 0.25, + "grad_norm": 8.084464446709841, + "learning_rate": 3.1635000000000003e-06, + "loss": 0.284, + "step": 6350 + }, + { + "epoch": 0.25, + "grad_norm": 6.694245128142803, + "learning_rate": 3.176e-06, + "loss": 0.2611, + "step": 6375 + }, + { + "epoch": 0.25, + "grad_norm": 6.564775176663976, + "learning_rate": 3.1885000000000005e-06, + "loss": 0.277, + "step": 6400 + }, + { + "epoch": 0.25, + "grad_norm": 7.356766955805626, + "learning_rate": 3.2010000000000004e-06, + "loss": 0.2585, + "step": 6425 + }, + { + "epoch": 0.25, + "grad_norm": 6.649934835176111, + "learning_rate": 3.2135000000000007e-06, + "loss": 0.2556, + "step": 6450 + }, + { + "epoch": 0.25, + "grad_norm": 6.635433701155168, + "learning_rate": 3.226e-06, + "loss": 0.291, + "step": 6475 + }, + { + "epoch": 0.26, + "grad_norm": 6.724811843119688, + "learning_rate": 3.2385000000000004e-06, + "loss": 0.2774, + "step": 6500 + }, + { + "epoch": 0.26, + "grad_norm": 8.629267878070488, + "learning_rate": 3.2510000000000003e-06, + "loss": 0.288, + "step": 6525 + }, + { + "epoch": 0.26, + "grad_norm": 7.037964444193315, + "learning_rate": 3.2635e-06, + "loss": 0.2723, + "step": 6550 + }, + { + "epoch": 0.26, + "grad_norm": 7.182370121813334, + "learning_rate": 3.2760000000000005e-06, + "loss": 0.3106, + "step": 6575 + }, + { + "epoch": 0.26, + "grad_norm": 7.1528459015462245, + "learning_rate": 3.2885e-06, + "loss": 0.2692, + "step": 6600 + }, + { + "epoch": 0.26, + "grad_norm": 6.500105672722975, + "learning_rate": 3.3010000000000002e-06, + "loss": 0.3031, + "step": 6625 + }, + { + "epoch": 0.26, + "grad_norm": 5.834985512213181, + "learning_rate": 3.3135e-06, + "loss": 0.2664, + "step": 6650 + }, + { + "epoch": 0.26, + "grad_norm": 8.19309701510752, + "learning_rate": 3.3260000000000004e-06, + "loss": 0.2723, + "step": 6675 + }, + { + "epoch": 0.26, + "grad_norm": 8.139308511625616, + "learning_rate": 3.3385e-06, + "loss": 0.2839, + "step": 6700 + }, + { + "epoch": 0.26, + "grad_norm": 7.3241076676376435, + "learning_rate": 3.351e-06, + "loss": 0.2806, + "step": 6725 + }, + { + "epoch": 0.27, + "grad_norm": 6.98434229153364, + "learning_rate": 3.3635e-06, + "loss": 0.28, + "step": 6750 + }, + { + "epoch": 0.27, + "grad_norm": 7.129264792301614, + "learning_rate": 3.3760000000000004e-06, + "loss": 0.282, + "step": 6775 + }, + { + "epoch": 0.27, + "grad_norm": 8.170862565343718, + "learning_rate": 3.3885000000000003e-06, + "loss": 0.2858, + "step": 6800 + }, + { + "epoch": 0.27, + "grad_norm": 7.838114805347313, + "learning_rate": 3.4010000000000006e-06, + "loss": 0.2821, + "step": 6825 + }, + { + "epoch": 0.27, + "grad_norm": 7.030750778829712, + "learning_rate": 3.4135e-06, + "loss": 0.2925, + "step": 6850 + }, + { + "epoch": 0.27, + "grad_norm": 7.186794776187053, + "learning_rate": 3.4260000000000003e-06, + "loss": 0.2813, + "step": 6875 + }, + { + "epoch": 0.27, + "grad_norm": 6.6235944995301645, + "learning_rate": 3.4385e-06, + "loss": 0.2884, + "step": 6900 + }, + { + "epoch": 0.27, + "grad_norm": 6.471229594532035, + "learning_rate": 3.4510000000000005e-06, + "loss": 0.2759, + "step": 6925 + }, + { + "epoch": 0.27, + "grad_norm": 7.0819247117845, + "learning_rate": 3.4635000000000004e-06, + "loss": 0.2738, + "step": 6950 + }, + { + "epoch": 0.27, + "grad_norm": 5.998542491050096, + "learning_rate": 3.4760000000000007e-06, + "loss": 0.2878, + "step": 6975 + }, + { + "epoch": 0.28, + "grad_norm": 6.461630412552012, + "learning_rate": 3.4885e-06, + "loss": 0.3053, + "step": 7000 + }, + { + "epoch": 0.28, + "grad_norm": 6.586574763749033, + "learning_rate": 3.5010000000000004e-06, + "loss": 0.2919, + "step": 7025 + }, + { + "epoch": 0.28, + "grad_norm": 7.792709877852477, + "learning_rate": 3.5135000000000003e-06, + "loss": 0.2641, + "step": 7050 + }, + { + "epoch": 0.28, + "grad_norm": 6.911279814176992, + "learning_rate": 3.5255e-06, + "loss": 0.2902, + "step": 7075 + }, + { + "epoch": 0.28, + "grad_norm": 6.067894245771899, + "learning_rate": 3.5380000000000003e-06, + "loss": 0.2668, + "step": 7100 + }, + { + "epoch": 0.28, + "grad_norm": 7.1730720185264785, + "learning_rate": 3.5505e-06, + "loss": 0.2602, + "step": 7125 + }, + { + "epoch": 0.28, + "grad_norm": 6.9493924947782295, + "learning_rate": 3.5630000000000004e-06, + "loss": 0.2878, + "step": 7150 + }, + { + "epoch": 0.28, + "grad_norm": 5.022243763558953, + "learning_rate": 3.5755e-06, + "loss": 0.2792, + "step": 7175 + }, + { + "epoch": 0.28, + "grad_norm": 7.59839736126787, + "learning_rate": 3.588e-06, + "loss": 0.2716, + "step": 7200 + }, + { + "epoch": 0.28, + "grad_norm": 8.033887406465633, + "learning_rate": 3.6005e-06, + "loss": 0.3156, + "step": 7225 + }, + { + "epoch": 0.29, + "grad_norm": 6.642200365588752, + "learning_rate": 3.6130000000000004e-06, + "loss": 0.2713, + "step": 7250 + }, + { + "epoch": 0.29, + "grad_norm": 6.796078080375151, + "learning_rate": 3.6255000000000003e-06, + "loss": 0.2759, + "step": 7275 + }, + { + "epoch": 0.29, + "grad_norm": 6.564454612601875, + "learning_rate": 3.6380000000000006e-06, + "loss": 0.2598, + "step": 7300 + }, + { + "epoch": 0.29, + "grad_norm": 6.505268037249782, + "learning_rate": 3.6505e-06, + "loss": 0.2723, + "step": 7325 + }, + { + "epoch": 0.29, + "grad_norm": 6.1652467945628, + "learning_rate": 3.6630000000000003e-06, + "loss": 0.2671, + "step": 7350 + }, + { + "epoch": 0.29, + "grad_norm": 6.08578394589941, + "learning_rate": 3.6755000000000002e-06, + "loss": 0.2741, + "step": 7375 + }, + { + "epoch": 0.29, + "grad_norm": 8.409058891602932, + "learning_rate": 3.6880000000000005e-06, + "loss": 0.2785, + "step": 7400 + }, + { + "epoch": 0.29, + "grad_norm": 6.599948826653968, + "learning_rate": 3.7005000000000004e-06, + "loss": 0.2881, + "step": 7425 + }, + { + "epoch": 0.29, + "grad_norm": 7.3632193157404755, + "learning_rate": 3.7130000000000007e-06, + "loss": 0.2754, + "step": 7450 + }, + { + "epoch": 0.29, + "grad_norm": 4.580890830497889, + "learning_rate": 3.7255e-06, + "loss": 0.2763, + "step": 7475 + }, + { + "epoch": 0.3, + "grad_norm": 7.429129214199958, + "learning_rate": 3.7380000000000005e-06, + "loss": 0.27, + "step": 7500 + }, + { + "epoch": 0.3, + "grad_norm": 6.94826201614837, + "learning_rate": 3.7505000000000003e-06, + "loss": 0.2718, + "step": 7525 + }, + { + "epoch": 0.3, + "grad_norm": 7.004602730126053, + "learning_rate": 3.7630000000000006e-06, + "loss": 0.2996, + "step": 7550 + }, + { + "epoch": 0.3, + "grad_norm": 6.70321711644467, + "learning_rate": 3.7755e-06, + "loss": 0.2533, + "step": 7575 + }, + { + "epoch": 0.3, + "grad_norm": 5.926960517500858, + "learning_rate": 3.7880000000000004e-06, + "loss": 0.266, + "step": 7600 + }, + { + "epoch": 0.3, + "grad_norm": 7.099979599842278, + "learning_rate": 3.8005000000000003e-06, + "loss": 0.278, + "step": 7625 + }, + { + "epoch": 0.3, + "grad_norm": 8.394314379220583, + "learning_rate": 3.813e-06, + "loss": 0.2794, + "step": 7650 + }, + { + "epoch": 0.3, + "grad_norm": 7.984983861009723, + "learning_rate": 3.8255e-06, + "loss": 0.2889, + "step": 7675 + }, + { + "epoch": 0.3, + "grad_norm": 6.498279020164819, + "learning_rate": 3.838e-06, + "loss": 0.2793, + "step": 7700 + }, + { + "epoch": 0.3, + "grad_norm": 6.672802599011468, + "learning_rate": 3.850500000000001e-06, + "loss": 0.2954, + "step": 7725 + }, + { + "epoch": 0.3, + "grad_norm": 7.429422338538009, + "learning_rate": 3.863e-06, + "loss": 0.3035, + "step": 7750 + }, + { + "epoch": 0.31, + "grad_norm": 6.278266658380528, + "learning_rate": 3.8755e-06, + "loss": 0.2887, + "step": 7775 + }, + { + "epoch": 0.31, + "grad_norm": 5.933838932503291, + "learning_rate": 3.888e-06, + "loss": 0.27, + "step": 7800 + }, + { + "epoch": 0.31, + "grad_norm": 5.341557859998104, + "learning_rate": 3.9005e-06, + "loss": 0.2874, + "step": 7825 + }, + { + "epoch": 0.31, + "grad_norm": 6.368800106240496, + "learning_rate": 3.9130000000000005e-06, + "loss": 0.2697, + "step": 7850 + }, + { + "epoch": 0.31, + "grad_norm": 8.531746542220086, + "learning_rate": 3.925500000000001e-06, + "loss": 0.2704, + "step": 7875 + }, + { + "epoch": 0.31, + "grad_norm": 5.948769494377511, + "learning_rate": 3.938e-06, + "loss": 0.266, + "step": 7900 + }, + { + "epoch": 0.31, + "grad_norm": 7.0940656271040545, + "learning_rate": 3.9505000000000005e-06, + "loss": 0.2859, + "step": 7925 + }, + { + "epoch": 0.31, + "grad_norm": 5.508339585723285, + "learning_rate": 3.963e-06, + "loss": 0.2851, + "step": 7950 + }, + { + "epoch": 0.31, + "grad_norm": 6.14929249851529, + "learning_rate": 3.9755e-06, + "loss": 0.2823, + "step": 7975 + }, + { + "epoch": 0.31, + "grad_norm": 5.3779482123770475, + "learning_rate": 3.988000000000001e-06, + "loss": 0.2698, + "step": 8000 + }, + { + "epoch": 0.32, + "grad_norm": 6.074063079478101, + "learning_rate": 4.000500000000001e-06, + "loss": 0.2819, + "step": 8025 + }, + { + "epoch": 0.32, + "grad_norm": 6.362443834444672, + "learning_rate": 4.013e-06, + "loss": 0.2623, + "step": 8050 + }, + { + "epoch": 0.32, + "grad_norm": 6.51778892694598, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.2563, + "step": 8075 + }, + { + "epoch": 0.32, + "grad_norm": 5.3055285454811845, + "learning_rate": 4.037500000000001e-06, + "loss": 0.2698, + "step": 8100 + }, + { + "epoch": 0.32, + "grad_norm": 5.9267504904096375, + "learning_rate": 4.05e-06, + "loss": 0.2522, + "step": 8125 + }, + { + "epoch": 0.32, + "grad_norm": 7.966682070450956, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.2976, + "step": 8150 + }, + { + "epoch": 0.32, + "grad_norm": 6.548744530829551, + "learning_rate": 4.075e-06, + "loss": 0.2904, + "step": 8175 + }, + { + "epoch": 0.32, + "grad_norm": 6.753040626050456, + "learning_rate": 4.0875e-06, + "loss": 0.2843, + "step": 8200 + }, + { + "epoch": 0.32, + "grad_norm": 7.200730763133159, + "learning_rate": 4.1e-06, + "loss": 0.2648, + "step": 8225 + }, + { + "epoch": 0.32, + "grad_norm": 6.411993308970435, + "learning_rate": 4.1125e-06, + "loss": 0.2949, + "step": 8250 + }, + { + "epoch": 0.33, + "grad_norm": 6.226554884823071, + "learning_rate": 4.125e-06, + "loss": 0.2606, + "step": 8275 + }, + { + "epoch": 0.33, + "grad_norm": 6.5307361977359895, + "learning_rate": 4.137500000000001e-06, + "loss": 0.275, + "step": 8300 + }, + { + "epoch": 0.33, + "grad_norm": 12.482533214728566, + "learning_rate": 4.15e-06, + "loss": 0.2997, + "step": 8325 + }, + { + "epoch": 0.33, + "grad_norm": 6.674706679093602, + "learning_rate": 4.1625e-06, + "loss": 0.2955, + "step": 8350 + }, + { + "epoch": 0.33, + "grad_norm": 6.992206297283636, + "learning_rate": 4.175e-06, + "loss": 0.2743, + "step": 8375 + }, + { + "epoch": 0.33, + "grad_norm": 6.042025737938246, + "learning_rate": 4.1875e-06, + "loss": 0.2671, + "step": 8400 + }, + { + "epoch": 0.33, + "grad_norm": 6.055250933615206, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.2577, + "step": 8425 + }, + { + "epoch": 0.33, + "grad_norm": 5.836279970090582, + "learning_rate": 4.212500000000001e-06, + "loss": 0.2959, + "step": 8450 + }, + { + "epoch": 0.33, + "grad_norm": 6.347072802266105, + "learning_rate": 4.225e-06, + "loss": 0.3041, + "step": 8475 + }, + { + "epoch": 0.33, + "grad_norm": 6.126442118336176, + "learning_rate": 4.2375000000000005e-06, + "loss": 0.2882, + "step": 8500 + }, + { + "epoch": 0.34, + "grad_norm": 6.18958310982247, + "learning_rate": 4.25e-06, + "loss": 0.3015, + "step": 8525 + }, + { + "epoch": 0.34, + "grad_norm": 6.197927352860307, + "learning_rate": 4.2625e-06, + "loss": 0.2791, + "step": 8550 + }, + { + "epoch": 0.34, + "grad_norm": 7.522494327067332, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.2736, + "step": 8575 + }, + { + "epoch": 0.34, + "grad_norm": 6.237403417855199, + "learning_rate": 4.287500000000001e-06, + "loss": 0.2924, + "step": 8600 + }, + { + "epoch": 0.34, + "grad_norm": 5.3182172892190565, + "learning_rate": 4.3e-06, + "loss": 0.2824, + "step": 8625 + }, + { + "epoch": 0.34, + "grad_norm": 9.816379144060154, + "learning_rate": 4.312500000000001e-06, + "loss": 0.2843, + "step": 8650 + }, + { + "epoch": 0.34, + "grad_norm": 4.895548452165421, + "learning_rate": 4.325e-06, + "loss": 0.284, + "step": 8675 + }, + { + "epoch": 0.34, + "grad_norm": 6.654941433869707, + "learning_rate": 4.3375e-06, + "loss": 0.2705, + "step": 8700 + }, + { + "epoch": 0.34, + "grad_norm": 10.469132180677638, + "learning_rate": 4.350000000000001e-06, + "loss": 0.2729, + "step": 8725 + }, + { + "epoch": 0.34, + "grad_norm": 6.44234959433828, + "learning_rate": 4.362500000000001e-06, + "loss": 0.2644, + "step": 8750 + }, + { + "epoch": 0.35, + "grad_norm": 5.838324842695495, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.2676, + "step": 8775 + }, + { + "epoch": 0.35, + "grad_norm": 9.085540132092943, + "learning_rate": 4.3875e-06, + "loss": 0.2869, + "step": 8800 + }, + { + "epoch": 0.35, + "grad_norm": 6.795754152011421, + "learning_rate": 4.4e-06, + "loss": 0.276, + "step": 8825 + }, + { + "epoch": 0.35, + "grad_norm": 7.102982969922736, + "learning_rate": 4.4125000000000005e-06, + "loss": 0.2595, + "step": 8850 + }, + { + "epoch": 0.35, + "grad_norm": 7.700990648549398, + "learning_rate": 4.425e-06, + "loss": 0.2812, + "step": 8875 + }, + { + "epoch": 0.35, + "grad_norm": 4.312933654551669, + "learning_rate": 4.4375e-06, + "loss": 0.2852, + "step": 8900 + }, + { + "epoch": 0.35, + "grad_norm": 8.379757647366656, + "learning_rate": 4.450000000000001e-06, + "loss": 0.2779, + "step": 8925 + }, + { + "epoch": 0.35, + "grad_norm": 6.792550628543418, + "learning_rate": 4.4625e-06, + "loss": 0.2793, + "step": 8950 + }, + { + "epoch": 0.35, + "grad_norm": 5.945063099925784, + "learning_rate": 4.475e-06, + "loss": 0.291, + "step": 8975 + }, + { + "epoch": 0.35, + "grad_norm": 6.365832685836148, + "learning_rate": 4.4875e-06, + "loss": 0.275, + "step": 9000 + }, + { + "epoch": 0.36, + "grad_norm": 6.188639752527868, + "learning_rate": 4.5e-06, + "loss": 0.3043, + "step": 9025 + }, + { + "epoch": 0.36, + "grad_norm": 5.383012165041406, + "learning_rate": 4.5125e-06, + "loss": 0.2791, + "step": 9050 + }, + { + "epoch": 0.36, + "grad_norm": 5.5648061171749745, + "learning_rate": 4.5245000000000005e-06, + "loss": 0.2812, + "step": 9075 + }, + { + "epoch": 0.36, + "grad_norm": 6.718625044735663, + "learning_rate": 4.537e-06, + "loss": 0.2817, + "step": 9100 + }, + { + "epoch": 0.36, + "grad_norm": 7.578986129922235, + "learning_rate": 4.5495e-06, + "loss": 0.3059, + "step": 9125 + }, + { + "epoch": 0.36, + "grad_norm": 4.950562990896707, + "learning_rate": 4.5620000000000005e-06, + "loss": 0.2991, + "step": 9150 + }, + { + "epoch": 0.36, + "grad_norm": 6.6274318125177105, + "learning_rate": 4.574500000000001e-06, + "loss": 0.2905, + "step": 9175 + }, + { + "epoch": 0.36, + "grad_norm": 7.868725872074689, + "learning_rate": 4.587e-06, + "loss": 0.3046, + "step": 9200 + }, + { + "epoch": 0.36, + "grad_norm": 7.514705728312666, + "learning_rate": 4.599500000000001e-06, + "loss": 0.2778, + "step": 9225 + }, + { + "epoch": 0.36, + "grad_norm": 6.37394309344846, + "learning_rate": 4.612e-06, + "loss": 0.2796, + "step": 9250 + }, + { + "epoch": 0.36, + "grad_norm": 6.710149784112031, + "learning_rate": 4.6245e-06, + "loss": 0.2921, + "step": 9275 + }, + { + "epoch": 0.37, + "grad_norm": 7.545639520621682, + "learning_rate": 4.637000000000001e-06, + "loss": 0.268, + "step": 9300 + }, + { + "epoch": 0.37, + "grad_norm": 5.688131287423445, + "learning_rate": 4.6495e-06, + "loss": 0.2524, + "step": 9325 + }, + { + "epoch": 0.37, + "grad_norm": 6.135284919662475, + "learning_rate": 4.6620000000000004e-06, + "loss": 0.276, + "step": 9350 + }, + { + "epoch": 0.37, + "grad_norm": 6.688180517230278, + "learning_rate": 4.6745e-06, + "loss": 0.2862, + "step": 9375 + }, + { + "epoch": 0.37, + "grad_norm": 6.795335976277467, + "learning_rate": 4.687e-06, + "loss": 0.2888, + "step": 9400 + }, + { + "epoch": 0.37, + "grad_norm": 5.746370462450612, + "learning_rate": 4.6995000000000005e-06, + "loss": 0.2662, + "step": 9425 + }, + { + "epoch": 0.37, + "grad_norm": 6.769522511707671, + "learning_rate": 4.712000000000001e-06, + "loss": 0.2984, + "step": 9450 + }, + { + "epoch": 0.37, + "grad_norm": 5.346844756223433, + "learning_rate": 4.7245e-06, + "loss": 0.2596, + "step": 9475 + }, + { + "epoch": 0.37, + "grad_norm": 7.3781982999908795, + "learning_rate": 4.7370000000000006e-06, + "loss": 0.2792, + "step": 9500 + }, + { + "epoch": 0.37, + "grad_norm": 6.069856946236931, + "learning_rate": 4.7495e-06, + "loss": 0.2762, + "step": 9525 + }, + { + "epoch": 0.38, + "grad_norm": 5.564725582025151, + "learning_rate": 4.762e-06, + "loss": 0.2868, + "step": 9550 + }, + { + "epoch": 0.38, + "grad_norm": 4.831607241976059, + "learning_rate": 4.774500000000001e-06, + "loss": 0.2633, + "step": 9575 + }, + { + "epoch": 0.38, + "grad_norm": 5.993570224205531, + "learning_rate": 4.787000000000001e-06, + "loss": 0.2907, + "step": 9600 + }, + { + "epoch": 0.38, + "grad_norm": 5.473582625195408, + "learning_rate": 4.7995e-06, + "loss": 0.2939, + "step": 9625 + }, + { + "epoch": 0.38, + "grad_norm": 4.8096853747215675, + "learning_rate": 4.812000000000001e-06, + "loss": 0.2694, + "step": 9650 + }, + { + "epoch": 0.38, + "grad_norm": 7.871920620924059, + "learning_rate": 4.8245e-06, + "loss": 0.2796, + "step": 9675 + }, + { + "epoch": 0.38, + "grad_norm": 5.914335893785346, + "learning_rate": 4.8370000000000004e-06, + "loss": 0.2843, + "step": 9700 + }, + { + "epoch": 0.38, + "grad_norm": 5.459462930577671, + "learning_rate": 4.8495e-06, + "loss": 0.273, + "step": 9725 + }, + { + "epoch": 0.38, + "grad_norm": 7.001699192626203, + "learning_rate": 4.862e-06, + "loss": 0.2588, + "step": 9750 + }, + { + "epoch": 0.38, + "grad_norm": 6.407971853644161, + "learning_rate": 4.8745000000000005e-06, + "loss": 0.2705, + "step": 9775 + }, + { + "epoch": 0.39, + "grad_norm": 5.359611800340205, + "learning_rate": 4.887000000000001e-06, + "loss": 0.2818, + "step": 9800 + }, + { + "epoch": 0.39, + "grad_norm": 5.385058355738006, + "learning_rate": 4.8995e-06, + "loss": 0.2821, + "step": 9825 + }, + { + "epoch": 0.39, + "grad_norm": 5.8879065529077135, + "learning_rate": 4.9120000000000006e-06, + "loss": 0.2879, + "step": 9850 + }, + { + "epoch": 0.39, + "grad_norm": 5.521614128029462, + "learning_rate": 4.9245e-06, + "loss": 0.2701, + "step": 9875 + }, + { + "epoch": 0.39, + "grad_norm": 6.22001469157038, + "learning_rate": 4.937e-06, + "loss": 0.2814, + "step": 9900 + }, + { + "epoch": 0.39, + "grad_norm": 5.645954802312944, + "learning_rate": 4.949500000000001e-06, + "loss": 0.2788, + "step": 9925 + }, + { + "epoch": 0.39, + "grad_norm": 5.975705959262324, + "learning_rate": 4.962e-06, + "loss": 0.2745, + "step": 9950 + }, + { + "epoch": 0.39, + "grad_norm": 6.7617079563479345, + "learning_rate": 4.9745e-06, + "loss": 0.2882, + "step": 9975 + }, + { + "epoch": 0.39, + "grad_norm": 6.349127933186041, + "learning_rate": 4.987e-06, + "loss": 0.2717, + "step": 10000 + }, + { + "epoch": 0.39, + "eval_loss": 0.414306640625, + "eval_runtime": 11566.661, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.051, + "eval_wer": 0.1341312356224764, + "step": 10000 + }, + { + "epoch": 0.39, + "grad_norm": 7.011108784934462, + "learning_rate": 4.9995e-06, + "loss": 0.2786, + "step": 10025 + }, + { + "epoch": 0.4, + "grad_norm": 6.8292619191203014, + "learning_rate": 4.999959866220736e-06, + "loss": 0.2727, + "step": 10050 + }, + { + "epoch": 0.4, + "grad_norm": 8.723216450366968, + "learning_rate": 4.999919732441472e-06, + "loss": 0.2894, + "step": 10075 + }, + { + "epoch": 0.4, + "grad_norm": 7.24048520411389, + "learning_rate": 4.999877926421406e-06, + "loss": 0.2752, + "step": 10100 + }, + { + "epoch": 0.4, + "grad_norm": 6.342763382928159, + "learning_rate": 4.999836120401338e-06, + "loss": 0.2667, + "step": 10125 + }, + { + "epoch": 0.4, + "grad_norm": 7.50752715283013, + "learning_rate": 4.999794314381271e-06, + "loss": 0.2939, + "step": 10150 + }, + { + "epoch": 0.4, + "grad_norm": 6.204906569121375, + "learning_rate": 4.9997525083612046e-06, + "loss": 0.2718, + "step": 10175 + }, + { + "epoch": 0.4, + "grad_norm": 4.752575646886806, + "learning_rate": 4.999710702341137e-06, + "loss": 0.2747, + "step": 10200 + }, + { + "epoch": 0.4, + "grad_norm": 5.796030259684187, + "learning_rate": 4.999668896321071e-06, + "loss": 0.2902, + "step": 10225 + }, + { + "epoch": 0.4, + "grad_norm": 6.053195409587687, + "learning_rate": 4.9996270903010035e-06, + "loss": 0.3108, + "step": 10250 + }, + { + "epoch": 0.4, + "grad_norm": 6.059127631079693, + "learning_rate": 4.999585284280937e-06, + "loss": 0.2775, + "step": 10275 + }, + { + "epoch": 0.41, + "grad_norm": 6.383155938148739, + "learning_rate": 4.99954347826087e-06, + "loss": 0.3066, + "step": 10300 + }, + { + "epoch": 0.41, + "grad_norm": 6.561402837665798, + "learning_rate": 4.999501672240803e-06, + "loss": 0.2798, + "step": 10325 + }, + { + "epoch": 0.41, + "grad_norm": 7.225986990831043, + "learning_rate": 4.999459866220736e-06, + "loss": 0.2957, + "step": 10350 + }, + { + "epoch": 0.41, + "grad_norm": 5.381995878211679, + "learning_rate": 4.9994180602006695e-06, + "loss": 0.2824, + "step": 10375 + }, + { + "epoch": 0.41, + "grad_norm": 6.422499531575439, + "learning_rate": 4.999376254180602e-06, + "loss": 0.2682, + "step": 10400 + }, + { + "epoch": 0.41, + "grad_norm": 6.429046461471528, + "learning_rate": 4.999334448160536e-06, + "loss": 0.2909, + "step": 10425 + }, + { + "epoch": 0.41, + "grad_norm": 5.823072159394408, + "learning_rate": 4.9992926421404685e-06, + "loss": 0.2588, + "step": 10450 + }, + { + "epoch": 0.41, + "grad_norm": 6.64555406872812, + "learning_rate": 4.999250836120402e-06, + "loss": 0.2675, + "step": 10475 + }, + { + "epoch": 0.41, + "grad_norm": 7.285451019452604, + "learning_rate": 4.999209030100335e-06, + "loss": 0.2743, + "step": 10500 + }, + { + "epoch": 0.41, + "grad_norm": 5.804120728206725, + "learning_rate": 4.999167224080268e-06, + "loss": 0.288, + "step": 10525 + }, + { + "epoch": 0.42, + "grad_norm": 5.098440922169357, + "learning_rate": 4.999125418060201e-06, + "loss": 0.272, + "step": 10550 + }, + { + "epoch": 0.42, + "grad_norm": 6.546611829047939, + "learning_rate": 4.9990836120401345e-06, + "loss": 0.2883, + "step": 10575 + }, + { + "epoch": 0.42, + "grad_norm": 5.622357797847333, + "learning_rate": 4.999041806020067e-06, + "loss": 0.2577, + "step": 10600 + }, + { + "epoch": 0.42, + "grad_norm": 6.001323909432522, + "learning_rate": 4.999000000000001e-06, + "loss": 0.2925, + "step": 10625 + }, + { + "epoch": 0.42, + "grad_norm": 6.211276512696046, + "learning_rate": 4.9989581939799335e-06, + "loss": 0.2711, + "step": 10650 + }, + { + "epoch": 0.42, + "grad_norm": 6.565052395271628, + "learning_rate": 4.998916387959867e-06, + "loss": 0.2988, + "step": 10675 + }, + { + "epoch": 0.42, + "grad_norm": 6.296027910196759, + "learning_rate": 4.9988745819398e-06, + "loss": 0.2773, + "step": 10700 + }, + { + "epoch": 0.42, + "grad_norm": 6.344517029275454, + "learning_rate": 4.998832775919733e-06, + "loss": 0.257, + "step": 10725 + }, + { + "epoch": 0.42, + "grad_norm": 6.746065840605949, + "learning_rate": 4.998790969899666e-06, + "loss": 0.2832, + "step": 10750 + }, + { + "epoch": 0.42, + "grad_norm": 6.779174959859481, + "learning_rate": 4.9987491638795995e-06, + "loss": 0.281, + "step": 10775 + }, + { + "epoch": 0.42, + "grad_norm": 6.109047891868748, + "learning_rate": 4.998707357859532e-06, + "loss": 0.2972, + "step": 10800 + }, + { + "epoch": 0.43, + "grad_norm": 5.426495294412173, + "learning_rate": 4.998665551839466e-06, + "loss": 0.2689, + "step": 10825 + }, + { + "epoch": 0.43, + "grad_norm": 8.24421870740098, + "learning_rate": 4.9986237458193984e-06, + "loss": 0.2659, + "step": 10850 + }, + { + "epoch": 0.43, + "grad_norm": 7.081586642876751, + "learning_rate": 4.998581939799331e-06, + "loss": 0.2672, + "step": 10875 + }, + { + "epoch": 0.43, + "grad_norm": 5.126266886484015, + "learning_rate": 4.998540133779265e-06, + "loss": 0.2675, + "step": 10900 + }, + { + "epoch": 0.43, + "grad_norm": 5.726554912145362, + "learning_rate": 4.998498327759197e-06, + "loss": 0.2881, + "step": 10925 + }, + { + "epoch": 0.43, + "grad_norm": 5.749427772911198, + "learning_rate": 4.998456521739131e-06, + "loss": 0.297, + "step": 10950 + }, + { + "epoch": 0.43, + "grad_norm": 5.598962835292076, + "learning_rate": 4.998414715719064e-06, + "loss": 0.2978, + "step": 10975 + }, + { + "epoch": 0.43, + "grad_norm": 6.200710604353594, + "learning_rate": 4.998372909698997e-06, + "loss": 0.2692, + "step": 11000 + }, + { + "epoch": 0.43, + "grad_norm": 6.702673268567435, + "learning_rate": 4.99833110367893e-06, + "loss": 0.2944, + "step": 11025 + }, + { + "epoch": 0.43, + "grad_norm": 6.625028895002169, + "learning_rate": 4.998289297658863e-06, + "loss": 0.2885, + "step": 11050 + }, + { + "epoch": 0.44, + "grad_norm": 7.820444832366389, + "learning_rate": 4.998249163879598e-06, + "loss": 0.2942, + "step": 11075 + }, + { + "epoch": 0.44, + "grad_norm": 5.567604834669745, + "learning_rate": 4.998207357859532e-06, + "loss": 0.2742, + "step": 11100 + }, + { + "epoch": 0.44, + "grad_norm": 6.423735455535033, + "learning_rate": 4.998165551839465e-06, + "loss": 0.2846, + "step": 11125 + }, + { + "epoch": 0.44, + "grad_norm": 4.999616850642319, + "learning_rate": 4.998123745819398e-06, + "loss": 0.2782, + "step": 11150 + }, + { + "epoch": 0.44, + "grad_norm": 7.030451571678514, + "learning_rate": 4.998081939799332e-06, + "loss": 0.3123, + "step": 11175 + }, + { + "epoch": 0.44, + "grad_norm": 7.9115247740062875, + "learning_rate": 4.9980401337792644e-06, + "loss": 0.2536, + "step": 11200 + }, + { + "epoch": 0.44, + "grad_norm": 6.072557616156972, + "learning_rate": 4.997998327759198e-06, + "loss": 0.2915, + "step": 11225 + }, + { + "epoch": 0.44, + "grad_norm": 6.711391798767144, + "learning_rate": 4.997956521739131e-06, + "loss": 0.2736, + "step": 11250 + }, + { + "epoch": 0.44, + "grad_norm": 5.645455721566767, + "learning_rate": 4.997914715719064e-06, + "loss": 0.288, + "step": 11275 + }, + { + "epoch": 0.44, + "grad_norm": 7.186122138761158, + "learning_rate": 4.997872909698997e-06, + "loss": 0.2697, + "step": 11300 + }, + { + "epoch": 0.45, + "grad_norm": 9.00329145846993, + "learning_rate": 4.9978311036789305e-06, + "loss": 0.2924, + "step": 11325 + }, + { + "epoch": 0.45, + "grad_norm": 6.312139760803962, + "learning_rate": 4.997789297658863e-06, + "loss": 0.2756, + "step": 11350 + }, + { + "epoch": 0.45, + "grad_norm": 5.8874897753934965, + "learning_rate": 4.997747491638797e-06, + "loss": 0.2703, + "step": 11375 + }, + { + "epoch": 0.45, + "grad_norm": 8.211561229810027, + "learning_rate": 4.997705685618729e-06, + "loss": 0.2724, + "step": 11400 + }, + { + "epoch": 0.45, + "grad_norm": 6.444390049063497, + "learning_rate": 4.997663879598663e-06, + "loss": 0.2877, + "step": 11425 + }, + { + "epoch": 0.45, + "grad_norm": 6.170573267490551, + "learning_rate": 4.997622073578596e-06, + "loss": 0.259, + "step": 11450 + }, + { + "epoch": 0.45, + "grad_norm": 4.632322155697068, + "learning_rate": 4.997580267558529e-06, + "loss": 0.2823, + "step": 11475 + }, + { + "epoch": 0.45, + "grad_norm": 6.152734308462785, + "learning_rate": 4.997538461538462e-06, + "loss": 0.2607, + "step": 11500 + }, + { + "epoch": 0.45, + "grad_norm": 6.107228957537137, + "learning_rate": 4.9974966555183954e-06, + "loss": 0.2746, + "step": 11525 + }, + { + "epoch": 0.45, + "grad_norm": 6.472917184920453, + "learning_rate": 4.997454849498328e-06, + "loss": 0.2728, + "step": 11550 + }, + { + "epoch": 0.46, + "grad_norm": 5.83478467904063, + "learning_rate": 4.997413043478262e-06, + "loss": 0.2751, + "step": 11575 + }, + { + "epoch": 0.46, + "grad_norm": 4.279889559058508, + "learning_rate": 4.997371237458194e-06, + "loss": 0.2659, + "step": 11600 + }, + { + "epoch": 0.46, + "grad_norm": 7.228962539059828, + "learning_rate": 4.997329431438128e-06, + "loss": 0.2946, + "step": 11625 + }, + { + "epoch": 0.46, + "grad_norm": 6.329870520493551, + "learning_rate": 4.997287625418061e-06, + "loss": 0.2752, + "step": 11650 + }, + { + "epoch": 0.46, + "grad_norm": 6.644595157995969, + "learning_rate": 4.997245819397994e-06, + "loss": 0.2782, + "step": 11675 + }, + { + "epoch": 0.46, + "grad_norm": 7.922993574833506, + "learning_rate": 4.997204013377927e-06, + "loss": 0.277, + "step": 11700 + }, + { + "epoch": 0.46, + "grad_norm": 5.510974001369662, + "learning_rate": 4.99716220735786e-06, + "loss": 0.2752, + "step": 11725 + }, + { + "epoch": 0.46, + "grad_norm": 4.9210546984842365, + "learning_rate": 4.997120401337793e-06, + "loss": 0.2634, + "step": 11750 + }, + { + "epoch": 0.46, + "grad_norm": 6.079776486981066, + "learning_rate": 4.997078595317727e-06, + "loss": 0.2647, + "step": 11775 + }, + { + "epoch": 0.46, + "grad_norm": 6.305360379026096, + "learning_rate": 4.997036789297659e-06, + "loss": 0.2988, + "step": 11800 + }, + { + "epoch": 0.47, + "grad_norm": 7.735797578726933, + "learning_rate": 4.996994983277592e-06, + "loss": 0.2484, + "step": 11825 + }, + { + "epoch": 0.47, + "grad_norm": 7.43170191366388, + "learning_rate": 4.996953177257526e-06, + "loss": 0.2619, + "step": 11850 + }, + { + "epoch": 0.47, + "grad_norm": 6.402374846991611, + "learning_rate": 4.996911371237458e-06, + "loss": 0.2746, + "step": 11875 + }, + { + "epoch": 0.47, + "grad_norm": 6.531743806890973, + "learning_rate": 4.996869565217392e-06, + "loss": 0.2665, + "step": 11900 + }, + { + "epoch": 0.47, + "grad_norm": 4.839865759331419, + "learning_rate": 4.9968277591973245e-06, + "loss": 0.2702, + "step": 11925 + }, + { + "epoch": 0.47, + "grad_norm": 5.825681200003497, + "learning_rate": 4.996785953177258e-06, + "loss": 0.2531, + "step": 11950 + }, + { + "epoch": 0.47, + "grad_norm": 7.125879741698381, + "learning_rate": 4.996744147157191e-06, + "loss": 0.2704, + "step": 11975 + }, + { + "epoch": 0.47, + "grad_norm": 5.663308145081337, + "learning_rate": 4.996702341137124e-06, + "loss": 0.2653, + "step": 12000 + }, + { + "epoch": 0.47, + "grad_norm": 6.184543925229218, + "learning_rate": 4.996660535117057e-06, + "loss": 0.2786, + "step": 12025 + }, + { + "epoch": 0.47, + "grad_norm": 7.865850646784692, + "learning_rate": 4.996618729096991e-06, + "loss": 0.2781, + "step": 12050 + }, + { + "epoch": 0.48, + "grad_norm": 6.13013469457717, + "learning_rate": 4.9965785953177256e-06, + "loss": 0.2725, + "step": 12075 + }, + { + "epoch": 0.48, + "grad_norm": 5.6434695533203305, + "learning_rate": 4.996536789297659e-06, + "loss": 0.2852, + "step": 12100 + }, + { + "epoch": 0.48, + "grad_norm": 6.9799882279278265, + "learning_rate": 4.996494983277592e-06, + "loss": 0.2699, + "step": 12125 + }, + { + "epoch": 0.48, + "grad_norm": 6.060616926983353, + "learning_rate": 4.996453177257525e-06, + "loss": 0.2752, + "step": 12150 + }, + { + "epoch": 0.48, + "grad_norm": 4.746693589734832, + "learning_rate": 4.996411371237458e-06, + "loss": 0.2624, + "step": 12175 + }, + { + "epoch": 0.48, + "grad_norm": 6.229583616728782, + "learning_rate": 4.996369565217392e-06, + "loss": 0.2515, + "step": 12200 + }, + { + "epoch": 0.48, + "grad_norm": 4.999145743899038, + "learning_rate": 4.996327759197324e-06, + "loss": 0.2775, + "step": 12225 + }, + { + "epoch": 0.48, + "grad_norm": 6.927319188266453, + "learning_rate": 4.996285953177258e-06, + "loss": 0.2846, + "step": 12250 + }, + { + "epoch": 0.48, + "grad_norm": 6.874400599413486, + "learning_rate": 4.996244147157191e-06, + "loss": 0.2758, + "step": 12275 + }, + { + "epoch": 0.48, + "grad_norm": 6.024826067369364, + "learning_rate": 4.996202341137124e-06, + "loss": 0.2733, + "step": 12300 + }, + { + "epoch": 0.48, + "grad_norm": 5.724310991701934, + "learning_rate": 4.996160535117058e-06, + "loss": 0.2661, + "step": 12325 + }, + { + "epoch": 0.49, + "grad_norm": 6.190847168134574, + "learning_rate": 4.99611872909699e-06, + "loss": 0.3009, + "step": 12350 + }, + { + "epoch": 0.49, + "grad_norm": 5.981795774000632, + "learning_rate": 4.996076923076924e-06, + "loss": 0.2714, + "step": 12375 + }, + { + "epoch": 0.49, + "grad_norm": 5.982768726819535, + "learning_rate": 4.9960351170568566e-06, + "loss": 0.2452, + "step": 12400 + }, + { + "epoch": 0.49, + "grad_norm": 6.30852279239966, + "learning_rate": 4.99599331103679e-06, + "loss": 0.2584, + "step": 12425 + }, + { + "epoch": 0.49, + "grad_norm": 6.858438111584143, + "learning_rate": 4.995951505016723e-06, + "loss": 0.2505, + "step": 12450 + }, + { + "epoch": 0.49, + "grad_norm": 6.174552548362399, + "learning_rate": 4.995909698996656e-06, + "loss": 0.2532, + "step": 12475 + }, + { + "epoch": 0.49, + "grad_norm": 7.0465648379078125, + "learning_rate": 4.995867892976589e-06, + "loss": 0.265, + "step": 12500 + }, + { + "epoch": 0.49, + "grad_norm": 6.988153680687835, + "learning_rate": 4.995826086956523e-06, + "loss": 0.2432, + "step": 12525 + }, + { + "epoch": 0.49, + "grad_norm": 6.264922296051626, + "learning_rate": 4.995784280936455e-06, + "loss": 0.2657, + "step": 12550 + }, + { + "epoch": 0.49, + "grad_norm": 6.449845098914649, + "learning_rate": 4.995742474916389e-06, + "loss": 0.2708, + "step": 12575 + }, + { + "epoch": 0.5, + "grad_norm": 7.009503417055907, + "learning_rate": 4.995702341137124e-06, + "loss": 0.2522, + "step": 12600 + }, + { + "epoch": 0.5, + "grad_norm": 5.1804236684247345, + "learning_rate": 4.995660535117057e-06, + "loss": 0.2539, + "step": 12625 + }, + { + "epoch": 0.5, + "grad_norm": 5.543935093810235, + "learning_rate": 4.99561872909699e-06, + "loss": 0.2672, + "step": 12650 + }, + { + "epoch": 0.5, + "grad_norm": 6.471815945203869, + "learning_rate": 4.995576923076924e-06, + "loss": 0.2952, + "step": 12675 + }, + { + "epoch": 0.5, + "grad_norm": 6.501833406890489, + "learning_rate": 4.995535117056856e-06, + "loss": 0.2667, + "step": 12700 + }, + { + "epoch": 0.5, + "grad_norm": 6.343214654642585, + "learning_rate": 4.99549331103679e-06, + "loss": 0.2872, + "step": 12725 + }, + { + "epoch": 0.5, + "grad_norm": 5.271797540434532, + "learning_rate": 4.9954515050167226e-06, + "loss": 0.2683, + "step": 12750 + }, + { + "epoch": 0.5, + "grad_norm": 6.599139231991323, + "learning_rate": 4.995409698996656e-06, + "loss": 0.263, + "step": 12775 + }, + { + "epoch": 0.5, + "grad_norm": 6.002476348463217, + "learning_rate": 4.995367892976589e-06, + "loss": 0.2783, + "step": 12800 + }, + { + "epoch": 0.5, + "grad_norm": 5.525297588148386, + "learning_rate": 4.995326086956522e-06, + "loss": 0.2871, + "step": 12825 + }, + { + "epoch": 0.51, + "grad_norm": 5.708477915446077, + "learning_rate": 4.995284280936455e-06, + "loss": 0.2691, + "step": 12850 + }, + { + "epoch": 0.51, + "grad_norm": 6.6843516447099445, + "learning_rate": 4.995242474916389e-06, + "loss": 0.2871, + "step": 12875 + }, + { + "epoch": 0.51, + "grad_norm": 5.115815653336138, + "learning_rate": 4.995200668896321e-06, + "loss": 0.2425, + "step": 12900 + }, + { + "epoch": 0.51, + "grad_norm": 6.516585844765604, + "learning_rate": 4.995158862876255e-06, + "loss": 0.2576, + "step": 12925 + }, + { + "epoch": 0.51, + "grad_norm": 5.532827849171224, + "learning_rate": 4.995117056856188e-06, + "loss": 0.2921, + "step": 12950 + }, + { + "epoch": 0.51, + "grad_norm": 5.0067926091682144, + "learning_rate": 4.99507525083612e-06, + "loss": 0.2559, + "step": 12975 + }, + { + "epoch": 0.51, + "grad_norm": 5.750130129977456, + "learning_rate": 4.995033444816054e-06, + "loss": 0.264, + "step": 13000 + }, + { + "epoch": 0.51, + "grad_norm": 5.321056688567065, + "learning_rate": 4.9949916387959865e-06, + "loss": 0.2627, + "step": 13025 + }, + { + "epoch": 0.51, + "grad_norm": 6.063866279377709, + "learning_rate": 4.99494983277592e-06, + "loss": 0.2723, + "step": 13050 + }, + { + "epoch": 0.51, + "grad_norm": 5.685284661380941, + "learning_rate": 4.994908026755853e-06, + "loss": 0.2791, + "step": 13075 + }, + { + "epoch": 0.52, + "grad_norm": 5.645607817332848, + "learning_rate": 4.994866220735786e-06, + "loss": 0.2569, + "step": 13100 + }, + { + "epoch": 0.52, + "grad_norm": 5.818177466809242, + "learning_rate": 4.994824414715719e-06, + "loss": 0.2652, + "step": 13125 + }, + { + "epoch": 0.52, + "grad_norm": 5.567556521092891, + "learning_rate": 4.9947826086956525e-06, + "loss": 0.2722, + "step": 13150 + }, + { + "epoch": 0.52, + "grad_norm": 6.049853304178241, + "learning_rate": 4.994740802675585e-06, + "loss": 0.2521, + "step": 13175 + }, + { + "epoch": 0.52, + "grad_norm": 5.525962556248073, + "learning_rate": 4.994698996655519e-06, + "loss": 0.2782, + "step": 13200 + }, + { + "epoch": 0.52, + "grad_norm": 5.2979035414331035, + "learning_rate": 4.9946571906354515e-06, + "loss": 0.2672, + "step": 13225 + }, + { + "epoch": 0.52, + "grad_norm": 5.494928513364084, + "learning_rate": 4.994615384615385e-06, + "loss": 0.2705, + "step": 13250 + }, + { + "epoch": 0.52, + "grad_norm": 6.084597225504904, + "learning_rate": 4.994573578595318e-06, + "loss": 0.2883, + "step": 13275 + }, + { + "epoch": 0.52, + "grad_norm": 4.072120776961249, + "learning_rate": 4.994531772575251e-06, + "loss": 0.2588, + "step": 13300 + }, + { + "epoch": 0.52, + "grad_norm": 6.741904082206531, + "learning_rate": 4.994489966555184e-06, + "loss": 0.2687, + "step": 13325 + }, + { + "epoch": 0.53, + "grad_norm": 6.687864286901733, + "learning_rate": 4.9944481605351175e-06, + "loss": 0.2618, + "step": 13350 + }, + { + "epoch": 0.53, + "grad_norm": 6.279456079341556, + "learning_rate": 4.994406354515051e-06, + "loss": 0.2788, + "step": 13375 + }, + { + "epoch": 0.53, + "grad_norm": 4.034957788594076, + "learning_rate": 4.994364548494984e-06, + "loss": 0.2425, + "step": 13400 + }, + { + "epoch": 0.53, + "grad_norm": 5.552783387380151, + "learning_rate": 4.994322742474917e-06, + "loss": 0.251, + "step": 13425 + }, + { + "epoch": 0.53, + "grad_norm": 7.177948746144479, + "learning_rate": 4.99428093645485e-06, + "loss": 0.2782, + "step": 13450 + }, + { + "epoch": 0.53, + "grad_norm": 5.068512146023676, + "learning_rate": 4.9942391304347835e-06, + "loss": 0.2608, + "step": 13475 + }, + { + "epoch": 0.53, + "grad_norm": 5.744283896920329, + "learning_rate": 4.994197324414716e-06, + "loss": 0.2564, + "step": 13500 + }, + { + "epoch": 0.53, + "grad_norm": 7.126944376370206, + "learning_rate": 4.99415551839465e-06, + "loss": 0.2673, + "step": 13525 + }, + { + "epoch": 0.53, + "grad_norm": 4.907130426240397, + "learning_rate": 4.9941137123745825e-06, + "loss": 0.2547, + "step": 13550 + }, + { + "epoch": 0.53, + "grad_norm": 5.4639597192774865, + "learning_rate": 4.994071906354516e-06, + "loss": 0.2795, + "step": 13575 + }, + { + "epoch": 0.54, + "grad_norm": 3.672162745544116, + "learning_rate": 4.994030100334449e-06, + "loss": 0.289, + "step": 13600 + }, + { + "epoch": 0.54, + "grad_norm": 6.72396760446355, + "learning_rate": 4.993988294314382e-06, + "loss": 0.2711, + "step": 13625 + }, + { + "epoch": 0.54, + "grad_norm": 6.555801546793623, + "learning_rate": 4.993946488294315e-06, + "loss": 0.2666, + "step": 13650 + }, + { + "epoch": 0.54, + "grad_norm": 3.9331030728145016, + "learning_rate": 4.993904682274248e-06, + "loss": 0.243, + "step": 13675 + }, + { + "epoch": 0.54, + "grad_norm": 5.11331974297062, + "learning_rate": 4.99386287625418e-06, + "loss": 0.2442, + "step": 13700 + }, + { + "epoch": 0.54, + "grad_norm": 8.512882588773397, + "learning_rate": 4.993821070234114e-06, + "loss": 0.2622, + "step": 13725 + }, + { + "epoch": 0.54, + "grad_norm": 6.5372514495460905, + "learning_rate": 4.993779264214047e-06, + "loss": 0.299, + "step": 13750 + }, + { + "epoch": 0.54, + "grad_norm": 5.888509838266094, + "learning_rate": 4.99373745819398e-06, + "loss": 0.2575, + "step": 13775 + }, + { + "epoch": 0.54, + "grad_norm": 5.748557291602811, + "learning_rate": 4.993695652173914e-06, + "loss": 0.2668, + "step": 13800 + }, + { + "epoch": 0.54, + "grad_norm": 5.6182807926195215, + "learning_rate": 4.993653846153846e-06, + "loss": 0.2569, + "step": 13825 + }, + { + "epoch": 0.54, + "grad_norm": 6.067375657021155, + "learning_rate": 4.99361204013378e-06, + "loss": 0.2417, + "step": 13850 + }, + { + "epoch": 0.55, + "grad_norm": 7.0814238478732365, + "learning_rate": 4.993570234113713e-06, + "loss": 0.284, + "step": 13875 + }, + { + "epoch": 0.55, + "grad_norm": 5.849375990146647, + "learning_rate": 4.993528428093646e-06, + "loss": 0.2586, + "step": 13900 + }, + { + "epoch": 0.55, + "grad_norm": 6.420173287461275, + "learning_rate": 4.993486622073579e-06, + "loss": 0.2685, + "step": 13925 + }, + { + "epoch": 0.55, + "grad_norm": 6.510360107745434, + "learning_rate": 4.9934448160535124e-06, + "loss": 0.2622, + "step": 13950 + }, + { + "epoch": 0.55, + "grad_norm": 5.744495569021254, + "learning_rate": 4.993403010033445e-06, + "loss": 0.2654, + "step": 13975 + }, + { + "epoch": 0.55, + "grad_norm": 5.241624718908685, + "learning_rate": 4.993361204013379e-06, + "loss": 0.2575, + "step": 14000 + }, + { + "epoch": 0.55, + "grad_norm": 5.964854463517818, + "learning_rate": 4.993319397993311e-06, + "loss": 0.2601, + "step": 14025 + }, + { + "epoch": 0.55, + "grad_norm": 5.98667622104724, + "learning_rate": 4.993277591973245e-06, + "loss": 0.2631, + "step": 14050 + }, + { + "epoch": 0.55, + "grad_norm": 6.991177855529151, + "learning_rate": 4.993235785953178e-06, + "loss": 0.2711, + "step": 14075 + }, + { + "epoch": 0.55, + "grad_norm": 5.205578681842165, + "learning_rate": 4.993193979933111e-06, + "loss": 0.3043, + "step": 14100 + }, + { + "epoch": 0.56, + "grad_norm": 5.597102209370411, + "learning_rate": 4.993152173913044e-06, + "loss": 0.292, + "step": 14125 + }, + { + "epoch": 0.56, + "grad_norm": 4.747292937249573, + "learning_rate": 4.993110367892977e-06, + "loss": 0.2496, + "step": 14150 + }, + { + "epoch": 0.56, + "grad_norm": 5.567122798131745, + "learning_rate": 4.99306856187291e-06, + "loss": 0.2898, + "step": 14175 + }, + { + "epoch": 0.56, + "grad_norm": 5.425348552599907, + "learning_rate": 4.993026755852844e-06, + "loss": 0.2675, + "step": 14200 + }, + { + "epoch": 0.56, + "grad_norm": 5.923945998272752, + "learning_rate": 4.992984949832776e-06, + "loss": 0.2693, + "step": 14225 + }, + { + "epoch": 0.56, + "grad_norm": 6.085703696247624, + "learning_rate": 4.99294314381271e-06, + "loss": 0.2661, + "step": 14250 + }, + { + "epoch": 0.56, + "grad_norm": 5.73816088361352, + "learning_rate": 4.992901337792643e-06, + "loss": 0.2694, + "step": 14275 + }, + { + "epoch": 0.56, + "grad_norm": 6.809164188200779, + "learning_rate": 4.992859531772576e-06, + "loss": 0.2719, + "step": 14300 + }, + { + "epoch": 0.56, + "grad_norm": 6.3598512977073, + "learning_rate": 4.992817725752509e-06, + "loss": 0.2774, + "step": 14325 + }, + { + "epoch": 0.56, + "grad_norm": 5.216046308562436, + "learning_rate": 4.992775919732442e-06, + "loss": 0.247, + "step": 14350 + }, + { + "epoch": 0.57, + "grad_norm": 4.329199858263562, + "learning_rate": 4.992734113712375e-06, + "loss": 0.2868, + "step": 14375 + }, + { + "epoch": 0.57, + "grad_norm": 5.210460931264835, + "learning_rate": 4.992692307692308e-06, + "loss": 0.2683, + "step": 14400 + }, + { + "epoch": 0.57, + "grad_norm": 5.7667612517073765, + "learning_rate": 4.992650501672241e-06, + "loss": 0.2535, + "step": 14425 + }, + { + "epoch": 0.57, + "grad_norm": 6.9612302730938405, + "learning_rate": 4.992608695652174e-06, + "loss": 0.2529, + "step": 14450 + }, + { + "epoch": 0.57, + "grad_norm": 6.596575617259898, + "learning_rate": 4.9925668896321076e-06, + "loss": 0.253, + "step": 14475 + }, + { + "epoch": 0.57, + "grad_norm": 4.62301372614655, + "learning_rate": 4.99252508361204e-06, + "loss": 0.2504, + "step": 14500 + }, + { + "epoch": 0.57, + "grad_norm": 5.950147967461413, + "learning_rate": 4.992483277591974e-06, + "loss": 0.2682, + "step": 14525 + }, + { + "epoch": 0.57, + "grad_norm": 6.881464976782473, + "learning_rate": 4.9924414715719065e-06, + "loss": 0.2473, + "step": 14550 + }, + { + "epoch": 0.57, + "grad_norm": 6.705896744318513, + "learning_rate": 4.99239966555184e-06, + "loss": 0.2468, + "step": 14575 + }, + { + "epoch": 0.57, + "grad_norm": 3.6827213444842526, + "learning_rate": 4.992357859531773e-06, + "loss": 0.3114, + "step": 14600 + }, + { + "epoch": 0.58, + "grad_norm": 6.249966633309746, + "learning_rate": 4.992317725752509e-06, + "loss": 0.2633, + "step": 14625 + }, + { + "epoch": 0.58, + "grad_norm": 5.42278315835385, + "learning_rate": 4.992275919732441e-06, + "loss": 0.2854, + "step": 14650 + }, + { + "epoch": 0.58, + "grad_norm": 5.655143231460296, + "learning_rate": 4.992234113712375e-06, + "loss": 0.2655, + "step": 14675 + }, + { + "epoch": 0.58, + "grad_norm": 7.858506902274973, + "learning_rate": 4.9921923076923075e-06, + "loss": 0.2913, + "step": 14700 + }, + { + "epoch": 0.58, + "grad_norm": 5.606517868907838, + "learning_rate": 4.992150501672241e-06, + "loss": 0.2562, + "step": 14725 + }, + { + "epoch": 0.58, + "grad_norm": 7.8892802718869275, + "learning_rate": 4.992108695652174e-06, + "loss": 0.2653, + "step": 14750 + }, + { + "epoch": 0.58, + "grad_norm": 6.395026098564766, + "learning_rate": 4.992066889632107e-06, + "loss": 0.2717, + "step": 14775 + }, + { + "epoch": 0.58, + "grad_norm": 6.043189975643226, + "learning_rate": 4.99202508361204e-06, + "loss": 0.2587, + "step": 14800 + }, + { + "epoch": 0.58, + "grad_norm": 7.36609725720899, + "learning_rate": 4.9919832775919736e-06, + "loss": 0.272, + "step": 14825 + }, + { + "epoch": 0.58, + "grad_norm": 4.645686026598199, + "learning_rate": 4.991941471571906e-06, + "loss": 0.2333, + "step": 14850 + }, + { + "epoch": 0.59, + "grad_norm": 5.55507184083137, + "learning_rate": 4.99189966555184e-06, + "loss": 0.2509, + "step": 14875 + }, + { + "epoch": 0.59, + "grad_norm": 5.345846578476869, + "learning_rate": 4.991857859531773e-06, + "loss": 0.2613, + "step": 14900 + }, + { + "epoch": 0.59, + "grad_norm": 5.935348438197945, + "learning_rate": 4.991816053511706e-06, + "loss": 0.2816, + "step": 14925 + }, + { + "epoch": 0.59, + "grad_norm": 5.803960018432536, + "learning_rate": 4.99177424749164e-06, + "loss": 0.2548, + "step": 14950 + }, + { + "epoch": 0.59, + "grad_norm": 4.42912623970475, + "learning_rate": 4.991732441471572e-06, + "loss": 0.2529, + "step": 14975 + }, + { + "epoch": 0.59, + "grad_norm": 4.975233188396705, + "learning_rate": 4.991690635451506e-06, + "loss": 0.2881, + "step": 15000 + }, + { + "epoch": 0.59, + "grad_norm": 5.22507074373668, + "learning_rate": 4.9916488294314385e-06, + "loss": 0.2552, + "step": 15025 + }, + { + "epoch": 0.59, + "grad_norm": 6.14001014159123, + "learning_rate": 4.991607023411372e-06, + "loss": 0.2684, + "step": 15050 + }, + { + "epoch": 0.59, + "grad_norm": 6.034722792372274, + "learning_rate": 4.991565217391305e-06, + "loss": 0.2564, + "step": 15075 + }, + { + "epoch": 0.59, + "grad_norm": 7.342705074428043, + "learning_rate": 4.991523411371238e-06, + "loss": 0.2542, + "step": 15100 + }, + { + "epoch": 0.6, + "grad_norm": 5.154437791807671, + "learning_rate": 4.991481605351171e-06, + "loss": 0.2728, + "step": 15125 + }, + { + "epoch": 0.6, + "grad_norm": 4.466502948123444, + "learning_rate": 4.9914397993311046e-06, + "loss": 0.2759, + "step": 15150 + }, + { + "epoch": 0.6, + "grad_norm": 5.730270409539509, + "learning_rate": 4.991397993311037e-06, + "loss": 0.2671, + "step": 15175 + }, + { + "epoch": 0.6, + "grad_norm": 4.13478451695111, + "learning_rate": 4.991356187290971e-06, + "loss": 0.2639, + "step": 15200 + }, + { + "epoch": 0.6, + "grad_norm": 6.63611014055058, + "learning_rate": 4.9913143812709035e-06, + "loss": 0.2511, + "step": 15225 + }, + { + "epoch": 0.6, + "grad_norm": 6.486679281838134, + "learning_rate": 4.991272575250837e-06, + "loss": 0.2615, + "step": 15250 + }, + { + "epoch": 0.6, + "grad_norm": 6.609027754294971, + "learning_rate": 4.99123076923077e-06, + "loss": 0.2521, + "step": 15275 + }, + { + "epoch": 0.6, + "grad_norm": 7.300744039135444, + "learning_rate": 4.991188963210703e-06, + "loss": 0.2734, + "step": 15300 + }, + { + "epoch": 0.6, + "grad_norm": 5.302779818219576, + "learning_rate": 4.991147157190635e-06, + "loss": 0.2771, + "step": 15325 + }, + { + "epoch": 0.6, + "grad_norm": 6.909444498848482, + "learning_rate": 4.991105351170569e-06, + "loss": 0.2675, + "step": 15350 + }, + { + "epoch": 0.6, + "grad_norm": 5.43122899967809, + "learning_rate": 4.991063545150502e-06, + "loss": 0.2618, + "step": 15375 + }, + { + "epoch": 0.61, + "grad_norm": 6.228246630467435, + "learning_rate": 4.991021739130435e-06, + "loss": 0.2687, + "step": 15400 + }, + { + "epoch": 0.61, + "grad_norm": 6.064148750557926, + "learning_rate": 4.9909799331103685e-06, + "loss": 0.2428, + "step": 15425 + }, + { + "epoch": 0.61, + "grad_norm": 5.255096086616142, + "learning_rate": 4.990938127090301e-06, + "loss": 0.2667, + "step": 15450 + }, + { + "epoch": 0.61, + "grad_norm": 7.709327247522713, + "learning_rate": 4.990896321070235e-06, + "loss": 0.2732, + "step": 15475 + }, + { + "epoch": 0.61, + "grad_norm": 6.184508161358503, + "learning_rate": 4.9908545150501674e-06, + "loss": 0.2719, + "step": 15500 + }, + { + "epoch": 0.61, + "grad_norm": 4.922987038531951, + "learning_rate": 4.990812709030101e-06, + "loss": 0.2797, + "step": 15525 + }, + { + "epoch": 0.61, + "grad_norm": 5.626376622087191, + "learning_rate": 4.990770903010034e-06, + "loss": 0.2479, + "step": 15550 + }, + { + "epoch": 0.61, + "grad_norm": 5.514060439157896, + "learning_rate": 4.990729096989967e-06, + "loss": 0.2405, + "step": 15575 + }, + { + "epoch": 0.61, + "grad_norm": 5.373488145308544, + "learning_rate": 4.9906872909699e-06, + "loss": 0.248, + "step": 15600 + }, + { + "epoch": 0.61, + "grad_norm": 6.741275477526847, + "learning_rate": 4.990647157190636e-06, + "loss": 0.253, + "step": 15625 + }, + { + "epoch": 0.62, + "grad_norm": 6.095305164848746, + "learning_rate": 4.9906053511705685e-06, + "loss": 0.2635, + "step": 15650 + }, + { + "epoch": 0.62, + "grad_norm": 6.587285383976124, + "learning_rate": 4.990563545150502e-06, + "loss": 0.2713, + "step": 15675 + }, + { + "epoch": 0.62, + "grad_norm": 5.9812853760264115, + "learning_rate": 4.990521739130435e-06, + "loss": 0.2796, + "step": 15700 + }, + { + "epoch": 0.62, + "grad_norm": 6.944806936412515, + "learning_rate": 4.990479933110368e-06, + "loss": 0.2599, + "step": 15725 + }, + { + "epoch": 0.62, + "grad_norm": 6.171130869044126, + "learning_rate": 4.990438127090301e-06, + "loss": 0.2489, + "step": 15750 + }, + { + "epoch": 0.62, + "grad_norm": 6.206236751191819, + "learning_rate": 4.9903963210702345e-06, + "loss": 0.2717, + "step": 15775 + }, + { + "epoch": 0.62, + "grad_norm": 6.974957589959287, + "learning_rate": 4.990354515050167e-06, + "loss": 0.2632, + "step": 15800 + }, + { + "epoch": 0.62, + "grad_norm": 4.464737013981678, + "learning_rate": 4.990312709030101e-06, + "loss": 0.2562, + "step": 15825 + }, + { + "epoch": 0.62, + "grad_norm": 7.039455268506516, + "learning_rate": 4.9902709030100334e-06, + "loss": 0.2589, + "step": 15850 + }, + { + "epoch": 0.62, + "grad_norm": 6.93453838570339, + "learning_rate": 4.990229096989967e-06, + "loss": 0.254, + "step": 15875 + }, + { + "epoch": 0.63, + "grad_norm": 7.429570899110624, + "learning_rate": 4.9901872909699e-06, + "loss": 0.2755, + "step": 15900 + }, + { + "epoch": 0.63, + "grad_norm": 6.473116184036116, + "learning_rate": 4.990145484949833e-06, + "loss": 0.255, + "step": 15925 + }, + { + "epoch": 0.63, + "grad_norm": 5.245501608124624, + "learning_rate": 4.990103678929766e-06, + "loss": 0.2323, + "step": 15950 + }, + { + "epoch": 0.63, + "grad_norm": 5.1289839856825346, + "learning_rate": 4.9900618729096995e-06, + "loss": 0.2623, + "step": 15975 + }, + { + "epoch": 0.63, + "grad_norm": 4.717814516086991, + "learning_rate": 4.990020066889632e-06, + "loss": 0.2414, + "step": 16000 + }, + { + "epoch": 0.63, + "grad_norm": 5.751851025204562, + "learning_rate": 4.989978260869566e-06, + "loss": 0.2621, + "step": 16025 + }, + { + "epoch": 0.63, + "grad_norm": 4.6353456530210755, + "learning_rate": 4.989936454849499e-06, + "loss": 0.2621, + "step": 16050 + }, + { + "epoch": 0.63, + "grad_norm": 6.035407794665361, + "learning_rate": 4.989894648829432e-06, + "loss": 0.2706, + "step": 16075 + }, + { + "epoch": 0.63, + "grad_norm": 7.175886486573118, + "learning_rate": 4.9898528428093655e-06, + "loss": 0.2556, + "step": 16100 + }, + { + "epoch": 0.63, + "grad_norm": 5.306973626946458, + "learning_rate": 4.989811036789298e-06, + "loss": 0.2532, + "step": 16125 + }, + { + "epoch": 0.64, + "grad_norm": 5.739524436385124, + "learning_rate": 4.989769230769232e-06, + "loss": 0.243, + "step": 16150 + }, + { + "epoch": 0.64, + "grad_norm": 5.743554132202793, + "learning_rate": 4.9897274247491644e-06, + "loss": 0.2571, + "step": 16175 + }, + { + "epoch": 0.64, + "grad_norm": 5.507860974296257, + "learning_rate": 4.989685618729098e-06, + "loss": 0.2498, + "step": 16200 + }, + { + "epoch": 0.64, + "grad_norm": 7.521567618683169, + "learning_rate": 4.989643812709031e-06, + "loss": 0.2406, + "step": 16225 + }, + { + "epoch": 0.64, + "grad_norm": 5.287306100434563, + "learning_rate": 4.989602006688964e-06, + "loss": 0.2548, + "step": 16250 + }, + { + "epoch": 0.64, + "grad_norm": 5.605554498785768, + "learning_rate": 4.989560200668896e-06, + "loss": 0.2735, + "step": 16275 + }, + { + "epoch": 0.64, + "grad_norm": 7.763241090962355, + "learning_rate": 4.98951839464883e-06, + "loss": 0.2306, + "step": 16300 + }, + { + "epoch": 0.64, + "grad_norm": 5.236303105187178, + "learning_rate": 4.989476588628762e-06, + "loss": 0.2406, + "step": 16325 + }, + { + "epoch": 0.64, + "grad_norm": 6.893192357925432, + "learning_rate": 4.989434782608696e-06, + "loss": 0.2643, + "step": 16350 + }, + { + "epoch": 0.64, + "grad_norm": 5.7801672791546945, + "learning_rate": 4.9893929765886286e-06, + "loss": 0.2557, + "step": 16375 + }, + { + "epoch": 0.65, + "grad_norm": 6.509427372226329, + "learning_rate": 4.989351170568562e-06, + "loss": 0.2408, + "step": 16400 + }, + { + "epoch": 0.65, + "grad_norm": 6.145083984273696, + "learning_rate": 4.989309364548495e-06, + "loss": 0.2475, + "step": 16425 + }, + { + "epoch": 0.65, + "grad_norm": 4.961929779970909, + "learning_rate": 4.989267558528428e-06, + "loss": 0.2543, + "step": 16450 + }, + { + "epoch": 0.65, + "grad_norm": 6.237616244924249, + "learning_rate": 4.989225752508362e-06, + "loss": 0.2693, + "step": 16475 + }, + { + "epoch": 0.65, + "grad_norm": 5.071941176628237, + "learning_rate": 4.989183946488295e-06, + "loss": 0.2672, + "step": 16500 + }, + { + "epoch": 0.65, + "grad_norm": 5.99881469452368, + "learning_rate": 4.989142140468228e-06, + "loss": 0.252, + "step": 16525 + }, + { + "epoch": 0.65, + "grad_norm": 6.824243326080856, + "learning_rate": 4.989100334448161e-06, + "loss": 0.2457, + "step": 16550 + }, + { + "epoch": 0.65, + "grad_norm": 6.86991972909777, + "learning_rate": 4.989058528428094e-06, + "loss": 0.266, + "step": 16575 + }, + { + "epoch": 0.65, + "grad_norm": 5.545283659754445, + "learning_rate": 4.989016722408027e-06, + "loss": 0.2781, + "step": 16600 + }, + { + "epoch": 0.65, + "grad_norm": 4.925067126921487, + "learning_rate": 4.988976588628763e-06, + "loss": 0.2954, + "step": 16625 + }, + { + "epoch": 0.66, + "grad_norm": 5.58821701479221, + "learning_rate": 4.988934782608696e-06, + "loss": 0.2494, + "step": 16650 + }, + { + "epoch": 0.66, + "grad_norm": 5.185520155813638, + "learning_rate": 4.988892976588629e-06, + "loss": 0.2307, + "step": 16675 + }, + { + "epoch": 0.66, + "grad_norm": 5.039375868942344, + "learning_rate": 4.988851170568562e-06, + "loss": 0.2454, + "step": 16700 + }, + { + "epoch": 0.66, + "grad_norm": 5.8875232231482615, + "learning_rate": 4.988809364548495e-06, + "loss": 0.2556, + "step": 16725 + }, + { + "epoch": 0.66, + "grad_norm": 4.585235280866713, + "learning_rate": 4.988767558528428e-06, + "loss": 0.2687, + "step": 16750 + }, + { + "epoch": 0.66, + "grad_norm": 5.796602212489863, + "learning_rate": 4.988725752508362e-06, + "loss": 0.2573, + "step": 16775 + }, + { + "epoch": 0.66, + "grad_norm": 6.1627805195183685, + "learning_rate": 4.988683946488294e-06, + "loss": 0.2932, + "step": 16800 + }, + { + "epoch": 0.66, + "grad_norm": 5.552009029354894, + "learning_rate": 4.988642140468228e-06, + "loss": 0.2707, + "step": 16825 + }, + { + "epoch": 0.66, + "grad_norm": 5.466725002249184, + "learning_rate": 4.988600334448161e-06, + "loss": 0.2336, + "step": 16850 + }, + { + "epoch": 0.66, + "grad_norm": 5.186062157421561, + "learning_rate": 4.988558528428094e-06, + "loss": 0.2538, + "step": 16875 + }, + { + "epoch": 0.66, + "grad_norm": 6.080865810052987, + "learning_rate": 4.988516722408027e-06, + "loss": 0.2464, + "step": 16900 + }, + { + "epoch": 0.67, + "grad_norm": 6.570893467223135, + "learning_rate": 4.98847491638796e-06, + "loss": 0.2555, + "step": 16925 + }, + { + "epoch": 0.67, + "grad_norm": 5.9998578151996504, + "learning_rate": 4.988433110367893e-06, + "loss": 0.2685, + "step": 16950 + }, + { + "epoch": 0.67, + "grad_norm": 5.714104620985236, + "learning_rate": 4.988391304347827e-06, + "loss": 0.2579, + "step": 16975 + }, + { + "epoch": 0.67, + "grad_norm": 6.204684812582076, + "learning_rate": 4.988349498327759e-06, + "loss": 0.2814, + "step": 17000 + }, + { + "epoch": 0.67, + "grad_norm": 7.184079656953484, + "learning_rate": 4.988307692307693e-06, + "loss": 0.2413, + "step": 17025 + }, + { + "epoch": 0.67, + "grad_norm": 5.268387433526686, + "learning_rate": 4.988265886287626e-06, + "loss": 0.2817, + "step": 17050 + }, + { + "epoch": 0.67, + "grad_norm": 6.257289805906091, + "learning_rate": 4.988224080267559e-06, + "loss": 0.2595, + "step": 17075 + }, + { + "epoch": 0.67, + "grad_norm": 5.0383919249393045, + "learning_rate": 4.988182274247492e-06, + "loss": 0.2666, + "step": 17100 + }, + { + "epoch": 0.67, + "grad_norm": 5.291305025444263, + "learning_rate": 4.988140468227425e-06, + "loss": 0.2654, + "step": 17125 + }, + { + "epoch": 0.67, + "grad_norm": 5.340130958791915, + "learning_rate": 4.988098662207359e-06, + "loss": 0.2525, + "step": 17150 + }, + { + "epoch": 0.68, + "grad_norm": 4.911613542246811, + "learning_rate": 4.988056856187292e-06, + "loss": 0.2788, + "step": 17175 + }, + { + "epoch": 0.68, + "grad_norm": 5.477361220204573, + "learning_rate": 4.988015050167225e-06, + "loss": 0.2522, + "step": 17200 + }, + { + "epoch": 0.68, + "grad_norm": 4.792368739302577, + "learning_rate": 4.987973244147157e-06, + "loss": 0.2724, + "step": 17225 + }, + { + "epoch": 0.68, + "grad_norm": 5.35695367909829, + "learning_rate": 4.9879314381270906e-06, + "loss": 0.2761, + "step": 17250 + }, + { + "epoch": 0.68, + "grad_norm": 5.629047017884108, + "learning_rate": 4.987889632107023e-06, + "loss": 0.2635, + "step": 17275 + }, + { + "epoch": 0.68, + "grad_norm": 5.331465117440273, + "learning_rate": 4.987847826086957e-06, + "loss": 0.2414, + "step": 17300 + }, + { + "epoch": 0.68, + "grad_norm": 5.367627157757305, + "learning_rate": 4.9878060200668895e-06, + "loss": 0.2572, + "step": 17325 + }, + { + "epoch": 0.68, + "grad_norm": 5.1172264981864295, + "learning_rate": 4.987764214046823e-06, + "loss": 0.2577, + "step": 17350 + }, + { + "epoch": 0.68, + "grad_norm": 5.355305084531797, + "learning_rate": 4.987724080267559e-06, + "loss": 0.2792, + "step": 17375 + }, + { + "epoch": 0.68, + "grad_norm": 5.842697216641398, + "learning_rate": 4.987682274247492e-06, + "loss": 0.2617, + "step": 17400 + }, + { + "epoch": 0.69, + "grad_norm": 5.743056099274135, + "learning_rate": 4.987640468227425e-06, + "loss": 0.2488, + "step": 17425 + }, + { + "epoch": 0.69, + "grad_norm": 8.36774370696684, + "learning_rate": 4.987598662207358e-06, + "loss": 0.2777, + "step": 17450 + }, + { + "epoch": 0.69, + "grad_norm": 5.700174425799506, + "learning_rate": 4.987556856187291e-06, + "loss": 0.2445, + "step": 17475 + }, + { + "epoch": 0.69, + "grad_norm": 5.5291577524920905, + "learning_rate": 4.987515050167224e-06, + "loss": 0.2391, + "step": 17500 + }, + { + "epoch": 0.69, + "grad_norm": 5.0434604227152455, + "learning_rate": 4.987473244147158e-06, + "loss": 0.2868, + "step": 17525 + }, + { + "epoch": 0.69, + "grad_norm": 6.471288503953715, + "learning_rate": 4.98743143812709e-06, + "loss": 0.294, + "step": 17550 + }, + { + "epoch": 0.69, + "grad_norm": 5.967607026828377, + "learning_rate": 4.987389632107024e-06, + "loss": 0.2582, + "step": 17575 + }, + { + "epoch": 0.69, + "grad_norm": 5.493557381929855, + "learning_rate": 4.9873478260869566e-06, + "loss": 0.2337, + "step": 17600 + }, + { + "epoch": 0.69, + "grad_norm": 5.557469040585438, + "learning_rate": 4.98730602006689e-06, + "loss": 0.2703, + "step": 17625 + }, + { + "epoch": 0.69, + "grad_norm": 5.523565121186858, + "learning_rate": 4.987264214046823e-06, + "loss": 0.2577, + "step": 17650 + }, + { + "epoch": 0.7, + "grad_norm": 6.148642857874085, + "learning_rate": 4.987222408026756e-06, + "loss": 0.252, + "step": 17675 + }, + { + "epoch": 0.7, + "grad_norm": 5.875580443467786, + "learning_rate": 4.987180602006689e-06, + "loss": 0.278, + "step": 17700 + }, + { + "epoch": 0.7, + "grad_norm": 5.99454936628081, + "learning_rate": 4.987138795986623e-06, + "loss": 0.2721, + "step": 17725 + }, + { + "epoch": 0.7, + "grad_norm": 4.555180518575604, + "learning_rate": 4.987096989966555e-06, + "loss": 0.2533, + "step": 17750 + }, + { + "epoch": 0.7, + "grad_norm": 5.395100579611774, + "learning_rate": 4.987055183946489e-06, + "loss": 0.2625, + "step": 17775 + }, + { + "epoch": 0.7, + "grad_norm": 5.563405837664269, + "learning_rate": 4.9870133779264215e-06, + "loss": 0.2622, + "step": 17800 + }, + { + "epoch": 0.7, + "grad_norm": 6.043632738488831, + "learning_rate": 4.986971571906355e-06, + "loss": 0.2845, + "step": 17825 + }, + { + "epoch": 0.7, + "grad_norm": 7.119469778219465, + "learning_rate": 4.986929765886288e-06, + "loss": 0.251, + "step": 17850 + }, + { + "epoch": 0.7, + "grad_norm": 8.75507347596451, + "learning_rate": 4.986887959866221e-06, + "loss": 0.251, + "step": 17875 + }, + { + "epoch": 0.7, + "grad_norm": 5.826211048039959, + "learning_rate": 4.986846153846154e-06, + "loss": 0.2691, + "step": 17900 + }, + { + "epoch": 0.71, + "grad_norm": 5.010072111561531, + "learning_rate": 4.9868043478260876e-06, + "loss": 0.2366, + "step": 17925 + }, + { + "epoch": 0.71, + "grad_norm": 4.859353986359138, + "learning_rate": 4.98676254180602e-06, + "loss": 0.2688, + "step": 17950 + }, + { + "epoch": 0.71, + "grad_norm": 4.436740266722265, + "learning_rate": 4.986720735785954e-06, + "loss": 0.2287, + "step": 17975 + }, + { + "epoch": 0.71, + "grad_norm": 7.699368648294974, + "learning_rate": 4.9866789297658865e-06, + "loss": 0.2623, + "step": 18000 + }, + { + "epoch": 0.71, + "grad_norm": 4.286788999671542, + "learning_rate": 4.98663712374582e-06, + "loss": 0.2675, + "step": 18025 + }, + { + "epoch": 0.71, + "grad_norm": 6.383383566653747, + "learning_rate": 4.986595317725753e-06, + "loss": 0.2483, + "step": 18050 + }, + { + "epoch": 0.71, + "grad_norm": 6.558217801754021, + "learning_rate": 4.986553511705686e-06, + "loss": 0.2542, + "step": 18075 + }, + { + "epoch": 0.71, + "grad_norm": 4.38291780085364, + "learning_rate": 4.986511705685619e-06, + "loss": 0.2497, + "step": 18100 + }, + { + "epoch": 0.71, + "grad_norm": 5.520472839091616, + "learning_rate": 4.9864698996655525e-06, + "loss": 0.2574, + "step": 18125 + }, + { + "epoch": 0.71, + "grad_norm": 6.77899108452888, + "learning_rate": 4.986428093645485e-06, + "loss": 0.264, + "step": 18150 + }, + { + "epoch": 0.72, + "grad_norm": 6.778959356886794, + "learning_rate": 4.986386287625418e-06, + "loss": 0.2524, + "step": 18175 + }, + { + "epoch": 0.72, + "grad_norm": 5.918405607392925, + "learning_rate": 4.9863444816053515e-06, + "loss": 0.2635, + "step": 18200 + }, + { + "epoch": 0.72, + "grad_norm": 5.031548207897417, + "learning_rate": 4.986302675585284e-06, + "loss": 0.2426, + "step": 18225 + }, + { + "epoch": 0.72, + "grad_norm": 5.352776823368371, + "learning_rate": 4.986260869565218e-06, + "loss": 0.2621, + "step": 18250 + }, + { + "epoch": 0.72, + "grad_norm": 6.416316266136449, + "learning_rate": 4.9862190635451504e-06, + "loss": 0.2337, + "step": 18275 + }, + { + "epoch": 0.72, + "grad_norm": 5.902376325347049, + "learning_rate": 4.986177257525084e-06, + "loss": 0.2764, + "step": 18300 + }, + { + "epoch": 0.72, + "grad_norm": 5.577162546382801, + "learning_rate": 4.986135451505017e-06, + "loss": 0.2647, + "step": 18325 + }, + { + "epoch": 0.72, + "grad_norm": 5.429483200140283, + "learning_rate": 4.98609364548495e-06, + "loss": 0.2539, + "step": 18350 + }, + { + "epoch": 0.72, + "grad_norm": 4.472015436354258, + "learning_rate": 4.986051839464883e-06, + "loss": 0.2194, + "step": 18375 + }, + { + "epoch": 0.72, + "grad_norm": 5.304795171586728, + "learning_rate": 4.9860100334448165e-06, + "loss": 0.2506, + "step": 18400 + }, + { + "epoch": 0.72, + "grad_norm": 5.5728897025692525, + "learning_rate": 4.985968227424749e-06, + "loss": 0.2471, + "step": 18425 + }, + { + "epoch": 0.73, + "grad_norm": 5.522313447060331, + "learning_rate": 4.985926421404683e-06, + "loss": 0.2708, + "step": 18450 + }, + { + "epoch": 0.73, + "grad_norm": 6.393407912312494, + "learning_rate": 4.985884615384615e-06, + "loss": 0.2504, + "step": 18475 + }, + { + "epoch": 0.73, + "grad_norm": 4.290470046509902, + "learning_rate": 4.985842809364549e-06, + "loss": 0.2418, + "step": 18500 + }, + { + "epoch": 0.73, + "grad_norm": 3.8204122499023567, + "learning_rate": 4.985801003344482e-06, + "loss": 0.2303, + "step": 18525 + }, + { + "epoch": 0.73, + "grad_norm": 4.705257208518913, + "learning_rate": 4.985759197324415e-06, + "loss": 0.2552, + "step": 18550 + }, + { + "epoch": 0.73, + "grad_norm": 6.239833066925982, + "learning_rate": 4.985717391304348e-06, + "loss": 0.2349, + "step": 18575 + }, + { + "epoch": 0.73, + "grad_norm": 5.007178185695814, + "learning_rate": 4.9856755852842814e-06, + "loss": 0.2746, + "step": 18600 + }, + { + "epoch": 0.73, + "grad_norm": 5.604459846104763, + "learning_rate": 4.985633779264214e-06, + "loss": 0.2556, + "step": 18625 + }, + { + "epoch": 0.73, + "grad_norm": 4.923135167645596, + "learning_rate": 4.985591973244148e-06, + "loss": 0.2602, + "step": 18650 + }, + { + "epoch": 0.73, + "grad_norm": 6.086240722877576, + "learning_rate": 4.985550167224081e-06, + "loss": 0.2445, + "step": 18675 + }, + { + "epoch": 0.74, + "grad_norm": 6.46646029604406, + "learning_rate": 4.985508361204014e-06, + "loss": 0.253, + "step": 18700 + }, + { + "epoch": 0.74, + "grad_norm": 5.479558717023913, + "learning_rate": 4.9854665551839475e-06, + "loss": 0.2642, + "step": 18725 + }, + { + "epoch": 0.74, + "grad_norm": 6.161950185397198, + "learning_rate": 4.98542474916388e-06, + "loss": 0.2664, + "step": 18750 + }, + { + "epoch": 0.74, + "grad_norm": 7.149826660631525, + "learning_rate": 4.985382943143814e-06, + "loss": 0.2735, + "step": 18775 + }, + { + "epoch": 0.74, + "grad_norm": 5.098059065958003, + "learning_rate": 4.985341137123746e-06, + "loss": 0.2626, + "step": 18800 + }, + { + "epoch": 0.74, + "grad_norm": 6.7322272581947376, + "learning_rate": 4.98529933110368e-06, + "loss": 0.2706, + "step": 18825 + }, + { + "epoch": 0.74, + "grad_norm": 6.485082925954047, + "learning_rate": 4.985257525083613e-06, + "loss": 0.265, + "step": 18850 + }, + { + "epoch": 0.74, + "grad_norm": 3.8109358920665106, + "learning_rate": 4.985215719063545e-06, + "loss": 0.2364, + "step": 18875 + }, + { + "epoch": 0.74, + "grad_norm": 5.94763670161086, + "learning_rate": 4.985173913043478e-06, + "loss": 0.238, + "step": 18900 + }, + { + "epoch": 0.74, + "grad_norm": 6.52671801682198, + "learning_rate": 4.985132107023412e-06, + "loss": 0.2535, + "step": 18925 + }, + { + "epoch": 0.75, + "grad_norm": 5.79617606507355, + "learning_rate": 4.985090301003344e-06, + "loss": 0.2319, + "step": 18950 + }, + { + "epoch": 0.75, + "grad_norm": 4.661160862165919, + "learning_rate": 4.985048494983278e-06, + "loss": 0.25, + "step": 18975 + }, + { + "epoch": 0.75, + "grad_norm": 5.823787487368952, + "learning_rate": 4.9850066889632105e-06, + "loss": 0.2494, + "step": 19000 + }, + { + "epoch": 0.75, + "grad_norm": 6.041962533822711, + "learning_rate": 4.984964882943144e-06, + "loss": 0.2534, + "step": 19025 + }, + { + "epoch": 0.75, + "grad_norm": 6.137209039165049, + "learning_rate": 4.984923076923077e-06, + "loss": 0.2485, + "step": 19050 + }, + { + "epoch": 0.75, + "grad_norm": 6.157303270364183, + "learning_rate": 4.98488127090301e-06, + "loss": 0.2495, + "step": 19075 + }, + { + "epoch": 0.75, + "grad_norm": 5.804140711641237, + "learning_rate": 4.984839464882944e-06, + "loss": 0.2547, + "step": 19100 + }, + { + "epoch": 0.75, + "grad_norm": 7.783724624022202, + "learning_rate": 4.9847976588628766e-06, + "loss": 0.2346, + "step": 19125 + }, + { + "epoch": 0.75, + "grad_norm": 5.69877397127529, + "learning_rate": 4.98475585284281e-06, + "loss": 0.2892, + "step": 19150 + }, + { + "epoch": 0.75, + "grad_norm": 6.811523610920984, + "learning_rate": 4.984714046822743e-06, + "loss": 0.2576, + "step": 19175 + }, + { + "epoch": 0.76, + "grad_norm": 5.632437319589204, + "learning_rate": 4.984672240802676e-06, + "loss": 0.2688, + "step": 19200 + }, + { + "epoch": 0.76, + "grad_norm": 5.287424341539971, + "learning_rate": 4.984630434782609e-06, + "loss": 0.2582, + "step": 19225 + }, + { + "epoch": 0.76, + "grad_norm": 5.27438775796841, + "learning_rate": 4.984588628762543e-06, + "loss": 0.2517, + "step": 19250 + }, + { + "epoch": 0.76, + "grad_norm": 6.312578396357088, + "learning_rate": 4.984546822742475e-06, + "loss": 0.2659, + "step": 19275 + }, + { + "epoch": 0.76, + "grad_norm": 6.561038478654943, + "learning_rate": 4.984505016722409e-06, + "loss": 0.2669, + "step": 19300 + }, + { + "epoch": 0.76, + "grad_norm": 5.883831211163112, + "learning_rate": 4.9844632107023416e-06, + "loss": 0.2644, + "step": 19325 + }, + { + "epoch": 0.76, + "grad_norm": 4.7206392552751435, + "learning_rate": 4.984421404682275e-06, + "loss": 0.2419, + "step": 19350 + }, + { + "epoch": 0.76, + "grad_norm": 5.772561430178686, + "learning_rate": 4.98438127090301e-06, + "loss": 0.2545, + "step": 19375 + }, + { + "epoch": 0.76, + "grad_norm": 5.808370166675897, + "learning_rate": 4.984339464882944e-06, + "loss": 0.2463, + "step": 19400 + }, + { + "epoch": 0.76, + "grad_norm": 5.8556307395440825, + "learning_rate": 4.984297658862876e-06, + "loss": 0.2811, + "step": 19425 + }, + { + "epoch": 0.77, + "grad_norm": 6.305858961213936, + "learning_rate": 4.98425585284281e-06, + "loss": 0.2662, + "step": 19450 + }, + { + "epoch": 0.77, + "grad_norm": 6.59470367618029, + "learning_rate": 4.9842140468227426e-06, + "loss": 0.2878, + "step": 19475 + }, + { + "epoch": 0.77, + "grad_norm": 6.3608941320554395, + "learning_rate": 4.984172240802676e-06, + "loss": 0.2566, + "step": 19500 + }, + { + "epoch": 0.77, + "grad_norm": 5.757478418973163, + "learning_rate": 4.984130434782609e-06, + "loss": 0.2448, + "step": 19525 + }, + { + "epoch": 0.77, + "grad_norm": 4.286220221553772, + "learning_rate": 4.984088628762542e-06, + "loss": 0.2359, + "step": 19550 + }, + { + "epoch": 0.77, + "grad_norm": 5.690642809026132, + "learning_rate": 4.984046822742475e-06, + "loss": 0.2417, + "step": 19575 + }, + { + "epoch": 0.77, + "grad_norm": 4.924557118890291, + "learning_rate": 4.984005016722409e-06, + "loss": 0.2663, + "step": 19600 + }, + { + "epoch": 0.77, + "grad_norm": 6.77094033240899, + "learning_rate": 4.983963210702341e-06, + "loss": 0.2548, + "step": 19625 + }, + { + "epoch": 0.77, + "grad_norm": 6.358984573786143, + "learning_rate": 4.983921404682275e-06, + "loss": 0.2619, + "step": 19650 + }, + { + "epoch": 0.77, + "grad_norm": 5.7311555484725965, + "learning_rate": 4.9838795986622075e-06, + "loss": 0.2821, + "step": 19675 + }, + { + "epoch": 0.78, + "grad_norm": 5.141037303387008, + "learning_rate": 4.983837792642141e-06, + "loss": 0.2697, + "step": 19700 + }, + { + "epoch": 0.78, + "grad_norm": 5.218327752735989, + "learning_rate": 4.983795986622074e-06, + "loss": 0.2596, + "step": 19725 + }, + { + "epoch": 0.78, + "grad_norm": 6.478272133179351, + "learning_rate": 4.983754180602007e-06, + "loss": 0.2788, + "step": 19750 + }, + { + "epoch": 0.78, + "grad_norm": 7.3508943589897315, + "learning_rate": 4.983712374581941e-06, + "loss": 0.2787, + "step": 19775 + }, + { + "epoch": 0.78, + "grad_norm": 5.109200564645706, + "learning_rate": 4.983670568561874e-06, + "loss": 0.226, + "step": 19800 + }, + { + "epoch": 0.78, + "grad_norm": 5.18181645808782, + "learning_rate": 4.983628762541806e-06, + "loss": 0.2721, + "step": 19825 + }, + { + "epoch": 0.78, + "grad_norm": 5.996776147055529, + "learning_rate": 4.983586956521739e-06, + "loss": 0.2465, + "step": 19850 + }, + { + "epoch": 0.78, + "grad_norm": 5.761328825995187, + "learning_rate": 4.9835451505016725e-06, + "loss": 0.2453, + "step": 19875 + }, + { + "epoch": 0.78, + "grad_norm": 5.828048723654273, + "learning_rate": 4.983503344481605e-06, + "loss": 0.2522, + "step": 19900 + }, + { + "epoch": 0.78, + "grad_norm": 5.580918856632557, + "learning_rate": 4.983461538461539e-06, + "loss": 0.2744, + "step": 19925 + }, + { + "epoch": 0.78, + "grad_norm": 6.91882009627752, + "learning_rate": 4.9834197324414715e-06, + "loss": 0.2666, + "step": 19950 + }, + { + "epoch": 0.79, + "grad_norm": 5.32076952903993, + "learning_rate": 4.983377926421405e-06, + "loss": 0.265, + "step": 19975 + }, + { + "epoch": 0.79, + "grad_norm": 4.971506292016713, + "learning_rate": 4.983336120401338e-06, + "loss": 0.2646, + "step": 20000 + }, + { + "epoch": 0.79, + "eval_loss": 0.4140625, + "eval_runtime": 11517.8974, + "eval_samples_per_second": 0.822, + "eval_steps_per_second": 0.051, + "eval_wer": 0.1283560960700095, + "step": 20000 + }, + { + "epoch": 0.79, + "grad_norm": 7.597765424127864, + "learning_rate": 4.983294314381271e-06, + "loss": 0.2477, + "step": 20025 + }, + { + "epoch": 0.79, + "grad_norm": 6.125850278326169, + "learning_rate": 4.983252508361204e-06, + "loss": 0.2679, + "step": 20050 + }, + { + "epoch": 0.79, + "grad_norm": 5.937530192709086, + "learning_rate": 4.9832107023411375e-06, + "loss": 0.2524, + "step": 20075 + }, + { + "epoch": 0.79, + "grad_norm": 6.879107953793692, + "learning_rate": 4.98316889632107e-06, + "loss": 0.2629, + "step": 20100 + }, + { + "epoch": 0.79, + "grad_norm": 4.726138480598868, + "learning_rate": 4.983127090301004e-06, + "loss": 0.2541, + "step": 20125 + }, + { + "epoch": 0.79, + "grad_norm": 8.90017454045631, + "learning_rate": 4.9830852842809364e-06, + "loss": 0.263, + "step": 20150 + }, + { + "epoch": 0.79, + "grad_norm": 6.581872676278387, + "learning_rate": 4.98304347826087e-06, + "loss": 0.2568, + "step": 20175 + }, + { + "epoch": 0.79, + "grad_norm": 4.54004763181376, + "learning_rate": 4.9830016722408035e-06, + "loss": 0.237, + "step": 20200 + }, + { + "epoch": 0.8, + "grad_norm": 5.106351027498988, + "learning_rate": 4.982959866220736e-06, + "loss": 0.2521, + "step": 20225 + }, + { + "epoch": 0.8, + "grad_norm": 6.218144145112456, + "learning_rate": 4.98291806020067e-06, + "loss": 0.2719, + "step": 20250 + }, + { + "epoch": 0.8, + "grad_norm": 4.68621541939387, + "learning_rate": 4.9828762541806025e-06, + "loss": 0.2378, + "step": 20275 + }, + { + "epoch": 0.8, + "grad_norm": 4.990952354340057, + "learning_rate": 4.982834448160536e-06, + "loss": 0.2557, + "step": 20300 + }, + { + "epoch": 0.8, + "grad_norm": 5.0326266609134045, + "learning_rate": 4.982792642140469e-06, + "loss": 0.2745, + "step": 20325 + }, + { + "epoch": 0.8, + "grad_norm": 5.628421286827086, + "learning_rate": 4.982750836120402e-06, + "loss": 0.2832, + "step": 20350 + }, + { + "epoch": 0.8, + "grad_norm": 5.890967827336047, + "learning_rate": 4.982710702341137e-06, + "loss": 0.2428, + "step": 20375 + }, + { + "epoch": 0.8, + "grad_norm": 5.757261412656723, + "learning_rate": 4.982668896321071e-06, + "loss": 0.2363, + "step": 20400 + }, + { + "epoch": 0.8, + "grad_norm": 5.790150924630483, + "learning_rate": 4.9826270903010035e-06, + "loss": 0.2841, + "step": 20425 + }, + { + "epoch": 0.8, + "grad_norm": 5.334436845925394, + "learning_rate": 4.982585284280937e-06, + "loss": 0.2335, + "step": 20450 + }, + { + "epoch": 0.81, + "grad_norm": 6.496338521482485, + "learning_rate": 4.98254347826087e-06, + "loss": 0.256, + "step": 20475 + }, + { + "epoch": 0.81, + "grad_norm": 5.1183227106736355, + "learning_rate": 4.982501672240803e-06, + "loss": 0.2316, + "step": 20500 + }, + { + "epoch": 0.81, + "grad_norm": 5.6851818591062955, + "learning_rate": 4.982459866220736e-06, + "loss": 0.2858, + "step": 20525 + }, + { + "epoch": 0.81, + "grad_norm": 6.243803882140677, + "learning_rate": 4.9824180602006695e-06, + "loss": 0.2509, + "step": 20550 + }, + { + "epoch": 0.81, + "grad_norm": 5.557348957276566, + "learning_rate": 4.982376254180602e-06, + "loss": 0.255, + "step": 20575 + }, + { + "epoch": 0.81, + "grad_norm": 7.057972306154081, + "learning_rate": 4.982334448160536e-06, + "loss": 0.2673, + "step": 20600 + }, + { + "epoch": 0.81, + "grad_norm": 5.269049715317146, + "learning_rate": 4.9822926421404685e-06, + "loss": 0.2332, + "step": 20625 + }, + { + "epoch": 0.81, + "grad_norm": 5.0641172747571215, + "learning_rate": 4.982250836120402e-06, + "loss": 0.2438, + "step": 20650 + }, + { + "epoch": 0.81, + "grad_norm": 4.545638271569368, + "learning_rate": 4.982209030100335e-06, + "loss": 0.2507, + "step": 20675 + }, + { + "epoch": 0.81, + "grad_norm": 5.833842229029364, + "learning_rate": 4.982167224080268e-06, + "loss": 0.2692, + "step": 20700 + }, + { + "epoch": 0.82, + "grad_norm": 5.91131799728637, + "learning_rate": 4.982125418060201e-06, + "loss": 0.2708, + "step": 20725 + }, + { + "epoch": 0.82, + "grad_norm": 6.81240765396023, + "learning_rate": 4.982083612040134e-06, + "loss": 0.2362, + "step": 20750 + }, + { + "epoch": 0.82, + "grad_norm": 5.808809996905685, + "learning_rate": 4.982041806020067e-06, + "loss": 0.2398, + "step": 20775 + }, + { + "epoch": 0.82, + "grad_norm": 6.063650018965221, + "learning_rate": 4.982e-06, + "loss": 0.2631, + "step": 20800 + }, + { + "epoch": 0.82, + "grad_norm": 5.759033532600636, + "learning_rate": 4.9819581939799334e-06, + "loss": 0.2227, + "step": 20825 + }, + { + "epoch": 0.82, + "grad_norm": 5.455666288438305, + "learning_rate": 4.981916387959866e-06, + "loss": 0.2445, + "step": 20850 + }, + { + "epoch": 0.82, + "grad_norm": 6.701923172622271, + "learning_rate": 4.9818745819398e-06, + "loss": 0.2575, + "step": 20875 + }, + { + "epoch": 0.82, + "grad_norm": 4.9618139565457255, + "learning_rate": 4.981832775919732e-06, + "loss": 0.2406, + "step": 20900 + }, + { + "epoch": 0.82, + "grad_norm": 7.68671913016998, + "learning_rate": 4.981790969899666e-06, + "loss": 0.2665, + "step": 20925 + }, + { + "epoch": 0.82, + "grad_norm": 4.722574500390105, + "learning_rate": 4.981749163879599e-06, + "loss": 0.2569, + "step": 20950 + }, + { + "epoch": 0.83, + "grad_norm": 4.572541737429397, + "learning_rate": 4.981707357859532e-06, + "loss": 0.239, + "step": 20975 + }, + { + "epoch": 0.83, + "grad_norm": 5.415209470257432, + "learning_rate": 4.981665551839465e-06, + "loss": 0.2451, + "step": 21000 + }, + { + "epoch": 0.83, + "grad_norm": 6.020681059671357, + "learning_rate": 4.9816237458193984e-06, + "loss": 0.2469, + "step": 21025 + }, + { + "epoch": 0.83, + "grad_norm": 6.046597623490536, + "learning_rate": 4.981581939799331e-06, + "loss": 0.2429, + "step": 21050 + }, + { + "epoch": 0.83, + "grad_norm": 5.68155367027518, + "learning_rate": 4.981540133779265e-06, + "loss": 0.2215, + "step": 21075 + }, + { + "epoch": 0.83, + "grad_norm": 6.289254570068965, + "learning_rate": 4.981498327759197e-06, + "loss": 0.2596, + "step": 21100 + }, + { + "epoch": 0.83, + "grad_norm": 5.531907347250682, + "learning_rate": 4.981456521739131e-06, + "loss": 0.2724, + "step": 21125 + }, + { + "epoch": 0.83, + "grad_norm": 3.70719685809795, + "learning_rate": 4.981414715719064e-06, + "loss": 0.246, + "step": 21150 + }, + { + "epoch": 0.83, + "grad_norm": 5.033777182035272, + "learning_rate": 4.981372909698997e-06, + "loss": 0.2603, + "step": 21175 + }, + { + "epoch": 0.83, + "grad_norm": 6.997263007017261, + "learning_rate": 4.98133110367893e-06, + "loss": 0.2549, + "step": 21200 + }, + { + "epoch": 0.84, + "grad_norm": 7.397447058530203, + "learning_rate": 4.981289297658863e-06, + "loss": 0.2252, + "step": 21225 + }, + { + "epoch": 0.84, + "grad_norm": 6.387609355487834, + "learning_rate": 4.981247491638796e-06, + "loss": 0.2424, + "step": 21250 + }, + { + "epoch": 0.84, + "grad_norm": 5.86389992482405, + "learning_rate": 4.981207357859532e-06, + "loss": 0.2808, + "step": 21275 + }, + { + "epoch": 0.84, + "grad_norm": 5.886969996279023, + "learning_rate": 4.9811655518394655e-06, + "loss": 0.2445, + "step": 21300 + }, + { + "epoch": 0.84, + "grad_norm": 4.930646590051299, + "learning_rate": 4.981123745819398e-06, + "loss": 0.2385, + "step": 21325 + }, + { + "epoch": 0.84, + "grad_norm": 6.658704856684161, + "learning_rate": 4.981081939799332e-06, + "loss": 0.2654, + "step": 21350 + }, + { + "epoch": 0.84, + "grad_norm": 5.85826584313124, + "learning_rate": 4.981040133779264e-06, + "loss": 0.2787, + "step": 21375 + }, + { + "epoch": 0.84, + "grad_norm": 6.2136211167108915, + "learning_rate": 4.980998327759198e-06, + "loss": 0.2496, + "step": 21400 + }, + { + "epoch": 0.84, + "grad_norm": 6.292493595501454, + "learning_rate": 4.980956521739131e-06, + "loss": 0.2398, + "step": 21425 + }, + { + "epoch": 0.84, + "grad_norm": 6.058955089554257, + "learning_rate": 4.980914715719064e-06, + "loss": 0.2399, + "step": 21450 + }, + { + "epoch": 0.84, + "grad_norm": 5.676342839883485, + "learning_rate": 4.980872909698997e-06, + "loss": 0.2323, + "step": 21475 + }, + { + "epoch": 0.85, + "grad_norm": 4.982168817113219, + "learning_rate": 4.9808311036789305e-06, + "loss": 0.2391, + "step": 21500 + }, + { + "epoch": 0.85, + "grad_norm": 6.193462179348854, + "learning_rate": 4.980789297658863e-06, + "loss": 0.2386, + "step": 21525 + }, + { + "epoch": 0.85, + "grad_norm": 5.014739296688169, + "learning_rate": 4.980747491638797e-06, + "loss": 0.2514, + "step": 21550 + }, + { + "epoch": 0.85, + "grad_norm": 4.930490666345008, + "learning_rate": 4.980705685618729e-06, + "loss": 0.2643, + "step": 21575 + }, + { + "epoch": 0.85, + "grad_norm": 6.295124932611308, + "learning_rate": 4.980663879598663e-06, + "loss": 0.2689, + "step": 21600 + }, + { + "epoch": 0.85, + "grad_norm": 5.907807211766026, + "learning_rate": 4.980622073578596e-06, + "loss": 0.2589, + "step": 21625 + }, + { + "epoch": 0.85, + "grad_norm": 6.178379073295326, + "learning_rate": 4.980580267558529e-06, + "loss": 0.2515, + "step": 21650 + }, + { + "epoch": 0.85, + "grad_norm": 5.000008304241975, + "learning_rate": 4.980538461538462e-06, + "loss": 0.2544, + "step": 21675 + }, + { + "epoch": 0.85, + "grad_norm": 5.7552188199845675, + "learning_rate": 4.980496655518395e-06, + "loss": 0.2474, + "step": 21700 + }, + { + "epoch": 0.85, + "grad_norm": 6.291094061313578, + "learning_rate": 4.980454849498328e-06, + "loss": 0.2488, + "step": 21725 + }, + { + "epoch": 0.86, + "grad_norm": 4.7158702845231, + "learning_rate": 4.980413043478261e-06, + "loss": 0.2377, + "step": 21750 + }, + { + "epoch": 0.86, + "grad_norm": 4.371680856314009, + "learning_rate": 4.980371237458194e-06, + "loss": 0.2695, + "step": 21775 + }, + { + "epoch": 0.86, + "grad_norm": 5.362994192905961, + "learning_rate": 4.980329431438127e-06, + "loss": 0.2622, + "step": 21800 + }, + { + "epoch": 0.86, + "grad_norm": 6.382584959830635, + "learning_rate": 4.980287625418061e-06, + "loss": 0.2458, + "step": 21825 + }, + { + "epoch": 0.86, + "grad_norm": 5.793487011383763, + "learning_rate": 4.980245819397993e-06, + "loss": 0.288, + "step": 21850 + }, + { + "epoch": 0.86, + "grad_norm": 4.399312922304265, + "learning_rate": 4.980204013377927e-06, + "loss": 0.2615, + "step": 21875 + }, + { + "epoch": 0.86, + "grad_norm": 6.081833307673779, + "learning_rate": 4.9801622073578596e-06, + "loss": 0.2694, + "step": 21900 + }, + { + "epoch": 0.86, + "grad_norm": 4.915184300784432, + "learning_rate": 4.980120401337793e-06, + "loss": 0.2545, + "step": 21925 + }, + { + "epoch": 0.86, + "grad_norm": 5.794109389200768, + "learning_rate": 4.980078595317726e-06, + "loss": 0.2422, + "step": 21950 + }, + { + "epoch": 0.86, + "grad_norm": 6.638448141225639, + "learning_rate": 4.980036789297659e-06, + "loss": 0.2736, + "step": 21975 + }, + { + "epoch": 0.87, + "grad_norm": 4.2006435060994844, + "learning_rate": 4.979994983277592e-06, + "loss": 0.2537, + "step": 22000 + }, + { + "epoch": 0.87, + "grad_norm": 4.600066690029428, + "learning_rate": 4.979953177257526e-06, + "loss": 0.2654, + "step": 22025 + }, + { + "epoch": 0.87, + "grad_norm": 5.687952294883638, + "learning_rate": 4.979911371237458e-06, + "loss": 0.2593, + "step": 22050 + }, + { + "epoch": 0.87, + "grad_norm": 5.627401660974489, + "learning_rate": 4.979869565217392e-06, + "loss": 0.2711, + "step": 22075 + }, + { + "epoch": 0.87, + "grad_norm": 5.065685866336755, + "learning_rate": 4.9798277591973245e-06, + "loss": 0.2387, + "step": 22100 + }, + { + "epoch": 0.87, + "grad_norm": 4.682468578553179, + "learning_rate": 4.979785953177258e-06, + "loss": 0.2351, + "step": 22125 + }, + { + "epoch": 0.87, + "grad_norm": 5.893827612677344, + "learning_rate": 4.979744147157191e-06, + "loss": 0.2501, + "step": 22150 + }, + { + "epoch": 0.87, + "grad_norm": 5.300974420339823, + "learning_rate": 4.979702341137124e-06, + "loss": 0.2292, + "step": 22175 + }, + { + "epoch": 0.87, + "grad_norm": 6.750330015989807, + "learning_rate": 4.979660535117057e-06, + "loss": 0.2534, + "step": 22200 + }, + { + "epoch": 0.87, + "grad_norm": 4.921714939272224, + "learning_rate": 4.9796187290969906e-06, + "loss": 0.2378, + "step": 22225 + }, + { + "epoch": 0.88, + "grad_norm": 6.104235634143888, + "learning_rate": 4.979576923076923e-06, + "loss": 0.239, + "step": 22250 + }, + { + "epoch": 0.88, + "grad_norm": 6.365003485750156, + "learning_rate": 4.979535117056857e-06, + "loss": 0.2401, + "step": 22275 + }, + { + "epoch": 0.88, + "grad_norm": 5.8943944369540064, + "learning_rate": 4.9794933110367895e-06, + "loss": 0.2703, + "step": 22300 + }, + { + "epoch": 0.88, + "grad_norm": 5.740334114124547, + "learning_rate": 4.979451505016723e-06, + "loss": 0.2448, + "step": 22325 + }, + { + "epoch": 0.88, + "grad_norm": 6.26645969958187, + "learning_rate": 4.979409698996656e-06, + "loss": 0.237, + "step": 22350 + }, + { + "epoch": 0.88, + "grad_norm": 5.188296175194914, + "learning_rate": 4.979367892976589e-06, + "loss": 0.2439, + "step": 22375 + }, + { + "epoch": 0.88, + "grad_norm": 5.28258281943557, + "learning_rate": 4.979326086956522e-06, + "loss": 0.2319, + "step": 22400 + }, + { + "epoch": 0.88, + "grad_norm": 4.876641118702411, + "learning_rate": 4.979284280936455e-06, + "loss": 0.2256, + "step": 22425 + }, + { + "epoch": 0.88, + "grad_norm": 5.9872075168697, + "learning_rate": 4.979242474916388e-06, + "loss": 0.2401, + "step": 22450 + }, + { + "epoch": 0.88, + "grad_norm": 5.709817054749593, + "learning_rate": 4.979200668896321e-06, + "loss": 0.2318, + "step": 22475 + }, + { + "epoch": 0.89, + "grad_norm": 5.821509746602487, + "learning_rate": 4.9791588628762545e-06, + "loss": 0.2556, + "step": 22500 + }, + { + "epoch": 0.89, + "grad_norm": 4.68934415618322, + "learning_rate": 4.979117056856187e-06, + "loss": 0.2558, + "step": 22525 + }, + { + "epoch": 0.89, + "grad_norm": 5.509947218331295, + "learning_rate": 4.979075250836121e-06, + "loss": 0.2365, + "step": 22550 + }, + { + "epoch": 0.89, + "grad_norm": 4.58010564782228, + "learning_rate": 4.9790334448160534e-06, + "loss": 0.2585, + "step": 22575 + }, + { + "epoch": 0.89, + "grad_norm": 5.350109880497084, + "learning_rate": 4.978991638795987e-06, + "loss": 0.2496, + "step": 22600 + }, + { + "epoch": 0.89, + "grad_norm": 5.52596483622084, + "learning_rate": 4.97894983277592e-06, + "loss": 0.2513, + "step": 22625 + }, + { + "epoch": 0.89, + "grad_norm": 4.5425308104361335, + "learning_rate": 4.978908026755853e-06, + "loss": 0.2317, + "step": 22650 + }, + { + "epoch": 0.89, + "grad_norm": 5.042990505291717, + "learning_rate": 4.978866220735786e-06, + "loss": 0.2425, + "step": 22675 + }, + { + "epoch": 0.89, + "grad_norm": 5.28155864508552, + "learning_rate": 4.9788244147157195e-06, + "loss": 0.2433, + "step": 22700 + }, + { + "epoch": 0.89, + "grad_norm": 5.05219957106238, + "learning_rate": 4.978782608695652e-06, + "loss": 0.2582, + "step": 22725 + }, + { + "epoch": 0.89, + "grad_norm": 6.3750393384628925, + "learning_rate": 4.978740802675586e-06, + "loss": 0.24, + "step": 22750 + }, + { + "epoch": 0.9, + "grad_norm": 4.981026672978073, + "learning_rate": 4.978698996655518e-06, + "loss": 0.2564, + "step": 22775 + }, + { + "epoch": 0.9, + "grad_norm": 4.8161776421109055, + "learning_rate": 4.978657190635452e-06, + "loss": 0.239, + "step": 22800 + }, + { + "epoch": 0.9, + "grad_norm": 7.376213905893997, + "learning_rate": 4.978615384615385e-06, + "loss": 0.2611, + "step": 22825 + }, + { + "epoch": 0.9, + "grad_norm": 5.15776727013437, + "learning_rate": 4.978573578595318e-06, + "loss": 0.2576, + "step": 22850 + }, + { + "epoch": 0.9, + "grad_norm": 5.04185750311223, + "learning_rate": 4.978531772575252e-06, + "loss": 0.2482, + "step": 22875 + }, + { + "epoch": 0.9, + "grad_norm": 5.950731538074712, + "learning_rate": 4.9784899665551844e-06, + "loss": 0.269, + "step": 22900 + }, + { + "epoch": 0.9, + "grad_norm": 6.205593383648058, + "learning_rate": 4.978448160535118e-06, + "loss": 0.2401, + "step": 22925 + }, + { + "epoch": 0.9, + "grad_norm": 5.548735869447201, + "learning_rate": 4.978406354515051e-06, + "loss": 0.2462, + "step": 22950 + }, + { + "epoch": 0.9, + "grad_norm": 5.329571742913044, + "learning_rate": 4.978364548494984e-06, + "loss": 0.2636, + "step": 22975 + }, + { + "epoch": 0.9, + "grad_norm": 5.536247610819822, + "learning_rate": 4.978322742474917e-06, + "loss": 0.2512, + "step": 23000 + }, + { + "epoch": 0.91, + "grad_norm": 5.573520221822453, + "learning_rate": 4.9782809364548505e-06, + "loss": 0.2551, + "step": 23025 + }, + { + "epoch": 0.91, + "grad_norm": 5.059324762232372, + "learning_rate": 4.978239130434783e-06, + "loss": 0.2677, + "step": 23050 + }, + { + "epoch": 0.91, + "grad_norm": 6.4312911338529535, + "learning_rate": 4.978197324414717e-06, + "loss": 0.2647, + "step": 23075 + }, + { + "epoch": 0.91, + "grad_norm": 5.905908003456633, + "learning_rate": 4.978155518394649e-06, + "loss": 0.241, + "step": 23100 + }, + { + "epoch": 0.91, + "grad_norm": 6.305362513811501, + "learning_rate": 4.978113712374582e-06, + "loss": 0.2759, + "step": 23125 + }, + { + "epoch": 0.91, + "grad_norm": 6.334507557305194, + "learning_rate": 4.978071906354515e-06, + "loss": 0.2471, + "step": 23150 + }, + { + "epoch": 0.91, + "grad_norm": 5.167193712946236, + "learning_rate": 4.978030100334448e-06, + "loss": 0.2454, + "step": 23175 + }, + { + "epoch": 0.91, + "grad_norm": 5.575127667668046, + "learning_rate": 4.977988294314381e-06, + "loss": 0.2166, + "step": 23200 + }, + { + "epoch": 0.91, + "grad_norm": 5.829621265497541, + "learning_rate": 4.977948160535118e-06, + "loss": 0.2363, + "step": 23225 + }, + { + "epoch": 0.91, + "grad_norm": 5.561630435288059, + "learning_rate": 4.9779063545150504e-06, + "loss": 0.2368, + "step": 23250 + }, + { + "epoch": 0.92, + "grad_norm": 4.602771537076456, + "learning_rate": 4.977864548494984e-06, + "loss": 0.264, + "step": 23275 + }, + { + "epoch": 0.92, + "grad_norm": 4.8921357782406565, + "learning_rate": 4.977822742474917e-06, + "loss": 0.2425, + "step": 23300 + }, + { + "epoch": 0.92, + "grad_norm": 6.761815993674995, + "learning_rate": 4.97778093645485e-06, + "loss": 0.2628, + "step": 23325 + }, + { + "epoch": 0.92, + "grad_norm": 4.789432658329115, + "learning_rate": 4.977739130434783e-06, + "loss": 0.2415, + "step": 23350 + }, + { + "epoch": 0.92, + "grad_norm": 5.212070910404648, + "learning_rate": 4.977697324414716e-06, + "loss": 0.2606, + "step": 23375 + }, + { + "epoch": 0.92, + "grad_norm": 5.645169198733312, + "learning_rate": 4.977655518394649e-06, + "loss": 0.2295, + "step": 23400 + }, + { + "epoch": 0.92, + "grad_norm": 5.941660028505237, + "learning_rate": 4.977613712374582e-06, + "loss": 0.259, + "step": 23425 + }, + { + "epoch": 0.92, + "grad_norm": 6.006340050777933, + "learning_rate": 4.977571906354515e-06, + "loss": 0.2489, + "step": 23450 + }, + { + "epoch": 0.92, + "grad_norm": 5.444718325419201, + "learning_rate": 4.977530100334448e-06, + "loss": 0.2252, + "step": 23475 + }, + { + "epoch": 0.92, + "grad_norm": 5.315731137511242, + "learning_rate": 4.977488294314382e-06, + "loss": 0.2444, + "step": 23500 + }, + { + "epoch": 0.93, + "grad_norm": 5.147536052111137, + "learning_rate": 4.977446488294314e-06, + "loss": 0.253, + "step": 23525 + }, + { + "epoch": 0.93, + "grad_norm": 4.637894947771309, + "learning_rate": 4.977404682274248e-06, + "loss": 0.2411, + "step": 23550 + }, + { + "epoch": 0.93, + "grad_norm": 7.3547145173015, + "learning_rate": 4.977362876254181e-06, + "loss": 0.249, + "step": 23575 + }, + { + "epoch": 0.93, + "grad_norm": 6.191389830163696, + "learning_rate": 4.977321070234114e-06, + "loss": 0.2977, + "step": 23600 + }, + { + "epoch": 0.93, + "grad_norm": 4.747541667430566, + "learning_rate": 4.977279264214047e-06, + "loss": 0.2449, + "step": 23625 + }, + { + "epoch": 0.93, + "grad_norm": 4.520805102077198, + "learning_rate": 4.97723745819398e-06, + "loss": 0.2308, + "step": 23650 + }, + { + "epoch": 0.93, + "grad_norm": 5.417279655484847, + "learning_rate": 4.977195652173913e-06, + "loss": 0.2282, + "step": 23675 + }, + { + "epoch": 0.93, + "grad_norm": 5.814135551224237, + "learning_rate": 4.977153846153847e-06, + "loss": 0.2407, + "step": 23700 + }, + { + "epoch": 0.93, + "grad_norm": 5.799363124129896, + "learning_rate": 4.977112040133779e-06, + "loss": 0.2464, + "step": 23725 + }, + { + "epoch": 0.93, + "grad_norm": 5.926212906223406, + "learning_rate": 4.977070234113713e-06, + "loss": 0.2543, + "step": 23750 + }, + { + "epoch": 0.94, + "grad_norm": 4.757663142452678, + "learning_rate": 4.977028428093646e-06, + "loss": 0.2452, + "step": 23775 + }, + { + "epoch": 0.94, + "grad_norm": 6.6489188278172895, + "learning_rate": 4.976986622073579e-06, + "loss": 0.2517, + "step": 23800 + }, + { + "epoch": 0.94, + "grad_norm": 5.577979099426105, + "learning_rate": 4.976944816053512e-06, + "loss": 0.2605, + "step": 23825 + }, + { + "epoch": 0.94, + "grad_norm": 6.9466236367411005, + "learning_rate": 4.976903010033445e-06, + "loss": 0.2655, + "step": 23850 + }, + { + "epoch": 0.94, + "grad_norm": 5.7414326777762685, + "learning_rate": 4.976861204013378e-06, + "loss": 0.2304, + "step": 23875 + }, + { + "epoch": 0.94, + "grad_norm": 5.522499469061028, + "learning_rate": 4.976819397993312e-06, + "loss": 0.2454, + "step": 23900 + }, + { + "epoch": 0.94, + "grad_norm": 5.343226074167374, + "learning_rate": 4.976777591973244e-06, + "loss": 0.2555, + "step": 23925 + }, + { + "epoch": 0.94, + "grad_norm": 6.908837038372152, + "learning_rate": 4.976735785953178e-06, + "loss": 0.2479, + "step": 23950 + }, + { + "epoch": 0.94, + "grad_norm": 5.9308077824937016, + "learning_rate": 4.976693979933111e-06, + "loss": 0.2571, + "step": 23975 + }, + { + "epoch": 0.94, + "grad_norm": 6.187889109702333, + "learning_rate": 4.976652173913044e-06, + "loss": 0.2705, + "step": 24000 + }, + { + "epoch": 0.95, + "grad_norm": 5.591103804137682, + "learning_rate": 4.976610367892978e-06, + "loss": 0.2448, + "step": 24025 + }, + { + "epoch": 0.95, + "grad_norm": 4.396768803047713, + "learning_rate": 4.97656856187291e-06, + "loss": 0.2243, + "step": 24050 + }, + { + "epoch": 0.95, + "grad_norm": 4.7909268110643275, + "learning_rate": 4.976526755852843e-06, + "loss": 0.2332, + "step": 24075 + }, + { + "epoch": 0.95, + "grad_norm": 3.7576206413020232, + "learning_rate": 4.976484949832776e-06, + "loss": 0.2549, + "step": 24100 + }, + { + "epoch": 0.95, + "grad_norm": 5.4611210915597415, + "learning_rate": 4.976443143812709e-06, + "loss": 0.2458, + "step": 24125 + }, + { + "epoch": 0.95, + "grad_norm": 6.089932740094683, + "learning_rate": 4.976401337792642e-06, + "loss": 0.2503, + "step": 24150 + }, + { + "epoch": 0.95, + "grad_norm": 4.268253661077563, + "learning_rate": 4.9763595317725755e-06, + "loss": 0.225, + "step": 24175 + }, + { + "epoch": 0.95, + "grad_norm": 5.204555335361072, + "learning_rate": 4.976317725752508e-06, + "loss": 0.2713, + "step": 24200 + }, + { + "epoch": 0.95, + "grad_norm": 6.021816391599869, + "learning_rate": 4.976275919732442e-06, + "loss": 0.2336, + "step": 24225 + }, + { + "epoch": 0.95, + "grad_norm": 4.8455887000789835, + "learning_rate": 4.9762341137123745e-06, + "loss": 0.2279, + "step": 24250 + }, + { + "epoch": 0.95, + "grad_norm": 7.225031289810486, + "learning_rate": 4.976192307692308e-06, + "loss": 0.2275, + "step": 24275 + }, + { + "epoch": 0.96, + "grad_norm": 5.159430367632941, + "learning_rate": 4.976150501672241e-06, + "loss": 0.2492, + "step": 24300 + }, + { + "epoch": 0.96, + "grad_norm": 4.127979739902747, + "learning_rate": 4.976108695652174e-06, + "loss": 0.2336, + "step": 24325 + }, + { + "epoch": 0.96, + "grad_norm": 6.238071839251816, + "learning_rate": 4.976066889632107e-06, + "loss": 0.2801, + "step": 24350 + }, + { + "epoch": 0.96, + "grad_norm": 4.157082993930431, + "learning_rate": 4.9760250836120405e-06, + "loss": 0.2307, + "step": 24375 + }, + { + "epoch": 0.96, + "grad_norm": 5.751821236126511, + "learning_rate": 4.975983277591974e-06, + "loss": 0.2466, + "step": 24400 + }, + { + "epoch": 0.96, + "grad_norm": 5.723512280716872, + "learning_rate": 4.975941471571907e-06, + "loss": 0.2533, + "step": 24425 + }, + { + "epoch": 0.96, + "grad_norm": 6.529999483690673, + "learning_rate": 4.97589966555184e-06, + "loss": 0.2305, + "step": 24450 + }, + { + "epoch": 0.96, + "grad_norm": 4.640020922858098, + "learning_rate": 4.975857859531773e-06, + "loss": 0.24, + "step": 24475 + }, + { + "epoch": 0.96, + "grad_norm": 5.647549375619507, + "learning_rate": 4.9758160535117065e-06, + "loss": 0.2444, + "step": 24500 + }, + { + "epoch": 0.96, + "grad_norm": 4.391480590113418, + "learning_rate": 4.975774247491639e-06, + "loss": 0.2585, + "step": 24525 + }, + { + "epoch": 0.97, + "grad_norm": 7.489122500542812, + "learning_rate": 4.975732441471573e-06, + "loss": 0.2475, + "step": 24550 + }, + { + "epoch": 0.97, + "grad_norm": 7.499688320980486, + "learning_rate": 4.9756906354515055e-06, + "loss": 0.2402, + "step": 24575 + }, + { + "epoch": 0.97, + "grad_norm": 4.331955267278932, + "learning_rate": 4.975648829431439e-06, + "loss": 0.2444, + "step": 24600 + }, + { + "epoch": 0.97, + "grad_norm": 7.408758125870933, + "learning_rate": 4.975607023411372e-06, + "loss": 0.227, + "step": 24625 + }, + { + "epoch": 0.97, + "grad_norm": 4.756801869357502, + "learning_rate": 4.975565217391305e-06, + "loss": 0.2472, + "step": 24650 + }, + { + "epoch": 0.97, + "grad_norm": 5.472101388539881, + "learning_rate": 4.975523411371238e-06, + "loss": 0.2178, + "step": 24675 + }, + { + "epoch": 0.97, + "grad_norm": 7.244728796693067, + "learning_rate": 4.9754816053511715e-06, + "loss": 0.2409, + "step": 24700 + }, + { + "epoch": 0.97, + "grad_norm": 7.422212395250679, + "learning_rate": 4.975439799331104e-06, + "loss": 0.2553, + "step": 24725 + }, + { + "epoch": 0.97, + "grad_norm": 6.733100754021741, + "learning_rate": 4.975397993311038e-06, + "loss": 0.2423, + "step": 24750 + }, + { + "epoch": 0.97, + "grad_norm": 5.907380286889155, + "learning_rate": 4.97535618729097e-06, + "loss": 0.246, + "step": 24775 + }, + { + "epoch": 0.98, + "grad_norm": 5.336386993051637, + "learning_rate": 4.975314381270903e-06, + "loss": 0.2437, + "step": 24800 + }, + { + "epoch": 0.98, + "grad_norm": 6.3744020887867645, + "learning_rate": 4.975272575250837e-06, + "loss": 0.2537, + "step": 24825 + }, + { + "epoch": 0.98, + "grad_norm": 5.130606979870505, + "learning_rate": 4.975230769230769e-06, + "loss": 0.2185, + "step": 24850 + }, + { + "epoch": 0.98, + "grad_norm": 6.393159176636358, + "learning_rate": 4.975188963210703e-06, + "loss": 0.2447, + "step": 24875 + }, + { + "epoch": 0.98, + "grad_norm": 4.37010512506881, + "learning_rate": 4.975147157190636e-06, + "loss": 0.2537, + "step": 24900 + }, + { + "epoch": 0.98, + "grad_norm": 5.781150540673855, + "learning_rate": 4.975105351170569e-06, + "loss": 0.2322, + "step": 24925 + }, + { + "epoch": 0.98, + "grad_norm": 4.670847613893096, + "learning_rate": 4.975063545150502e-06, + "loss": 0.2464, + "step": 24950 + }, + { + "epoch": 0.98, + "grad_norm": 6.727990256628477, + "learning_rate": 4.9750217391304354e-06, + "loss": 0.2581, + "step": 24975 + }, + { + "epoch": 0.98, + "grad_norm": 4.544263592893811, + "learning_rate": 4.974979933110368e-06, + "loss": 0.2628, + "step": 25000 + }, + { + "epoch": 0.98, + "grad_norm": 5.69470940676318, + "learning_rate": 4.974938127090302e-06, + "loss": 0.2223, + "step": 25025 + }, + { + "epoch": 0.99, + "grad_norm": 5.792583847102043, + "learning_rate": 4.974896321070234e-06, + "loss": 0.2343, + "step": 25050 + }, + { + "epoch": 0.99, + "grad_norm": 5.870490355918843, + "learning_rate": 4.974854515050168e-06, + "loss": 0.2473, + "step": 25075 + }, + { + "epoch": 0.99, + "grad_norm": 6.202473423301913, + "learning_rate": 4.974812709030101e-06, + "loss": 0.2507, + "step": 25100 + }, + { + "epoch": 0.99, + "grad_norm": 5.7239036684055895, + "learning_rate": 4.974770903010034e-06, + "loss": 0.2539, + "step": 25125 + }, + { + "epoch": 0.99, + "grad_norm": 5.222922132018913, + "learning_rate": 4.974729096989967e-06, + "loss": 0.2239, + "step": 25150 + }, + { + "epoch": 0.99, + "grad_norm": 5.957670491347016, + "learning_rate": 4.9746872909699e-06, + "loss": 0.2482, + "step": 25175 + }, + { + "epoch": 0.99, + "grad_norm": 4.8279149756498105, + "learning_rate": 4.974645484949833e-06, + "loss": 0.241, + "step": 25200 + }, + { + "epoch": 0.99, + "grad_norm": 5.200294070704066, + "learning_rate": 4.974605351170569e-06, + "loss": 0.242, + "step": 25225 + }, + { + "epoch": 0.99, + "grad_norm": 4.670126129827119, + "learning_rate": 4.974563545150502e-06, + "loss": 0.2406, + "step": 25250 + }, + { + "epoch": 0.99, + "grad_norm": 5.544898567700344, + "learning_rate": 4.974521739130435e-06, + "loss": 0.2338, + "step": 25275 + }, + { + "epoch": 1.0, + "grad_norm": 4.806206312623487, + "learning_rate": 4.974479933110368e-06, + "loss": 0.2265, + "step": 25300 + }, + { + "epoch": 1.0, + "grad_norm": 5.53793647458314, + "learning_rate": 4.9744381270903014e-06, + "loss": 0.2345, + "step": 25325 + }, + { + "epoch": 1.0, + "grad_norm": 5.608502420190892, + "learning_rate": 4.974396321070234e-06, + "loss": 0.2385, + "step": 25350 + }, + { + "epoch": 1.0, + "grad_norm": 5.1286787349789815, + "learning_rate": 4.974354515050168e-06, + "loss": 0.2453, + "step": 25375 + }, + { + "epoch": 1.0, + "grad_norm": 5.469570857307241, + "learning_rate": 4.9743127090301e-06, + "loss": 0.2294, + "step": 25400 + }, + { + "epoch": 1.0, + "grad_norm": 1.3534162143803563, + "learning_rate": 4.974270903010034e-06, + "loss": 0.2361, + "step": 25425 + }, + { + "epoch": 1.0, + "grad_norm": 1.706718780670229, + "learning_rate": 4.974229096989967e-06, + "loss": 0.2079, + "step": 25450 + }, + { + "epoch": 1.0, + "grad_norm": 1.4059544274203903, + "learning_rate": 4.9741872909699e-06, + "loss": 0.2031, + "step": 25475 + }, + { + "epoch": 1.0, + "grad_norm": 1.7037901786451766, + "learning_rate": 4.974145484949834e-06, + "loss": 0.2252, + "step": 25500 + }, + { + "epoch": 1.0, + "grad_norm": 1.3558198308855107, + "learning_rate": 4.974103678929766e-06, + "loss": 0.2154, + "step": 25525 + }, + { + "epoch": 1.01, + "grad_norm": 1.6682224121680143, + "learning_rate": 4.9740618729097e-06, + "loss": 0.2129, + "step": 25550 + }, + { + "epoch": 1.01, + "grad_norm": 1.4577637750260048, + "learning_rate": 4.974020066889633e-06, + "loss": 0.2024, + "step": 25575 + }, + { + "epoch": 1.01, + "grad_norm": 1.5087352378433831, + "learning_rate": 4.973978260869566e-06, + "loss": 0.2109, + "step": 25600 + }, + { + "epoch": 1.01, + "grad_norm": 1.6402724021534647, + "learning_rate": 4.973936454849499e-06, + "loss": 0.212, + "step": 25625 + }, + { + "epoch": 1.01, + "grad_norm": 1.9264263538960484, + "learning_rate": 4.9738946488294324e-06, + "loss": 0.2143, + "step": 25650 + }, + { + "epoch": 1.01, + "grad_norm": 1.9174315090045257, + "learning_rate": 4.973852842809365e-06, + "loss": 0.2117, + "step": 25675 + }, + { + "epoch": 1.01, + "grad_norm": 1.8429762901411832, + "learning_rate": 4.973811036789299e-06, + "loss": 0.2059, + "step": 25700 + }, + { + "epoch": 1.01, + "grad_norm": 1.5033416778790167, + "learning_rate": 4.9737692307692305e-06, + "loss": 0.182, + "step": 25725 + }, + { + "epoch": 1.01, + "grad_norm": 1.330321894521951, + "learning_rate": 4.973727424749164e-06, + "loss": 0.1881, + "step": 25750 + }, + { + "epoch": 1.01, + "grad_norm": 1.8292089071205324, + "learning_rate": 4.973685618729097e-06, + "loss": 0.1975, + "step": 25775 + }, + { + "epoch": 1.01, + "grad_norm": 1.2598690654621236, + "learning_rate": 4.97364381270903e-06, + "loss": 0.2053, + "step": 25800 + }, + { + "epoch": 1.02, + "grad_norm": 1.435229607084148, + "learning_rate": 4.973602006688963e-06, + "loss": 0.2169, + "step": 25825 + }, + { + "epoch": 1.02, + "grad_norm": 2.1185880545936135, + "learning_rate": 4.973560200668897e-06, + "loss": 0.1958, + "step": 25850 + }, + { + "epoch": 1.02, + "grad_norm": 1.3672188771049028, + "learning_rate": 4.973518394648829e-06, + "loss": 0.2104, + "step": 25875 + }, + { + "epoch": 1.02, + "grad_norm": 1.863357164049792, + "learning_rate": 4.973476588628763e-06, + "loss": 0.1857, + "step": 25900 + }, + { + "epoch": 1.02, + "grad_norm": 1.9990414028836165, + "learning_rate": 4.973434782608696e-06, + "loss": 0.1921, + "step": 25925 + }, + { + "epoch": 1.02, + "grad_norm": 1.3154262807639836, + "learning_rate": 4.973392976588629e-06, + "loss": 0.2266, + "step": 25950 + }, + { + "epoch": 1.02, + "grad_norm": 1.3950612420595643, + "learning_rate": 4.973351170568563e-06, + "loss": 0.2064, + "step": 25975 + }, + { + "epoch": 1.02, + "grad_norm": 1.408288078068823, + "learning_rate": 4.973309364548495e-06, + "loss": 0.2071, + "step": 26000 + }, + { + "epoch": 1.02, + "grad_norm": 1.9525039250801453, + "learning_rate": 4.973267558528429e-06, + "loss": 0.2118, + "step": 26025 + }, + { + "epoch": 1.02, + "grad_norm": 1.8278710821331015, + "learning_rate": 4.9732257525083616e-06, + "loss": 0.2262, + "step": 26050 + }, + { + "epoch": 1.03, + "grad_norm": 1.6148367474628422, + "learning_rate": 4.973183946488295e-06, + "loss": 0.1902, + "step": 26075 + }, + { + "epoch": 1.03, + "grad_norm": 1.5038758535892578, + "learning_rate": 4.973142140468228e-06, + "loss": 0.2051, + "step": 26100 + }, + { + "epoch": 1.03, + "grad_norm": 1.55344960183899, + "learning_rate": 4.973100334448161e-06, + "loss": 0.2026, + "step": 26125 + }, + { + "epoch": 1.03, + "grad_norm": 2.1894008457226986, + "learning_rate": 4.973058528428094e-06, + "loss": 0.2085, + "step": 26150 + }, + { + "epoch": 1.03, + "grad_norm": 1.585075493380135, + "learning_rate": 4.973016722408028e-06, + "loss": 0.2036, + "step": 26175 + }, + { + "epoch": 1.03, + "grad_norm": 1.8233869368121183, + "learning_rate": 4.97297491638796e-06, + "loss": 0.2235, + "step": 26200 + }, + { + "epoch": 1.03, + "grad_norm": 1.8237982393202128, + "learning_rate": 4.972934782608696e-06, + "loss": 0.2225, + "step": 26225 + }, + { + "epoch": 1.03, + "grad_norm": 1.870096249638426, + "learning_rate": 4.972892976588629e-06, + "loss": 0.2277, + "step": 26250 + }, + { + "epoch": 1.03, + "grad_norm": 2.084326368520519, + "learning_rate": 4.972851170568562e-06, + "loss": 0.2166, + "step": 26275 + }, + { + "epoch": 1.03, + "grad_norm": 1.786309173531602, + "learning_rate": 4.972809364548495e-06, + "loss": 0.1952, + "step": 26300 + }, + { + "epoch": 1.04, + "grad_norm": 2.118377964073037, + "learning_rate": 4.972767558528429e-06, + "loss": 0.2035, + "step": 26325 + }, + { + "epoch": 1.04, + "grad_norm": 1.4408933539946762, + "learning_rate": 4.972725752508361e-06, + "loss": 0.2101, + "step": 26350 + }, + { + "epoch": 1.04, + "grad_norm": 1.5247018069177714, + "learning_rate": 4.972683946488295e-06, + "loss": 0.2146, + "step": 26375 + }, + { + "epoch": 1.04, + "grad_norm": 1.4099242258025098, + "learning_rate": 4.9726421404682275e-06, + "loss": 0.2125, + "step": 26400 + }, + { + "epoch": 1.04, + "grad_norm": 1.385534734677316, + "learning_rate": 4.972600334448161e-06, + "loss": 0.2031, + "step": 26425 + }, + { + "epoch": 1.04, + "grad_norm": 1.3518252038997889, + "learning_rate": 4.972558528428094e-06, + "loss": 0.2054, + "step": 26450 + }, + { + "epoch": 1.04, + "grad_norm": 1.7923556960029041, + "learning_rate": 4.972516722408027e-06, + "loss": 0.2048, + "step": 26475 + }, + { + "epoch": 1.04, + "grad_norm": 1.6394896847363243, + "learning_rate": 4.97247491638796e-06, + "loss": 0.2172, + "step": 26500 + }, + { + "epoch": 1.04, + "grad_norm": 2.9050366746717975, + "learning_rate": 4.972433110367894e-06, + "loss": 0.2072, + "step": 26525 + }, + { + "epoch": 1.04, + "grad_norm": 1.4611510590944903, + "learning_rate": 4.972391304347826e-06, + "loss": 0.2026, + "step": 26550 + }, + { + "epoch": 1.05, + "grad_norm": 1.757163035064764, + "learning_rate": 4.97234949832776e-06, + "loss": 0.2169, + "step": 26575 + }, + { + "epoch": 1.05, + "grad_norm": 1.2821175785153787, + "learning_rate": 4.9723076923076925e-06, + "loss": 0.2181, + "step": 26600 + }, + { + "epoch": 1.05, + "grad_norm": 1.4184262153912761, + "learning_rate": 4.972265886287626e-06, + "loss": 0.1958, + "step": 26625 + }, + { + "epoch": 1.05, + "grad_norm": 1.7602753726816016, + "learning_rate": 4.97222408026756e-06, + "loss": 0.2009, + "step": 26650 + }, + { + "epoch": 1.05, + "grad_norm": 2.5974760210540015, + "learning_rate": 4.9721822742474915e-06, + "loss": 0.2243, + "step": 26675 + }, + { + "epoch": 1.05, + "grad_norm": 1.645642258789146, + "learning_rate": 4.972140468227425e-06, + "loss": 0.2038, + "step": 26700 + }, + { + "epoch": 1.05, + "grad_norm": 1.4401284677266035, + "learning_rate": 4.972098662207358e-06, + "loss": 0.1898, + "step": 26725 + }, + { + "epoch": 1.05, + "grad_norm": 1.430617922584249, + "learning_rate": 4.972056856187291e-06, + "loss": 0.2111, + "step": 26750 + }, + { + "epoch": 1.05, + "grad_norm": 1.7904074848447293, + "learning_rate": 4.972015050167224e-06, + "loss": 0.2129, + "step": 26775 + }, + { + "epoch": 1.05, + "grad_norm": 2.406346980994871, + "learning_rate": 4.9719732441471575e-06, + "loss": 0.2045, + "step": 26800 + }, + { + "epoch": 1.06, + "grad_norm": 2.2583526864827905, + "learning_rate": 4.97193143812709e-06, + "loss": 0.2266, + "step": 26825 + }, + { + "epoch": 1.06, + "grad_norm": 1.7655942648716196, + "learning_rate": 4.971889632107024e-06, + "loss": 0.1951, + "step": 26850 + }, + { + "epoch": 1.06, + "grad_norm": 1.467109180429437, + "learning_rate": 4.9718478260869564e-06, + "loss": 0.2284, + "step": 26875 + }, + { + "epoch": 1.06, + "grad_norm": 1.7516265119310985, + "learning_rate": 4.97180602006689e-06, + "loss": 0.2234, + "step": 26900 + }, + { + "epoch": 1.06, + "grad_norm": 1.5583524171970238, + "learning_rate": 4.971764214046823e-06, + "loss": 0.2205, + "step": 26925 + }, + { + "epoch": 1.06, + "grad_norm": 1.5551307670089498, + "learning_rate": 4.971722408026756e-06, + "loss": 0.2013, + "step": 26950 + }, + { + "epoch": 1.06, + "grad_norm": 1.7858242769151977, + "learning_rate": 4.971680602006689e-06, + "loss": 0.2037, + "step": 26975 + }, + { + "epoch": 1.06, + "grad_norm": 1.6377397935158822, + "learning_rate": 4.9716387959866225e-06, + "loss": 0.2002, + "step": 27000 + }, + { + "epoch": 1.06, + "grad_norm": 1.9240347144018508, + "learning_rate": 4.971596989966555e-06, + "loss": 0.2101, + "step": 27025 + }, + { + "epoch": 1.06, + "grad_norm": 1.9628276445796573, + "learning_rate": 4.971555183946489e-06, + "loss": 0.2132, + "step": 27050 + }, + { + "epoch": 1.07, + "grad_norm": 0.9744874761939027, + "learning_rate": 4.971513377926422e-06, + "loss": 0.2101, + "step": 27075 + }, + { + "epoch": 1.07, + "grad_norm": 1.6399308156848618, + "learning_rate": 4.971471571906355e-06, + "loss": 0.2181, + "step": 27100 + }, + { + "epoch": 1.07, + "grad_norm": 1.1026445818397668, + "learning_rate": 4.9714297658862885e-06, + "loss": 0.197, + "step": 27125 + }, + { + "epoch": 1.07, + "grad_norm": 2.2558271888800925, + "learning_rate": 4.971387959866221e-06, + "loss": 0.2128, + "step": 27150 + }, + { + "epoch": 1.07, + "grad_norm": 1.8603882142931765, + "learning_rate": 4.971346153846155e-06, + "loss": 0.1947, + "step": 27175 + }, + { + "epoch": 1.07, + "grad_norm": 1.7503812536338823, + "learning_rate": 4.9713043478260875e-06, + "loss": 0.1932, + "step": 27200 + }, + { + "epoch": 1.07, + "grad_norm": 1.567055034127128, + "learning_rate": 4.971264214046823e-06, + "loss": 0.1959, + "step": 27225 + }, + { + "epoch": 1.07, + "grad_norm": 2.133487976541747, + "learning_rate": 4.971222408026756e-06, + "loss": 0.2218, + "step": 27250 + }, + { + "epoch": 1.07, + "grad_norm": 1.7226015136938018, + "learning_rate": 4.9711806020066895e-06, + "loss": 0.2246, + "step": 27275 + }, + { + "epoch": 1.07, + "grad_norm": 1.5324982024611993, + "learning_rate": 4.971138795986622e-06, + "loss": 0.2249, + "step": 27300 + }, + { + "epoch": 1.07, + "grad_norm": 1.2666638439290532, + "learning_rate": 4.971096989966556e-06, + "loss": 0.2023, + "step": 27325 + }, + { + "epoch": 1.08, + "grad_norm": 1.7936378450293402, + "learning_rate": 4.9710551839464885e-06, + "loss": 0.2209, + "step": 27350 + }, + { + "epoch": 1.08, + "grad_norm": 1.4685172845320271, + "learning_rate": 4.971013377926422e-06, + "loss": 0.212, + "step": 27375 + }, + { + "epoch": 1.08, + "grad_norm": 1.3475774524686015, + "learning_rate": 4.970971571906355e-06, + "loss": 0.2038, + "step": 27400 + }, + { + "epoch": 1.08, + "grad_norm": 1.1356769724282703, + "learning_rate": 4.970929765886288e-06, + "loss": 0.2052, + "step": 27425 + }, + { + "epoch": 1.08, + "grad_norm": 2.3461405690608945, + "learning_rate": 4.970887959866221e-06, + "loss": 0.2077, + "step": 27450 + }, + { + "epoch": 1.08, + "grad_norm": 1.2787743115282497, + "learning_rate": 4.9708461538461545e-06, + "loss": 0.1933, + "step": 27475 + }, + { + "epoch": 1.08, + "grad_norm": 1.710480020197042, + "learning_rate": 4.970804347826087e-06, + "loss": 0.1945, + "step": 27500 + }, + { + "epoch": 1.08, + "grad_norm": 1.74469357398459, + "learning_rate": 4.970762541806021e-06, + "loss": 0.2212, + "step": 27525 + }, + { + "epoch": 1.08, + "grad_norm": 1.553720300641038, + "learning_rate": 4.9707207357859535e-06, + "loss": 0.2071, + "step": 27550 + }, + { + "epoch": 1.08, + "grad_norm": 1.5925560178851168, + "learning_rate": 4.970678929765887e-06, + "loss": 0.2082, + "step": 27575 + }, + { + "epoch": 1.09, + "grad_norm": 2.1786027786262885, + "learning_rate": 4.97063712374582e-06, + "loss": 0.2185, + "step": 27600 + }, + { + "epoch": 1.09, + "grad_norm": 1.7290929692337984, + "learning_rate": 4.970595317725752e-06, + "loss": 0.1983, + "step": 27625 + }, + { + "epoch": 1.09, + "grad_norm": 1.9473623643856743, + "learning_rate": 4.970553511705686e-06, + "loss": 0.1961, + "step": 27650 + }, + { + "epoch": 1.09, + "grad_norm": 1.4076384394311243, + "learning_rate": 4.970511705685619e-06, + "loss": 0.197, + "step": 27675 + }, + { + "epoch": 1.09, + "grad_norm": 1.9993400148837797, + "learning_rate": 4.970469899665552e-06, + "loss": 0.1954, + "step": 27700 + }, + { + "epoch": 1.09, + "grad_norm": 1.271427684128234, + "learning_rate": 4.970428093645485e-06, + "loss": 0.2048, + "step": 27725 + }, + { + "epoch": 1.09, + "grad_norm": 2.591987460804486, + "learning_rate": 4.9703862876254184e-06, + "loss": 0.1951, + "step": 27750 + }, + { + "epoch": 1.09, + "grad_norm": 1.5791865767097701, + "learning_rate": 4.970344481605351e-06, + "loss": 0.2057, + "step": 27775 + }, + { + "epoch": 1.09, + "grad_norm": 2.1896274986314004, + "learning_rate": 4.970302675585285e-06, + "loss": 0.2098, + "step": 27800 + }, + { + "epoch": 1.09, + "grad_norm": 1.6903447291808138, + "learning_rate": 4.970260869565217e-06, + "loss": 0.1987, + "step": 27825 + }, + { + "epoch": 1.1, + "grad_norm": 1.490242793429607, + "learning_rate": 4.970219063545151e-06, + "loss": 0.1942, + "step": 27850 + }, + { + "epoch": 1.1, + "grad_norm": 1.2304358371703357, + "learning_rate": 4.970177257525084e-06, + "loss": 0.2152, + "step": 27875 + }, + { + "epoch": 1.1, + "grad_norm": 1.890308908845425, + "learning_rate": 4.970135451505017e-06, + "loss": 0.2066, + "step": 27900 + }, + { + "epoch": 1.1, + "grad_norm": 1.5472152303564, + "learning_rate": 4.97009364548495e-06, + "loss": 0.2295, + "step": 27925 + }, + { + "epoch": 1.1, + "grad_norm": 1.8402540061996924, + "learning_rate": 4.970051839464883e-06, + "loss": 0.2103, + "step": 27950 + }, + { + "epoch": 1.1, + "grad_norm": 1.5521269697737237, + "learning_rate": 4.970010033444816e-06, + "loss": 0.1894, + "step": 27975 + }, + { + "epoch": 1.1, + "grad_norm": 1.1789685652154698, + "learning_rate": 4.96996822742475e-06, + "loss": 0.1998, + "step": 28000 + }, + { + "epoch": 1.1, + "grad_norm": 1.971887880644852, + "learning_rate": 4.969926421404682e-06, + "loss": 0.2217, + "step": 28025 + }, + { + "epoch": 1.1, + "grad_norm": 1.6583518083353077, + "learning_rate": 4.969884615384616e-06, + "loss": 0.2077, + "step": 28050 + }, + { + "epoch": 1.1, + "grad_norm": 1.2231691799156768, + "learning_rate": 4.969842809364549e-06, + "loss": 0.2074, + "step": 28075 + }, + { + "epoch": 1.11, + "grad_norm": 1.4124682639555552, + "learning_rate": 4.969801003344482e-06, + "loss": 0.2002, + "step": 28100 + }, + { + "epoch": 1.11, + "grad_norm": 1.5315963028247968, + "learning_rate": 4.969759197324415e-06, + "loss": 0.2116, + "step": 28125 + }, + { + "epoch": 1.11, + "grad_norm": 0.774063539155429, + "learning_rate": 4.969717391304348e-06, + "loss": 0.2003, + "step": 28150 + }, + { + "epoch": 1.11, + "grad_norm": 1.3548270043032278, + "learning_rate": 4.969675585284282e-06, + "loss": 0.2266, + "step": 28175 + }, + { + "epoch": 1.11, + "grad_norm": 1.5113091523893083, + "learning_rate": 4.969633779264215e-06, + "loss": 0.2182, + "step": 28200 + }, + { + "epoch": 1.11, + "grad_norm": 0.9251487332267831, + "learning_rate": 4.9695936454849505e-06, + "loss": 0.2202, + "step": 28225 + }, + { + "epoch": 1.11, + "grad_norm": 1.8324301345828689, + "learning_rate": 4.969551839464883e-06, + "loss": 0.2255, + "step": 28250 + }, + { + "epoch": 1.11, + "grad_norm": 1.4775932179452143, + "learning_rate": 4.969510033444817e-06, + "loss": 0.1994, + "step": 28275 + }, + { + "epoch": 1.11, + "grad_norm": 1.6333919121168157, + "learning_rate": 4.969468227424749e-06, + "loss": 0.2081, + "step": 28300 + }, + { + "epoch": 1.11, + "grad_norm": 0.864925684984239, + "learning_rate": 4.969426421404683e-06, + "loss": 0.1797, + "step": 28325 + }, + { + "epoch": 1.12, + "grad_norm": 1.9096612346707407, + "learning_rate": 4.969384615384616e-06, + "loss": 0.206, + "step": 28350 + }, + { + "epoch": 1.12, + "grad_norm": 1.9576478394690897, + "learning_rate": 4.969342809364549e-06, + "loss": 0.2197, + "step": 28375 + }, + { + "epoch": 1.12, + "grad_norm": 1.5136863296140723, + "learning_rate": 4.969301003344482e-06, + "loss": 0.199, + "step": 28400 + }, + { + "epoch": 1.12, + "grad_norm": 1.2010364526097577, + "learning_rate": 4.9692591973244154e-06, + "loss": 0.2107, + "step": 28425 + }, + { + "epoch": 1.12, + "grad_norm": 1.407440767715517, + "learning_rate": 4.969217391304348e-06, + "loss": 0.1996, + "step": 28450 + }, + { + "epoch": 1.12, + "grad_norm": 2.037763973567924, + "learning_rate": 4.969175585284282e-06, + "loss": 0.1935, + "step": 28475 + }, + { + "epoch": 1.12, + "grad_norm": 1.7243218825000752, + "learning_rate": 4.969133779264214e-06, + "loss": 0.2144, + "step": 28500 + }, + { + "epoch": 1.12, + "grad_norm": 1.973852579552326, + "learning_rate": 4.969091973244148e-06, + "loss": 0.1863, + "step": 28525 + }, + { + "epoch": 1.12, + "grad_norm": 1.5798095723052332, + "learning_rate": 4.969050167224081e-06, + "loss": 0.2177, + "step": 28550 + }, + { + "epoch": 1.12, + "grad_norm": 1.4624355487653509, + "learning_rate": 4.969008361204013e-06, + "loss": 0.2158, + "step": 28575 + }, + { + "epoch": 1.13, + "grad_norm": 1.2634512898863046, + "learning_rate": 4.968966555183947e-06, + "loss": 0.1805, + "step": 28600 + }, + { + "epoch": 1.13, + "grad_norm": 1.292016189175459, + "learning_rate": 4.9689247491638796e-06, + "loss": 0.1972, + "step": 28625 + }, + { + "epoch": 1.13, + "grad_norm": 1.887658715804907, + "learning_rate": 4.968882943143813e-06, + "loss": 0.1994, + "step": 28650 + }, + { + "epoch": 1.13, + "grad_norm": 1.5728950499609098, + "learning_rate": 4.968841137123746e-06, + "loss": 0.2084, + "step": 28675 + }, + { + "epoch": 1.13, + "grad_norm": 1.5170270635605654, + "learning_rate": 4.968799331103679e-06, + "loss": 0.1836, + "step": 28700 + }, + { + "epoch": 1.13, + "grad_norm": 1.182392469384069, + "learning_rate": 4.968757525083612e-06, + "loss": 0.1927, + "step": 28725 + }, + { + "epoch": 1.13, + "grad_norm": 1.467566207323012, + "learning_rate": 4.968715719063546e-06, + "loss": 0.211, + "step": 28750 + }, + { + "epoch": 1.13, + "grad_norm": 1.272554103873533, + "learning_rate": 4.968673913043478e-06, + "loss": 0.2185, + "step": 28775 + }, + { + "epoch": 1.13, + "grad_norm": 1.8133824165206656, + "learning_rate": 4.968632107023412e-06, + "loss": 0.2006, + "step": 28800 + }, + { + "epoch": 1.13, + "grad_norm": 1.1311871034158731, + "learning_rate": 4.9685903010033445e-06, + "loss": 0.2058, + "step": 28825 + }, + { + "epoch": 1.13, + "grad_norm": 1.5883722748526226, + "learning_rate": 4.968548494983278e-06, + "loss": 0.2064, + "step": 28850 + }, + { + "epoch": 1.14, + "grad_norm": 2.5904261682283662, + "learning_rate": 4.968506688963211e-06, + "loss": 0.2024, + "step": 28875 + }, + { + "epoch": 1.14, + "grad_norm": 1.1478828955712104, + "learning_rate": 4.968464882943144e-06, + "loss": 0.235, + "step": 28900 + }, + { + "epoch": 1.14, + "grad_norm": 1.630161077340849, + "learning_rate": 4.968423076923077e-06, + "loss": 0.1974, + "step": 28925 + }, + { + "epoch": 1.14, + "grad_norm": 1.8662904772136388, + "learning_rate": 4.9683812709030106e-06, + "loss": 0.2017, + "step": 28950 + }, + { + "epoch": 1.14, + "grad_norm": 1.8506149538880028, + "learning_rate": 4.968339464882943e-06, + "loss": 0.2079, + "step": 28975 + }, + { + "epoch": 1.14, + "grad_norm": 1.7198221954597992, + "learning_rate": 4.968297658862877e-06, + "loss": 0.2012, + "step": 29000 + }, + { + "epoch": 1.14, + "grad_norm": 2.2468499981047616, + "learning_rate": 4.9682558528428095e-06, + "loss": 0.2241, + "step": 29025 + }, + { + "epoch": 1.14, + "grad_norm": 1.871449485888433, + "learning_rate": 4.968214046822743e-06, + "loss": 0.2259, + "step": 29050 + }, + { + "epoch": 1.14, + "grad_norm": 1.2013415537996155, + "learning_rate": 4.968172240802676e-06, + "loss": 0.1966, + "step": 29075 + }, + { + "epoch": 1.14, + "grad_norm": 1.8646445635350821, + "learning_rate": 4.968130434782609e-06, + "loss": 0.2081, + "step": 29100 + }, + { + "epoch": 1.15, + "grad_norm": 1.7362617133719327, + "learning_rate": 4.968088628762542e-06, + "loss": 0.2033, + "step": 29125 + }, + { + "epoch": 1.15, + "grad_norm": 1.9583229334368888, + "learning_rate": 4.9680468227424756e-06, + "loss": 0.2195, + "step": 29150 + }, + { + "epoch": 1.15, + "grad_norm": 1.9756496260469272, + "learning_rate": 4.968005016722408e-06, + "loss": 0.2138, + "step": 29175 + }, + { + "epoch": 1.15, + "grad_norm": 1.8136961850820263, + "learning_rate": 4.967963210702342e-06, + "loss": 0.1989, + "step": 29200 + }, + { + "epoch": 1.15, + "grad_norm": 0.8737358384053303, + "learning_rate": 4.9679214046822745e-06, + "loss": 0.1914, + "step": 29225 + }, + { + "epoch": 1.15, + "grad_norm": 1.8604986074744494, + "learning_rate": 4.96788127090301e-06, + "loss": 0.2203, + "step": 29250 + }, + { + "epoch": 1.15, + "grad_norm": 1.349940979840139, + "learning_rate": 4.967839464882944e-06, + "loss": 0.2278, + "step": 29275 + }, + { + "epoch": 1.15, + "grad_norm": 1.8282393434449213, + "learning_rate": 4.9677976588628766e-06, + "loss": 0.2236, + "step": 29300 + }, + { + "epoch": 1.15, + "grad_norm": 2.5180697118545403, + "learning_rate": 4.96775585284281e-06, + "loss": 0.2302, + "step": 29325 + }, + { + "epoch": 1.15, + "grad_norm": 1.8114203235294122, + "learning_rate": 4.967714046822743e-06, + "loss": 0.1866, + "step": 29350 + }, + { + "epoch": 1.16, + "grad_norm": 1.3440834890531017, + "learning_rate": 4.967672240802676e-06, + "loss": 0.2201, + "step": 29375 + }, + { + "epoch": 1.16, + "grad_norm": 2.3107235122075616, + "learning_rate": 4.967630434782609e-06, + "loss": 0.2028, + "step": 29400 + }, + { + "epoch": 1.16, + "grad_norm": 1.329479003913603, + "learning_rate": 4.967588628762543e-06, + "loss": 0.1991, + "step": 29425 + }, + { + "epoch": 1.16, + "grad_norm": 1.7727406488523492, + "learning_rate": 4.967546822742475e-06, + "loss": 0.2009, + "step": 29450 + }, + { + "epoch": 1.16, + "grad_norm": 1.6058323726648749, + "learning_rate": 4.967505016722409e-06, + "loss": 0.2118, + "step": 29475 + }, + { + "epoch": 1.16, + "grad_norm": 1.2674018423135895, + "learning_rate": 4.9674632107023415e-06, + "loss": 0.2054, + "step": 29500 + }, + { + "epoch": 1.16, + "grad_norm": 1.7825070235459493, + "learning_rate": 4.967421404682274e-06, + "loss": 0.2254, + "step": 29525 + }, + { + "epoch": 1.16, + "grad_norm": 2.043836906677806, + "learning_rate": 4.967379598662208e-06, + "loss": 0.2188, + "step": 29550 + }, + { + "epoch": 1.16, + "grad_norm": 2.0876415569608375, + "learning_rate": 4.9673377926421405e-06, + "loss": 0.2041, + "step": 29575 + }, + { + "epoch": 1.16, + "grad_norm": 1.804802164781188, + "learning_rate": 4.967295986622074e-06, + "loss": 0.1919, + "step": 29600 + }, + { + "epoch": 1.17, + "grad_norm": 2.183654886861711, + "learning_rate": 4.967254180602007e-06, + "loss": 0.1988, + "step": 29625 + }, + { + "epoch": 1.17, + "grad_norm": 1.8097074308078962, + "learning_rate": 4.96721237458194e-06, + "loss": 0.2106, + "step": 29650 + }, + { + "epoch": 1.17, + "grad_norm": 1.8124071979254552, + "learning_rate": 4.967170568561873e-06, + "loss": 0.2138, + "step": 29675 + }, + { + "epoch": 1.17, + "grad_norm": 1.4488631236080394, + "learning_rate": 4.9671287625418065e-06, + "loss": 0.2091, + "step": 29700 + }, + { + "epoch": 1.17, + "grad_norm": 1.3220655136855233, + "learning_rate": 4.967086956521739e-06, + "loss": 0.2245, + "step": 29725 + }, + { + "epoch": 1.17, + "grad_norm": 1.5358184563094297, + "learning_rate": 4.967045150501673e-06, + "loss": 0.1831, + "step": 29750 + }, + { + "epoch": 1.17, + "grad_norm": 1.5721622318412813, + "learning_rate": 4.9670033444816055e-06, + "loss": 0.2164, + "step": 29775 + }, + { + "epoch": 1.17, + "grad_norm": 1.3569247099530573, + "learning_rate": 4.966961538461539e-06, + "loss": 0.2083, + "step": 29800 + }, + { + "epoch": 1.17, + "grad_norm": 1.919524305810182, + "learning_rate": 4.966919732441472e-06, + "loss": 0.2012, + "step": 29825 + }, + { + "epoch": 1.17, + "grad_norm": 1.6943687241906875, + "learning_rate": 4.966877926421405e-06, + "loss": 0.223, + "step": 29850 + }, + { + "epoch": 1.18, + "grad_norm": 1.6313740896270725, + "learning_rate": 4.966836120401338e-06, + "loss": 0.2194, + "step": 29875 + }, + { + "epoch": 1.18, + "grad_norm": 1.6895490084997424, + "learning_rate": 4.9667943143812715e-06, + "loss": 0.2077, + "step": 29900 + }, + { + "epoch": 1.18, + "grad_norm": 1.601228861172109, + "learning_rate": 4.966752508361204e-06, + "loss": 0.1814, + "step": 29925 + }, + { + "epoch": 1.18, + "grad_norm": 2.219366909148872, + "learning_rate": 4.966710702341138e-06, + "loss": 0.2117, + "step": 29950 + }, + { + "epoch": 1.18, + "grad_norm": 1.2499897794948742, + "learning_rate": 4.9666688963210704e-06, + "loss": 0.1791, + "step": 29975 + }, + { + "epoch": 1.18, + "grad_norm": 1.3071238159646315, + "learning_rate": 4.966627090301004e-06, + "loss": 0.2244, + "step": 30000 + }, + { + "epoch": 1.18, + "eval_loss": 0.5361328125, + "eval_runtime": 11518.7802, + "eval_samples_per_second": 0.822, + "eval_steps_per_second": 0.051, + "eval_wer": 0.12534787574602255, + "step": 30000 + }, + { + "epoch": 1.18, + "grad_norm": 1.7823343242326686, + "learning_rate": 4.966585284280937e-06, + "loss": 0.2065, + "step": 30025 + }, + { + "epoch": 1.18, + "grad_norm": 1.5221486291926747, + "learning_rate": 4.96654347826087e-06, + "loss": 0.1987, + "step": 30050 + }, + { + "epoch": 1.18, + "grad_norm": 1.319001734849566, + "learning_rate": 4.966501672240803e-06, + "loss": 0.1983, + "step": 30075 + }, + { + "epoch": 1.18, + "grad_norm": 1.7989713632094604, + "learning_rate": 4.9664598662207365e-06, + "loss": 0.2139, + "step": 30100 + }, + { + "epoch": 1.19, + "grad_norm": 1.6123524206023738, + "learning_rate": 4.966418060200669e-06, + "loss": 0.2362, + "step": 30125 + }, + { + "epoch": 1.19, + "grad_norm": 1.1978865463050377, + "learning_rate": 4.966376254180603e-06, + "loss": 0.2037, + "step": 30150 + }, + { + "epoch": 1.19, + "grad_norm": 1.3585992261357787, + "learning_rate": 4.966334448160535e-06, + "loss": 0.2092, + "step": 30175 + }, + { + "epoch": 1.19, + "grad_norm": 1.5811291808324892, + "learning_rate": 4.966292642140468e-06, + "loss": 0.195, + "step": 30200 + }, + { + "epoch": 1.19, + "grad_norm": 1.6853682024289205, + "learning_rate": 4.966250836120402e-06, + "loss": 0.2238, + "step": 30225 + }, + { + "epoch": 1.19, + "grad_norm": 1.7722744670231796, + "learning_rate": 4.9662107023411375e-06, + "loss": 0.2075, + "step": 30250 + }, + { + "epoch": 1.19, + "grad_norm": 1.881184080308395, + "learning_rate": 4.966168896321071e-06, + "loss": 0.1813, + "step": 30275 + }, + { + "epoch": 1.19, + "grad_norm": 1.0946512002383884, + "learning_rate": 4.966127090301004e-06, + "loss": 0.2055, + "step": 30300 + }, + { + "epoch": 1.19, + "grad_norm": 1.3313241329178456, + "learning_rate": 4.966085284280937e-06, + "loss": 0.2141, + "step": 30325 + }, + { + "epoch": 1.19, + "grad_norm": 1.3531092149433521, + "learning_rate": 4.96604347826087e-06, + "loss": 0.1969, + "step": 30350 + }, + { + "epoch": 1.19, + "grad_norm": 1.9444146528281916, + "learning_rate": 4.9660016722408035e-06, + "loss": 0.2217, + "step": 30375 + }, + { + "epoch": 1.2, + "grad_norm": 1.1850621451492553, + "learning_rate": 4.965959866220736e-06, + "loss": 0.2144, + "step": 30400 + }, + { + "epoch": 1.2, + "grad_norm": 1.8003297169728758, + "learning_rate": 4.96591806020067e-06, + "loss": 0.1884, + "step": 30425 + }, + { + "epoch": 1.2, + "grad_norm": 2.301931614004386, + "learning_rate": 4.9658762541806025e-06, + "loss": 0.2166, + "step": 30450 + }, + { + "epoch": 1.2, + "grad_norm": 1.4598479583328665, + "learning_rate": 4.965834448160535e-06, + "loss": 0.2008, + "step": 30475 + }, + { + "epoch": 1.2, + "grad_norm": 1.4011597406764777, + "learning_rate": 4.965792642140469e-06, + "loss": 0.2057, + "step": 30500 + }, + { + "epoch": 1.2, + "grad_norm": 2.0890831441737023, + "learning_rate": 4.965750836120401e-06, + "loss": 0.2002, + "step": 30525 + }, + { + "epoch": 1.2, + "grad_norm": 1.0317922862792932, + "learning_rate": 4.965709030100335e-06, + "loss": 0.2098, + "step": 30550 + }, + { + "epoch": 1.2, + "grad_norm": 2.3339129809340755, + "learning_rate": 4.965667224080268e-06, + "loss": 0.2073, + "step": 30575 + }, + { + "epoch": 1.2, + "grad_norm": 1.3910047992276036, + "learning_rate": 4.965625418060201e-06, + "loss": 0.2241, + "step": 30600 + }, + { + "epoch": 1.2, + "grad_norm": 1.937110026502965, + "learning_rate": 4.965583612040134e-06, + "loss": 0.2048, + "step": 30625 + }, + { + "epoch": 1.21, + "grad_norm": 2.4355058481059215, + "learning_rate": 4.9655418060200674e-06, + "loss": 0.2099, + "step": 30650 + }, + { + "epoch": 1.21, + "grad_norm": 1.5529814044707928, + "learning_rate": 4.9655e-06, + "loss": 0.1749, + "step": 30675 + }, + { + "epoch": 1.21, + "grad_norm": 2.247459234736527, + "learning_rate": 4.965458193979934e-06, + "loss": 0.2111, + "step": 30700 + }, + { + "epoch": 1.21, + "grad_norm": 1.839503804605756, + "learning_rate": 4.965416387959866e-06, + "loss": 0.2074, + "step": 30725 + }, + { + "epoch": 1.21, + "grad_norm": 0.8945095656682283, + "learning_rate": 4.9653745819398e-06, + "loss": 0.1872, + "step": 30750 + }, + { + "epoch": 1.21, + "grad_norm": 1.8914319288406296, + "learning_rate": 4.965332775919733e-06, + "loss": 0.2194, + "step": 30775 + }, + { + "epoch": 1.21, + "grad_norm": 1.8360889606904163, + "learning_rate": 4.965290969899666e-06, + "loss": 0.2243, + "step": 30800 + }, + { + "epoch": 1.21, + "grad_norm": 1.7850880114912815, + "learning_rate": 4.965249163879599e-06, + "loss": 0.2205, + "step": 30825 + }, + { + "epoch": 1.21, + "grad_norm": 1.596884049166259, + "learning_rate": 4.9652073578595324e-06, + "loss": 0.221, + "step": 30850 + }, + { + "epoch": 1.21, + "grad_norm": 1.9903428122299165, + "learning_rate": 4.965165551839465e-06, + "loss": 0.2093, + "step": 30875 + }, + { + "epoch": 1.22, + "grad_norm": 1.5969796027518697, + "learning_rate": 4.965123745819399e-06, + "loss": 0.2066, + "step": 30900 + }, + { + "epoch": 1.22, + "grad_norm": 1.4232465281818925, + "learning_rate": 4.965081939799331e-06, + "loss": 0.1918, + "step": 30925 + }, + { + "epoch": 1.22, + "grad_norm": 1.6069161329849821, + "learning_rate": 4.965040133779265e-06, + "loss": 0.2285, + "step": 30950 + }, + { + "epoch": 1.22, + "grad_norm": 1.2697223955884902, + "learning_rate": 4.964998327759198e-06, + "loss": 0.2016, + "step": 30975 + }, + { + "epoch": 1.22, + "grad_norm": 1.2930314056749161, + "learning_rate": 4.964956521739131e-06, + "loss": 0.2227, + "step": 31000 + }, + { + "epoch": 1.22, + "grad_norm": 1.1212062680858963, + "learning_rate": 4.964914715719064e-06, + "loss": 0.2011, + "step": 31025 + }, + { + "epoch": 1.22, + "grad_norm": 1.9658032132564844, + "learning_rate": 4.964872909698997e-06, + "loss": 0.2285, + "step": 31050 + }, + { + "epoch": 1.22, + "grad_norm": 2.3693017625531883, + "learning_rate": 4.96483110367893e-06, + "loss": 0.216, + "step": 31075 + }, + { + "epoch": 1.22, + "grad_norm": 1.9956647662239217, + "learning_rate": 4.964789297658864e-06, + "loss": 0.2185, + "step": 31100 + }, + { + "epoch": 1.22, + "grad_norm": 1.202282939519141, + "learning_rate": 4.964747491638796e-06, + "loss": 0.2048, + "step": 31125 + }, + { + "epoch": 1.23, + "grad_norm": 1.6666987073505568, + "learning_rate": 4.964705685618729e-06, + "loss": 0.2308, + "step": 31150 + }, + { + "epoch": 1.23, + "grad_norm": 1.9958955477076095, + "learning_rate": 4.964663879598663e-06, + "loss": 0.2031, + "step": 31175 + }, + { + "epoch": 1.23, + "grad_norm": 2.4079118493249374, + "learning_rate": 4.964622073578595e-06, + "loss": 0.207, + "step": 31200 + }, + { + "epoch": 1.23, + "grad_norm": 1.6437168762178533, + "learning_rate": 4.964580267558529e-06, + "loss": 0.2088, + "step": 31225 + }, + { + "epoch": 1.23, + "grad_norm": 2.0817719152582503, + "learning_rate": 4.964540133779265e-06, + "loss": 0.1928, + "step": 31250 + }, + { + "epoch": 1.23, + "grad_norm": 0.9167781534885544, + "learning_rate": 4.964498327759198e-06, + "loss": 0.1914, + "step": 31275 + }, + { + "epoch": 1.23, + "grad_norm": 1.9066328276379372, + "learning_rate": 4.964456521739131e-06, + "loss": 0.2069, + "step": 31300 + }, + { + "epoch": 1.23, + "grad_norm": 1.9776175873759618, + "learning_rate": 4.9644147157190645e-06, + "loss": 0.2121, + "step": 31325 + }, + { + "epoch": 1.23, + "grad_norm": 1.3144155588840976, + "learning_rate": 4.964372909698997e-06, + "loss": 0.1927, + "step": 31350 + }, + { + "epoch": 1.23, + "grad_norm": 1.4071128683786893, + "learning_rate": 4.96433110367893e-06, + "loss": 0.1953, + "step": 31375 + }, + { + "epoch": 1.24, + "grad_norm": 1.0973251877720371, + "learning_rate": 4.964289297658863e-06, + "loss": 0.2166, + "step": 31400 + }, + { + "epoch": 1.24, + "grad_norm": 1.6240742135179362, + "learning_rate": 4.964247491638796e-06, + "loss": 0.2036, + "step": 31425 + }, + { + "epoch": 1.24, + "grad_norm": 1.9782177053431977, + "learning_rate": 4.96420568561873e-06, + "loss": 0.2265, + "step": 31450 + }, + { + "epoch": 1.24, + "grad_norm": 1.5819617657187213, + "learning_rate": 4.964163879598662e-06, + "loss": 0.1893, + "step": 31475 + }, + { + "epoch": 1.24, + "grad_norm": 2.0167586079498867, + "learning_rate": 4.964122073578596e-06, + "loss": 0.2175, + "step": 31500 + }, + { + "epoch": 1.24, + "grad_norm": 1.6880449215025803, + "learning_rate": 4.964080267558529e-06, + "loss": 0.2047, + "step": 31525 + }, + { + "epoch": 1.24, + "grad_norm": 1.3911746414902157, + "learning_rate": 4.964038461538462e-06, + "loss": 0.2083, + "step": 31550 + }, + { + "epoch": 1.24, + "grad_norm": 1.6077306397848325, + "learning_rate": 4.963996655518395e-06, + "loss": 0.2038, + "step": 31575 + }, + { + "epoch": 1.24, + "grad_norm": 0.8547104842544427, + "learning_rate": 4.963954849498328e-06, + "loss": 0.1909, + "step": 31600 + }, + { + "epoch": 1.24, + "grad_norm": 1.2230790698786718, + "learning_rate": 4.963913043478261e-06, + "loss": 0.2144, + "step": 31625 + }, + { + "epoch": 1.25, + "grad_norm": 1.6684024560293576, + "learning_rate": 4.963871237458195e-06, + "loss": 0.2259, + "step": 31650 + }, + { + "epoch": 1.25, + "grad_norm": 1.4888511188162146, + "learning_rate": 4.963829431438127e-06, + "loss": 0.1831, + "step": 31675 + }, + { + "epoch": 1.25, + "grad_norm": 2.078967899199761, + "learning_rate": 4.963787625418061e-06, + "loss": 0.2105, + "step": 31700 + }, + { + "epoch": 1.25, + "grad_norm": 1.510441172072911, + "learning_rate": 4.9637458193979936e-06, + "loss": 0.1981, + "step": 31725 + }, + { + "epoch": 1.25, + "grad_norm": 1.646878944946446, + "learning_rate": 4.963704013377927e-06, + "loss": 0.2097, + "step": 31750 + }, + { + "epoch": 1.25, + "grad_norm": 1.8328540352865563, + "learning_rate": 4.96366220735786e-06, + "loss": 0.1886, + "step": 31775 + }, + { + "epoch": 1.25, + "grad_norm": 1.1374340269320808, + "learning_rate": 4.963620401337793e-06, + "loss": 0.191, + "step": 31800 + }, + { + "epoch": 1.25, + "grad_norm": 1.9988391709765514, + "learning_rate": 4.963578595317726e-06, + "loss": 0.2059, + "step": 31825 + }, + { + "epoch": 1.25, + "grad_norm": 1.1328657514428446, + "learning_rate": 4.96353678929766e-06, + "loss": 0.2285, + "step": 31850 + }, + { + "epoch": 1.25, + "grad_norm": 1.9394059197287883, + "learning_rate": 4.963494983277592e-06, + "loss": 0.2108, + "step": 31875 + }, + { + "epoch": 1.25, + "grad_norm": 1.0461897291306157, + "learning_rate": 4.963453177257526e-06, + "loss": 0.2116, + "step": 31900 + }, + { + "epoch": 1.26, + "grad_norm": 1.3526856502795168, + "learning_rate": 4.9634113712374585e-06, + "loss": 0.2267, + "step": 31925 + }, + { + "epoch": 1.26, + "grad_norm": 1.8917535020913774, + "learning_rate": 4.963369565217392e-06, + "loss": 0.2163, + "step": 31950 + }, + { + "epoch": 1.26, + "grad_norm": 1.4229763247970235, + "learning_rate": 4.963327759197325e-06, + "loss": 0.1974, + "step": 31975 + }, + { + "epoch": 1.26, + "grad_norm": 2.091442768892304, + "learning_rate": 4.963285953177258e-06, + "loss": 0.2062, + "step": 32000 + }, + { + "epoch": 1.26, + "grad_norm": 1.513372953462432, + "learning_rate": 4.963244147157191e-06, + "loss": 0.2136, + "step": 32025 + }, + { + "epoch": 1.26, + "grad_norm": 1.43008542750968, + "learning_rate": 4.9632023411371246e-06, + "loss": 0.2091, + "step": 32050 + }, + { + "epoch": 1.26, + "grad_norm": 2.199026863323161, + "learning_rate": 4.963160535117057e-06, + "loss": 0.2209, + "step": 32075 + }, + { + "epoch": 1.26, + "grad_norm": 1.5908105703435922, + "learning_rate": 4.96311872909699e-06, + "loss": 0.2005, + "step": 32100 + }, + { + "epoch": 1.26, + "grad_norm": 2.4953338699146124, + "learning_rate": 4.9630769230769235e-06, + "loss": 0.2449, + "step": 32125 + }, + { + "epoch": 1.26, + "grad_norm": 1.502883147552933, + "learning_rate": 4.963035117056856e-06, + "loss": 0.2131, + "step": 32150 + }, + { + "epoch": 1.27, + "grad_norm": 1.751577299753483, + "learning_rate": 4.96299331103679e-06, + "loss": 0.217, + "step": 32175 + }, + { + "epoch": 1.27, + "grad_norm": 1.394809862880557, + "learning_rate": 4.9629515050167225e-06, + "loss": 0.2091, + "step": 32200 + }, + { + "epoch": 1.27, + "grad_norm": 2.2060001394612345, + "learning_rate": 4.962909698996656e-06, + "loss": 0.2069, + "step": 32225 + }, + { + "epoch": 1.27, + "grad_norm": 1.9708415738072285, + "learning_rate": 4.962869565217392e-06, + "loss": 0.2026, + "step": 32250 + }, + { + "epoch": 1.27, + "grad_norm": 2.001205087461328, + "learning_rate": 4.962827759197325e-06, + "loss": 0.2032, + "step": 32275 + }, + { + "epoch": 1.27, + "grad_norm": 1.266515112142646, + "learning_rate": 4.962785953177258e-06, + "loss": 0.2161, + "step": 32300 + }, + { + "epoch": 1.27, + "grad_norm": 1.5658738110557742, + "learning_rate": 4.962744147157191e-06, + "loss": 0.196, + "step": 32325 + }, + { + "epoch": 1.27, + "grad_norm": 1.9588945387829928, + "learning_rate": 4.962702341137124e-06, + "loss": 0.2175, + "step": 32350 + }, + { + "epoch": 1.27, + "grad_norm": 1.5897518353484381, + "learning_rate": 4.962660535117057e-06, + "loss": 0.2134, + "step": 32375 + }, + { + "epoch": 1.27, + "grad_norm": 1.2728706316819134, + "learning_rate": 4.9626187290969906e-06, + "loss": 0.1953, + "step": 32400 + }, + { + "epoch": 1.28, + "grad_norm": 2.391470189830352, + "learning_rate": 4.962576923076923e-06, + "loss": 0.21, + "step": 32425 + }, + { + "epoch": 1.28, + "grad_norm": 1.5455063030077856, + "learning_rate": 4.962535117056857e-06, + "loss": 0.1955, + "step": 32450 + }, + { + "epoch": 1.28, + "grad_norm": 1.8416205159711543, + "learning_rate": 4.9624933110367895e-06, + "loss": 0.1971, + "step": 32475 + }, + { + "epoch": 1.28, + "grad_norm": 1.6448546575874654, + "learning_rate": 4.962451505016723e-06, + "loss": 0.2074, + "step": 32500 + }, + { + "epoch": 1.28, + "grad_norm": 3.3357850095647246, + "learning_rate": 4.962409698996656e-06, + "loss": 0.2314, + "step": 32525 + }, + { + "epoch": 1.28, + "grad_norm": 1.9861129354509446, + "learning_rate": 4.962367892976589e-06, + "loss": 0.2014, + "step": 32550 + }, + { + "epoch": 1.28, + "grad_norm": 1.3127887409214314, + "learning_rate": 4.962326086956522e-06, + "loss": 0.2087, + "step": 32575 + }, + { + "epoch": 1.28, + "grad_norm": 1.5428255216234126, + "learning_rate": 4.9622842809364555e-06, + "loss": 0.1927, + "step": 32600 + }, + { + "epoch": 1.28, + "grad_norm": 2.265547190895961, + "learning_rate": 4.962242474916388e-06, + "loss": 0.2127, + "step": 32625 + }, + { + "epoch": 1.28, + "grad_norm": 1.1891946036863335, + "learning_rate": 4.962200668896322e-06, + "loss": 0.1993, + "step": 32650 + }, + { + "epoch": 1.29, + "grad_norm": 1.7958715126981217, + "learning_rate": 4.9621588628762545e-06, + "loss": 0.214, + "step": 32675 + }, + { + "epoch": 1.29, + "grad_norm": 1.6405593238103398, + "learning_rate": 4.962117056856188e-06, + "loss": 0.2097, + "step": 32700 + }, + { + "epoch": 1.29, + "grad_norm": 1.5373448233299254, + "learning_rate": 4.962075250836121e-06, + "loss": 0.2151, + "step": 32725 + }, + { + "epoch": 1.29, + "grad_norm": 2.0068111663697987, + "learning_rate": 4.962033444816054e-06, + "loss": 0.1893, + "step": 32750 + }, + { + "epoch": 1.29, + "grad_norm": 2.0679584686902626, + "learning_rate": 4.961991638795987e-06, + "loss": 0.2236, + "step": 32775 + }, + { + "epoch": 1.29, + "grad_norm": 1.2454742075576373, + "learning_rate": 4.9619498327759205e-06, + "loss": 0.1778, + "step": 32800 + }, + { + "epoch": 1.29, + "grad_norm": 1.494891691231001, + "learning_rate": 4.961908026755853e-06, + "loss": 0.2111, + "step": 32825 + }, + { + "epoch": 1.29, + "grad_norm": 1.972901323601928, + "learning_rate": 4.961866220735787e-06, + "loss": 0.2329, + "step": 32850 + }, + { + "epoch": 1.29, + "grad_norm": 1.4863004691478272, + "learning_rate": 4.9618244147157195e-06, + "loss": 0.2036, + "step": 32875 + }, + { + "epoch": 1.29, + "grad_norm": 1.2875445717253988, + "learning_rate": 4.961782608695653e-06, + "loss": 0.2026, + "step": 32900 + }, + { + "epoch": 1.3, + "grad_norm": 1.5038045805937086, + "learning_rate": 4.961740802675586e-06, + "loss": 0.207, + "step": 32925 + }, + { + "epoch": 1.3, + "grad_norm": 1.649781568126332, + "learning_rate": 4.961698996655519e-06, + "loss": 0.1957, + "step": 32950 + }, + { + "epoch": 1.3, + "grad_norm": 1.451803928656282, + "learning_rate": 4.961657190635452e-06, + "loss": 0.2139, + "step": 32975 + }, + { + "epoch": 1.3, + "grad_norm": 2.096939736947867, + "learning_rate": 4.9616153846153855e-06, + "loss": 0.2167, + "step": 33000 + }, + { + "epoch": 1.3, + "grad_norm": 1.5804173291083985, + "learning_rate": 4.961573578595318e-06, + "loss": 0.2084, + "step": 33025 + }, + { + "epoch": 1.3, + "grad_norm": 1.900713897980342, + "learning_rate": 4.961531772575251e-06, + "loss": 0.2094, + "step": 33050 + }, + { + "epoch": 1.3, + "grad_norm": 1.8047019720202782, + "learning_rate": 4.9614899665551844e-06, + "loss": 0.2045, + "step": 33075 + }, + { + "epoch": 1.3, + "grad_norm": 1.663316027429542, + "learning_rate": 4.961448160535117e-06, + "loss": 0.2075, + "step": 33100 + }, + { + "epoch": 1.3, + "grad_norm": 1.9090718281643422, + "learning_rate": 4.961406354515051e-06, + "loss": 0.1984, + "step": 33125 + }, + { + "epoch": 1.3, + "grad_norm": 1.8038054052477845, + "learning_rate": 4.961364548494983e-06, + "loss": 0.2071, + "step": 33150 + }, + { + "epoch": 1.31, + "grad_norm": 1.9119519029509469, + "learning_rate": 4.961322742474917e-06, + "loss": 0.2132, + "step": 33175 + }, + { + "epoch": 1.31, + "grad_norm": 1.9349267181687344, + "learning_rate": 4.96128093645485e-06, + "loss": 0.2159, + "step": 33200 + }, + { + "epoch": 1.31, + "grad_norm": 1.550384787253934, + "learning_rate": 4.961239130434783e-06, + "loss": 0.1922, + "step": 33225 + }, + { + "epoch": 1.31, + "grad_norm": 2.1539016786904788, + "learning_rate": 4.961198996655519e-06, + "loss": 0.2158, + "step": 33250 + }, + { + "epoch": 1.31, + "grad_norm": 2.1496086242086543, + "learning_rate": 4.961157190635452e-06, + "loss": 0.2176, + "step": 33275 + }, + { + "epoch": 1.31, + "grad_norm": 1.6905461372752362, + "learning_rate": 4.961115384615384e-06, + "loss": 0.1866, + "step": 33300 + }, + { + "epoch": 1.31, + "grad_norm": 1.1509723343168803, + "learning_rate": 4.961073578595318e-06, + "loss": 0.2218, + "step": 33325 + }, + { + "epoch": 1.31, + "grad_norm": 1.0867727056995953, + "learning_rate": 4.961031772575251e-06, + "loss": 0.2073, + "step": 33350 + }, + { + "epoch": 1.31, + "grad_norm": 1.5010059727175604, + "learning_rate": 4.960989966555184e-06, + "loss": 0.2009, + "step": 33375 + }, + { + "epoch": 1.31, + "grad_norm": 2.3178677208541214, + "learning_rate": 4.960948160535117e-06, + "loss": 0.2053, + "step": 33400 + }, + { + "epoch": 1.31, + "grad_norm": 1.5525133283022474, + "learning_rate": 4.9609063545150504e-06, + "loss": 0.1934, + "step": 33425 + }, + { + "epoch": 1.32, + "grad_norm": 1.5592614964414115, + "learning_rate": 4.960864548494984e-06, + "loss": 0.2024, + "step": 33450 + }, + { + "epoch": 1.32, + "grad_norm": 1.1553832312538863, + "learning_rate": 4.960822742474917e-06, + "loss": 0.2109, + "step": 33475 + }, + { + "epoch": 1.32, + "grad_norm": 1.8607289492966907, + "learning_rate": 4.96078093645485e-06, + "loss": 0.1962, + "step": 33500 + }, + { + "epoch": 1.32, + "grad_norm": 1.503789861400716, + "learning_rate": 4.960739130434783e-06, + "loss": 0.2088, + "step": 33525 + }, + { + "epoch": 1.32, + "grad_norm": 1.8929282998591728, + "learning_rate": 4.9606973244147165e-06, + "loss": 0.2209, + "step": 33550 + }, + { + "epoch": 1.32, + "grad_norm": 1.0496277495832924, + "learning_rate": 4.960655518394649e-06, + "loss": 0.1954, + "step": 33575 + }, + { + "epoch": 1.32, + "grad_norm": 1.4165638236647178, + "learning_rate": 4.960613712374583e-06, + "loss": 0.184, + "step": 33600 + }, + { + "epoch": 1.32, + "grad_norm": 1.3598210314527397, + "learning_rate": 4.960571906354515e-06, + "loss": 0.2002, + "step": 33625 + }, + { + "epoch": 1.32, + "grad_norm": 1.4783079603213292, + "learning_rate": 4.960530100334449e-06, + "loss": 0.2033, + "step": 33650 + }, + { + "epoch": 1.32, + "grad_norm": 1.9540334215512212, + "learning_rate": 4.960488294314382e-06, + "loss": 0.2249, + "step": 33675 + }, + { + "epoch": 1.33, + "grad_norm": 1.1034996218987867, + "learning_rate": 4.960446488294315e-06, + "loss": 0.1903, + "step": 33700 + }, + { + "epoch": 1.33, + "grad_norm": 2.3139815085334834, + "learning_rate": 4.960404682274248e-06, + "loss": 0.1939, + "step": 33725 + }, + { + "epoch": 1.33, + "grad_norm": 0.9968667345143176, + "learning_rate": 4.9603628762541814e-06, + "loss": 0.2057, + "step": 33750 + }, + { + "epoch": 1.33, + "grad_norm": 1.5320555945517205, + "learning_rate": 4.960321070234114e-06, + "loss": 0.2073, + "step": 33775 + }, + { + "epoch": 1.33, + "grad_norm": 1.282025684245721, + "learning_rate": 4.960279264214048e-06, + "loss": 0.2133, + "step": 33800 + }, + { + "epoch": 1.33, + "grad_norm": 1.777269893458473, + "learning_rate": 4.96023745819398e-06, + "loss": 0.2243, + "step": 33825 + }, + { + "epoch": 1.33, + "grad_norm": 1.5943182163866918, + "learning_rate": 4.960195652173914e-06, + "loss": 0.2153, + "step": 33850 + }, + { + "epoch": 1.33, + "grad_norm": 1.379151340617046, + "learning_rate": 4.960153846153847e-06, + "loss": 0.2043, + "step": 33875 + }, + { + "epoch": 1.33, + "grad_norm": 1.1521103544665638, + "learning_rate": 4.96011204013378e-06, + "loss": 0.2238, + "step": 33900 + }, + { + "epoch": 1.33, + "grad_norm": 1.2182907874261988, + "learning_rate": 4.960070234113713e-06, + "loss": 0.2097, + "step": 33925 + }, + { + "epoch": 1.34, + "grad_norm": 1.387589834089496, + "learning_rate": 4.960028428093646e-06, + "loss": 0.1899, + "step": 33950 + }, + { + "epoch": 1.34, + "grad_norm": 1.5191996191680612, + "learning_rate": 4.959986622073579e-06, + "loss": 0.2107, + "step": 33975 + }, + { + "epoch": 1.34, + "grad_norm": 1.4337854691253606, + "learning_rate": 4.959944816053512e-06, + "loss": 0.2131, + "step": 34000 + }, + { + "epoch": 1.34, + "grad_norm": 1.8589806989817563, + "learning_rate": 4.959903010033445e-06, + "loss": 0.2162, + "step": 34025 + }, + { + "epoch": 1.34, + "grad_norm": 2.303077233635094, + "learning_rate": 4.959861204013378e-06, + "loss": 0.203, + "step": 34050 + }, + { + "epoch": 1.34, + "grad_norm": 1.5672615001886725, + "learning_rate": 4.959819397993312e-06, + "loss": 0.2073, + "step": 34075 + }, + { + "epoch": 1.34, + "grad_norm": 1.163916859493643, + "learning_rate": 4.959777591973244e-06, + "loss": 0.2, + "step": 34100 + }, + { + "epoch": 1.34, + "grad_norm": 1.6051761264160214, + "learning_rate": 4.959735785953178e-06, + "loss": 0.2052, + "step": 34125 + }, + { + "epoch": 1.34, + "grad_norm": 1.3556812408093033, + "learning_rate": 4.9596939799331106e-06, + "loss": 0.2308, + "step": 34150 + }, + { + "epoch": 1.34, + "grad_norm": 1.3137216096393576, + "learning_rate": 4.959652173913044e-06, + "loss": 0.2077, + "step": 34175 + }, + { + "epoch": 1.35, + "grad_norm": 1.5157679496546062, + "learning_rate": 4.959610367892977e-06, + "loss": 0.2038, + "step": 34200 + }, + { + "epoch": 1.35, + "grad_norm": 2.236311175171304, + "learning_rate": 4.95956856187291e-06, + "loss": 0.1943, + "step": 34225 + }, + { + "epoch": 1.35, + "grad_norm": 1.4890581893083967, + "learning_rate": 4.959528428093645e-06, + "loss": 0.2026, + "step": 34250 + }, + { + "epoch": 1.35, + "grad_norm": 2.0697802575341915, + "learning_rate": 4.959486622073579e-06, + "loss": 0.2175, + "step": 34275 + }, + { + "epoch": 1.35, + "grad_norm": 2.0491147263632876, + "learning_rate": 4.9594448160535116e-06, + "loss": 0.2117, + "step": 34300 + }, + { + "epoch": 1.35, + "grad_norm": 1.46097970439395, + "learning_rate": 4.959403010033445e-06, + "loss": 0.1839, + "step": 34325 + }, + { + "epoch": 1.35, + "grad_norm": 2.014542005118715, + "learning_rate": 4.959361204013378e-06, + "loss": 0.2171, + "step": 34350 + }, + { + "epoch": 1.35, + "grad_norm": 1.3888128070932415, + "learning_rate": 4.959319397993311e-06, + "loss": 0.1863, + "step": 34375 + }, + { + "epoch": 1.35, + "grad_norm": 1.7106357207309615, + "learning_rate": 4.959277591973244e-06, + "loss": 0.1865, + "step": 34400 + }, + { + "epoch": 1.35, + "grad_norm": 1.7455015346010385, + "learning_rate": 4.959235785953178e-06, + "loss": 0.2059, + "step": 34425 + }, + { + "epoch": 1.36, + "grad_norm": 0.9415841930783881, + "learning_rate": 4.95919397993311e-06, + "loss": 0.2034, + "step": 34450 + }, + { + "epoch": 1.36, + "grad_norm": 2.3452049148270158, + "learning_rate": 4.959152173913044e-06, + "loss": 0.2055, + "step": 34475 + }, + { + "epoch": 1.36, + "grad_norm": 1.4039121644682209, + "learning_rate": 4.9591103678929765e-06, + "loss": 0.1914, + "step": 34500 + }, + { + "epoch": 1.36, + "grad_norm": 1.4796081378428405, + "learning_rate": 4.95906856187291e-06, + "loss": 0.2172, + "step": 34525 + }, + { + "epoch": 1.36, + "grad_norm": 1.9045711516273565, + "learning_rate": 4.959026755852843e-06, + "loss": 0.1927, + "step": 34550 + }, + { + "epoch": 1.36, + "grad_norm": 1.7547093039225692, + "learning_rate": 4.958984949832776e-06, + "loss": 0.228, + "step": 34575 + }, + { + "epoch": 1.36, + "grad_norm": 1.9785998947274708, + "learning_rate": 4.95894314381271e-06, + "loss": 0.2026, + "step": 34600 + }, + { + "epoch": 1.36, + "grad_norm": 1.2354236137870933, + "learning_rate": 4.958901337792643e-06, + "loss": 0.214, + "step": 34625 + }, + { + "epoch": 1.36, + "grad_norm": 1.3622120646886287, + "learning_rate": 4.958859531772576e-06, + "loss": 0.1868, + "step": 34650 + }, + { + "epoch": 1.36, + "grad_norm": 1.573347646006675, + "learning_rate": 4.958817725752509e-06, + "loss": 0.1997, + "step": 34675 + }, + { + "epoch": 1.37, + "grad_norm": 1.8322249477184158, + "learning_rate": 4.958775919732442e-06, + "loss": 0.2188, + "step": 34700 + }, + { + "epoch": 1.37, + "grad_norm": 1.4524561976880421, + "learning_rate": 4.958734113712375e-06, + "loss": 0.2097, + "step": 34725 + }, + { + "epoch": 1.37, + "grad_norm": 1.259548939098752, + "learning_rate": 4.958692307692309e-06, + "loss": 0.2152, + "step": 34750 + }, + { + "epoch": 1.37, + "grad_norm": 1.477510113801705, + "learning_rate": 4.958650501672241e-06, + "loss": 0.1898, + "step": 34775 + }, + { + "epoch": 1.37, + "grad_norm": 1.2690260150220294, + "learning_rate": 4.958608695652175e-06, + "loss": 0.2182, + "step": 34800 + }, + { + "epoch": 1.37, + "grad_norm": 1.5844701742556588, + "learning_rate": 4.9585668896321076e-06, + "loss": 0.1961, + "step": 34825 + }, + { + "epoch": 1.37, + "grad_norm": 2.273853459237293, + "learning_rate": 4.958525083612041e-06, + "loss": 0.1922, + "step": 34850 + }, + { + "epoch": 1.37, + "grad_norm": 1.8175047287317434, + "learning_rate": 4.958483277591974e-06, + "loss": 0.2038, + "step": 34875 + }, + { + "epoch": 1.37, + "grad_norm": 1.836042614757162, + "learning_rate": 4.958441471571907e-06, + "loss": 0.2168, + "step": 34900 + }, + { + "epoch": 1.37, + "grad_norm": 1.3796229553862953, + "learning_rate": 4.958399665551839e-06, + "loss": 0.2062, + "step": 34925 + }, + { + "epoch": 1.37, + "grad_norm": 2.3102116543430355, + "learning_rate": 4.958357859531773e-06, + "loss": 0.2078, + "step": 34950 + }, + { + "epoch": 1.38, + "grad_norm": 1.360851412118792, + "learning_rate": 4.9583160535117054e-06, + "loss": 0.1949, + "step": 34975 + }, + { + "epoch": 1.38, + "grad_norm": 1.2283119602827541, + "learning_rate": 4.958274247491639e-06, + "loss": 0.2074, + "step": 35000 + }, + { + "epoch": 1.38, + "grad_norm": 2.13008899942473, + "learning_rate": 4.9582324414715725e-06, + "loss": 0.2098, + "step": 35025 + }, + { + "epoch": 1.38, + "grad_norm": 1.033370788229089, + "learning_rate": 4.958190635451505e-06, + "loss": 0.2019, + "step": 35050 + }, + { + "epoch": 1.38, + "grad_norm": 1.2650220644179653, + "learning_rate": 4.958148829431439e-06, + "loss": 0.2119, + "step": 35075 + }, + { + "epoch": 1.38, + "grad_norm": 1.6882292388102216, + "learning_rate": 4.9581070234113715e-06, + "loss": 0.204, + "step": 35100 + }, + { + "epoch": 1.38, + "grad_norm": 1.5380929836431863, + "learning_rate": 4.958065217391305e-06, + "loss": 0.2127, + "step": 35125 + }, + { + "epoch": 1.38, + "grad_norm": 2.1509889226964374, + "learning_rate": 4.958023411371238e-06, + "loss": 0.1943, + "step": 35150 + }, + { + "epoch": 1.38, + "grad_norm": 2.4326488451072428, + "learning_rate": 4.957981605351171e-06, + "loss": 0.1855, + "step": 35175 + }, + { + "epoch": 1.38, + "grad_norm": 1.383235843774178, + "learning_rate": 4.957939799331104e-06, + "loss": 0.2132, + "step": 35200 + }, + { + "epoch": 1.39, + "grad_norm": 1.5270733379843522, + "learning_rate": 4.9578979933110375e-06, + "loss": 0.1998, + "step": 35225 + }, + { + "epoch": 1.39, + "grad_norm": 1.4047334942033511, + "learning_rate": 4.9578578595317725e-06, + "loss": 0.2344, + "step": 35250 + }, + { + "epoch": 1.39, + "grad_norm": 1.3056747609702148, + "learning_rate": 4.957816053511706e-06, + "loss": 0.1981, + "step": 35275 + }, + { + "epoch": 1.39, + "grad_norm": 1.8432393756629464, + "learning_rate": 4.957774247491639e-06, + "loss": 0.1912, + "step": 35300 + }, + { + "epoch": 1.39, + "grad_norm": 1.255197218187179, + "learning_rate": 4.957732441471572e-06, + "loss": 0.1947, + "step": 35325 + }, + { + "epoch": 1.39, + "grad_norm": 1.873243680140684, + "learning_rate": 4.957690635451505e-06, + "loss": 0.2078, + "step": 35350 + }, + { + "epoch": 1.39, + "grad_norm": 2.048000455141224, + "learning_rate": 4.9576488294314385e-06, + "loss": 0.2059, + "step": 35375 + }, + { + "epoch": 1.39, + "grad_norm": 3.4680651759429137, + "learning_rate": 4.957607023411371e-06, + "loss": 0.22, + "step": 35400 + }, + { + "epoch": 1.39, + "grad_norm": 1.53131994025086, + "learning_rate": 4.957565217391305e-06, + "loss": 0.2256, + "step": 35425 + }, + { + "epoch": 1.39, + "grad_norm": 1.8548841688370339, + "learning_rate": 4.9575234113712375e-06, + "loss": 0.2143, + "step": 35450 + }, + { + "epoch": 1.4, + "grad_norm": 1.6403964177995343, + "learning_rate": 4.957481605351171e-06, + "loss": 0.2128, + "step": 35475 + }, + { + "epoch": 1.4, + "grad_norm": 1.6156960137162244, + "learning_rate": 4.957439799331104e-06, + "loss": 0.2203, + "step": 35500 + }, + { + "epoch": 1.4, + "grad_norm": 1.5593538805691032, + "learning_rate": 4.957397993311037e-06, + "loss": 0.2111, + "step": 35525 + }, + { + "epoch": 1.4, + "grad_norm": 1.3940280246709056, + "learning_rate": 4.95735618729097e-06, + "loss": 0.2053, + "step": 35550 + }, + { + "epoch": 1.4, + "grad_norm": 2.2706106350468436, + "learning_rate": 4.9573143812709035e-06, + "loss": 0.2074, + "step": 35575 + }, + { + "epoch": 1.4, + "grad_norm": 0.9178553441649687, + "learning_rate": 4.957272575250836e-06, + "loss": 0.1981, + "step": 35600 + }, + { + "epoch": 1.4, + "grad_norm": 1.4261776676677842, + "learning_rate": 4.95723076923077e-06, + "loss": 0.1853, + "step": 35625 + }, + { + "epoch": 1.4, + "grad_norm": 1.1696659611524045, + "learning_rate": 4.9571889632107024e-06, + "loss": 0.2134, + "step": 35650 + }, + { + "epoch": 1.4, + "grad_norm": 0.7602557743841436, + "learning_rate": 4.957147157190636e-06, + "loss": 0.2126, + "step": 35675 + }, + { + "epoch": 1.4, + "grad_norm": 2.492907454571274, + "learning_rate": 4.9571053511705695e-06, + "loss": 0.216, + "step": 35700 + }, + { + "epoch": 1.41, + "grad_norm": 2.367209562020745, + "learning_rate": 4.957063545150502e-06, + "loss": 0.2195, + "step": 35725 + }, + { + "epoch": 1.41, + "grad_norm": 1.8227019149029202, + "learning_rate": 4.957021739130436e-06, + "loss": 0.1937, + "step": 35750 + }, + { + "epoch": 1.41, + "grad_norm": 1.2682518351707983, + "learning_rate": 4.9569799331103685e-06, + "loss": 0.2019, + "step": 35775 + }, + { + "epoch": 1.41, + "grad_norm": 1.262927556471107, + "learning_rate": 4.956938127090302e-06, + "loss": 0.1928, + "step": 35800 + }, + { + "epoch": 1.41, + "grad_norm": 1.42063661058105, + "learning_rate": 4.956896321070235e-06, + "loss": 0.2019, + "step": 35825 + }, + { + "epoch": 1.41, + "grad_norm": 2.4089099655471107, + "learning_rate": 4.956854515050168e-06, + "loss": 0.2014, + "step": 35850 + }, + { + "epoch": 1.41, + "grad_norm": 1.0092033649001106, + "learning_rate": 4.9568127090301e-06, + "loss": 0.2002, + "step": 35875 + }, + { + "epoch": 1.41, + "grad_norm": 1.9427722122886302, + "learning_rate": 4.956770903010034e-06, + "loss": 0.2208, + "step": 35900 + }, + { + "epoch": 1.41, + "grad_norm": 1.6812779714137775, + "learning_rate": 4.956729096989966e-06, + "loss": 0.1991, + "step": 35925 + }, + { + "epoch": 1.41, + "grad_norm": 1.3752830801753009, + "learning_rate": 4.9566872909699e-06, + "loss": 0.2051, + "step": 35950 + }, + { + "epoch": 1.42, + "grad_norm": 1.6857256233320776, + "learning_rate": 4.956645484949833e-06, + "loss": 0.206, + "step": 35975 + }, + { + "epoch": 1.42, + "grad_norm": 1.84815296780552, + "learning_rate": 4.956603678929766e-06, + "loss": 0.2146, + "step": 36000 + }, + { + "epoch": 1.42, + "grad_norm": 1.2132768250520185, + "learning_rate": 4.956561872909699e-06, + "loss": 0.1858, + "step": 36025 + }, + { + "epoch": 1.42, + "grad_norm": 1.3922235347411074, + "learning_rate": 4.956520066889632e-06, + "loss": 0.1841, + "step": 36050 + }, + { + "epoch": 1.42, + "grad_norm": 1.4109203186054706, + "learning_rate": 4.956478260869565e-06, + "loss": 0.2042, + "step": 36075 + }, + { + "epoch": 1.42, + "grad_norm": 1.076333380375572, + "learning_rate": 4.956436454849499e-06, + "loss": 0.2103, + "step": 36100 + }, + { + "epoch": 1.42, + "grad_norm": 2.28510984363013, + "learning_rate": 4.956394648829432e-06, + "loss": 0.1989, + "step": 36125 + }, + { + "epoch": 1.42, + "grad_norm": 0.9777457396941752, + "learning_rate": 4.956352842809365e-06, + "loss": 0.2305, + "step": 36150 + }, + { + "epoch": 1.42, + "grad_norm": 1.5988408503297, + "learning_rate": 4.9563110367892984e-06, + "loss": 0.2053, + "step": 36175 + }, + { + "epoch": 1.42, + "grad_norm": 1.6674285290812338, + "learning_rate": 4.956269230769231e-06, + "loss": 0.2098, + "step": 36200 + }, + { + "epoch": 1.43, + "grad_norm": 2.1275665989695556, + "learning_rate": 4.956227424749165e-06, + "loss": 0.2157, + "step": 36225 + }, + { + "epoch": 1.43, + "grad_norm": 1.298315469696719, + "learning_rate": 4.9561872909699e-06, + "loss": 0.1852, + "step": 36250 + }, + { + "epoch": 1.43, + "grad_norm": 1.4806491205789225, + "learning_rate": 4.956145484949833e-06, + "loss": 0.2023, + "step": 36275 + }, + { + "epoch": 1.43, + "grad_norm": 1.7496689306471362, + "learning_rate": 4.956103678929766e-06, + "loss": 0.2057, + "step": 36300 + }, + { + "epoch": 1.43, + "grad_norm": 1.723557779898001, + "learning_rate": 4.9560618729096995e-06, + "loss": 0.217, + "step": 36325 + }, + { + "epoch": 1.43, + "grad_norm": 1.700269519444899, + "learning_rate": 4.956020066889632e-06, + "loss": 0.1761, + "step": 36350 + }, + { + "epoch": 1.43, + "grad_norm": 3.320993952778013, + "learning_rate": 4.955978260869566e-06, + "loss": 0.1874, + "step": 36375 + }, + { + "epoch": 1.43, + "grad_norm": 1.5904614343317816, + "learning_rate": 4.955936454849498e-06, + "loss": 0.2055, + "step": 36400 + }, + { + "epoch": 1.43, + "grad_norm": 1.1408285586096996, + "learning_rate": 4.955894648829432e-06, + "loss": 0.2046, + "step": 36425 + }, + { + "epoch": 1.43, + "grad_norm": 1.7747940283874366, + "learning_rate": 4.955852842809365e-06, + "loss": 0.1819, + "step": 36450 + }, + { + "epoch": 1.43, + "grad_norm": 1.4295834467887427, + "learning_rate": 4.955811036789298e-06, + "loss": 0.1995, + "step": 36475 + }, + { + "epoch": 1.44, + "grad_norm": 1.2445542660386075, + "learning_rate": 4.955769230769231e-06, + "loss": 0.1994, + "step": 36500 + }, + { + "epoch": 1.44, + "grad_norm": 1.4077690160385432, + "learning_rate": 4.9557274247491644e-06, + "loss": 0.2321, + "step": 36525 + }, + { + "epoch": 1.44, + "grad_norm": 1.0359673240598635, + "learning_rate": 4.955685618729097e-06, + "loss": 0.2202, + "step": 36550 + }, + { + "epoch": 1.44, + "grad_norm": 1.4659278382451848, + "learning_rate": 4.955643812709031e-06, + "loss": 0.2263, + "step": 36575 + }, + { + "epoch": 1.44, + "grad_norm": 1.3295779187540848, + "learning_rate": 4.955602006688963e-06, + "loss": 0.2192, + "step": 36600 + }, + { + "epoch": 1.44, + "grad_norm": 1.1071332705463832, + "learning_rate": 4.955560200668897e-06, + "loss": 0.2132, + "step": 36625 + }, + { + "epoch": 1.44, + "grad_norm": 2.265229352325454, + "learning_rate": 4.95551839464883e-06, + "loss": 0.1988, + "step": 36650 + }, + { + "epoch": 1.44, + "grad_norm": 0.9800704407638031, + "learning_rate": 4.955476588628763e-06, + "loss": 0.2032, + "step": 36675 + }, + { + "epoch": 1.44, + "grad_norm": 0.9925356070357615, + "learning_rate": 4.955434782608696e-06, + "loss": 0.215, + "step": 36700 + }, + { + "epoch": 1.44, + "grad_norm": 1.5204227004534854, + "learning_rate": 4.955392976588629e-06, + "loss": 0.2107, + "step": 36725 + }, + { + "epoch": 1.45, + "grad_norm": 2.4452324885329624, + "learning_rate": 4.955351170568562e-06, + "loss": 0.2164, + "step": 36750 + }, + { + "epoch": 1.45, + "grad_norm": 1.959056822372053, + "learning_rate": 4.955309364548496e-06, + "loss": 0.217, + "step": 36775 + }, + { + "epoch": 1.45, + "grad_norm": 2.3043549385517452, + "learning_rate": 4.955267558528428e-06, + "loss": 0.2211, + "step": 36800 + }, + { + "epoch": 1.45, + "grad_norm": 2.2806395544186833, + "learning_rate": 4.955225752508361e-06, + "loss": 0.1887, + "step": 36825 + }, + { + "epoch": 1.45, + "grad_norm": 1.3016123333945022, + "learning_rate": 4.955183946488295e-06, + "loss": 0.2025, + "step": 36850 + }, + { + "epoch": 1.45, + "grad_norm": 1.4292394564351278, + "learning_rate": 4.955142140468227e-06, + "loss": 0.2076, + "step": 36875 + }, + { + "epoch": 1.45, + "grad_norm": 1.9148503359389857, + "learning_rate": 4.955100334448161e-06, + "loss": 0.2046, + "step": 36900 + }, + { + "epoch": 1.45, + "grad_norm": 1.6141787965473307, + "learning_rate": 4.9550585284280935e-06, + "loss": 0.2014, + "step": 36925 + }, + { + "epoch": 1.45, + "grad_norm": 1.9101519644967175, + "learning_rate": 4.955016722408027e-06, + "loss": 0.2111, + "step": 36950 + }, + { + "epoch": 1.45, + "grad_norm": 1.7403141468024392, + "learning_rate": 4.95497491638796e-06, + "loss": 0.2077, + "step": 36975 + }, + { + "epoch": 1.46, + "grad_norm": 1.7207730170828357, + "learning_rate": 4.954933110367893e-06, + "loss": 0.1927, + "step": 37000 + }, + { + "epoch": 1.46, + "grad_norm": 1.8605826806771688, + "learning_rate": 4.954891304347826e-06, + "loss": 0.2184, + "step": 37025 + }, + { + "epoch": 1.46, + "grad_norm": 1.6175520560976995, + "learning_rate": 4.9548494983277596e-06, + "loss": 0.2098, + "step": 37050 + }, + { + "epoch": 1.46, + "grad_norm": 1.8085251236732929, + "learning_rate": 4.954807692307692e-06, + "loss": 0.2257, + "step": 37075 + }, + { + "epoch": 1.46, + "grad_norm": 1.3604634040424115, + "learning_rate": 4.954765886287626e-06, + "loss": 0.2019, + "step": 37100 + }, + { + "epoch": 1.46, + "grad_norm": 1.6502918851829131, + "learning_rate": 4.9547240802675585e-06, + "loss": 0.1919, + "step": 37125 + }, + { + "epoch": 1.46, + "grad_norm": 1.7617138039619495, + "learning_rate": 4.954682274247492e-06, + "loss": 0.2058, + "step": 37150 + }, + { + "epoch": 1.46, + "grad_norm": 1.4810928710156535, + "learning_rate": 4.954640468227425e-06, + "loss": 0.1996, + "step": 37175 + }, + { + "epoch": 1.46, + "grad_norm": 1.5350605129741175, + "learning_rate": 4.954598662207358e-06, + "loss": 0.2055, + "step": 37200 + }, + { + "epoch": 1.46, + "grad_norm": 2.0090394477173805, + "learning_rate": 4.954556856187292e-06, + "loss": 0.1838, + "step": 37225 + }, + { + "epoch": 1.47, + "grad_norm": 1.2115676280987648, + "learning_rate": 4.954516722408027e-06, + "loss": 0.2182, + "step": 37250 + }, + { + "epoch": 1.47, + "grad_norm": 2.2809098857963606, + "learning_rate": 4.95447491638796e-06, + "loss": 0.2116, + "step": 37275 + }, + { + "epoch": 1.47, + "grad_norm": 1.7286107298277917, + "learning_rate": 4.954433110367893e-06, + "loss": 0.1862, + "step": 37300 + }, + { + "epoch": 1.47, + "grad_norm": 2.3745604372273177, + "learning_rate": 4.954391304347827e-06, + "loss": 0.2003, + "step": 37325 + }, + { + "epoch": 1.47, + "grad_norm": 2.179529589103856, + "learning_rate": 4.954349498327759e-06, + "loss": 0.2023, + "step": 37350 + }, + { + "epoch": 1.47, + "grad_norm": 1.379480802423559, + "learning_rate": 4.954307692307693e-06, + "loss": 0.2044, + "step": 37375 + }, + { + "epoch": 1.47, + "grad_norm": 2.704416220877015, + "learning_rate": 4.9542658862876256e-06, + "loss": 0.2179, + "step": 37400 + }, + { + "epoch": 1.47, + "grad_norm": 1.2195378522173297, + "learning_rate": 4.954224080267559e-06, + "loss": 0.2084, + "step": 37425 + }, + { + "epoch": 1.47, + "grad_norm": 1.548215462518736, + "learning_rate": 4.954182274247492e-06, + "loss": 0.2095, + "step": 37450 + }, + { + "epoch": 1.47, + "grad_norm": 1.488083163244097, + "learning_rate": 4.954140468227425e-06, + "loss": 0.1998, + "step": 37475 + }, + { + "epoch": 1.48, + "grad_norm": 1.735700092430531, + "learning_rate": 4.954098662207358e-06, + "loss": 0.2143, + "step": 37500 + }, + { + "epoch": 1.48, + "grad_norm": 1.3001115190622823, + "learning_rate": 4.954056856187292e-06, + "loss": 0.2107, + "step": 37525 + }, + { + "epoch": 1.48, + "grad_norm": 1.724639356179923, + "learning_rate": 4.954015050167224e-06, + "loss": 0.2267, + "step": 37550 + }, + { + "epoch": 1.48, + "grad_norm": 1.836013236024464, + "learning_rate": 4.953973244147158e-06, + "loss": 0.223, + "step": 37575 + }, + { + "epoch": 1.48, + "grad_norm": 1.63422094494171, + "learning_rate": 4.9539314381270905e-06, + "loss": 0.1909, + "step": 37600 + }, + { + "epoch": 1.48, + "grad_norm": 1.6384726135540142, + "learning_rate": 4.953889632107024e-06, + "loss": 0.2025, + "step": 37625 + }, + { + "epoch": 1.48, + "grad_norm": 1.9438112258099434, + "learning_rate": 4.953847826086957e-06, + "loss": 0.1905, + "step": 37650 + }, + { + "epoch": 1.48, + "grad_norm": 2.0403963480844216, + "learning_rate": 4.95380602006689e-06, + "loss": 0.2039, + "step": 37675 + }, + { + "epoch": 1.48, + "grad_norm": 1.5742246476102364, + "learning_rate": 4.953764214046823e-06, + "loss": 0.2102, + "step": 37700 + }, + { + "epoch": 1.48, + "grad_norm": 1.485529188887855, + "learning_rate": 4.953722408026757e-06, + "loss": 0.2192, + "step": 37725 + }, + { + "epoch": 1.49, + "grad_norm": 1.761172543764633, + "learning_rate": 4.953680602006689e-06, + "loss": 0.2076, + "step": 37750 + }, + { + "epoch": 1.49, + "grad_norm": 0.9982263034997709, + "learning_rate": 4.953638795986622e-06, + "loss": 0.2182, + "step": 37775 + }, + { + "epoch": 1.49, + "grad_norm": 2.2204246293357652, + "learning_rate": 4.9535969899665555e-06, + "loss": 0.2172, + "step": 37800 + }, + { + "epoch": 1.49, + "grad_norm": 1.3041545197815172, + "learning_rate": 4.953555183946488e-06, + "loss": 0.21, + "step": 37825 + }, + { + "epoch": 1.49, + "grad_norm": 2.6384467244877707, + "learning_rate": 4.953513377926422e-06, + "loss": 0.2086, + "step": 37850 + }, + { + "epoch": 1.49, + "grad_norm": 1.8368676858921458, + "learning_rate": 4.9534715719063545e-06, + "loss": 0.2193, + "step": 37875 + }, + { + "epoch": 1.49, + "grad_norm": 1.68655526333874, + "learning_rate": 4.953429765886288e-06, + "loss": 0.2122, + "step": 37900 + }, + { + "epoch": 1.49, + "grad_norm": 2.283888377497305, + "learning_rate": 4.953387959866221e-06, + "loss": 0.2013, + "step": 37925 + }, + { + "epoch": 1.49, + "grad_norm": 1.1147280818501166, + "learning_rate": 4.953346153846154e-06, + "loss": 0.2119, + "step": 37950 + }, + { + "epoch": 1.49, + "grad_norm": 2.131251084192741, + "learning_rate": 4.953304347826087e-06, + "loss": 0.1857, + "step": 37975 + }, + { + "epoch": 1.49, + "grad_norm": 1.879560812566554, + "learning_rate": 4.9532625418060205e-06, + "loss": 0.2138, + "step": 38000 + }, + { + "epoch": 1.5, + "grad_norm": 1.1426136680022196, + "learning_rate": 4.953220735785953e-06, + "loss": 0.1844, + "step": 38025 + }, + { + "epoch": 1.5, + "grad_norm": 1.5973757490635936, + "learning_rate": 4.953178929765887e-06, + "loss": 0.2084, + "step": 38050 + }, + { + "epoch": 1.5, + "grad_norm": 2.8939082265559706, + "learning_rate": 4.9531371237458194e-06, + "loss": 0.2129, + "step": 38075 + }, + { + "epoch": 1.5, + "grad_norm": 1.496482622396757, + "learning_rate": 4.953095317725753e-06, + "loss": 0.2145, + "step": 38100 + }, + { + "epoch": 1.5, + "grad_norm": 1.593350227368928, + "learning_rate": 4.953053511705686e-06, + "loss": 0.2064, + "step": 38125 + }, + { + "epoch": 1.5, + "grad_norm": 1.4347207475008577, + "learning_rate": 4.953011705685619e-06, + "loss": 0.2101, + "step": 38150 + }, + { + "epoch": 1.5, + "grad_norm": 1.5414470961110491, + "learning_rate": 4.952969899665552e-06, + "loss": 0.1994, + "step": 38175 + }, + { + "epoch": 1.5, + "grad_norm": 1.2999288110788667, + "learning_rate": 4.9529280936454855e-06, + "loss": 0.19, + "step": 38200 + }, + { + "epoch": 1.5, + "grad_norm": 1.3896836661728793, + "learning_rate": 4.952886287625418e-06, + "loss": 0.2014, + "step": 38225 + }, + { + "epoch": 1.5, + "grad_norm": 1.395012428279921, + "learning_rate": 4.952846153846154e-06, + "loss": 0.1805, + "step": 38250 + }, + { + "epoch": 1.51, + "grad_norm": 1.4366727872711265, + "learning_rate": 4.9528043478260875e-06, + "loss": 0.19, + "step": 38275 + }, + { + "epoch": 1.51, + "grad_norm": 1.2645834734452694, + "learning_rate": 4.95276254180602e-06, + "loss": 0.1943, + "step": 38300 + }, + { + "epoch": 1.51, + "grad_norm": 1.2963361249917345, + "learning_rate": 4.952720735785954e-06, + "loss": 0.1991, + "step": 38325 + }, + { + "epoch": 1.51, + "grad_norm": 1.4866651771754051, + "learning_rate": 4.9526789297658865e-06, + "loss": 0.2245, + "step": 38350 + }, + { + "epoch": 1.51, + "grad_norm": 0.9546225113518195, + "learning_rate": 4.95263712374582e-06, + "loss": 0.222, + "step": 38375 + }, + { + "epoch": 1.51, + "grad_norm": 1.7859246566220357, + "learning_rate": 4.952595317725753e-06, + "loss": 0.2001, + "step": 38400 + }, + { + "epoch": 1.51, + "grad_norm": 1.919164444138617, + "learning_rate": 4.952553511705686e-06, + "loss": 0.1928, + "step": 38425 + }, + { + "epoch": 1.51, + "grad_norm": 1.3516142246793494, + "learning_rate": 4.952511705685619e-06, + "loss": 0.2054, + "step": 38450 + }, + { + "epoch": 1.51, + "grad_norm": 1.8640375790937023, + "learning_rate": 4.9524698996655525e-06, + "loss": 0.1886, + "step": 38475 + }, + { + "epoch": 1.51, + "grad_norm": 1.8124716196702333, + "learning_rate": 4.952428093645485e-06, + "loss": 0.2149, + "step": 38500 + }, + { + "epoch": 1.52, + "grad_norm": 1.5193585343880305, + "learning_rate": 4.952386287625419e-06, + "loss": 0.1966, + "step": 38525 + }, + { + "epoch": 1.52, + "grad_norm": 1.832736275735568, + "learning_rate": 4.9523444816053515e-06, + "loss": 0.2038, + "step": 38550 + }, + { + "epoch": 1.52, + "grad_norm": 1.736944723639645, + "learning_rate": 4.952302675585285e-06, + "loss": 0.1933, + "step": 38575 + }, + { + "epoch": 1.52, + "grad_norm": 1.630971173062863, + "learning_rate": 4.952260869565218e-06, + "loss": 0.2004, + "step": 38600 + }, + { + "epoch": 1.52, + "grad_norm": 1.8873355888793588, + "learning_rate": 4.952219063545151e-06, + "loss": 0.2176, + "step": 38625 + }, + { + "epoch": 1.52, + "grad_norm": 1.3335092351991724, + "learning_rate": 4.952177257525084e-06, + "loss": 0.2189, + "step": 38650 + }, + { + "epoch": 1.52, + "grad_norm": 2.1938959308869856, + "learning_rate": 4.9521354515050175e-06, + "loss": 0.2158, + "step": 38675 + }, + { + "epoch": 1.52, + "grad_norm": 1.1451216064786816, + "learning_rate": 4.95209364548495e-06, + "loss": 0.1996, + "step": 38700 + }, + { + "epoch": 1.52, + "grad_norm": 1.1085029403514461, + "learning_rate": 4.952051839464883e-06, + "loss": 0.2116, + "step": 38725 + }, + { + "epoch": 1.52, + "grad_norm": 1.4562454164261576, + "learning_rate": 4.9520100334448164e-06, + "loss": 0.2174, + "step": 38750 + }, + { + "epoch": 1.53, + "grad_norm": 1.9786520420055989, + "learning_rate": 4.951968227424749e-06, + "loss": 0.1918, + "step": 38775 + }, + { + "epoch": 1.53, + "grad_norm": 1.5097439606342058, + "learning_rate": 4.951926421404683e-06, + "loss": 0.229, + "step": 38800 + }, + { + "epoch": 1.53, + "grad_norm": 2.7567248665862696, + "learning_rate": 4.951884615384615e-06, + "loss": 0.2072, + "step": 38825 + }, + { + "epoch": 1.53, + "grad_norm": 1.7139284656959257, + "learning_rate": 4.951842809364549e-06, + "loss": 0.2102, + "step": 38850 + }, + { + "epoch": 1.53, + "grad_norm": 1.7839798240238915, + "learning_rate": 4.951801003344482e-06, + "loss": 0.2117, + "step": 38875 + }, + { + "epoch": 1.53, + "grad_norm": 1.3293739028446008, + "learning_rate": 4.951759197324415e-06, + "loss": 0.1878, + "step": 38900 + }, + { + "epoch": 1.53, + "grad_norm": 1.6080849310086829, + "learning_rate": 4.951717391304348e-06, + "loss": 0.2101, + "step": 38925 + }, + { + "epoch": 1.53, + "grad_norm": 1.250962569095591, + "learning_rate": 4.951675585284281e-06, + "loss": 0.1938, + "step": 38950 + }, + { + "epoch": 1.53, + "grad_norm": 2.1190166838811497, + "learning_rate": 4.951633779264214e-06, + "loss": 0.2159, + "step": 38975 + }, + { + "epoch": 1.53, + "grad_norm": 2.4326007393935587, + "learning_rate": 4.951591973244148e-06, + "loss": 0.1985, + "step": 39000 + }, + { + "epoch": 1.54, + "grad_norm": 1.6414580007592534, + "learning_rate": 4.95155016722408e-06, + "loss": 0.2048, + "step": 39025 + }, + { + "epoch": 1.54, + "grad_norm": 1.9216076329480545, + "learning_rate": 4.951508361204014e-06, + "loss": 0.2124, + "step": 39050 + }, + { + "epoch": 1.54, + "grad_norm": 1.2728391425359964, + "learning_rate": 4.951466555183947e-06, + "loss": 0.2193, + "step": 39075 + }, + { + "epoch": 1.54, + "grad_norm": 1.5009755698819605, + "learning_rate": 4.95142474916388e-06, + "loss": 0.2223, + "step": 39100 + }, + { + "epoch": 1.54, + "grad_norm": 1.9770053209281309, + "learning_rate": 4.951382943143813e-06, + "loss": 0.2027, + "step": 39125 + }, + { + "epoch": 1.54, + "grad_norm": 1.6300988064698791, + "learning_rate": 4.951341137123746e-06, + "loss": 0.2077, + "step": 39150 + }, + { + "epoch": 1.54, + "grad_norm": 1.561710138010411, + "learning_rate": 4.951299331103679e-06, + "loss": 0.1915, + "step": 39175 + }, + { + "epoch": 1.54, + "grad_norm": 2.705460436343843, + "learning_rate": 4.951257525083613e-06, + "loss": 0.192, + "step": 39200 + }, + { + "epoch": 1.54, + "grad_norm": 1.5991161773874567, + "learning_rate": 4.951215719063545e-06, + "loss": 0.2001, + "step": 39225 + }, + { + "epoch": 1.54, + "grad_norm": 1.2368222120398074, + "learning_rate": 4.951175585284281e-06, + "loss": 0.1984, + "step": 39250 + }, + { + "epoch": 1.55, + "grad_norm": 2.787832230883491, + "learning_rate": 4.951133779264215e-06, + "loss": 0.2138, + "step": 39275 + }, + { + "epoch": 1.55, + "grad_norm": 1.7658617749710668, + "learning_rate": 4.951091973244147e-06, + "loss": 0.2084, + "step": 39300 + }, + { + "epoch": 1.55, + "grad_norm": 1.4069418293510692, + "learning_rate": 4.951050167224081e-06, + "loss": 0.209, + "step": 39325 + }, + { + "epoch": 1.55, + "grad_norm": 2.167347269613874, + "learning_rate": 4.951008361204014e-06, + "loss": 0.2259, + "step": 39350 + }, + { + "epoch": 1.55, + "grad_norm": 1.4967667199328691, + "learning_rate": 4.950966555183947e-06, + "loss": 0.1938, + "step": 39375 + }, + { + "epoch": 1.55, + "grad_norm": 1.1203864757858373, + "learning_rate": 4.95092474916388e-06, + "loss": 0.1996, + "step": 39400 + }, + { + "epoch": 1.55, + "grad_norm": 1.8801469194282086, + "learning_rate": 4.9508829431438135e-06, + "loss": 0.2039, + "step": 39425 + }, + { + "epoch": 1.55, + "grad_norm": 1.327003950516436, + "learning_rate": 4.950841137123746e-06, + "loss": 0.2206, + "step": 39450 + }, + { + "epoch": 1.55, + "grad_norm": 2.41131914892425, + "learning_rate": 4.95079933110368e-06, + "loss": 0.2014, + "step": 39475 + }, + { + "epoch": 1.55, + "grad_norm": 2.0819564181136587, + "learning_rate": 4.950757525083612e-06, + "loss": 0.213, + "step": 39500 + }, + { + "epoch": 1.55, + "grad_norm": 1.6673113392240486, + "learning_rate": 4.950715719063546e-06, + "loss": 0.2184, + "step": 39525 + }, + { + "epoch": 1.56, + "grad_norm": 1.380558230210356, + "learning_rate": 4.950673913043479e-06, + "loss": 0.1943, + "step": 39550 + }, + { + "epoch": 1.56, + "grad_norm": 2.1697122354109033, + "learning_rate": 4.950632107023412e-06, + "loss": 0.1906, + "step": 39575 + }, + { + "epoch": 1.56, + "grad_norm": 1.9418778704353494, + "learning_rate": 4.950590301003345e-06, + "loss": 0.189, + "step": 39600 + }, + { + "epoch": 1.56, + "grad_norm": 1.4589182357598085, + "learning_rate": 4.950548494983278e-06, + "loss": 0.1947, + "step": 39625 + }, + { + "epoch": 1.56, + "grad_norm": 1.8193230995535195, + "learning_rate": 4.950506688963211e-06, + "loss": 0.2207, + "step": 39650 + }, + { + "epoch": 1.56, + "grad_norm": 1.3338701052042354, + "learning_rate": 4.950464882943144e-06, + "loss": 0.2067, + "step": 39675 + }, + { + "epoch": 1.56, + "grad_norm": 1.7410998436865661, + "learning_rate": 4.950423076923077e-06, + "loss": 0.1927, + "step": 39700 + }, + { + "epoch": 1.56, + "grad_norm": 1.3042207681102427, + "learning_rate": 4.95038127090301e-06, + "loss": 0.2183, + "step": 39725 + }, + { + "epoch": 1.56, + "grad_norm": 1.7924918618547276, + "learning_rate": 4.950339464882944e-06, + "loss": 0.2164, + "step": 39750 + }, + { + "epoch": 1.56, + "grad_norm": 1.9474636138675756, + "learning_rate": 4.950297658862876e-06, + "loss": 0.205, + "step": 39775 + }, + { + "epoch": 1.57, + "grad_norm": 1.6557124540827683, + "learning_rate": 4.95025585284281e-06, + "loss": 0.2181, + "step": 39800 + }, + { + "epoch": 1.57, + "grad_norm": 1.2497598066214064, + "learning_rate": 4.9502140468227426e-06, + "loss": 0.2372, + "step": 39825 + }, + { + "epoch": 1.57, + "grad_norm": 1.6087866416925465, + "learning_rate": 4.950172240802676e-06, + "loss": 0.2086, + "step": 39850 + }, + { + "epoch": 1.57, + "grad_norm": 1.4725315047956657, + "learning_rate": 4.950130434782609e-06, + "loss": 0.2029, + "step": 39875 + }, + { + "epoch": 1.57, + "grad_norm": 1.4054406114906426, + "learning_rate": 4.950088628762542e-06, + "loss": 0.1899, + "step": 39900 + }, + { + "epoch": 1.57, + "grad_norm": 1.705726548553069, + "learning_rate": 4.950046822742475e-06, + "loss": 0.1959, + "step": 39925 + }, + { + "epoch": 1.57, + "grad_norm": 1.3147943647253022, + "learning_rate": 4.950005016722409e-06, + "loss": 0.2099, + "step": 39950 + }, + { + "epoch": 1.57, + "grad_norm": 1.199900310959841, + "learning_rate": 4.949963210702341e-06, + "loss": 0.1991, + "step": 39975 + }, + { + "epoch": 1.57, + "grad_norm": 1.5347734934426658, + "learning_rate": 4.949921404682275e-06, + "loss": 0.2056, + "step": 40000 + }, + { + "epoch": 1.57, + "eval_loss": 0.471435546875, + "eval_runtime": 11603.9924, + "eval_samples_per_second": 0.816, + "eval_steps_per_second": 0.051, + "eval_wer": 0.12227530846323376, + "step": 40000 + }, + { + "epoch": 1.57, + "grad_norm": 1.2625514121666555, + "learning_rate": 4.9498795986622075e-06, + "loss": 0.2043, + "step": 40025 + }, + { + "epoch": 1.58, + "grad_norm": 1.598268805422133, + "learning_rate": 4.949837792642141e-06, + "loss": 0.2222, + "step": 40050 + }, + { + "epoch": 1.58, + "grad_norm": 1.8626188809803597, + "learning_rate": 4.949795986622074e-06, + "loss": 0.2062, + "step": 40075 + }, + { + "epoch": 1.58, + "grad_norm": 1.2772100379671945, + "learning_rate": 4.949754180602007e-06, + "loss": 0.1935, + "step": 40100 + }, + { + "epoch": 1.58, + "grad_norm": 1.623164374874199, + "learning_rate": 4.94971237458194e-06, + "loss": 0.2072, + "step": 40125 + }, + { + "epoch": 1.58, + "grad_norm": 1.8510882784804008, + "learning_rate": 4.9496705685618736e-06, + "loss": 0.2103, + "step": 40150 + }, + { + "epoch": 1.58, + "grad_norm": 1.1111748213696224, + "learning_rate": 4.949628762541806e-06, + "loss": 0.2119, + "step": 40175 + }, + { + "epoch": 1.58, + "grad_norm": 1.8817631319158592, + "learning_rate": 4.94958695652174e-06, + "loss": 0.2284, + "step": 40200 + }, + { + "epoch": 1.58, + "grad_norm": 1.7672470719864912, + "learning_rate": 4.9495451505016725e-06, + "loss": 0.2094, + "step": 40225 + }, + { + "epoch": 1.58, + "grad_norm": 1.5564127286984561, + "learning_rate": 4.949505016722408e-06, + "loss": 0.1975, + "step": 40250 + }, + { + "epoch": 1.58, + "grad_norm": 1.6106298300859476, + "learning_rate": 4.949463210702342e-06, + "loss": 0.2036, + "step": 40275 + }, + { + "epoch": 1.59, + "grad_norm": 1.9484881833986716, + "learning_rate": 4.949421404682275e-06, + "loss": 0.2186, + "step": 40300 + }, + { + "epoch": 1.59, + "grad_norm": 1.9152229538502286, + "learning_rate": 4.949379598662208e-06, + "loss": 0.195, + "step": 40325 + }, + { + "epoch": 1.59, + "grad_norm": 2.014374946325913, + "learning_rate": 4.949337792642141e-06, + "loss": 0.2268, + "step": 40350 + }, + { + "epoch": 1.59, + "grad_norm": 1.4380015003183522, + "learning_rate": 4.949295986622074e-06, + "loss": 0.2022, + "step": 40375 + }, + { + "epoch": 1.59, + "grad_norm": 1.8807119242703392, + "learning_rate": 4.949254180602007e-06, + "loss": 0.2071, + "step": 40400 + }, + { + "epoch": 1.59, + "grad_norm": 1.2257784971118444, + "learning_rate": 4.949212374581941e-06, + "loss": 0.2172, + "step": 40425 + }, + { + "epoch": 1.59, + "grad_norm": 1.3513506180545665, + "learning_rate": 4.949170568561873e-06, + "loss": 0.2005, + "step": 40450 + }, + { + "epoch": 1.59, + "grad_norm": 1.7195640433114703, + "learning_rate": 4.949128762541807e-06, + "loss": 0.2044, + "step": 40475 + }, + { + "epoch": 1.59, + "grad_norm": 1.8785362354699997, + "learning_rate": 4.9490869565217396e-06, + "loss": 0.2008, + "step": 40500 + }, + { + "epoch": 1.59, + "grad_norm": 1.4693582668589034, + "learning_rate": 4.949045150501673e-06, + "loss": 0.1974, + "step": 40525 + }, + { + "epoch": 1.6, + "grad_norm": 2.126779893482259, + "learning_rate": 4.949003344481606e-06, + "loss": 0.2031, + "step": 40550 + }, + { + "epoch": 1.6, + "grad_norm": 2.0166603572842985, + "learning_rate": 4.9489615384615385e-06, + "loss": 0.203, + "step": 40575 + }, + { + "epoch": 1.6, + "grad_norm": 1.530039181431065, + "learning_rate": 4.948919732441472e-06, + "loss": 0.205, + "step": 40600 + }, + { + "epoch": 1.6, + "grad_norm": 1.2765710499831335, + "learning_rate": 4.948877926421405e-06, + "loss": 0.1917, + "step": 40625 + }, + { + "epoch": 1.6, + "grad_norm": 1.361058220611067, + "learning_rate": 4.948836120401338e-06, + "loss": 0.1953, + "step": 40650 + }, + { + "epoch": 1.6, + "grad_norm": 2.077832653814155, + "learning_rate": 4.948794314381271e-06, + "loss": 0.2052, + "step": 40675 + }, + { + "epoch": 1.6, + "grad_norm": 1.4160926776241496, + "learning_rate": 4.9487525083612045e-06, + "loss": 0.2133, + "step": 40700 + }, + { + "epoch": 1.6, + "grad_norm": 1.0655600758672423, + "learning_rate": 4.948710702341137e-06, + "loss": 0.1919, + "step": 40725 + }, + { + "epoch": 1.6, + "grad_norm": 1.6603254897395308, + "learning_rate": 4.948668896321071e-06, + "loss": 0.2079, + "step": 40750 + }, + { + "epoch": 1.6, + "grad_norm": 1.768524529622312, + "learning_rate": 4.9486270903010035e-06, + "loss": 0.2002, + "step": 40775 + }, + { + "epoch": 1.61, + "grad_norm": 1.7012943976613921, + "learning_rate": 4.948585284280937e-06, + "loss": 0.2011, + "step": 40800 + }, + { + "epoch": 1.61, + "grad_norm": 1.7082794174545288, + "learning_rate": 4.94854347826087e-06, + "loss": 0.2062, + "step": 40825 + }, + { + "epoch": 1.61, + "grad_norm": 1.928279567494617, + "learning_rate": 4.948501672240803e-06, + "loss": 0.1939, + "step": 40850 + }, + { + "epoch": 1.61, + "grad_norm": 1.9975469181079872, + "learning_rate": 4.948459866220736e-06, + "loss": 0.195, + "step": 40875 + }, + { + "epoch": 1.61, + "grad_norm": 0.9802397774014037, + "learning_rate": 4.9484180602006695e-06, + "loss": 0.2166, + "step": 40900 + }, + { + "epoch": 1.61, + "grad_norm": 1.443205417331069, + "learning_rate": 4.948376254180602e-06, + "loss": 0.219, + "step": 40925 + }, + { + "epoch": 1.61, + "grad_norm": 1.8443352124627206, + "learning_rate": 4.948334448160536e-06, + "loss": 0.1943, + "step": 40950 + }, + { + "epoch": 1.61, + "grad_norm": 1.193610360908578, + "learning_rate": 4.9482926421404685e-06, + "loss": 0.197, + "step": 40975 + }, + { + "epoch": 1.61, + "grad_norm": 1.4623521124140761, + "learning_rate": 4.948250836120402e-06, + "loss": 0.2081, + "step": 41000 + }, + { + "epoch": 1.61, + "grad_norm": 1.3374220657812355, + "learning_rate": 4.948209030100335e-06, + "loss": 0.1975, + "step": 41025 + }, + { + "epoch": 1.61, + "grad_norm": 1.1336669393881718, + "learning_rate": 4.948167224080268e-06, + "loss": 0.203, + "step": 41050 + }, + { + "epoch": 1.62, + "grad_norm": 2.2493898272913384, + "learning_rate": 4.948125418060201e-06, + "loss": 0.1948, + "step": 41075 + }, + { + "epoch": 1.62, + "grad_norm": 1.2972621198738996, + "learning_rate": 4.9480836120401345e-06, + "loss": 0.2192, + "step": 41100 + }, + { + "epoch": 1.62, + "grad_norm": 2.388290835184551, + "learning_rate": 4.948041806020067e-06, + "loss": 0.2166, + "step": 41125 + }, + { + "epoch": 1.62, + "grad_norm": 1.5916665380726456, + "learning_rate": 4.948000000000001e-06, + "loss": 0.2136, + "step": 41150 + }, + { + "epoch": 1.62, + "grad_norm": 1.8625346417846564, + "learning_rate": 4.9479581939799334e-06, + "loss": 0.2046, + "step": 41175 + }, + { + "epoch": 1.62, + "grad_norm": 1.2016947650644971, + "learning_rate": 4.947916387959867e-06, + "loss": 0.1867, + "step": 41200 + }, + { + "epoch": 1.62, + "grad_norm": 1.5740502812375239, + "learning_rate": 4.9478745819398e-06, + "loss": 0.1927, + "step": 41225 + }, + { + "epoch": 1.62, + "grad_norm": 1.3675578577012184, + "learning_rate": 4.9478344481605355e-06, + "loss": 0.1967, + "step": 41250 + }, + { + "epoch": 1.62, + "grad_norm": 1.3133012100842971, + "learning_rate": 4.947792642140469e-06, + "loss": 0.2053, + "step": 41275 + }, + { + "epoch": 1.62, + "grad_norm": 1.8309449231056476, + "learning_rate": 4.947750836120402e-06, + "loss": 0.2045, + "step": 41300 + }, + { + "epoch": 1.63, + "grad_norm": 2.000483291522688, + "learning_rate": 4.947709030100335e-06, + "loss": 0.2107, + "step": 41325 + }, + { + "epoch": 1.63, + "grad_norm": 1.8991039424577518, + "learning_rate": 4.947667224080268e-06, + "loss": 0.1881, + "step": 41350 + }, + { + "epoch": 1.63, + "grad_norm": 1.7183900225861886, + "learning_rate": 4.9476254180602015e-06, + "loss": 0.1982, + "step": 41375 + }, + { + "epoch": 1.63, + "grad_norm": 1.5200229992542378, + "learning_rate": 4.947583612040134e-06, + "loss": 0.1985, + "step": 41400 + }, + { + "epoch": 1.63, + "grad_norm": 1.420445600987573, + "learning_rate": 4.947541806020068e-06, + "loss": 0.2126, + "step": 41425 + }, + { + "epoch": 1.63, + "grad_norm": 2.1764873951692207, + "learning_rate": 4.9475000000000005e-06, + "loss": 0.21, + "step": 41450 + }, + { + "epoch": 1.63, + "grad_norm": 1.2261555787798102, + "learning_rate": 4.947458193979934e-06, + "loss": 0.21, + "step": 41475 + }, + { + "epoch": 1.63, + "grad_norm": 1.7188400172880676, + "learning_rate": 4.947416387959867e-06, + "loss": 0.1842, + "step": 41500 + }, + { + "epoch": 1.63, + "grad_norm": 1.0411835673772099, + "learning_rate": 4.9473745819397994e-06, + "loss": 0.1884, + "step": 41525 + }, + { + "epoch": 1.63, + "grad_norm": 0.8882216836071882, + "learning_rate": 4.947332775919733e-06, + "loss": 0.1994, + "step": 41550 + }, + { + "epoch": 1.64, + "grad_norm": 1.1029055988375176, + "learning_rate": 4.947290969899666e-06, + "loss": 0.1998, + "step": 41575 + }, + { + "epoch": 1.64, + "grad_norm": 1.3852789220034836, + "learning_rate": 4.947249163879599e-06, + "loss": 0.2331, + "step": 41600 + }, + { + "epoch": 1.64, + "grad_norm": 1.324731226538153, + "learning_rate": 4.947207357859532e-06, + "loss": 0.2266, + "step": 41625 + }, + { + "epoch": 1.64, + "grad_norm": 2.4657036693086165, + "learning_rate": 4.9471655518394655e-06, + "loss": 0.215, + "step": 41650 + }, + { + "epoch": 1.64, + "grad_norm": 1.527084020270202, + "learning_rate": 4.947123745819398e-06, + "loss": 0.1884, + "step": 41675 + }, + { + "epoch": 1.64, + "grad_norm": 1.6846948970525009, + "learning_rate": 4.947081939799332e-06, + "loss": 0.2197, + "step": 41700 + }, + { + "epoch": 1.64, + "grad_norm": 1.3851303237662287, + "learning_rate": 4.947040133779264e-06, + "loss": 0.2075, + "step": 41725 + }, + { + "epoch": 1.64, + "grad_norm": 1.734382175640918, + "learning_rate": 4.946998327759198e-06, + "loss": 0.1901, + "step": 41750 + }, + { + "epoch": 1.64, + "grad_norm": 1.8727000776687595, + "learning_rate": 4.946956521739131e-06, + "loss": 0.2099, + "step": 41775 + }, + { + "epoch": 1.64, + "grad_norm": 1.3410587525533615, + "learning_rate": 4.946914715719064e-06, + "loss": 0.2144, + "step": 41800 + }, + { + "epoch": 1.65, + "grad_norm": 2.232771457346267, + "learning_rate": 4.946872909698997e-06, + "loss": 0.2167, + "step": 41825 + }, + { + "epoch": 1.65, + "grad_norm": 1.5134483112849184, + "learning_rate": 4.9468311036789304e-06, + "loss": 0.2034, + "step": 41850 + }, + { + "epoch": 1.65, + "grad_norm": 1.2186748135314416, + "learning_rate": 4.946789297658863e-06, + "loss": 0.2016, + "step": 41875 + }, + { + "epoch": 1.65, + "grad_norm": 1.5576513070546023, + "learning_rate": 4.946747491638797e-06, + "loss": 0.2147, + "step": 41900 + }, + { + "epoch": 1.65, + "grad_norm": 2.350090298252575, + "learning_rate": 4.946705685618729e-06, + "loss": 0.2087, + "step": 41925 + }, + { + "epoch": 1.65, + "grad_norm": 1.5693305584665407, + "learning_rate": 4.946663879598663e-06, + "loss": 0.2105, + "step": 41950 + }, + { + "epoch": 1.65, + "grad_norm": 1.664226564042428, + "learning_rate": 4.946622073578596e-06, + "loss": 0.2092, + "step": 41975 + }, + { + "epoch": 1.65, + "grad_norm": 2.5203247472068897, + "learning_rate": 4.946580267558529e-06, + "loss": 0.21, + "step": 42000 + }, + { + "epoch": 1.65, + "grad_norm": 1.6732043500656888, + "learning_rate": 4.946538461538462e-06, + "loss": 0.2172, + "step": 42025 + }, + { + "epoch": 1.65, + "grad_norm": 2.1170712647253107, + "learning_rate": 4.946496655518395e-06, + "loss": 0.2095, + "step": 42050 + }, + { + "epoch": 1.66, + "grad_norm": 2.27085798758864, + "learning_rate": 4.946454849498328e-06, + "loss": 0.1996, + "step": 42075 + }, + { + "epoch": 1.66, + "grad_norm": 2.121975214434564, + "learning_rate": 4.946413043478262e-06, + "loss": 0.1915, + "step": 42100 + }, + { + "epoch": 1.66, + "grad_norm": 1.8454823137618726, + "learning_rate": 4.946371237458194e-06, + "loss": 0.2185, + "step": 42125 + }, + { + "epoch": 1.66, + "grad_norm": 1.7165066242154234, + "learning_rate": 4.946329431438128e-06, + "loss": 0.2285, + "step": 42150 + }, + { + "epoch": 1.66, + "grad_norm": 1.1935629947391357, + "learning_rate": 4.946287625418061e-06, + "loss": 0.2055, + "step": 42175 + }, + { + "epoch": 1.66, + "grad_norm": 1.6263960825451074, + "learning_rate": 4.946245819397994e-06, + "loss": 0.1963, + "step": 42200 + }, + { + "epoch": 1.66, + "grad_norm": 1.5830643088692502, + "learning_rate": 4.946204013377927e-06, + "loss": 0.1938, + "step": 42225 + }, + { + "epoch": 1.66, + "grad_norm": 1.188351267211218, + "learning_rate": 4.946163879598663e-06, + "loss": 0.1972, + "step": 42250 + }, + { + "epoch": 1.66, + "grad_norm": 1.5228388266516044, + "learning_rate": 4.946122073578596e-06, + "loss": 0.2146, + "step": 42275 + }, + { + "epoch": 1.66, + "grad_norm": 1.2417331494377257, + "learning_rate": 4.946080267558529e-06, + "loss": 0.1901, + "step": 42300 + }, + { + "epoch": 1.67, + "grad_norm": 1.2050316831091845, + "learning_rate": 4.9460384615384625e-06, + "loss": 0.1964, + "step": 42325 + }, + { + "epoch": 1.67, + "grad_norm": 0.7456359383137927, + "learning_rate": 4.945996655518395e-06, + "loss": 0.2114, + "step": 42350 + }, + { + "epoch": 1.67, + "grad_norm": 1.1280620070130793, + "learning_rate": 4.945954849498329e-06, + "loss": 0.204, + "step": 42375 + }, + { + "epoch": 1.67, + "grad_norm": 1.8154667054830138, + "learning_rate": 4.945913043478261e-06, + "loss": 0.2017, + "step": 42400 + }, + { + "epoch": 1.67, + "grad_norm": 1.6384097778731235, + "learning_rate": 4.945871237458195e-06, + "loss": 0.2066, + "step": 42425 + }, + { + "epoch": 1.67, + "grad_norm": 1.8156792744274872, + "learning_rate": 4.945829431438128e-06, + "loss": 0.1958, + "step": 42450 + }, + { + "epoch": 1.67, + "grad_norm": 1.1599616876765457, + "learning_rate": 4.94578762541806e-06, + "loss": 0.2014, + "step": 42475 + }, + { + "epoch": 1.67, + "grad_norm": 1.4247687110866212, + "learning_rate": 4.945745819397993e-06, + "loss": 0.2277, + "step": 42500 + }, + { + "epoch": 1.67, + "grad_norm": 1.8355672996659873, + "learning_rate": 4.945704013377927e-06, + "loss": 0.2099, + "step": 42525 + }, + { + "epoch": 1.67, + "grad_norm": 2.425217424300148, + "learning_rate": 4.94566220735786e-06, + "loss": 0.2016, + "step": 42550 + }, + { + "epoch": 1.67, + "grad_norm": 1.7198208972430582, + "learning_rate": 4.945620401337793e-06, + "loss": 0.202, + "step": 42575 + }, + { + "epoch": 1.68, + "grad_norm": 1.1552545949551052, + "learning_rate": 4.945578595317726e-06, + "loss": 0.221, + "step": 42600 + }, + { + "epoch": 1.68, + "grad_norm": 1.4029219339538905, + "learning_rate": 4.945536789297659e-06, + "loss": 0.1987, + "step": 42625 + }, + { + "epoch": 1.68, + "grad_norm": 2.3882183271546116, + "learning_rate": 4.945494983277593e-06, + "loss": 0.2081, + "step": 42650 + }, + { + "epoch": 1.68, + "grad_norm": 0.9199663729648839, + "learning_rate": 4.945453177257525e-06, + "loss": 0.1973, + "step": 42675 + }, + { + "epoch": 1.68, + "grad_norm": 0.8901908136062112, + "learning_rate": 4.945411371237459e-06, + "loss": 0.2012, + "step": 42700 + }, + { + "epoch": 1.68, + "grad_norm": 1.3827221258424418, + "learning_rate": 4.945369565217392e-06, + "loss": 0.2079, + "step": 42725 + }, + { + "epoch": 1.68, + "grad_norm": 1.7149862183278763, + "learning_rate": 4.945327759197325e-06, + "loss": 0.1983, + "step": 42750 + }, + { + "epoch": 1.68, + "grad_norm": 1.9186732501542165, + "learning_rate": 4.945285953177258e-06, + "loss": 0.2022, + "step": 42775 + }, + { + "epoch": 1.68, + "grad_norm": 1.633325078705575, + "learning_rate": 4.945244147157191e-06, + "loss": 0.2094, + "step": 42800 + }, + { + "epoch": 1.68, + "grad_norm": 1.397331371465462, + "learning_rate": 4.945202341137124e-06, + "loss": 0.2227, + "step": 42825 + }, + { + "epoch": 1.69, + "grad_norm": 1.4918868776955483, + "learning_rate": 4.945160535117058e-06, + "loss": 0.214, + "step": 42850 + }, + { + "epoch": 1.69, + "grad_norm": 1.595591918880962, + "learning_rate": 4.94511872909699e-06, + "loss": 0.2151, + "step": 42875 + }, + { + "epoch": 1.69, + "grad_norm": 1.7222991867626707, + "learning_rate": 4.945076923076924e-06, + "loss": 0.2128, + "step": 42900 + }, + { + "epoch": 1.69, + "grad_norm": 1.1691471473743391, + "learning_rate": 4.9450351170568566e-06, + "loss": 0.2075, + "step": 42925 + }, + { + "epoch": 1.69, + "grad_norm": 1.4662345851129484, + "learning_rate": 4.94499331103679e-06, + "loss": 0.2125, + "step": 42950 + }, + { + "epoch": 1.69, + "grad_norm": 1.5717643240108634, + "learning_rate": 4.944951505016723e-06, + "loss": 0.1802, + "step": 42975 + }, + { + "epoch": 1.69, + "grad_norm": 1.3287909014275803, + "learning_rate": 4.944909698996656e-06, + "loss": 0.1952, + "step": 43000 + }, + { + "epoch": 1.69, + "grad_norm": 1.097159089815716, + "learning_rate": 4.944867892976589e-06, + "loss": 0.202, + "step": 43025 + }, + { + "epoch": 1.69, + "grad_norm": 1.9895232385883712, + "learning_rate": 4.944826086956523e-06, + "loss": 0.2109, + "step": 43050 + }, + { + "epoch": 1.69, + "grad_norm": 1.5103690475667175, + "learning_rate": 4.944784280936455e-06, + "loss": 0.189, + "step": 43075 + }, + { + "epoch": 1.7, + "grad_norm": 1.882631524404233, + "learning_rate": 4.944742474916389e-06, + "loss": 0.1967, + "step": 43100 + }, + { + "epoch": 1.7, + "grad_norm": 1.3046423330005568, + "learning_rate": 4.9447006688963215e-06, + "loss": 0.1788, + "step": 43125 + }, + { + "epoch": 1.7, + "grad_norm": 1.693461277845761, + "learning_rate": 4.944658862876255e-06, + "loss": 0.1978, + "step": 43150 + }, + { + "epoch": 1.7, + "grad_norm": 1.453539084307599, + "learning_rate": 4.944617056856188e-06, + "loss": 0.2291, + "step": 43175 + }, + { + "epoch": 1.7, + "grad_norm": 1.7794757369908873, + "learning_rate": 4.9445752508361205e-06, + "loss": 0.2148, + "step": 43200 + }, + { + "epoch": 1.7, + "grad_norm": 1.7997181495063956, + "learning_rate": 4.944533444816054e-06, + "loss": 0.1912, + "step": 43225 + }, + { + "epoch": 1.7, + "grad_norm": 2.103310345158487, + "learning_rate": 4.94449331103679e-06, + "loss": 0.2029, + "step": 43250 + }, + { + "epoch": 1.7, + "grad_norm": 1.3283148483923, + "learning_rate": 4.944451505016723e-06, + "loss": 0.2016, + "step": 43275 + }, + { + "epoch": 1.7, + "grad_norm": 1.5231191363195111, + "learning_rate": 4.944409698996656e-06, + "loss": 0.194, + "step": 43300 + }, + { + "epoch": 1.7, + "grad_norm": 1.727077984512469, + "learning_rate": 4.94436789297659e-06, + "loss": 0.2031, + "step": 43325 + }, + { + "epoch": 1.71, + "grad_norm": 1.453989318654602, + "learning_rate": 4.944326086956522e-06, + "loss": 0.2083, + "step": 43350 + }, + { + "epoch": 1.71, + "grad_norm": 1.5469548856275384, + "learning_rate": 4.944284280936456e-06, + "loss": 0.2176, + "step": 43375 + }, + { + "epoch": 1.71, + "grad_norm": 3.160825934896527, + "learning_rate": 4.944242474916388e-06, + "loss": 0.2022, + "step": 43400 + }, + { + "epoch": 1.71, + "grad_norm": 1.5719087440594064, + "learning_rate": 4.944200668896321e-06, + "loss": 0.2125, + "step": 43425 + }, + { + "epoch": 1.71, + "grad_norm": 1.3656354570889466, + "learning_rate": 4.944158862876254e-06, + "loss": 0.2117, + "step": 43450 + }, + { + "epoch": 1.71, + "grad_norm": 1.5367941234175355, + "learning_rate": 4.9441170568561875e-06, + "loss": 0.2176, + "step": 43475 + }, + { + "epoch": 1.71, + "grad_norm": 2.4417364583297174, + "learning_rate": 4.94407525083612e-06, + "loss": 0.1879, + "step": 43500 + }, + { + "epoch": 1.71, + "grad_norm": 1.2630478840085972, + "learning_rate": 4.944033444816054e-06, + "loss": 0.2, + "step": 43525 + }, + { + "epoch": 1.71, + "grad_norm": 1.112072451582671, + "learning_rate": 4.9439916387959865e-06, + "loss": 0.1968, + "step": 43550 + }, + { + "epoch": 1.71, + "grad_norm": 1.7713642185306702, + "learning_rate": 4.94394983277592e-06, + "loss": 0.2267, + "step": 43575 + }, + { + "epoch": 1.72, + "grad_norm": 1.2056972754693054, + "learning_rate": 4.943908026755853e-06, + "loss": 0.2014, + "step": 43600 + }, + { + "epoch": 1.72, + "grad_norm": 2.092655557657814, + "learning_rate": 4.943866220735786e-06, + "loss": 0.2009, + "step": 43625 + }, + { + "epoch": 1.72, + "grad_norm": 1.7225505908943808, + "learning_rate": 4.94382441471572e-06, + "loss": 0.2268, + "step": 43650 + }, + { + "epoch": 1.72, + "grad_norm": 1.5368215742705937, + "learning_rate": 4.9437826086956525e-06, + "loss": 0.2055, + "step": 43675 + }, + { + "epoch": 1.72, + "grad_norm": 1.3197247012408562, + "learning_rate": 4.943740802675586e-06, + "loss": 0.2343, + "step": 43700 + }, + { + "epoch": 1.72, + "grad_norm": 1.3772135831744492, + "learning_rate": 4.943698996655519e-06, + "loss": 0.1976, + "step": 43725 + }, + { + "epoch": 1.72, + "grad_norm": 1.4326058066713432, + "learning_rate": 4.943657190635452e-06, + "loss": 0.2031, + "step": 43750 + }, + { + "epoch": 1.72, + "grad_norm": 1.5888807750342064, + "learning_rate": 4.943615384615385e-06, + "loss": 0.2074, + "step": 43775 + }, + { + "epoch": 1.72, + "grad_norm": 1.5396007104076448, + "learning_rate": 4.9435735785953185e-06, + "loss": 0.2067, + "step": 43800 + }, + { + "epoch": 1.72, + "grad_norm": 2.4011127551195726, + "learning_rate": 4.943531772575251e-06, + "loss": 0.1787, + "step": 43825 + }, + { + "epoch": 1.73, + "grad_norm": 0.833780754470428, + "learning_rate": 4.943489966555185e-06, + "loss": 0.1943, + "step": 43850 + }, + { + "epoch": 1.73, + "grad_norm": 1.4268589028719112, + "learning_rate": 4.9434481605351175e-06, + "loss": 0.184, + "step": 43875 + }, + { + "epoch": 1.73, + "grad_norm": 2.1879663087707617, + "learning_rate": 4.943406354515051e-06, + "loss": 0.2, + "step": 43900 + }, + { + "epoch": 1.73, + "grad_norm": 1.4837138515357116, + "learning_rate": 4.943364548494984e-06, + "loss": 0.1938, + "step": 43925 + }, + { + "epoch": 1.73, + "grad_norm": 1.302090890735732, + "learning_rate": 4.943322742474917e-06, + "loss": 0.2179, + "step": 43950 + }, + { + "epoch": 1.73, + "grad_norm": 1.861136682417232, + "learning_rate": 4.94328093645485e-06, + "loss": 0.2182, + "step": 43975 + }, + { + "epoch": 1.73, + "grad_norm": 2.335286311587073, + "learning_rate": 4.9432391304347835e-06, + "loss": 0.2214, + "step": 44000 + }, + { + "epoch": 1.73, + "grad_norm": 1.6769155432405607, + "learning_rate": 4.943197324414716e-06, + "loss": 0.2042, + "step": 44025 + }, + { + "epoch": 1.73, + "grad_norm": 2.200106902410679, + "learning_rate": 4.94315551839465e-06, + "loss": 0.1987, + "step": 44050 + }, + { + "epoch": 1.73, + "grad_norm": 1.7563502007021174, + "learning_rate": 4.9431137123745825e-06, + "loss": 0.1951, + "step": 44075 + }, + { + "epoch": 1.73, + "grad_norm": 1.5530867625540488, + "learning_rate": 4.943071906354516e-06, + "loss": 0.2139, + "step": 44100 + }, + { + "epoch": 1.74, + "grad_norm": 1.3534943489075693, + "learning_rate": 4.943030100334449e-06, + "loss": 0.1773, + "step": 44125 + }, + { + "epoch": 1.74, + "grad_norm": 1.4957898910507397, + "learning_rate": 4.942988294314381e-06, + "loss": 0.228, + "step": 44150 + }, + { + "epoch": 1.74, + "grad_norm": 2.8785662557550045, + "learning_rate": 4.942946488294315e-06, + "loss": 0.2081, + "step": 44175 + }, + { + "epoch": 1.74, + "grad_norm": 1.712164070763034, + "learning_rate": 4.942904682274248e-06, + "loss": 0.2042, + "step": 44200 + }, + { + "epoch": 1.74, + "grad_norm": 1.52945812067655, + "learning_rate": 4.942862876254181e-06, + "loss": 0.2277, + "step": 44225 + }, + { + "epoch": 1.74, + "grad_norm": 1.3722433996508723, + "learning_rate": 4.942822742474917e-06, + "loss": 0.2299, + "step": 44250 + }, + { + "epoch": 1.74, + "grad_norm": 1.5393189925564164, + "learning_rate": 4.94278093645485e-06, + "loss": 0.2175, + "step": 44275 + }, + { + "epoch": 1.74, + "grad_norm": 1.5822179394649376, + "learning_rate": 4.942739130434783e-06, + "loss": 0.2038, + "step": 44300 + }, + { + "epoch": 1.74, + "grad_norm": 2.3792366307810666, + "learning_rate": 4.942697324414717e-06, + "loss": 0.2047, + "step": 44325 + }, + { + "epoch": 1.74, + "grad_norm": 1.70816553270609, + "learning_rate": 4.942655518394649e-06, + "loss": 0.2217, + "step": 44350 + }, + { + "epoch": 1.75, + "grad_norm": 1.2792536160078478, + "learning_rate": 4.942613712374582e-06, + "loss": 0.2131, + "step": 44375 + }, + { + "epoch": 1.75, + "grad_norm": 1.4575673021789084, + "learning_rate": 4.942571906354515e-06, + "loss": 0.2131, + "step": 44400 + }, + { + "epoch": 1.75, + "grad_norm": 1.8400518563065749, + "learning_rate": 4.9425301003344485e-06, + "loss": 0.2037, + "step": 44425 + }, + { + "epoch": 1.75, + "grad_norm": 1.5635883512841116, + "learning_rate": 4.942488294314381e-06, + "loss": 0.2091, + "step": 44450 + }, + { + "epoch": 1.75, + "grad_norm": 2.0007327369281773, + "learning_rate": 4.942446488294315e-06, + "loss": 0.2127, + "step": 44475 + }, + { + "epoch": 1.75, + "grad_norm": 2.119742302055427, + "learning_rate": 4.942404682274247e-06, + "loss": 0.2012, + "step": 44500 + }, + { + "epoch": 1.75, + "grad_norm": 1.9344147417555144, + "learning_rate": 4.942362876254181e-06, + "loss": 0.2013, + "step": 44525 + }, + { + "epoch": 1.75, + "grad_norm": 1.328539603876774, + "learning_rate": 4.942321070234114e-06, + "loss": 0.1986, + "step": 44550 + }, + { + "epoch": 1.75, + "grad_norm": 1.7040707858619548, + "learning_rate": 4.942279264214047e-06, + "loss": 0.1954, + "step": 44575 + }, + { + "epoch": 1.75, + "grad_norm": 2.4008583236352887, + "learning_rate": 4.94223745819398e-06, + "loss": 0.2226, + "step": 44600 + }, + { + "epoch": 1.76, + "grad_norm": 1.148723618960234, + "learning_rate": 4.9421956521739134e-06, + "loss": 0.2067, + "step": 44625 + }, + { + "epoch": 1.76, + "grad_norm": 1.4833158950358802, + "learning_rate": 4.942153846153846e-06, + "loss": 0.2012, + "step": 44650 + }, + { + "epoch": 1.76, + "grad_norm": 1.8801569799377404, + "learning_rate": 4.94211204013378e-06, + "loss": 0.204, + "step": 44675 + }, + { + "epoch": 1.76, + "grad_norm": 1.6972529337434588, + "learning_rate": 4.942070234113712e-06, + "loss": 0.2183, + "step": 44700 + }, + { + "epoch": 1.76, + "grad_norm": 1.7424024925861386, + "learning_rate": 4.942028428093646e-06, + "loss": 0.196, + "step": 44725 + }, + { + "epoch": 1.76, + "grad_norm": 1.419744820585979, + "learning_rate": 4.9419866220735795e-06, + "loss": 0.2205, + "step": 44750 + }, + { + "epoch": 1.76, + "grad_norm": 1.773705854022055, + "learning_rate": 4.941944816053512e-06, + "loss": 0.1951, + "step": 44775 + }, + { + "epoch": 1.76, + "grad_norm": 1.6575512331545263, + "learning_rate": 4.941903010033446e-06, + "loss": 0.1998, + "step": 44800 + }, + { + "epoch": 1.76, + "grad_norm": 0.8632058992042098, + "learning_rate": 4.941861204013378e-06, + "loss": 0.2272, + "step": 44825 + }, + { + "epoch": 1.76, + "grad_norm": 1.8833068240202187, + "learning_rate": 4.941819397993312e-06, + "loss": 0.23, + "step": 44850 + }, + { + "epoch": 1.77, + "grad_norm": 1.5386804945912553, + "learning_rate": 4.941777591973245e-06, + "loss": 0.2026, + "step": 44875 + }, + { + "epoch": 1.77, + "grad_norm": 1.1855200203299385, + "learning_rate": 4.941735785953178e-06, + "loss": 0.2161, + "step": 44900 + }, + { + "epoch": 1.77, + "grad_norm": 1.9564640886071363, + "learning_rate": 4.941693979933111e-06, + "loss": 0.1789, + "step": 44925 + }, + { + "epoch": 1.77, + "grad_norm": 2.340358116387867, + "learning_rate": 4.9416521739130444e-06, + "loss": 0.1937, + "step": 44950 + }, + { + "epoch": 1.77, + "grad_norm": 2.1872850428854087, + "learning_rate": 4.941610367892977e-06, + "loss": 0.2088, + "step": 44975 + }, + { + "epoch": 1.77, + "grad_norm": 1.3941237415424022, + "learning_rate": 4.941568561872911e-06, + "loss": 0.215, + "step": 45000 + }, + { + "epoch": 1.77, + "grad_norm": 1.2422150166796995, + "learning_rate": 4.941526755852843e-06, + "loss": 0.1978, + "step": 45025 + }, + { + "epoch": 1.77, + "grad_norm": 1.2834289450493046, + "learning_rate": 4.941484949832777e-06, + "loss": 0.2019, + "step": 45050 + }, + { + "epoch": 1.77, + "grad_norm": 1.3887225318643923, + "learning_rate": 4.941443143812709e-06, + "loss": 0.2005, + "step": 45075 + }, + { + "epoch": 1.77, + "grad_norm": 1.5957356966876162, + "learning_rate": 4.941401337792642e-06, + "loss": 0.2116, + "step": 45100 + }, + { + "epoch": 1.78, + "grad_norm": 1.7998947945044745, + "learning_rate": 4.941359531772575e-06, + "loss": 0.2113, + "step": 45125 + }, + { + "epoch": 1.78, + "grad_norm": 1.2863221238150597, + "learning_rate": 4.9413177257525086e-06, + "loss": 0.1909, + "step": 45150 + }, + { + "epoch": 1.78, + "grad_norm": 1.291509095965881, + "learning_rate": 4.941275919732442e-06, + "loss": 0.1993, + "step": 45175 + }, + { + "epoch": 1.78, + "grad_norm": 2.1958530909710876, + "learning_rate": 4.941234113712375e-06, + "loss": 0.2115, + "step": 45200 + }, + { + "epoch": 1.78, + "grad_norm": 1.5439304960130933, + "learning_rate": 4.941192307692308e-06, + "loss": 0.1959, + "step": 45225 + }, + { + "epoch": 1.78, + "grad_norm": 1.9055756132780866, + "learning_rate": 4.941152173913044e-06, + "loss": 0.2022, + "step": 45250 + }, + { + "epoch": 1.78, + "grad_norm": 1.2942694636126806, + "learning_rate": 4.941110367892977e-06, + "loss": 0.1945, + "step": 45275 + }, + { + "epoch": 1.78, + "grad_norm": 1.6378120736690176, + "learning_rate": 4.94106856187291e-06, + "loss": 0.2001, + "step": 45300 + }, + { + "epoch": 1.78, + "grad_norm": 1.7309152118782125, + "learning_rate": 4.941026755852843e-06, + "loss": 0.189, + "step": 45325 + }, + { + "epoch": 1.78, + "grad_norm": 1.5926592940812094, + "learning_rate": 4.940984949832776e-06, + "loss": 0.1886, + "step": 45350 + }, + { + "epoch": 1.79, + "grad_norm": 1.4814435621208106, + "learning_rate": 4.940943143812709e-06, + "loss": 0.2175, + "step": 45375 + }, + { + "epoch": 1.79, + "grad_norm": 2.3340113593410514, + "learning_rate": 4.940901337792642e-06, + "loss": 0.2064, + "step": 45400 + }, + { + "epoch": 1.79, + "grad_norm": 1.7179296631796945, + "learning_rate": 4.940859531772576e-06, + "loss": 0.2027, + "step": 45425 + }, + { + "epoch": 1.79, + "grad_norm": 1.3525854738210719, + "learning_rate": 4.940817725752508e-06, + "loss": 0.2069, + "step": 45450 + }, + { + "epoch": 1.79, + "grad_norm": 1.1243049994361427, + "learning_rate": 4.940775919732442e-06, + "loss": 0.1942, + "step": 45475 + }, + { + "epoch": 1.79, + "grad_norm": 1.2883069022850577, + "learning_rate": 4.9407341137123746e-06, + "loss": 0.1776, + "step": 45500 + }, + { + "epoch": 1.79, + "grad_norm": 1.3987378560793549, + "learning_rate": 4.940692307692308e-06, + "loss": 0.2085, + "step": 45525 + }, + { + "epoch": 1.79, + "grad_norm": 1.8141337973143972, + "learning_rate": 4.940650501672241e-06, + "loss": 0.2077, + "step": 45550 + }, + { + "epoch": 1.79, + "grad_norm": 2.1709295579225634, + "learning_rate": 4.940608695652174e-06, + "loss": 0.1949, + "step": 45575 + }, + { + "epoch": 1.79, + "grad_norm": 1.5535520468286816, + "learning_rate": 4.940566889632107e-06, + "loss": 0.1988, + "step": 45600 + }, + { + "epoch": 1.79, + "grad_norm": 1.9291991595641431, + "learning_rate": 4.940525083612041e-06, + "loss": 0.2142, + "step": 45625 + }, + { + "epoch": 1.8, + "grad_norm": 1.944908850889749, + "learning_rate": 4.940483277591973e-06, + "loss": 0.2189, + "step": 45650 + }, + { + "epoch": 1.8, + "grad_norm": 1.8924849523796625, + "learning_rate": 4.940441471571907e-06, + "loss": 0.2042, + "step": 45675 + }, + { + "epoch": 1.8, + "grad_norm": 1.0381586631748612, + "learning_rate": 4.9403996655518395e-06, + "loss": 0.1973, + "step": 45700 + }, + { + "epoch": 1.8, + "grad_norm": 1.596229546211035, + "learning_rate": 4.940357859531773e-06, + "loss": 0.1971, + "step": 45725 + }, + { + "epoch": 1.8, + "grad_norm": 1.478086237165827, + "learning_rate": 4.940316053511706e-06, + "loss": 0.2172, + "step": 45750 + }, + { + "epoch": 1.8, + "grad_norm": 1.559672032174473, + "learning_rate": 4.940274247491639e-06, + "loss": 0.2215, + "step": 45775 + }, + { + "epoch": 1.8, + "grad_norm": 1.2609063742885178, + "learning_rate": 4.940232441471572e-06, + "loss": 0.1815, + "step": 45800 + }, + { + "epoch": 1.8, + "grad_norm": 1.2924032797525853, + "learning_rate": 4.9401906354515056e-06, + "loss": 0.2064, + "step": 45825 + }, + { + "epoch": 1.8, + "grad_norm": 2.4174361862563374, + "learning_rate": 4.940148829431439e-06, + "loss": 0.217, + "step": 45850 + }, + { + "epoch": 1.8, + "grad_norm": 1.373477111385322, + "learning_rate": 4.940107023411372e-06, + "loss": 0.1953, + "step": 45875 + }, + { + "epoch": 1.81, + "grad_norm": 1.8558451130861622, + "learning_rate": 4.940065217391305e-06, + "loss": 0.2054, + "step": 45900 + }, + { + "epoch": 1.81, + "grad_norm": 1.6990765889691408, + "learning_rate": 4.940023411371238e-06, + "loss": 0.1899, + "step": 45925 + }, + { + "epoch": 1.81, + "grad_norm": 1.8060030463753445, + "learning_rate": 4.939981605351172e-06, + "loss": 0.2202, + "step": 45950 + }, + { + "epoch": 1.81, + "grad_norm": 1.0036006697315363, + "learning_rate": 4.939939799331104e-06, + "loss": 0.1892, + "step": 45975 + }, + { + "epoch": 1.81, + "grad_norm": 1.5889893762168747, + "learning_rate": 4.939897993311037e-06, + "loss": 0.2014, + "step": 46000 + }, + { + "epoch": 1.81, + "grad_norm": 1.2885407324111298, + "learning_rate": 4.93985618729097e-06, + "loss": 0.2022, + "step": 46025 + }, + { + "epoch": 1.81, + "grad_norm": 1.4911043166689495, + "learning_rate": 4.939814381270903e-06, + "loss": 0.219, + "step": 46050 + }, + { + "epoch": 1.81, + "grad_norm": 1.2755317856393762, + "learning_rate": 4.939772575250836e-06, + "loss": 0.2054, + "step": 46075 + }, + { + "epoch": 1.81, + "grad_norm": 1.994187232620831, + "learning_rate": 4.9397307692307695e-06, + "loss": 0.1891, + "step": 46100 + }, + { + "epoch": 1.81, + "grad_norm": 1.5475701584616484, + "learning_rate": 4.939688963210702e-06, + "loss": 0.2022, + "step": 46125 + }, + { + "epoch": 1.82, + "grad_norm": 1.4436373558259408, + "learning_rate": 4.939647157190636e-06, + "loss": 0.197, + "step": 46150 + }, + { + "epoch": 1.82, + "grad_norm": 1.7292220715855722, + "learning_rate": 4.9396053511705684e-06, + "loss": 0.2127, + "step": 46175 + }, + { + "epoch": 1.82, + "grad_norm": 2.00863798888671, + "learning_rate": 4.939563545150502e-06, + "loss": 0.1874, + "step": 46200 + }, + { + "epoch": 1.82, + "grad_norm": 1.575347210475048, + "learning_rate": 4.939521739130435e-06, + "loss": 0.2142, + "step": 46225 + }, + { + "epoch": 1.82, + "grad_norm": 1.8865538136561686, + "learning_rate": 4.9394816053511705e-06, + "loss": 0.194, + "step": 46250 + }, + { + "epoch": 1.82, + "grad_norm": 1.3867064113440684, + "learning_rate": 4.939439799331104e-06, + "loss": 0.2238, + "step": 46275 + }, + { + "epoch": 1.82, + "grad_norm": 1.6636489722849195, + "learning_rate": 4.939397993311037e-06, + "loss": 0.2124, + "step": 46300 + }, + { + "epoch": 1.82, + "grad_norm": 1.796035846087043, + "learning_rate": 4.93935618729097e-06, + "loss": 0.2086, + "step": 46325 + }, + { + "epoch": 1.82, + "grad_norm": 1.6152861037902768, + "learning_rate": 4.939314381270903e-06, + "loss": 0.2194, + "step": 46350 + }, + { + "epoch": 1.82, + "grad_norm": 0.872021389069065, + "learning_rate": 4.9392725752508365e-06, + "loss": 0.1983, + "step": 46375 + }, + { + "epoch": 1.83, + "grad_norm": 1.3627173247663622, + "learning_rate": 4.939230769230769e-06, + "loss": 0.2228, + "step": 46400 + }, + { + "epoch": 1.83, + "grad_norm": 2.314259549099962, + "learning_rate": 4.939188963210703e-06, + "loss": 0.1979, + "step": 46425 + }, + { + "epoch": 1.83, + "grad_norm": 1.6032460211094444, + "learning_rate": 4.9391471571906355e-06, + "loss": 0.2278, + "step": 46450 + }, + { + "epoch": 1.83, + "grad_norm": 1.775553525039903, + "learning_rate": 4.939105351170569e-06, + "loss": 0.2044, + "step": 46475 + }, + { + "epoch": 1.83, + "grad_norm": 1.800363727354035, + "learning_rate": 4.939063545150502e-06, + "loss": 0.2143, + "step": 46500 + }, + { + "epoch": 1.83, + "grad_norm": 1.2465956627042085, + "learning_rate": 4.939021739130435e-06, + "loss": 0.2043, + "step": 46525 + }, + { + "epoch": 1.83, + "grad_norm": 1.8944500939180655, + "learning_rate": 4.938979933110368e-06, + "loss": 0.2063, + "step": 46550 + }, + { + "epoch": 1.83, + "grad_norm": 1.5117363359679743, + "learning_rate": 4.9389381270903015e-06, + "loss": 0.2122, + "step": 46575 + }, + { + "epoch": 1.83, + "grad_norm": 1.5887958169703909, + "learning_rate": 4.938896321070234e-06, + "loss": 0.221, + "step": 46600 + }, + { + "epoch": 1.83, + "grad_norm": 1.648221177617564, + "learning_rate": 4.938854515050168e-06, + "loss": 0.1771, + "step": 46625 + }, + { + "epoch": 1.84, + "grad_norm": 1.55324428745445, + "learning_rate": 4.9388127090301005e-06, + "loss": 0.1883, + "step": 46650 + }, + { + "epoch": 1.84, + "grad_norm": 1.7723178618451372, + "learning_rate": 4.938770903010034e-06, + "loss": 0.1927, + "step": 46675 + }, + { + "epoch": 1.84, + "grad_norm": 2.153448369196902, + "learning_rate": 4.938729096989967e-06, + "loss": 0.2044, + "step": 46700 + }, + { + "epoch": 1.84, + "grad_norm": 2.1268320701701797, + "learning_rate": 4.9386872909699e-06, + "loss": 0.2428, + "step": 46725 + }, + { + "epoch": 1.84, + "grad_norm": 1.3631634906977346, + "learning_rate": 4.938645484949833e-06, + "loss": 0.1857, + "step": 46750 + }, + { + "epoch": 1.84, + "grad_norm": 1.6168270762112813, + "learning_rate": 4.9386036789297665e-06, + "loss": 0.2011, + "step": 46775 + }, + { + "epoch": 1.84, + "grad_norm": 1.6696110343047001, + "learning_rate": 4.938561872909699e-06, + "loss": 0.2204, + "step": 46800 + }, + { + "epoch": 1.84, + "grad_norm": 1.192494551736515, + "learning_rate": 4.938520066889633e-06, + "loss": 0.2038, + "step": 46825 + }, + { + "epoch": 1.84, + "grad_norm": 1.7077939692501043, + "learning_rate": 4.9384782608695654e-06, + "loss": 0.1955, + "step": 46850 + }, + { + "epoch": 1.84, + "grad_norm": 1.8191368453499213, + "learning_rate": 4.938436454849499e-06, + "loss": 0.208, + "step": 46875 + }, + { + "epoch": 1.85, + "grad_norm": 1.5432423506062156, + "learning_rate": 4.938394648829432e-06, + "loss": 0.2058, + "step": 46900 + }, + { + "epoch": 1.85, + "grad_norm": 1.9468989423937535, + "learning_rate": 4.938352842809365e-06, + "loss": 0.1768, + "step": 46925 + }, + { + "epoch": 1.85, + "grad_norm": 1.7161970238767805, + "learning_rate": 4.938311036789298e-06, + "loss": 0.2131, + "step": 46950 + }, + { + "epoch": 1.85, + "grad_norm": 1.5485071588534483, + "learning_rate": 4.938269230769231e-06, + "loss": 0.2218, + "step": 46975 + }, + { + "epoch": 1.85, + "grad_norm": 2.53263925456021, + "learning_rate": 4.938227424749164e-06, + "loss": 0.2235, + "step": 47000 + }, + { + "epoch": 1.85, + "grad_norm": 1.8192841664528814, + "learning_rate": 4.938185618729097e-06, + "loss": 0.1992, + "step": 47025 + }, + { + "epoch": 1.85, + "grad_norm": 2.115023350287966, + "learning_rate": 4.93814381270903e-06, + "loss": 0.2213, + "step": 47050 + }, + { + "epoch": 1.85, + "grad_norm": 1.5380163293793399, + "learning_rate": 4.938102006688963e-06, + "loss": 0.1992, + "step": 47075 + }, + { + "epoch": 1.85, + "grad_norm": 1.4449128177392452, + "learning_rate": 4.938060200668897e-06, + "loss": 0.2181, + "step": 47100 + }, + { + "epoch": 1.85, + "grad_norm": 1.8029578697927904, + "learning_rate": 4.938018394648829e-06, + "loss": 0.1921, + "step": 47125 + }, + { + "epoch": 1.85, + "grad_norm": 2.0395632871911147, + "learning_rate": 4.937976588628763e-06, + "loss": 0.1895, + "step": 47150 + }, + { + "epoch": 1.86, + "grad_norm": 1.3878968532309697, + "learning_rate": 4.937934782608696e-06, + "loss": 0.1842, + "step": 47175 + }, + { + "epoch": 1.86, + "grad_norm": 1.3434122603027006, + "learning_rate": 4.937892976588629e-06, + "loss": 0.2056, + "step": 47200 + }, + { + "epoch": 1.86, + "grad_norm": 1.3490671768973181, + "learning_rate": 4.937851170568562e-06, + "loss": 0.1866, + "step": 47225 + }, + { + "epoch": 1.86, + "grad_norm": 1.613883740225785, + "learning_rate": 4.937811036789298e-06, + "loss": 0.1979, + "step": 47250 + }, + { + "epoch": 1.86, + "grad_norm": 1.7926615672094046, + "learning_rate": 4.937769230769231e-06, + "loss": 0.1977, + "step": 47275 + }, + { + "epoch": 1.86, + "grad_norm": 2.484911009082714, + "learning_rate": 4.937727424749164e-06, + "loss": 0.2022, + "step": 47300 + }, + { + "epoch": 1.86, + "grad_norm": 1.9218949249621484, + "learning_rate": 4.9376856187290975e-06, + "loss": 0.2221, + "step": 47325 + }, + { + "epoch": 1.86, + "grad_norm": 1.4316456230383503, + "learning_rate": 4.937645484949833e-06, + "loss": 0.1902, + "step": 47350 + }, + { + "epoch": 1.86, + "grad_norm": 1.7052129122552293, + "learning_rate": 4.937603678929767e-06, + "loss": 0.2221, + "step": 47375 + }, + { + "epoch": 1.86, + "grad_norm": 1.2637211421190688, + "learning_rate": 4.9375618729096995e-06, + "loss": 0.2058, + "step": 47400 + }, + { + "epoch": 1.87, + "grad_norm": 1.8451450491794519, + "learning_rate": 4.937520066889632e-06, + "loss": 0.1993, + "step": 47425 + }, + { + "epoch": 1.87, + "grad_norm": 0.9283921636952341, + "learning_rate": 4.937478260869566e-06, + "loss": 0.2055, + "step": 47450 + }, + { + "epoch": 1.87, + "grad_norm": 1.8414663077620397, + "learning_rate": 4.9374364548494985e-06, + "loss": 0.2203, + "step": 47475 + }, + { + "epoch": 1.87, + "grad_norm": 2.0753580914227365, + "learning_rate": 4.937394648829432e-06, + "loss": 0.213, + "step": 47500 + }, + { + "epoch": 1.87, + "grad_norm": 1.2078240932003506, + "learning_rate": 4.937352842809365e-06, + "loss": 0.2136, + "step": 47525 + }, + { + "epoch": 1.87, + "grad_norm": 1.7304545694227498, + "learning_rate": 4.937311036789298e-06, + "loss": 0.215, + "step": 47550 + }, + { + "epoch": 1.87, + "grad_norm": 1.6773895264603955, + "learning_rate": 4.937269230769231e-06, + "loss": 0.184, + "step": 47575 + }, + { + "epoch": 1.87, + "grad_norm": 1.6133216723159176, + "learning_rate": 4.9372274247491645e-06, + "loss": 0.2233, + "step": 47600 + }, + { + "epoch": 1.87, + "grad_norm": 2.3083125789789483, + "learning_rate": 4.937185618729097e-06, + "loss": 0.2224, + "step": 47625 + }, + { + "epoch": 1.87, + "grad_norm": 1.6499407481703168, + "learning_rate": 4.937143812709031e-06, + "loss": 0.2013, + "step": 47650 + }, + { + "epoch": 1.88, + "grad_norm": 1.9096100207893898, + "learning_rate": 4.9371020066889635e-06, + "loss": 0.1966, + "step": 47675 + }, + { + "epoch": 1.88, + "grad_norm": 2.0102219012043174, + "learning_rate": 4.937060200668897e-06, + "loss": 0.2247, + "step": 47700 + }, + { + "epoch": 1.88, + "grad_norm": 1.8440109791151151, + "learning_rate": 4.93701839464883e-06, + "loss": 0.1832, + "step": 47725 + }, + { + "epoch": 1.88, + "grad_norm": 1.6405657863329357, + "learning_rate": 4.936976588628763e-06, + "loss": 0.2021, + "step": 47750 + }, + { + "epoch": 1.88, + "grad_norm": 1.5352039277718714, + "learning_rate": 4.936934782608696e-06, + "loss": 0.2106, + "step": 47775 + }, + { + "epoch": 1.88, + "grad_norm": 1.218168105886781, + "learning_rate": 4.9368929765886295e-06, + "loss": 0.2028, + "step": 47800 + }, + { + "epoch": 1.88, + "grad_norm": 0.9303573799655196, + "learning_rate": 4.936851170568562e-06, + "loss": 0.208, + "step": 47825 + }, + { + "epoch": 1.88, + "grad_norm": 1.3720528426394387, + "learning_rate": 4.936809364548496e-06, + "loss": 0.213, + "step": 47850 + }, + { + "epoch": 1.88, + "grad_norm": 1.330231745796533, + "learning_rate": 4.9367675585284284e-06, + "loss": 0.194, + "step": 47875 + }, + { + "epoch": 1.88, + "grad_norm": 1.6102889821188229, + "learning_rate": 4.936725752508362e-06, + "loss": 0.2059, + "step": 47900 + }, + { + "epoch": 1.89, + "grad_norm": 1.7900179616799938, + "learning_rate": 4.936683946488295e-06, + "loss": 0.2099, + "step": 47925 + }, + { + "epoch": 1.89, + "grad_norm": 1.876167524982635, + "learning_rate": 4.936642140468228e-06, + "loss": 0.2108, + "step": 47950 + }, + { + "epoch": 1.89, + "grad_norm": 1.3715764517785491, + "learning_rate": 4.936600334448161e-06, + "loss": 0.2118, + "step": 47975 + }, + { + "epoch": 1.89, + "grad_norm": 1.242091850621187, + "learning_rate": 4.9365585284280945e-06, + "loss": 0.2091, + "step": 48000 + }, + { + "epoch": 1.89, + "grad_norm": 2.381253912271751, + "learning_rate": 4.936516722408027e-06, + "loss": 0.1934, + "step": 48025 + }, + { + "epoch": 1.89, + "grad_norm": 1.412172971145708, + "learning_rate": 4.936474916387961e-06, + "loss": 0.2054, + "step": 48050 + }, + { + "epoch": 1.89, + "grad_norm": 1.5116219256655863, + "learning_rate": 4.936433110367893e-06, + "loss": 0.2155, + "step": 48075 + }, + { + "epoch": 1.89, + "grad_norm": 1.2887142644122933, + "learning_rate": 4.936391304347827e-06, + "loss": 0.2091, + "step": 48100 + }, + { + "epoch": 1.89, + "grad_norm": 1.6820263408725942, + "learning_rate": 4.93634949832776e-06, + "loss": 0.1912, + "step": 48125 + }, + { + "epoch": 1.89, + "grad_norm": 1.5815843596491972, + "learning_rate": 4.936307692307692e-06, + "loss": 0.2055, + "step": 48150 + }, + { + "epoch": 1.9, + "grad_norm": 1.6081837772093868, + "learning_rate": 4.936265886287626e-06, + "loss": 0.2117, + "step": 48175 + }, + { + "epoch": 1.9, + "grad_norm": 1.4467885553172026, + "learning_rate": 4.936224080267559e-06, + "loss": 0.216, + "step": 48200 + }, + { + "epoch": 1.9, + "grad_norm": 1.1519428579352176, + "learning_rate": 4.936182274247492e-06, + "loss": 0.2017, + "step": 48225 + }, + { + "epoch": 1.9, + "grad_norm": 1.5300606843300384, + "learning_rate": 4.936140468227425e-06, + "loss": 0.205, + "step": 48250 + }, + { + "epoch": 1.9, + "grad_norm": 1.720404121032897, + "learning_rate": 4.936098662207358e-06, + "loss": 0.2007, + "step": 48275 + }, + { + "epoch": 1.9, + "grad_norm": 1.6425840844489246, + "learning_rate": 4.936056856187291e-06, + "loss": 0.1853, + "step": 48300 + }, + { + "epoch": 1.9, + "grad_norm": 1.459659103020749, + "learning_rate": 4.936015050167225e-06, + "loss": 0.2116, + "step": 48325 + }, + { + "epoch": 1.9, + "grad_norm": 1.3712585119025817, + "learning_rate": 4.935973244147157e-06, + "loss": 0.2106, + "step": 48350 + }, + { + "epoch": 1.9, + "grad_norm": 2.2759914401336694, + "learning_rate": 4.935931438127091e-06, + "loss": 0.2099, + "step": 48375 + }, + { + "epoch": 1.9, + "grad_norm": 1.1003315934242315, + "learning_rate": 4.935889632107024e-06, + "loss": 0.2006, + "step": 48400 + }, + { + "epoch": 1.91, + "grad_norm": 1.1531531222871791, + "learning_rate": 4.935847826086957e-06, + "loss": 0.1993, + "step": 48425 + }, + { + "epoch": 1.91, + "grad_norm": 1.9522070688750959, + "learning_rate": 4.93580602006689e-06, + "loss": 0.2029, + "step": 48450 + }, + { + "epoch": 1.91, + "grad_norm": 0.9647196482698353, + "learning_rate": 4.935764214046823e-06, + "loss": 0.1777, + "step": 48475 + }, + { + "epoch": 1.91, + "grad_norm": 2.357874217165264, + "learning_rate": 4.935722408026756e-06, + "loss": 0.202, + "step": 48500 + }, + { + "epoch": 1.91, + "grad_norm": 1.706086540465037, + "learning_rate": 4.93568060200669e-06, + "loss": 0.2099, + "step": 48525 + }, + { + "epoch": 1.91, + "grad_norm": 1.368230675138682, + "learning_rate": 4.935638795986622e-06, + "loss": 0.1893, + "step": 48550 + }, + { + "epoch": 1.91, + "grad_norm": 2.1989144444420323, + "learning_rate": 4.935596989966556e-06, + "loss": 0.234, + "step": 48575 + }, + { + "epoch": 1.91, + "grad_norm": 1.3439068076959173, + "learning_rate": 4.9355551839464886e-06, + "loss": 0.1919, + "step": 48600 + }, + { + "epoch": 1.91, + "grad_norm": 1.5347802892044207, + "learning_rate": 4.935513377926422e-06, + "loss": 0.2087, + "step": 48625 + }, + { + "epoch": 1.91, + "grad_norm": 1.3853060269687627, + "learning_rate": 4.935471571906355e-06, + "loss": 0.2153, + "step": 48650 + }, + { + "epoch": 1.91, + "grad_norm": 1.2705654843241438, + "learning_rate": 4.935429765886288e-06, + "loss": 0.1942, + "step": 48675 + }, + { + "epoch": 1.92, + "grad_norm": 1.507043182236023, + "learning_rate": 4.935387959866221e-06, + "loss": 0.2052, + "step": 48700 + }, + { + "epoch": 1.92, + "grad_norm": 0.8852640991515132, + "learning_rate": 4.935346153846155e-06, + "loss": 0.2126, + "step": 48725 + }, + { + "epoch": 1.92, + "grad_norm": 1.2880272534231267, + "learning_rate": 4.935304347826087e-06, + "loss": 0.1823, + "step": 48750 + }, + { + "epoch": 1.92, + "grad_norm": 1.3419848634582663, + "learning_rate": 4.935262541806021e-06, + "loss": 0.2085, + "step": 48775 + }, + { + "epoch": 1.92, + "grad_norm": 1.2454273911509375, + "learning_rate": 4.9352207357859535e-06, + "loss": 0.198, + "step": 48800 + }, + { + "epoch": 1.92, + "grad_norm": 1.1964741196519708, + "learning_rate": 4.935178929765886e-06, + "loss": 0.194, + "step": 48825 + }, + { + "epoch": 1.92, + "grad_norm": 1.482373609512503, + "learning_rate": 4.93513712374582e-06, + "loss": 0.1923, + "step": 48850 + }, + { + "epoch": 1.92, + "grad_norm": 1.4343822110286697, + "learning_rate": 4.9350953177257525e-06, + "loss": 0.201, + "step": 48875 + }, + { + "epoch": 1.92, + "grad_norm": 1.9110162286931605, + "learning_rate": 4.935053511705686e-06, + "loss": 0.2177, + "step": 48900 + }, + { + "epoch": 1.92, + "grad_norm": 1.4055468589424738, + "learning_rate": 4.935011705685619e-06, + "loss": 0.1975, + "step": 48925 + }, + { + "epoch": 1.93, + "grad_norm": 1.1095321871083568, + "learning_rate": 4.934969899665552e-06, + "loss": 0.2038, + "step": 48950 + }, + { + "epoch": 1.93, + "grad_norm": 1.6489177988410992, + "learning_rate": 4.934928093645485e-06, + "loss": 0.2045, + "step": 48975 + }, + { + "epoch": 1.93, + "grad_norm": 1.0969848442231984, + "learning_rate": 4.9348862876254185e-06, + "loss": 0.1798, + "step": 49000 + }, + { + "epoch": 1.93, + "grad_norm": 2.431163706761367, + "learning_rate": 4.934844481605351e-06, + "loss": 0.2148, + "step": 49025 + }, + { + "epoch": 1.93, + "grad_norm": 1.1269539448904287, + "learning_rate": 4.934802675585285e-06, + "loss": 0.1913, + "step": 49050 + }, + { + "epoch": 1.93, + "grad_norm": 2.009600750564663, + "learning_rate": 4.9347608695652175e-06, + "loss": 0.1902, + "step": 49075 + }, + { + "epoch": 1.93, + "grad_norm": 1.6341443792861832, + "learning_rate": 4.934719063545151e-06, + "loss": 0.2069, + "step": 49100 + }, + { + "epoch": 1.93, + "grad_norm": 1.4131602535107552, + "learning_rate": 4.934677257525084e-06, + "loss": 0.1899, + "step": 49125 + }, + { + "epoch": 1.93, + "grad_norm": 0.8377130239703243, + "learning_rate": 4.934635451505017e-06, + "loss": 0.2038, + "step": 49150 + }, + { + "epoch": 1.93, + "grad_norm": 1.9368805923358994, + "learning_rate": 4.93459364548495e-06, + "loss": 0.2089, + "step": 49175 + }, + { + "epoch": 1.94, + "grad_norm": 1.6406415730986539, + "learning_rate": 4.9345518394648835e-06, + "loss": 0.2023, + "step": 49200 + }, + { + "epoch": 1.94, + "grad_norm": 1.5142583337522033, + "learning_rate": 4.934510033444816e-06, + "loss": 0.2124, + "step": 49225 + }, + { + "epoch": 1.94, + "grad_norm": 1.4175403686258068, + "learning_rate": 4.93446822742475e-06, + "loss": 0.1889, + "step": 49250 + }, + { + "epoch": 1.94, + "grad_norm": 1.680924590381479, + "learning_rate": 4.9344264214046824e-06, + "loss": 0.18, + "step": 49275 + }, + { + "epoch": 1.94, + "grad_norm": 2.4182165052837803, + "learning_rate": 4.934384615384616e-06, + "loss": 0.199, + "step": 49300 + }, + { + "epoch": 1.94, + "grad_norm": 1.5324006988373318, + "learning_rate": 4.934342809364549e-06, + "loss": 0.2107, + "step": 49325 + }, + { + "epoch": 1.94, + "grad_norm": 1.8984134230654337, + "learning_rate": 4.9343026755852845e-06, + "loss": 0.2195, + "step": 49350 + }, + { + "epoch": 1.94, + "grad_norm": 1.8449500063553705, + "learning_rate": 4.934260869565218e-06, + "loss": 0.1778, + "step": 49375 + }, + { + "epoch": 1.94, + "grad_norm": 1.9070885133107418, + "learning_rate": 4.934219063545151e-06, + "loss": 0.2199, + "step": 49400 + }, + { + "epoch": 1.94, + "grad_norm": 1.528503320386569, + "learning_rate": 4.934177257525084e-06, + "loss": 0.2003, + "step": 49425 + }, + { + "epoch": 1.95, + "grad_norm": 1.6667344856090824, + "learning_rate": 4.934135451505017e-06, + "loss": 0.2131, + "step": 49450 + }, + { + "epoch": 1.95, + "grad_norm": 1.4152939504446955, + "learning_rate": 4.9340936454849505e-06, + "loss": 0.1976, + "step": 49475 + }, + { + "epoch": 1.95, + "grad_norm": 1.2226774657729198, + "learning_rate": 4.934051839464883e-06, + "loss": 0.194, + "step": 49500 + }, + { + "epoch": 1.95, + "grad_norm": 1.1414769595580594, + "learning_rate": 4.934010033444817e-06, + "loss": 0.2142, + "step": 49525 + }, + { + "epoch": 1.95, + "grad_norm": 1.3683179118744992, + "learning_rate": 4.9339682274247495e-06, + "loss": 0.209, + "step": 49550 + }, + { + "epoch": 1.95, + "grad_norm": 0.8797811276296436, + "learning_rate": 4.933926421404683e-06, + "loss": 0.1757, + "step": 49575 + }, + { + "epoch": 1.95, + "grad_norm": 1.6240088021552774, + "learning_rate": 4.933884615384616e-06, + "loss": 0.2065, + "step": 49600 + }, + { + "epoch": 1.95, + "grad_norm": 1.3496244482020072, + "learning_rate": 4.933842809364549e-06, + "loss": 0.1884, + "step": 49625 + }, + { + "epoch": 1.95, + "grad_norm": 1.9065579862207658, + "learning_rate": 4.933801003344482e-06, + "loss": 0.2219, + "step": 49650 + }, + { + "epoch": 1.95, + "grad_norm": 1.6823942734846893, + "learning_rate": 4.9337591973244155e-06, + "loss": 0.1922, + "step": 49675 + }, + { + "epoch": 1.96, + "grad_norm": 1.610332780860586, + "learning_rate": 4.933717391304348e-06, + "loss": 0.2257, + "step": 49700 + }, + { + "epoch": 1.96, + "grad_norm": 1.4749993288326266, + "learning_rate": 4.933675585284282e-06, + "loss": 0.2057, + "step": 49725 + }, + { + "epoch": 1.96, + "grad_norm": 1.1293208475215297, + "learning_rate": 4.9336337792642145e-06, + "loss": 0.2238, + "step": 49750 + }, + { + "epoch": 1.96, + "grad_norm": 0.9704102752018713, + "learning_rate": 4.933591973244147e-06, + "loss": 0.1962, + "step": 49775 + }, + { + "epoch": 1.96, + "grad_norm": 1.3546928779201595, + "learning_rate": 4.933550167224081e-06, + "loss": 0.1999, + "step": 49800 + }, + { + "epoch": 1.96, + "grad_norm": 1.647795435094045, + "learning_rate": 4.933508361204013e-06, + "loss": 0.187, + "step": 49825 + }, + { + "epoch": 1.96, + "grad_norm": 1.1719791408575768, + "learning_rate": 4.933466555183947e-06, + "loss": 0.1982, + "step": 49850 + }, + { + "epoch": 1.96, + "grad_norm": 1.410677691655115, + "learning_rate": 4.93342474916388e-06, + "loss": 0.2037, + "step": 49875 + }, + { + "epoch": 1.96, + "grad_norm": 1.9028353705603924, + "learning_rate": 4.933382943143813e-06, + "loss": 0.1955, + "step": 49900 + }, + { + "epoch": 1.96, + "grad_norm": 1.7600379755892537, + "learning_rate": 4.933341137123746e-06, + "loss": 0.2016, + "step": 49925 + }, + { + "epoch": 1.97, + "grad_norm": 1.340426249993996, + "learning_rate": 4.9332993311036794e-06, + "loss": 0.213, + "step": 49950 + }, + { + "epoch": 1.97, + "grad_norm": 1.7078496902487155, + "learning_rate": 4.933257525083612e-06, + "loss": 0.2066, + "step": 49975 + }, + { + "epoch": 1.97, + "grad_norm": 2.3783688063483632, + "learning_rate": 4.933215719063546e-06, + "loss": 0.2034, + "step": 50000 + }, + { + "epoch": 1.97, + "eval_loss": 0.49365234375, + "eval_runtime": 11498.4763, + "eval_samples_per_second": 0.823, + "eval_steps_per_second": 0.051, + "eval_wer": 0.11952447597445426, + "step": 50000 + }, + { + "epoch": 1.97, + "grad_norm": 1.9977781030077004, + "learning_rate": 4.933173913043478e-06, + "loss": 0.1802, + "step": 50025 + }, + { + "epoch": 1.97, + "grad_norm": 1.5811010265422427, + "learning_rate": 4.933132107023412e-06, + "loss": 0.2066, + "step": 50050 + }, + { + "epoch": 1.97, + "grad_norm": 1.8367235353607692, + "learning_rate": 4.933090301003345e-06, + "loss": 0.2188, + "step": 50075 + }, + { + "epoch": 1.97, + "grad_norm": 1.4262908481666672, + "learning_rate": 4.933048494983278e-06, + "loss": 0.1738, + "step": 50100 + }, + { + "epoch": 1.97, + "grad_norm": 0.8182130900532735, + "learning_rate": 4.933006688963211e-06, + "loss": 0.2088, + "step": 50125 + }, + { + "epoch": 1.97, + "grad_norm": 1.0473337182584366, + "learning_rate": 4.932964882943144e-06, + "loss": 0.2347, + "step": 50150 + }, + { + "epoch": 1.97, + "grad_norm": 1.2163618200194586, + "learning_rate": 4.932923076923077e-06, + "loss": 0.2046, + "step": 50175 + }, + { + "epoch": 1.97, + "grad_norm": 1.7969452660629082, + "learning_rate": 4.932881270903011e-06, + "loss": 0.1909, + "step": 50200 + }, + { + "epoch": 1.98, + "grad_norm": 1.8467324578026039, + "learning_rate": 4.932839464882943e-06, + "loss": 0.2108, + "step": 50225 + }, + { + "epoch": 1.98, + "grad_norm": 2.217318536045435, + "learning_rate": 4.932797658862877e-06, + "loss": 0.2038, + "step": 50250 + }, + { + "epoch": 1.98, + "grad_norm": 1.4419104306585004, + "learning_rate": 4.93275585284281e-06, + "loss": 0.2081, + "step": 50275 + }, + { + "epoch": 1.98, + "grad_norm": 1.6288965207748185, + "learning_rate": 4.932714046822743e-06, + "loss": 0.2051, + "step": 50300 + }, + { + "epoch": 1.98, + "grad_norm": 1.3830015306405978, + "learning_rate": 4.932672240802676e-06, + "loss": 0.1817, + "step": 50325 + }, + { + "epoch": 1.98, + "grad_norm": 1.85804110304336, + "learning_rate": 4.932632107023412e-06, + "loss": 0.2122, + "step": 50350 + }, + { + "epoch": 1.98, + "grad_norm": 1.268213957087889, + "learning_rate": 4.932590301003345e-06, + "loss": 0.2063, + "step": 50375 + }, + { + "epoch": 1.98, + "grad_norm": 1.985988233626404, + "learning_rate": 4.932548494983278e-06, + "loss": 0.2142, + "step": 50400 + }, + { + "epoch": 1.98, + "grad_norm": 2.0433158965290197, + "learning_rate": 4.9325066889632115e-06, + "loss": 0.195, + "step": 50425 + }, + { + "epoch": 1.98, + "grad_norm": 1.619354197211885, + "learning_rate": 4.932464882943144e-06, + "loss": 0.1964, + "step": 50450 + }, + { + "epoch": 1.99, + "grad_norm": 1.4976821988213502, + "learning_rate": 4.932423076923078e-06, + "loss": 0.2149, + "step": 50475 + }, + { + "epoch": 1.99, + "grad_norm": 2.420182183432332, + "learning_rate": 4.93238127090301e-06, + "loss": 0.2245, + "step": 50500 + }, + { + "epoch": 1.99, + "grad_norm": 1.9459403489592213, + "learning_rate": 4.932339464882944e-06, + "loss": 0.2198, + "step": 50525 + }, + { + "epoch": 1.99, + "grad_norm": 1.5861503958567087, + "learning_rate": 4.932297658862877e-06, + "loss": 0.2204, + "step": 50550 + }, + { + "epoch": 1.99, + "grad_norm": 0.8998163673028098, + "learning_rate": 4.93225585284281e-06, + "loss": 0.2106, + "step": 50575 + }, + { + "epoch": 1.99, + "grad_norm": 1.331165993976876, + "learning_rate": 4.932214046822743e-06, + "loss": 0.1854, + "step": 50600 + }, + { + "epoch": 1.99, + "grad_norm": 1.733723317933618, + "learning_rate": 4.9321722408026764e-06, + "loss": 0.1989, + "step": 50625 + }, + { + "epoch": 1.99, + "grad_norm": 1.78159021767067, + "learning_rate": 4.932130434782609e-06, + "loss": 0.2156, + "step": 50650 + }, + { + "epoch": 1.99, + "grad_norm": 1.8765020453068406, + "learning_rate": 4.932088628762543e-06, + "loss": 0.1994, + "step": 50675 + }, + { + "epoch": 1.99, + "grad_norm": 1.2239774542089124, + "learning_rate": 4.932046822742475e-06, + "loss": 0.1955, + "step": 50700 + }, + { + "epoch": 2.0, + "grad_norm": 1.182462271207217, + "learning_rate": 4.932005016722408e-06, + "loss": 0.1945, + "step": 50725 + }, + { + "epoch": 2.0, + "grad_norm": 1.8092880077254656, + "learning_rate": 4.931963210702342e-06, + "loss": 0.1952, + "step": 50750 + }, + { + "epoch": 2.0, + "grad_norm": 1.6754850868214068, + "learning_rate": 4.931921404682274e-06, + "loss": 0.2224, + "step": 50775 + }, + { + "epoch": 2.0, + "grad_norm": 1.0066490763989873, + "learning_rate": 4.931879598662208e-06, + "loss": 0.1988, + "step": 50800 + }, + { + "epoch": 2.0, + "grad_norm": 1.3175579753366817, + "learning_rate": 4.9318377926421406e-06, + "loss": 0.2053, + "step": 50825 + }, + { + "epoch": 2.0, + "grad_norm": 1.3893938429753403, + "learning_rate": 4.931795986622074e-06, + "loss": 0.1887, + "step": 50850 + }, + { + "epoch": 2.0, + "grad_norm": 1.6195076889793714, + "learning_rate": 4.931754180602007e-06, + "loss": 0.1549, + "step": 50875 + }, + { + "epoch": 2.0, + "grad_norm": 2.5831100199621515, + "learning_rate": 4.93171237458194e-06, + "loss": 0.1559, + "step": 50900 + }, + { + "epoch": 2.0, + "grad_norm": 2.280031099022124, + "learning_rate": 4.931670568561873e-06, + "loss": 0.1634, + "step": 50925 + }, + { + "epoch": 2.0, + "grad_norm": 2.0793384529410677, + "learning_rate": 4.931628762541807e-06, + "loss": 0.1563, + "step": 50950 + }, + { + "epoch": 2.01, + "grad_norm": 1.668781784899802, + "learning_rate": 4.931586956521739e-06, + "loss": 0.1525, + "step": 50975 + }, + { + "epoch": 2.01, + "grad_norm": 2.532539607395984, + "learning_rate": 4.931545150501673e-06, + "loss": 0.1557, + "step": 51000 + }, + { + "epoch": 2.01, + "grad_norm": 1.748971811774227, + "learning_rate": 4.9315033444816056e-06, + "loss": 0.1626, + "step": 51025 + }, + { + "epoch": 2.01, + "grad_norm": 1.759658061175033, + "learning_rate": 4.931461538461539e-06, + "loss": 0.1551, + "step": 51050 + }, + { + "epoch": 2.01, + "grad_norm": 1.899714433160585, + "learning_rate": 4.931419732441472e-06, + "loss": 0.1474, + "step": 51075 + }, + { + "epoch": 2.01, + "grad_norm": 3.3506408589981036, + "learning_rate": 4.931377926421405e-06, + "loss": 0.1537, + "step": 51100 + }, + { + "epoch": 2.01, + "grad_norm": 2.445193092719799, + "learning_rate": 4.931336120401338e-06, + "loss": 0.1553, + "step": 51125 + }, + { + "epoch": 2.01, + "grad_norm": 2.018152535354776, + "learning_rate": 4.931294314381272e-06, + "loss": 0.1452, + "step": 51150 + }, + { + "epoch": 2.01, + "grad_norm": 1.2715640087113764, + "learning_rate": 4.931252508361204e-06, + "loss": 0.159, + "step": 51175 + }, + { + "epoch": 2.01, + "grad_norm": 2.6259858096583195, + "learning_rate": 4.931210702341138e-06, + "loss": 0.1432, + "step": 51200 + }, + { + "epoch": 2.02, + "grad_norm": 2.0134734462630113, + "learning_rate": 4.9311688963210705e-06, + "loss": 0.1746, + "step": 51225 + }, + { + "epoch": 2.02, + "grad_norm": 2.53530897566775, + "learning_rate": 4.931127090301004e-06, + "loss": 0.1572, + "step": 51250 + }, + { + "epoch": 2.02, + "grad_norm": 2.319671554271341, + "learning_rate": 4.931085284280937e-06, + "loss": 0.1702, + "step": 51275 + }, + { + "epoch": 2.02, + "grad_norm": 2.1657373621605718, + "learning_rate": 4.93104347826087e-06, + "loss": 0.1778, + "step": 51300 + }, + { + "epoch": 2.02, + "grad_norm": 1.8207220923214296, + "learning_rate": 4.931001672240803e-06, + "loss": 0.1752, + "step": 51325 + }, + { + "epoch": 2.02, + "grad_norm": 1.6051555436818306, + "learning_rate": 4.930961538461539e-06, + "loss": 0.1463, + "step": 51350 + }, + { + "epoch": 2.02, + "grad_norm": 3.2631241428045374, + "learning_rate": 4.930919732441472e-06, + "loss": 0.1562, + "step": 51375 + }, + { + "epoch": 2.02, + "grad_norm": 2.733310564982773, + "learning_rate": 4.930877926421405e-06, + "loss": 0.1436, + "step": 51400 + }, + { + "epoch": 2.02, + "grad_norm": 3.0421003211716813, + "learning_rate": 4.930836120401339e-06, + "loss": 0.1439, + "step": 51425 + }, + { + "epoch": 2.02, + "grad_norm": 1.8208998532386054, + "learning_rate": 4.930794314381271e-06, + "loss": 0.1587, + "step": 51450 + }, + { + "epoch": 2.03, + "grad_norm": 1.619487202524097, + "learning_rate": 4.930752508361205e-06, + "loss": 0.1508, + "step": 51475 + }, + { + "epoch": 2.03, + "grad_norm": 2.4972195849639807, + "learning_rate": 4.930710702341138e-06, + "loss": 0.1439, + "step": 51500 + }, + { + "epoch": 2.03, + "grad_norm": 2.0416854490311533, + "learning_rate": 4.930668896321071e-06, + "loss": 0.1682, + "step": 51525 + }, + { + "epoch": 2.03, + "grad_norm": 2.565121011610061, + "learning_rate": 4.930627090301004e-06, + "loss": 0.1571, + "step": 51550 + }, + { + "epoch": 2.03, + "grad_norm": 1.9390353127611322, + "learning_rate": 4.930585284280937e-06, + "loss": 0.144, + "step": 51575 + }, + { + "epoch": 2.03, + "grad_norm": 3.0758813481142617, + "learning_rate": 4.93054347826087e-06, + "loss": 0.1611, + "step": 51600 + }, + { + "epoch": 2.03, + "grad_norm": 1.59428448676563, + "learning_rate": 4.930501672240804e-06, + "loss": 0.1634, + "step": 51625 + }, + { + "epoch": 2.03, + "grad_norm": 2.038301554771562, + "learning_rate": 4.930459866220736e-06, + "loss": 0.1527, + "step": 51650 + }, + { + "epoch": 2.03, + "grad_norm": 2.0376499179826846, + "learning_rate": 4.930418060200669e-06, + "loss": 0.165, + "step": 51675 + }, + { + "epoch": 2.03, + "grad_norm": 1.6312057183566488, + "learning_rate": 4.9303762541806026e-06, + "loss": 0.1456, + "step": 51700 + }, + { + "epoch": 2.03, + "grad_norm": 1.310992199986981, + "learning_rate": 4.930334448160535e-06, + "loss": 0.1473, + "step": 51725 + }, + { + "epoch": 2.04, + "grad_norm": 2.3033985635608203, + "learning_rate": 4.930292642140469e-06, + "loss": 0.155, + "step": 51750 + }, + { + "epoch": 2.04, + "grad_norm": 1.832532339873309, + "learning_rate": 4.9302508361204015e-06, + "loss": 0.1594, + "step": 51775 + }, + { + "epoch": 2.04, + "grad_norm": 1.8899396483286703, + "learning_rate": 4.930209030100335e-06, + "loss": 0.1534, + "step": 51800 + }, + { + "epoch": 2.04, + "grad_norm": 1.5216428673701485, + "learning_rate": 4.930167224080268e-06, + "loss": 0.1608, + "step": 51825 + }, + { + "epoch": 2.04, + "grad_norm": 3.8041744241802538, + "learning_rate": 4.930125418060201e-06, + "loss": 0.1779, + "step": 51850 + }, + { + "epoch": 2.04, + "grad_norm": 2.479253142386971, + "learning_rate": 4.930083612040134e-06, + "loss": 0.1623, + "step": 51875 + }, + { + "epoch": 2.04, + "grad_norm": 2.0835171942963253, + "learning_rate": 4.9300418060200675e-06, + "loss": 0.1685, + "step": 51900 + }, + { + "epoch": 2.04, + "grad_norm": 2.4988154900909576, + "learning_rate": 4.93e-06, + "loss": 0.1615, + "step": 51925 + }, + { + "epoch": 2.04, + "grad_norm": 1.8221176766564573, + "learning_rate": 4.929958193979934e-06, + "loss": 0.1502, + "step": 51950 + }, + { + "epoch": 2.04, + "grad_norm": 1.7415285030938414, + "learning_rate": 4.9299163879598665e-06, + "loss": 0.1559, + "step": 51975 + }, + { + "epoch": 2.05, + "grad_norm": 2.704013115530803, + "learning_rate": 4.9298745819398e-06, + "loss": 0.1685, + "step": 52000 + }, + { + "epoch": 2.05, + "grad_norm": 3.4942317648547885, + "learning_rate": 4.929832775919733e-06, + "loss": 0.1544, + "step": 52025 + }, + { + "epoch": 2.05, + "grad_norm": 1.4905741598961249, + "learning_rate": 4.929790969899666e-06, + "loss": 0.1526, + "step": 52050 + }, + { + "epoch": 2.05, + "grad_norm": 2.6196280886358214, + "learning_rate": 4.929749163879599e-06, + "loss": 0.161, + "step": 52075 + }, + { + "epoch": 2.05, + "grad_norm": 1.8163085825589482, + "learning_rate": 4.9297073578595325e-06, + "loss": 0.1769, + "step": 52100 + }, + { + "epoch": 2.05, + "grad_norm": 1.929047133011546, + "learning_rate": 4.929665551839465e-06, + "loss": 0.1672, + "step": 52125 + }, + { + "epoch": 2.05, + "grad_norm": 2.483643296525586, + "learning_rate": 4.929623745819399e-06, + "loss": 0.1441, + "step": 52150 + }, + { + "epoch": 2.05, + "grad_norm": 2.844006335610527, + "learning_rate": 4.929583612040134e-06, + "loss": 0.1718, + "step": 52175 + }, + { + "epoch": 2.05, + "grad_norm": 2.3894318119446356, + "learning_rate": 4.929541806020067e-06, + "loss": 0.1538, + "step": 52200 + }, + { + "epoch": 2.05, + "grad_norm": 2.965395513590732, + "learning_rate": 4.9295e-06, + "loss": 0.1496, + "step": 52225 + }, + { + "epoch": 2.06, + "grad_norm": 2.304504000730068, + "learning_rate": 4.9294581939799335e-06, + "loss": 0.1669, + "step": 52250 + }, + { + "epoch": 2.06, + "grad_norm": 1.0721485878168964, + "learning_rate": 4.929416387959867e-06, + "loss": 0.1425, + "step": 52275 + }, + { + "epoch": 2.06, + "grad_norm": 1.8325103490337078, + "learning_rate": 4.9293745819398e-06, + "loss": 0.1522, + "step": 52300 + }, + { + "epoch": 2.06, + "grad_norm": 1.9327528432617735, + "learning_rate": 4.929332775919733e-06, + "loss": 0.162, + "step": 52325 + }, + { + "epoch": 2.06, + "grad_norm": 1.3305621320480137, + "learning_rate": 4.929290969899666e-06, + "loss": 0.1493, + "step": 52350 + }, + { + "epoch": 2.06, + "grad_norm": 1.944031143750634, + "learning_rate": 4.9292491638795996e-06, + "loss": 0.1526, + "step": 52375 + }, + { + "epoch": 2.06, + "grad_norm": 2.3706222006628677, + "learning_rate": 4.929207357859532e-06, + "loss": 0.1537, + "step": 52400 + }, + { + "epoch": 2.06, + "grad_norm": 2.3458044853455924, + "learning_rate": 4.929165551839466e-06, + "loss": 0.1633, + "step": 52425 + }, + { + "epoch": 2.06, + "grad_norm": 1.74893596241187, + "learning_rate": 4.9291237458193985e-06, + "loss": 0.1573, + "step": 52450 + }, + { + "epoch": 2.06, + "grad_norm": 2.422402658813327, + "learning_rate": 4.929081939799332e-06, + "loss": 0.1769, + "step": 52475 + }, + { + "epoch": 2.07, + "grad_norm": 2.1581876952824732, + "learning_rate": 4.929040133779265e-06, + "loss": 0.1667, + "step": 52500 + }, + { + "epoch": 2.07, + "grad_norm": 1.549994934584253, + "learning_rate": 4.928998327759198e-06, + "loss": 0.1634, + "step": 52525 + }, + { + "epoch": 2.07, + "grad_norm": 1.9136228137791405, + "learning_rate": 4.928956521739131e-06, + "loss": 0.1531, + "step": 52550 + }, + { + "epoch": 2.07, + "grad_norm": 1.710298426938268, + "learning_rate": 4.9289147157190645e-06, + "loss": 0.1536, + "step": 52575 + }, + { + "epoch": 2.07, + "grad_norm": 2.5543413568354856, + "learning_rate": 4.928872909698996e-06, + "loss": 0.1705, + "step": 52600 + }, + { + "epoch": 2.07, + "grad_norm": 1.7127212550512338, + "learning_rate": 4.92883110367893e-06, + "loss": 0.1708, + "step": 52625 + }, + { + "epoch": 2.07, + "grad_norm": 2.3037309062234965, + "learning_rate": 4.928789297658863e-06, + "loss": 0.1741, + "step": 52650 + }, + { + "epoch": 2.07, + "grad_norm": 1.9495570743735675, + "learning_rate": 4.928747491638796e-06, + "loss": 0.1771, + "step": 52675 + }, + { + "epoch": 2.07, + "grad_norm": 2.238430814642686, + "learning_rate": 4.92870568561873e-06, + "loss": 0.1583, + "step": 52700 + }, + { + "epoch": 2.07, + "grad_norm": 2.320952142850099, + "learning_rate": 4.9286638795986624e-06, + "loss": 0.1593, + "step": 52725 + }, + { + "epoch": 2.08, + "grad_norm": 2.1755627909046047, + "learning_rate": 4.928622073578596e-06, + "loss": 0.1469, + "step": 52750 + }, + { + "epoch": 2.08, + "grad_norm": 2.4189873372395727, + "learning_rate": 4.928580267558529e-06, + "loss": 0.1456, + "step": 52775 + }, + { + "epoch": 2.08, + "grad_norm": 2.2511565610144335, + "learning_rate": 4.928538461538462e-06, + "loss": 0.1436, + "step": 52800 + }, + { + "epoch": 2.08, + "grad_norm": 2.1374086292631, + "learning_rate": 4.928496655518395e-06, + "loss": 0.143, + "step": 52825 + }, + { + "epoch": 2.08, + "grad_norm": 2.461021396003331, + "learning_rate": 4.9284548494983285e-06, + "loss": 0.1702, + "step": 52850 + }, + { + "epoch": 2.08, + "grad_norm": 1.9960814769649182, + "learning_rate": 4.928413043478261e-06, + "loss": 0.1594, + "step": 52875 + }, + { + "epoch": 2.08, + "grad_norm": 1.568571889804149, + "learning_rate": 4.928371237458195e-06, + "loss": 0.1537, + "step": 52900 + }, + { + "epoch": 2.08, + "grad_norm": 2.8880582494240996, + "learning_rate": 4.928329431438127e-06, + "loss": 0.1487, + "step": 52925 + }, + { + "epoch": 2.08, + "grad_norm": 1.4868852996183224, + "learning_rate": 4.928287625418061e-06, + "loss": 0.1567, + "step": 52950 + }, + { + "epoch": 2.08, + "grad_norm": 1.6979293077536246, + "learning_rate": 4.928245819397994e-06, + "loss": 0.1661, + "step": 52975 + }, + { + "epoch": 2.09, + "grad_norm": 2.0399515236372667, + "learning_rate": 4.928204013377927e-06, + "loss": 0.1573, + "step": 53000 + }, + { + "epoch": 2.09, + "grad_norm": 2.2471778177012527, + "learning_rate": 4.92816220735786e-06, + "loss": 0.15, + "step": 53025 + }, + { + "epoch": 2.09, + "grad_norm": 1.8607165071886655, + "learning_rate": 4.9281204013377934e-06, + "loss": 0.158, + "step": 53050 + }, + { + "epoch": 2.09, + "grad_norm": 2.68403495926998, + "learning_rate": 4.928078595317726e-06, + "loss": 0.1455, + "step": 53075 + }, + { + "epoch": 2.09, + "grad_norm": 2.1127210702982535, + "learning_rate": 4.92803678929766e-06, + "loss": 0.1405, + "step": 53100 + }, + { + "epoch": 2.09, + "grad_norm": 2.843643320066027, + "learning_rate": 4.927994983277592e-06, + "loss": 0.1716, + "step": 53125 + }, + { + "epoch": 2.09, + "grad_norm": 2.8317086013921706, + "learning_rate": 4.927953177257526e-06, + "loss": 0.157, + "step": 53150 + }, + { + "epoch": 2.09, + "grad_norm": 2.8083401347718393, + "learning_rate": 4.927911371237459e-06, + "loss": 0.1612, + "step": 53175 + }, + { + "epoch": 2.09, + "grad_norm": 1.8263439878473045, + "learning_rate": 4.927869565217392e-06, + "loss": 0.1766, + "step": 53200 + }, + { + "epoch": 2.09, + "grad_norm": 2.485098578437788, + "learning_rate": 4.927827759197325e-06, + "loss": 0.1884, + "step": 53225 + }, + { + "epoch": 2.09, + "grad_norm": 3.5075459930953996, + "learning_rate": 4.927785953177258e-06, + "loss": 0.1412, + "step": 53250 + }, + { + "epoch": 2.1, + "grad_norm": 2.395353082662295, + "learning_rate": 4.927744147157191e-06, + "loss": 0.1559, + "step": 53275 + }, + { + "epoch": 2.1, + "grad_norm": 2.1892457117001767, + "learning_rate": 4.927702341137125e-06, + "loss": 0.1704, + "step": 53300 + }, + { + "epoch": 2.1, + "grad_norm": 2.220899724188775, + "learning_rate": 4.927660535117057e-06, + "loss": 0.1717, + "step": 53325 + }, + { + "epoch": 2.1, + "grad_norm": 1.894092777332042, + "learning_rate": 4.92761872909699e-06, + "loss": 0.1421, + "step": 53350 + }, + { + "epoch": 2.1, + "grad_norm": 1.6879891749223022, + "learning_rate": 4.927576923076924e-06, + "loss": 0.1481, + "step": 53375 + }, + { + "epoch": 2.1, + "grad_norm": 1.849208892249495, + "learning_rate": 4.927535117056856e-06, + "loss": 0.1536, + "step": 53400 + }, + { + "epoch": 2.1, + "grad_norm": 3.234496990961037, + "learning_rate": 4.92749331103679e-06, + "loss": 0.1578, + "step": 53425 + }, + { + "epoch": 2.1, + "grad_norm": 1.4492358664142355, + "learning_rate": 4.9274515050167225e-06, + "loss": 0.1511, + "step": 53450 + }, + { + "epoch": 2.1, + "grad_norm": 2.685397894450621, + "learning_rate": 4.927409698996656e-06, + "loss": 0.174, + "step": 53475 + }, + { + "epoch": 2.1, + "grad_norm": 2.3684425125628716, + "learning_rate": 4.927367892976589e-06, + "loss": 0.1614, + "step": 53500 + }, + { + "epoch": 2.11, + "grad_norm": 2.11753748476298, + "learning_rate": 4.927326086956522e-06, + "loss": 0.1559, + "step": 53525 + }, + { + "epoch": 2.11, + "grad_norm": 1.9354680446303794, + "learning_rate": 4.927284280936455e-06, + "loss": 0.1663, + "step": 53550 + }, + { + "epoch": 2.11, + "grad_norm": 2.227156794826095, + "learning_rate": 4.9272424749163886e-06, + "loss": 0.1799, + "step": 53575 + }, + { + "epoch": 2.11, + "grad_norm": 2.620633749751813, + "learning_rate": 4.927200668896321e-06, + "loss": 0.1648, + "step": 53600 + }, + { + "epoch": 2.11, + "grad_norm": 1.9582576590163816, + "learning_rate": 4.927158862876255e-06, + "loss": 0.1611, + "step": 53625 + }, + { + "epoch": 2.11, + "grad_norm": 1.888954284267945, + "learning_rate": 4.9271170568561875e-06, + "loss": 0.1493, + "step": 53650 + }, + { + "epoch": 2.11, + "grad_norm": 1.5525653783661335, + "learning_rate": 4.927075250836121e-06, + "loss": 0.1539, + "step": 53675 + }, + { + "epoch": 2.11, + "grad_norm": 2.3535390824131723, + "learning_rate": 4.927033444816054e-06, + "loss": 0.1635, + "step": 53700 + }, + { + "epoch": 2.11, + "grad_norm": 2.773296192539899, + "learning_rate": 4.926991638795987e-06, + "loss": 0.1585, + "step": 53725 + }, + { + "epoch": 2.11, + "grad_norm": 1.784025818674246, + "learning_rate": 4.92694983277592e-06, + "loss": 0.1533, + "step": 53750 + }, + { + "epoch": 2.12, + "grad_norm": 1.671456960230062, + "learning_rate": 4.9269080267558536e-06, + "loss": 0.1546, + "step": 53775 + }, + { + "epoch": 2.12, + "grad_norm": 2.523737848859212, + "learning_rate": 4.926866220735786e-06, + "loss": 0.1707, + "step": 53800 + }, + { + "epoch": 2.12, + "grad_norm": 1.530100074434391, + "learning_rate": 4.92682441471572e-06, + "loss": 0.1684, + "step": 53825 + }, + { + "epoch": 2.12, + "grad_norm": 1.3561056846922916, + "learning_rate": 4.9267826086956525e-06, + "loss": 0.1888, + "step": 53850 + }, + { + "epoch": 2.12, + "grad_norm": 2.0777323154601883, + "learning_rate": 4.926740802675586e-06, + "loss": 0.1646, + "step": 53875 + }, + { + "epoch": 2.12, + "grad_norm": 1.493446217332173, + "learning_rate": 4.926698996655519e-06, + "loss": 0.1517, + "step": 53900 + }, + { + "epoch": 2.12, + "grad_norm": 2.1136296419506113, + "learning_rate": 4.926657190635452e-06, + "loss": 0.1633, + "step": 53925 + }, + { + "epoch": 2.12, + "grad_norm": 2.3989158546724845, + "learning_rate": 4.926615384615385e-06, + "loss": 0.1561, + "step": 53950 + }, + { + "epoch": 2.12, + "grad_norm": 3.0172378893049374, + "learning_rate": 4.9265735785953185e-06, + "loss": 0.1728, + "step": 53975 + }, + { + "epoch": 2.12, + "grad_norm": 1.9134652124787792, + "learning_rate": 4.926531772575251e-06, + "loss": 0.1601, + "step": 54000 + }, + { + "epoch": 2.13, + "grad_norm": 2.279368931878358, + "learning_rate": 4.926489966555184e-06, + "loss": 0.1619, + "step": 54025 + }, + { + "epoch": 2.13, + "grad_norm": 1.4996559021361437, + "learning_rate": 4.9264481605351175e-06, + "loss": 0.1535, + "step": 54050 + }, + { + "epoch": 2.13, + "grad_norm": 1.7199074418315037, + "learning_rate": 4.92640635451505e-06, + "loss": 0.1526, + "step": 54075 + }, + { + "epoch": 2.13, + "grad_norm": 2.1648899905797956, + "learning_rate": 4.926364548494984e-06, + "loss": 0.1653, + "step": 54100 + }, + { + "epoch": 2.13, + "grad_norm": 1.7286400940104754, + "learning_rate": 4.926322742474916e-06, + "loss": 0.1557, + "step": 54125 + }, + { + "epoch": 2.13, + "grad_norm": 1.5889299691023648, + "learning_rate": 4.92628093645485e-06, + "loss": 0.1623, + "step": 54150 + }, + { + "epoch": 2.13, + "grad_norm": 2.194237907627962, + "learning_rate": 4.926240802675586e-06, + "loss": 0.1694, + "step": 54175 + }, + { + "epoch": 2.13, + "grad_norm": 1.8557622264070135, + "learning_rate": 4.926198996655519e-06, + "loss": 0.1708, + "step": 54200 + }, + { + "epoch": 2.13, + "grad_norm": 1.6786879794483356, + "learning_rate": 4.926157190635452e-06, + "loss": 0.151, + "step": 54225 + }, + { + "epoch": 2.13, + "grad_norm": 2.9939730018473427, + "learning_rate": 4.926115384615385e-06, + "loss": 0.1538, + "step": 54250 + }, + { + "epoch": 2.14, + "grad_norm": 1.9875443858741775, + "learning_rate": 4.926073578595318e-06, + "loss": 0.176, + "step": 54275 + }, + { + "epoch": 2.14, + "grad_norm": 1.7973867233787022, + "learning_rate": 4.926031772575251e-06, + "loss": 0.145, + "step": 54300 + }, + { + "epoch": 2.14, + "grad_norm": 2.216337336553059, + "learning_rate": 4.9259899665551845e-06, + "loss": 0.1767, + "step": 54325 + }, + { + "epoch": 2.14, + "grad_norm": 3.4129098940982208, + "learning_rate": 4.925948160535117e-06, + "loss": 0.1485, + "step": 54350 + }, + { + "epoch": 2.14, + "grad_norm": 2.0007981016491767, + "learning_rate": 4.925906354515051e-06, + "loss": 0.1441, + "step": 54375 + }, + { + "epoch": 2.14, + "grad_norm": 2.8583478535737, + "learning_rate": 4.9258645484949835e-06, + "loss": 0.1594, + "step": 54400 + }, + { + "epoch": 2.14, + "grad_norm": 2.314179244280809, + "learning_rate": 4.925822742474917e-06, + "loss": 0.155, + "step": 54425 + }, + { + "epoch": 2.14, + "grad_norm": 2.0132476893206763, + "learning_rate": 4.92578093645485e-06, + "loss": 0.1656, + "step": 54450 + }, + { + "epoch": 2.14, + "grad_norm": 1.8635957755225794, + "learning_rate": 4.925739130434783e-06, + "loss": 0.1602, + "step": 54475 + }, + { + "epoch": 2.14, + "grad_norm": 2.834088448909386, + "learning_rate": 4.925697324414716e-06, + "loss": 0.1691, + "step": 54500 + }, + { + "epoch": 2.15, + "grad_norm": 2.2049287035923615, + "learning_rate": 4.9256555183946495e-06, + "loss": 0.1581, + "step": 54525 + }, + { + "epoch": 2.15, + "grad_norm": 3.2612117086630983, + "learning_rate": 4.925613712374582e-06, + "loss": 0.1581, + "step": 54550 + }, + { + "epoch": 2.15, + "grad_norm": 1.7241251316995652, + "learning_rate": 4.925571906354516e-06, + "loss": 0.1623, + "step": 54575 + }, + { + "epoch": 2.15, + "grad_norm": 2.2161375845886737, + "learning_rate": 4.9255301003344484e-06, + "loss": 0.173, + "step": 54600 + }, + { + "epoch": 2.15, + "grad_norm": 1.6279708477585182, + "learning_rate": 4.925488294314382e-06, + "loss": 0.1622, + "step": 54625 + }, + { + "epoch": 2.15, + "grad_norm": 1.6368572644697492, + "learning_rate": 4.925446488294315e-06, + "loss": 0.1665, + "step": 54650 + }, + { + "epoch": 2.15, + "grad_norm": 1.2374445674653936, + "learning_rate": 4.925404682274248e-06, + "loss": 0.1645, + "step": 54675 + }, + { + "epoch": 2.15, + "grad_norm": 2.777041275353947, + "learning_rate": 4.925362876254181e-06, + "loss": 0.1631, + "step": 54700 + }, + { + "epoch": 2.15, + "grad_norm": 2.4610825173728323, + "learning_rate": 4.9253210702341145e-06, + "loss": 0.1594, + "step": 54725 + }, + { + "epoch": 2.15, + "grad_norm": 2.0784666529738263, + "learning_rate": 4.925279264214047e-06, + "loss": 0.1563, + "step": 54750 + }, + { + "epoch": 2.15, + "grad_norm": 1.620773172947401, + "learning_rate": 4.925237458193981e-06, + "loss": 0.1534, + "step": 54775 + }, + { + "epoch": 2.16, + "grad_norm": 1.5041303743520913, + "learning_rate": 4.925195652173913e-06, + "loss": 0.1823, + "step": 54800 + }, + { + "epoch": 2.16, + "grad_norm": 2.3136541774000023, + "learning_rate": 4.925153846153847e-06, + "loss": 0.1643, + "step": 54825 + }, + { + "epoch": 2.16, + "grad_norm": 2.2238847850927437, + "learning_rate": 4.92511204013378e-06, + "loss": 0.1761, + "step": 54850 + }, + { + "epoch": 2.16, + "grad_norm": 2.713279366369079, + "learning_rate": 4.925070234113713e-06, + "loss": 0.1657, + "step": 54875 + }, + { + "epoch": 2.16, + "grad_norm": 2.6333303154824503, + "learning_rate": 4.925028428093646e-06, + "loss": 0.1627, + "step": 54900 + }, + { + "epoch": 2.16, + "grad_norm": 2.863859374260147, + "learning_rate": 4.9249866220735795e-06, + "loss": 0.1524, + "step": 54925 + }, + { + "epoch": 2.16, + "grad_norm": 1.7876622608964243, + "learning_rate": 4.924944816053512e-06, + "loss": 0.1603, + "step": 54950 + }, + { + "epoch": 2.16, + "grad_norm": 1.7726951565012345, + "learning_rate": 4.924903010033445e-06, + "loss": 0.1471, + "step": 54975 + }, + { + "epoch": 2.16, + "grad_norm": 1.823226550047555, + "learning_rate": 4.924861204013378e-06, + "loss": 0.1562, + "step": 55000 + }, + { + "epoch": 2.16, + "grad_norm": 1.6079458847263772, + "learning_rate": 4.924819397993311e-06, + "loss": 0.1687, + "step": 55025 + }, + { + "epoch": 2.17, + "grad_norm": 2.505583643161287, + "learning_rate": 4.924777591973245e-06, + "loss": 0.1666, + "step": 55050 + }, + { + "epoch": 2.17, + "grad_norm": 2.2835391606286155, + "learning_rate": 4.924735785953177e-06, + "loss": 0.1619, + "step": 55075 + }, + { + "epoch": 2.17, + "grad_norm": 2.0288694878473654, + "learning_rate": 4.924693979933111e-06, + "loss": 0.1612, + "step": 55100 + }, + { + "epoch": 2.17, + "grad_norm": 1.8297162400569662, + "learning_rate": 4.924652173913044e-06, + "loss": 0.1679, + "step": 55125 + }, + { + "epoch": 2.17, + "grad_norm": 1.5215046066698381, + "learning_rate": 4.924610367892977e-06, + "loss": 0.1636, + "step": 55150 + }, + { + "epoch": 2.17, + "grad_norm": 2.0415805165337577, + "learning_rate": 4.924570234113713e-06, + "loss": 0.1617, + "step": 55175 + }, + { + "epoch": 2.17, + "grad_norm": 1.885372598413116, + "learning_rate": 4.924528428093646e-06, + "loss": 0.1503, + "step": 55200 + }, + { + "epoch": 2.17, + "grad_norm": 1.693220968582674, + "learning_rate": 4.924486622073578e-06, + "loss": 0.174, + "step": 55225 + }, + { + "epoch": 2.17, + "grad_norm": 1.6448831869675682, + "learning_rate": 4.924444816053512e-06, + "loss": 0.1568, + "step": 55250 + }, + { + "epoch": 2.17, + "grad_norm": 1.6427824071243027, + "learning_rate": 4.924403010033445e-06, + "loss": 0.1409, + "step": 55275 + }, + { + "epoch": 2.18, + "grad_norm": 3.5051468060927804, + "learning_rate": 4.924361204013378e-06, + "loss": 0.16, + "step": 55300 + }, + { + "epoch": 2.18, + "grad_norm": 1.3554161547534496, + "learning_rate": 4.924319397993311e-06, + "loss": 0.1635, + "step": 55325 + }, + { + "epoch": 2.18, + "grad_norm": 2.2438531362592653, + "learning_rate": 4.924277591973244e-06, + "loss": 0.1696, + "step": 55350 + }, + { + "epoch": 2.18, + "grad_norm": 1.993729955191998, + "learning_rate": 4.924235785953178e-06, + "loss": 0.1414, + "step": 55375 + }, + { + "epoch": 2.18, + "grad_norm": 2.812931969711888, + "learning_rate": 4.924193979933111e-06, + "loss": 0.1535, + "step": 55400 + }, + { + "epoch": 2.18, + "grad_norm": 2.2778307356395966, + "learning_rate": 4.924152173913044e-06, + "loss": 0.1682, + "step": 55425 + }, + { + "epoch": 2.18, + "grad_norm": 2.1191913216519205, + "learning_rate": 4.924110367892977e-06, + "loss": 0.1556, + "step": 55450 + }, + { + "epoch": 2.18, + "grad_norm": 2.300298131748246, + "learning_rate": 4.9240685618729104e-06, + "loss": 0.1451, + "step": 55475 + }, + { + "epoch": 2.18, + "grad_norm": 1.9976266390195618, + "learning_rate": 4.924026755852843e-06, + "loss": 0.1544, + "step": 55500 + }, + { + "epoch": 2.18, + "grad_norm": 1.9317993781076177, + "learning_rate": 4.923984949832777e-06, + "loss": 0.1729, + "step": 55525 + }, + { + "epoch": 2.19, + "grad_norm": 2.718753249528282, + "learning_rate": 4.923943143812709e-06, + "loss": 0.1441, + "step": 55550 + }, + { + "epoch": 2.19, + "grad_norm": 2.2565840107062485, + "learning_rate": 4.923901337792643e-06, + "loss": 0.1674, + "step": 55575 + }, + { + "epoch": 2.19, + "grad_norm": 2.383013144810345, + "learning_rate": 4.923859531772576e-06, + "loss": 0.1605, + "step": 55600 + }, + { + "epoch": 2.19, + "grad_norm": 3.3092883642771405, + "learning_rate": 4.923817725752509e-06, + "loss": 0.1669, + "step": 55625 + }, + { + "epoch": 2.19, + "grad_norm": 2.211414188184337, + "learning_rate": 4.923775919732442e-06, + "loss": 0.1389, + "step": 55650 + }, + { + "epoch": 2.19, + "grad_norm": 2.488338988031535, + "learning_rate": 4.923734113712375e-06, + "loss": 0.168, + "step": 55675 + }, + { + "epoch": 2.19, + "grad_norm": 2.300045920710584, + "learning_rate": 4.923692307692308e-06, + "loss": 0.1383, + "step": 55700 + }, + { + "epoch": 2.19, + "grad_norm": 2.6881951652660336, + "learning_rate": 4.923650501672242e-06, + "loss": 0.1754, + "step": 55725 + }, + { + "epoch": 2.19, + "grad_norm": 2.930605332694203, + "learning_rate": 4.923608695652174e-06, + "loss": 0.1585, + "step": 55750 + }, + { + "epoch": 2.19, + "grad_norm": 1.7962333691225325, + "learning_rate": 4.923566889632108e-06, + "loss": 0.1539, + "step": 55775 + }, + { + "epoch": 2.2, + "grad_norm": 1.6517198727704034, + "learning_rate": 4.923525083612041e-06, + "loss": 0.1631, + "step": 55800 + }, + { + "epoch": 2.2, + "grad_norm": 3.2519972863357274, + "learning_rate": 4.923483277591974e-06, + "loss": 0.1734, + "step": 55825 + }, + { + "epoch": 2.2, + "grad_norm": 3.322960523681727, + "learning_rate": 4.923441471571907e-06, + "loss": 0.1626, + "step": 55850 + }, + { + "epoch": 2.2, + "grad_norm": 2.3228889974625515, + "learning_rate": 4.92339966555184e-06, + "loss": 0.1629, + "step": 55875 + }, + { + "epoch": 2.2, + "grad_norm": 2.0280038892158694, + "learning_rate": 4.923357859531773e-06, + "loss": 0.1828, + "step": 55900 + }, + { + "epoch": 2.2, + "grad_norm": 1.8535550307672954, + "learning_rate": 4.923316053511706e-06, + "loss": 0.1631, + "step": 55925 + }, + { + "epoch": 2.2, + "grad_norm": 3.309157550953824, + "learning_rate": 4.923274247491639e-06, + "loss": 0.1722, + "step": 55950 + }, + { + "epoch": 2.2, + "grad_norm": 2.351125286313886, + "learning_rate": 4.923232441471572e-06, + "loss": 0.1695, + "step": 55975 + }, + { + "epoch": 2.2, + "grad_norm": 2.297372457039874, + "learning_rate": 4.9231906354515056e-06, + "loss": 0.1553, + "step": 56000 + }, + { + "epoch": 2.2, + "grad_norm": 2.0925449099759983, + "learning_rate": 4.923148829431438e-06, + "loss": 0.1732, + "step": 56025 + }, + { + "epoch": 2.21, + "grad_norm": 1.6960095376972097, + "learning_rate": 4.923107023411372e-06, + "loss": 0.1474, + "step": 56050 + }, + { + "epoch": 2.21, + "grad_norm": 1.9693615031084075, + "learning_rate": 4.9230652173913045e-06, + "loss": 0.1656, + "step": 56075 + }, + { + "epoch": 2.21, + "grad_norm": 2.754391153522097, + "learning_rate": 4.923023411371238e-06, + "loss": 0.1621, + "step": 56100 + }, + { + "epoch": 2.21, + "grad_norm": 2.6063940292761676, + "learning_rate": 4.922981605351171e-06, + "loss": 0.142, + "step": 56125 + }, + { + "epoch": 2.21, + "grad_norm": 2.000603817581123, + "learning_rate": 4.922939799331104e-06, + "loss": 0.1588, + "step": 56150 + }, + { + "epoch": 2.21, + "grad_norm": 1.8577242463400019, + "learning_rate": 4.922899665551839e-06, + "loss": 0.1448, + "step": 56175 + }, + { + "epoch": 2.21, + "grad_norm": 2.3453900092605062, + "learning_rate": 4.922857859531773e-06, + "loss": 0.1489, + "step": 56200 + }, + { + "epoch": 2.21, + "grad_norm": 1.7724438857218991, + "learning_rate": 4.9228160535117055e-06, + "loss": 0.1619, + "step": 56225 + }, + { + "epoch": 2.21, + "grad_norm": 1.3524805703480405, + "learning_rate": 4.922774247491639e-06, + "loss": 0.1687, + "step": 56250 + }, + { + "epoch": 2.21, + "grad_norm": 3.3408597681628738, + "learning_rate": 4.922732441471572e-06, + "loss": 0.1643, + "step": 56275 + }, + { + "epoch": 2.21, + "grad_norm": 1.7170094914128877, + "learning_rate": 4.922690635451505e-06, + "loss": 0.1667, + "step": 56300 + }, + { + "epoch": 2.22, + "grad_norm": 1.9939337796903036, + "learning_rate": 4.922648829431438e-06, + "loss": 0.1436, + "step": 56325 + }, + { + "epoch": 2.22, + "grad_norm": 3.413924259574225, + "learning_rate": 4.9226070234113716e-06, + "loss": 0.1715, + "step": 56350 + }, + { + "epoch": 2.22, + "grad_norm": 2.322985702949605, + "learning_rate": 4.922565217391304e-06, + "loss": 0.1658, + "step": 56375 + }, + { + "epoch": 2.22, + "grad_norm": 2.370524435039194, + "learning_rate": 4.922523411371238e-06, + "loss": 0.1577, + "step": 56400 + }, + { + "epoch": 2.22, + "grad_norm": 1.983387072213947, + "learning_rate": 4.9224816053511705e-06, + "loss": 0.1768, + "step": 56425 + }, + { + "epoch": 2.22, + "grad_norm": 3.188374061799889, + "learning_rate": 4.922439799331104e-06, + "loss": 0.1555, + "step": 56450 + }, + { + "epoch": 2.22, + "grad_norm": 1.5451448494834452, + "learning_rate": 4.922397993311038e-06, + "loss": 0.1627, + "step": 56475 + }, + { + "epoch": 2.22, + "grad_norm": 1.444626924036314, + "learning_rate": 4.92235618729097e-06, + "loss": 0.1518, + "step": 56500 + }, + { + "epoch": 2.22, + "grad_norm": 2.0333573434866046, + "learning_rate": 4.922314381270904e-06, + "loss": 0.1682, + "step": 56525 + }, + { + "epoch": 2.22, + "grad_norm": 2.404693100907314, + "learning_rate": 4.9222725752508365e-06, + "loss": 0.1605, + "step": 56550 + }, + { + "epoch": 2.23, + "grad_norm": 1.8865735811248825, + "learning_rate": 4.92223076923077e-06, + "loss": 0.17, + "step": 56575 + }, + { + "epoch": 2.23, + "grad_norm": 1.9171365472921278, + "learning_rate": 4.922188963210703e-06, + "loss": 0.1693, + "step": 56600 + }, + { + "epoch": 2.23, + "grad_norm": 2.4763194923869865, + "learning_rate": 4.922147157190636e-06, + "loss": 0.1562, + "step": 56625 + }, + { + "epoch": 2.23, + "grad_norm": 2.764068022978239, + "learning_rate": 4.922105351170569e-06, + "loss": 0.1527, + "step": 56650 + }, + { + "epoch": 2.23, + "grad_norm": 2.1369551320651, + "learning_rate": 4.9220635451505026e-06, + "loss": 0.1488, + "step": 56675 + }, + { + "epoch": 2.23, + "grad_norm": 2.325422901668469, + "learning_rate": 4.922021739130435e-06, + "loss": 0.1719, + "step": 56700 + }, + { + "epoch": 2.23, + "grad_norm": 2.1041649166600105, + "learning_rate": 4.921979933110369e-06, + "loss": 0.1768, + "step": 56725 + }, + { + "epoch": 2.23, + "grad_norm": 1.7655589341104052, + "learning_rate": 4.9219381270903015e-06, + "loss": 0.1777, + "step": 56750 + }, + { + "epoch": 2.23, + "grad_norm": 1.8380477181947645, + "learning_rate": 4.921896321070235e-06, + "loss": 0.1644, + "step": 56775 + }, + { + "epoch": 2.23, + "grad_norm": 2.3126435559314795, + "learning_rate": 4.921854515050168e-06, + "loss": 0.1731, + "step": 56800 + }, + { + "epoch": 2.24, + "grad_norm": 2.56461416141031, + "learning_rate": 4.921812709030101e-06, + "loss": 0.159, + "step": 56825 + }, + { + "epoch": 2.24, + "grad_norm": 2.1602569083850414, + "learning_rate": 4.921770903010033e-06, + "loss": 0.1584, + "step": 56850 + }, + { + "epoch": 2.24, + "grad_norm": 1.8962818131934016, + "learning_rate": 4.921729096989967e-06, + "loss": 0.1576, + "step": 56875 + }, + { + "epoch": 2.24, + "grad_norm": 1.8524281706098542, + "learning_rate": 4.9216872909699e-06, + "loss": 0.1561, + "step": 56900 + }, + { + "epoch": 2.24, + "grad_norm": 1.9143396372190553, + "learning_rate": 4.921645484949833e-06, + "loss": 0.1692, + "step": 56925 + }, + { + "epoch": 2.24, + "grad_norm": 2.822879321951725, + "learning_rate": 4.9216036789297665e-06, + "loss": 0.1613, + "step": 56950 + }, + { + "epoch": 2.24, + "grad_norm": 1.894258326831287, + "learning_rate": 4.921561872909699e-06, + "loss": 0.1541, + "step": 56975 + }, + { + "epoch": 2.24, + "grad_norm": 1.7074729590893851, + "learning_rate": 4.921520066889633e-06, + "loss": 0.1555, + "step": 57000 + }, + { + "epoch": 2.24, + "grad_norm": 1.612601267245271, + "learning_rate": 4.9214782608695654e-06, + "loss": 0.145, + "step": 57025 + }, + { + "epoch": 2.24, + "grad_norm": 2.3502778459294635, + "learning_rate": 4.921436454849499e-06, + "loss": 0.1602, + "step": 57050 + }, + { + "epoch": 2.25, + "grad_norm": 1.6450546458329638, + "learning_rate": 4.921394648829432e-06, + "loss": 0.1717, + "step": 57075 + }, + { + "epoch": 2.25, + "grad_norm": 1.6578146368646642, + "learning_rate": 4.921352842809365e-06, + "loss": 0.156, + "step": 57100 + }, + { + "epoch": 2.25, + "grad_norm": 2.2414245528366807, + "learning_rate": 4.921311036789298e-06, + "loss": 0.1353, + "step": 57125 + }, + { + "epoch": 2.25, + "grad_norm": 2.8998348774283498, + "learning_rate": 4.9212692307692315e-06, + "loss": 0.171, + "step": 57150 + }, + { + "epoch": 2.25, + "grad_norm": 2.9985448953876657, + "learning_rate": 4.9212290969899665e-06, + "loss": 0.1467, + "step": 57175 + }, + { + "epoch": 2.25, + "grad_norm": 2.399374920196816, + "learning_rate": 4.9211872909699e-06, + "loss": 0.1614, + "step": 57200 + }, + { + "epoch": 2.25, + "grad_norm": 2.430911112893449, + "learning_rate": 4.921145484949833e-06, + "loss": 0.1678, + "step": 57225 + }, + { + "epoch": 2.25, + "grad_norm": 2.119242897772184, + "learning_rate": 4.921103678929766e-06, + "loss": 0.1608, + "step": 57250 + }, + { + "epoch": 2.25, + "grad_norm": 2.4857313645191823, + "learning_rate": 4.921061872909699e-06, + "loss": 0.1827, + "step": 57275 + }, + { + "epoch": 2.25, + "grad_norm": 2.904099021212478, + "learning_rate": 4.9210200668896325e-06, + "loss": 0.182, + "step": 57300 + }, + { + "epoch": 2.26, + "grad_norm": 1.548156179946889, + "learning_rate": 4.920978260869565e-06, + "loss": 0.1628, + "step": 57325 + }, + { + "epoch": 2.26, + "grad_norm": 2.3494390107702494, + "learning_rate": 4.920936454849499e-06, + "loss": 0.1479, + "step": 57350 + }, + { + "epoch": 2.26, + "grad_norm": 1.8864858832812406, + "learning_rate": 4.9208946488294314e-06, + "loss": 0.159, + "step": 57375 + }, + { + "epoch": 2.26, + "grad_norm": 1.3675349817558708, + "learning_rate": 4.920852842809365e-06, + "loss": 0.1698, + "step": 57400 + }, + { + "epoch": 2.26, + "grad_norm": 1.5225907794607534, + "learning_rate": 4.920811036789298e-06, + "loss": 0.1651, + "step": 57425 + }, + { + "epoch": 2.26, + "grad_norm": 2.1154683368095637, + "learning_rate": 4.920769230769231e-06, + "loss": 0.1577, + "step": 57450 + }, + { + "epoch": 2.26, + "grad_norm": 1.8981664952230868, + "learning_rate": 4.920727424749164e-06, + "loss": 0.1719, + "step": 57475 + }, + { + "epoch": 2.26, + "grad_norm": 2.53863923398963, + "learning_rate": 4.9206856187290975e-06, + "loss": 0.1644, + "step": 57500 + }, + { + "epoch": 2.26, + "grad_norm": 2.105776922935975, + "learning_rate": 4.92064381270903e-06, + "loss": 0.1545, + "step": 57525 + }, + { + "epoch": 2.26, + "grad_norm": 2.0750252396438587, + "learning_rate": 4.920602006688964e-06, + "loss": 0.155, + "step": 57550 + }, + { + "epoch": 2.27, + "grad_norm": 3.1660226968910576, + "learning_rate": 4.920560200668897e-06, + "loss": 0.1731, + "step": 57575 + }, + { + "epoch": 2.27, + "grad_norm": 3.025064147146057, + "learning_rate": 4.92051839464883e-06, + "loss": 0.1658, + "step": 57600 + }, + { + "epoch": 2.27, + "grad_norm": 2.4539822980767205, + "learning_rate": 4.9204765886287635e-06, + "loss": 0.1496, + "step": 57625 + }, + { + "epoch": 2.27, + "grad_norm": 1.8245524096248626, + "learning_rate": 4.920434782608696e-06, + "loss": 0.1479, + "step": 57650 + }, + { + "epoch": 2.27, + "grad_norm": 2.1986903832701565, + "learning_rate": 4.92039297658863e-06, + "loss": 0.1757, + "step": 57675 + }, + { + "epoch": 2.27, + "grad_norm": 1.737008195248559, + "learning_rate": 4.9203511705685624e-06, + "loss": 0.1574, + "step": 57700 + }, + { + "epoch": 2.27, + "grad_norm": 2.7053823545914026, + "learning_rate": 4.920309364548496e-06, + "loss": 0.169, + "step": 57725 + }, + { + "epoch": 2.27, + "grad_norm": 2.4113450770376206, + "learning_rate": 4.920267558528429e-06, + "loss": 0.1667, + "step": 57750 + }, + { + "epoch": 2.27, + "grad_norm": 1.970065828181729, + "learning_rate": 4.920225752508362e-06, + "loss": 0.1661, + "step": 57775 + }, + { + "epoch": 2.27, + "grad_norm": 3.0890846295834673, + "learning_rate": 4.920183946488294e-06, + "loss": 0.1638, + "step": 57800 + }, + { + "epoch": 2.27, + "grad_norm": 2.076681262223452, + "learning_rate": 4.920142140468228e-06, + "loss": 0.1864, + "step": 57825 + }, + { + "epoch": 2.28, + "grad_norm": 2.343128244131371, + "learning_rate": 4.92010033444816e-06, + "loss": 0.1687, + "step": 57850 + }, + { + "epoch": 2.28, + "grad_norm": 2.9276428639103855, + "learning_rate": 4.920058528428094e-06, + "loss": 0.1614, + "step": 57875 + }, + { + "epoch": 2.28, + "grad_norm": 1.761415206373535, + "learning_rate": 4.9200167224080266e-06, + "loss": 0.1636, + "step": 57900 + }, + { + "epoch": 2.28, + "grad_norm": 1.2563054496648425, + "learning_rate": 4.91997491638796e-06, + "loss": 0.1586, + "step": 57925 + }, + { + "epoch": 2.28, + "grad_norm": 2.639280854740686, + "learning_rate": 4.919933110367893e-06, + "loss": 0.1663, + "step": 57950 + }, + { + "epoch": 2.28, + "grad_norm": 1.4109976776005013, + "learning_rate": 4.919891304347826e-06, + "loss": 0.1607, + "step": 57975 + }, + { + "epoch": 2.28, + "grad_norm": 2.6597424343430185, + "learning_rate": 4.91984949832776e-06, + "loss": 0.142, + "step": 58000 + }, + { + "epoch": 2.28, + "grad_norm": 2.734114761003077, + "learning_rate": 4.919807692307693e-06, + "loss": 0.1666, + "step": 58025 + }, + { + "epoch": 2.28, + "grad_norm": 1.396277658035052, + "learning_rate": 4.919765886287626e-06, + "loss": 0.1505, + "step": 58050 + }, + { + "epoch": 2.28, + "grad_norm": 2.5980057974065858, + "learning_rate": 4.919724080267559e-06, + "loss": 0.1425, + "step": 58075 + }, + { + "epoch": 2.29, + "grad_norm": 1.260543291214527, + "learning_rate": 4.919682274247492e-06, + "loss": 0.1507, + "step": 58100 + }, + { + "epoch": 2.29, + "grad_norm": 1.759475488284363, + "learning_rate": 4.919640468227425e-06, + "loss": 0.1804, + "step": 58125 + }, + { + "epoch": 2.29, + "grad_norm": 2.8038300402487817, + "learning_rate": 4.919598662207359e-06, + "loss": 0.1644, + "step": 58150 + }, + { + "epoch": 2.29, + "grad_norm": 2.260869107549195, + "learning_rate": 4.919558528428094e-06, + "loss": 0.1508, + "step": 58175 + }, + { + "epoch": 2.29, + "grad_norm": 2.414584150357879, + "learning_rate": 4.919516722408027e-06, + "loss": 0.1709, + "step": 58200 + }, + { + "epoch": 2.29, + "grad_norm": 1.5049432654576578, + "learning_rate": 4.91947491638796e-06, + "loss": 0.1667, + "step": 58225 + }, + { + "epoch": 2.29, + "grad_norm": 2.2825260278718655, + "learning_rate": 4.919433110367893e-06, + "loss": 0.1566, + "step": 58250 + }, + { + "epoch": 2.29, + "grad_norm": 1.6614178100576562, + "learning_rate": 4.919391304347826e-06, + "loss": 0.1568, + "step": 58275 + }, + { + "epoch": 2.29, + "grad_norm": 1.7269557876148456, + "learning_rate": 4.91934949832776e-06, + "loss": 0.1552, + "step": 58300 + }, + { + "epoch": 2.29, + "grad_norm": 2.1426447315621164, + "learning_rate": 4.919307692307692e-06, + "loss": 0.1544, + "step": 58325 + }, + { + "epoch": 2.3, + "grad_norm": 2.847512216664601, + "learning_rate": 4.919265886287626e-06, + "loss": 0.1763, + "step": 58350 + }, + { + "epoch": 2.3, + "grad_norm": 2.177179572888644, + "learning_rate": 4.919224080267559e-06, + "loss": 0.1649, + "step": 58375 + }, + { + "epoch": 2.3, + "grad_norm": 2.3301719269253813, + "learning_rate": 4.919182274247492e-06, + "loss": 0.1629, + "step": 58400 + }, + { + "epoch": 2.3, + "grad_norm": 2.9371628068840936, + "learning_rate": 4.919140468227425e-06, + "loss": 0.1582, + "step": 58425 + }, + { + "epoch": 2.3, + "grad_norm": 2.1562146801351836, + "learning_rate": 4.919098662207358e-06, + "loss": 0.1905, + "step": 58450 + }, + { + "epoch": 2.3, + "grad_norm": 2.191229103225186, + "learning_rate": 4.919056856187291e-06, + "loss": 0.1694, + "step": 58475 + }, + { + "epoch": 2.3, + "grad_norm": 1.6064059646004025, + "learning_rate": 4.919015050167225e-06, + "loss": 0.1488, + "step": 58500 + }, + { + "epoch": 2.3, + "grad_norm": 4.323530192845432, + "learning_rate": 4.918973244147157e-06, + "loss": 0.1528, + "step": 58525 + }, + { + "epoch": 2.3, + "grad_norm": 2.095986879359018, + "learning_rate": 4.918931438127091e-06, + "loss": 0.1711, + "step": 58550 + }, + { + "epoch": 2.3, + "grad_norm": 2.10375244642294, + "learning_rate": 4.918889632107024e-06, + "loss": 0.1797, + "step": 58575 + }, + { + "epoch": 2.31, + "grad_norm": 2.4047401055803066, + "learning_rate": 4.918847826086957e-06, + "loss": 0.1688, + "step": 58600 + }, + { + "epoch": 2.31, + "grad_norm": 2.310141666785082, + "learning_rate": 4.91880602006689e-06, + "loss": 0.1661, + "step": 58625 + }, + { + "epoch": 2.31, + "grad_norm": 1.86452149655283, + "learning_rate": 4.918764214046823e-06, + "loss": 0.1591, + "step": 58650 + }, + { + "epoch": 2.31, + "grad_norm": 2.187797758585452, + "learning_rate": 4.918722408026757e-06, + "loss": 0.1584, + "step": 58675 + }, + { + "epoch": 2.31, + "grad_norm": 1.8069341965191519, + "learning_rate": 4.91868060200669e-06, + "loss": 0.1555, + "step": 58700 + }, + { + "epoch": 2.31, + "grad_norm": 2.18250105172939, + "learning_rate": 4.918638795986623e-06, + "loss": 0.1623, + "step": 58725 + }, + { + "epoch": 2.31, + "grad_norm": 2.6732097415682214, + "learning_rate": 4.918596989966555e-06, + "loss": 0.1664, + "step": 58750 + }, + { + "epoch": 2.31, + "grad_norm": 1.2977159672754395, + "learning_rate": 4.9185551839464886e-06, + "loss": 0.166, + "step": 58775 + }, + { + "epoch": 2.31, + "grad_norm": 2.007392545610804, + "learning_rate": 4.918513377926421e-06, + "loss": 0.1634, + "step": 58800 + }, + { + "epoch": 2.31, + "grad_norm": 2.7077236084984597, + "learning_rate": 4.918471571906355e-06, + "loss": 0.1491, + "step": 58825 + }, + { + "epoch": 2.32, + "grad_norm": 2.424611894537868, + "learning_rate": 4.9184297658862875e-06, + "loss": 0.1587, + "step": 58850 + }, + { + "epoch": 2.32, + "grad_norm": 1.3860933115368321, + "learning_rate": 4.918387959866221e-06, + "loss": 0.1496, + "step": 58875 + }, + { + "epoch": 2.32, + "grad_norm": 2.6511674403642846, + "learning_rate": 4.918346153846154e-06, + "loss": 0.1678, + "step": 58900 + }, + { + "epoch": 2.32, + "grad_norm": 2.1685696271223653, + "learning_rate": 4.918304347826087e-06, + "loss": 0.1411, + "step": 58925 + }, + { + "epoch": 2.32, + "grad_norm": 2.0485582790269428, + "learning_rate": 4.91826254180602e-06, + "loss": 0.1757, + "step": 58950 + }, + { + "epoch": 2.32, + "grad_norm": 2.6032184254228543, + "learning_rate": 4.9182207357859535e-06, + "loss": 0.1625, + "step": 58975 + }, + { + "epoch": 2.32, + "grad_norm": 2.027124650130354, + "learning_rate": 4.918178929765886e-06, + "loss": 0.1421, + "step": 59000 + }, + { + "epoch": 2.32, + "grad_norm": 2.8814506846542645, + "learning_rate": 4.91813712374582e-06, + "loss": 0.1628, + "step": 59025 + }, + { + "epoch": 2.32, + "grad_norm": 1.1579894725073154, + "learning_rate": 4.9180953177257525e-06, + "loss": 0.1654, + "step": 59050 + }, + { + "epoch": 2.32, + "grad_norm": 2.111540003358208, + "learning_rate": 4.918053511705686e-06, + "loss": 0.1695, + "step": 59075 + }, + { + "epoch": 2.33, + "grad_norm": 2.7137729677726354, + "learning_rate": 4.9180117056856196e-06, + "loss": 0.1486, + "step": 59100 + }, + { + "epoch": 2.33, + "grad_norm": 1.7812965649458703, + "learning_rate": 4.917969899665552e-06, + "loss": 0.1638, + "step": 59125 + }, + { + "epoch": 2.33, + "grad_norm": 2.011544514420453, + "learning_rate": 4.917928093645486e-06, + "loss": 0.1733, + "step": 59150 + }, + { + "epoch": 2.33, + "grad_norm": 2.390700805022534, + "learning_rate": 4.917887959866221e-06, + "loss": 0.1751, + "step": 59175 + }, + { + "epoch": 2.33, + "grad_norm": 2.969735384499303, + "learning_rate": 4.917846153846154e-06, + "loss": 0.1729, + "step": 59200 + }, + { + "epoch": 2.33, + "grad_norm": 1.913785187777396, + "learning_rate": 4.917804347826087e-06, + "loss": 0.153, + "step": 59225 + }, + { + "epoch": 2.33, + "grad_norm": 1.8021064858925475, + "learning_rate": 4.917762541806021e-06, + "loss": 0.1502, + "step": 59250 + }, + { + "epoch": 2.33, + "grad_norm": 2.833978772258698, + "learning_rate": 4.917720735785953e-06, + "loss": 0.1724, + "step": 59275 + }, + { + "epoch": 2.33, + "grad_norm": 2.2849194247990177, + "learning_rate": 4.917678929765887e-06, + "loss": 0.1698, + "step": 59300 + }, + { + "epoch": 2.33, + "grad_norm": 1.6327536122141721, + "learning_rate": 4.9176371237458195e-06, + "loss": 0.1479, + "step": 59325 + }, + { + "epoch": 2.33, + "grad_norm": 1.8262023075338034, + "learning_rate": 4.917595317725753e-06, + "loss": 0.1642, + "step": 59350 + }, + { + "epoch": 2.34, + "grad_norm": 1.9784854693898795, + "learning_rate": 4.917553511705686e-06, + "loss": 0.158, + "step": 59375 + }, + { + "epoch": 2.34, + "grad_norm": 2.359260864971092, + "learning_rate": 4.917511705685619e-06, + "loss": 0.1571, + "step": 59400 + }, + { + "epoch": 2.34, + "grad_norm": 2.2374038047494427, + "learning_rate": 4.917469899665552e-06, + "loss": 0.1471, + "step": 59425 + }, + { + "epoch": 2.34, + "grad_norm": 2.0946720369210645, + "learning_rate": 4.9174280936454856e-06, + "loss": 0.1658, + "step": 59450 + }, + { + "epoch": 2.34, + "grad_norm": 2.395192692613216, + "learning_rate": 4.917386287625418e-06, + "loss": 0.1662, + "step": 59475 + }, + { + "epoch": 2.34, + "grad_norm": 2.107283464209178, + "learning_rate": 4.917344481605352e-06, + "loss": 0.1646, + "step": 59500 + }, + { + "epoch": 2.34, + "grad_norm": 3.4601379596453037, + "learning_rate": 4.9173026755852845e-06, + "loss": 0.1577, + "step": 59525 + }, + { + "epoch": 2.34, + "grad_norm": 2.7885302397106813, + "learning_rate": 4.917260869565218e-06, + "loss": 0.1491, + "step": 59550 + }, + { + "epoch": 2.34, + "grad_norm": 1.9441052075639202, + "learning_rate": 4.917219063545151e-06, + "loss": 0.1518, + "step": 59575 + }, + { + "epoch": 2.34, + "grad_norm": 2.435857624262805, + "learning_rate": 4.917177257525084e-06, + "loss": 0.1631, + "step": 59600 + }, + { + "epoch": 2.35, + "grad_norm": 2.4963602725155, + "learning_rate": 4.917135451505017e-06, + "loss": 0.1588, + "step": 59625 + }, + { + "epoch": 2.35, + "grad_norm": 2.324784100257762, + "learning_rate": 4.9170936454849505e-06, + "loss": 0.1539, + "step": 59650 + }, + { + "epoch": 2.35, + "grad_norm": 2.0187377882547612, + "learning_rate": 4.917051839464883e-06, + "loss": 0.1695, + "step": 59675 + }, + { + "epoch": 2.35, + "grad_norm": 2.32940588948083, + "learning_rate": 4.917010033444816e-06, + "loss": 0.1516, + "step": 59700 + }, + { + "epoch": 2.35, + "grad_norm": 1.836519988909741, + "learning_rate": 4.9169682274247495e-06, + "loss": 0.1747, + "step": 59725 + }, + { + "epoch": 2.35, + "grad_norm": 2.196327753251213, + "learning_rate": 4.916926421404682e-06, + "loss": 0.1765, + "step": 59750 + }, + { + "epoch": 2.35, + "grad_norm": 2.2094447994990345, + "learning_rate": 4.916884615384616e-06, + "loss": 0.1525, + "step": 59775 + }, + { + "epoch": 2.35, + "grad_norm": 2.5435910280268885, + "learning_rate": 4.916842809364548e-06, + "loss": 0.1588, + "step": 59800 + }, + { + "epoch": 2.35, + "grad_norm": 2.1170442526101163, + "learning_rate": 4.916801003344482e-06, + "loss": 0.1652, + "step": 59825 + }, + { + "epoch": 2.35, + "grad_norm": 4.0496458307816825, + "learning_rate": 4.916759197324415e-06, + "loss": 0.1676, + "step": 59850 + }, + { + "epoch": 2.36, + "grad_norm": 2.5764839005683617, + "learning_rate": 4.916717391304348e-06, + "loss": 0.1524, + "step": 59875 + }, + { + "epoch": 2.36, + "grad_norm": 1.80735502088511, + "learning_rate": 4.916675585284281e-06, + "loss": 0.1646, + "step": 59900 + }, + { + "epoch": 2.36, + "grad_norm": 3.4931943985698877, + "learning_rate": 4.9166337792642145e-06, + "loss": 0.1688, + "step": 59925 + }, + { + "epoch": 2.36, + "grad_norm": 2.620793187205274, + "learning_rate": 4.916591973244147e-06, + "loss": 0.1645, + "step": 59950 + }, + { + "epoch": 2.36, + "grad_norm": 2.2352409464681644, + "learning_rate": 4.916550167224081e-06, + "loss": 0.1485, + "step": 59975 + }, + { + "epoch": 2.36, + "grad_norm": 1.8848876549630622, + "learning_rate": 4.916508361204013e-06, + "loss": 0.1717, + "step": 60000 + }, + { + "epoch": 2.36, + "eval_loss": 0.5126953125, + "eval_runtime": 11570.5057, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.051, + "eval_wer": 0.11778710808680405, + "step": 60000 + }, + { + "epoch": 2.36, + "grad_norm": 2.5457840080713066, + "learning_rate": 4.916466555183947e-06, + "loss": 0.163, + "step": 60025 + }, + { + "epoch": 2.36, + "grad_norm": 1.7137502472167718, + "learning_rate": 4.91642474916388e-06, + "loss": 0.153, + "step": 60050 + }, + { + "epoch": 2.36, + "grad_norm": 2.0294624199856943, + "learning_rate": 4.916382943143813e-06, + "loss": 0.1589, + "step": 60075 + }, + { + "epoch": 2.36, + "grad_norm": 2.3906210473393097, + "learning_rate": 4.916341137123746e-06, + "loss": 0.1652, + "step": 60100 + }, + { + "epoch": 2.37, + "grad_norm": 1.2050739597355082, + "learning_rate": 4.9162993311036794e-06, + "loss": 0.1638, + "step": 60125 + }, + { + "epoch": 2.37, + "grad_norm": 3.2900598636843386, + "learning_rate": 4.916257525083612e-06, + "loss": 0.1728, + "step": 60150 + }, + { + "epoch": 2.37, + "grad_norm": 2.1524349935755915, + "learning_rate": 4.916217391304348e-06, + "loss": 0.1489, + "step": 60175 + }, + { + "epoch": 2.37, + "grad_norm": 2.954050934593037, + "learning_rate": 4.9161755852842815e-06, + "loss": 0.1858, + "step": 60200 + }, + { + "epoch": 2.37, + "grad_norm": 1.9504239110160009, + "learning_rate": 4.916133779264214e-06, + "loss": 0.1584, + "step": 60225 + }, + { + "epoch": 2.37, + "grad_norm": 5.834701243563905, + "learning_rate": 4.916091973244148e-06, + "loss": 0.1653, + "step": 60250 + }, + { + "epoch": 2.37, + "grad_norm": 2.504682679956055, + "learning_rate": 4.9160501672240805e-06, + "loss": 0.1728, + "step": 60275 + }, + { + "epoch": 2.37, + "grad_norm": 2.6111928108040954, + "learning_rate": 4.916008361204014e-06, + "loss": 0.1695, + "step": 60300 + }, + { + "epoch": 2.37, + "grad_norm": 2.7115359266814565, + "learning_rate": 4.915966555183947e-06, + "loss": 0.1659, + "step": 60325 + }, + { + "epoch": 2.37, + "grad_norm": 2.7986136435971476, + "learning_rate": 4.91592474916388e-06, + "loss": 0.1561, + "step": 60350 + }, + { + "epoch": 2.38, + "grad_norm": 2.2374505980103256, + "learning_rate": 4.915882943143813e-06, + "loss": 0.1811, + "step": 60375 + }, + { + "epoch": 2.38, + "grad_norm": 2.677702691876237, + "learning_rate": 4.9158411371237465e-06, + "loss": 0.1578, + "step": 60400 + }, + { + "epoch": 2.38, + "grad_norm": 1.710984414065094, + "learning_rate": 4.915799331103679e-06, + "loss": 0.1766, + "step": 60425 + }, + { + "epoch": 2.38, + "grad_norm": 1.8611113380470368, + "learning_rate": 4.915757525083613e-06, + "loss": 0.1788, + "step": 60450 + }, + { + "epoch": 2.38, + "grad_norm": 2.6445493558289472, + "learning_rate": 4.9157157190635454e-06, + "loss": 0.1777, + "step": 60475 + }, + { + "epoch": 2.38, + "grad_norm": 2.476843097247296, + "learning_rate": 4.915673913043479e-06, + "loss": 0.1554, + "step": 60500 + }, + { + "epoch": 2.38, + "grad_norm": 2.009772580934335, + "learning_rate": 4.915632107023412e-06, + "loss": 0.176, + "step": 60525 + }, + { + "epoch": 2.38, + "grad_norm": 1.799057734334839, + "learning_rate": 4.915590301003345e-06, + "loss": 0.1687, + "step": 60550 + }, + { + "epoch": 2.38, + "grad_norm": 2.574514219545785, + "learning_rate": 4.915548494983278e-06, + "loss": 0.1772, + "step": 60575 + }, + { + "epoch": 2.38, + "grad_norm": 1.7987424850971203, + "learning_rate": 4.9155066889632115e-06, + "loss": 0.1662, + "step": 60600 + }, + { + "epoch": 2.39, + "grad_norm": 2.08168165567613, + "learning_rate": 4.915464882943144e-06, + "loss": 0.1705, + "step": 60625 + }, + { + "epoch": 2.39, + "grad_norm": 2.306741935160784, + "learning_rate": 4.915423076923077e-06, + "loss": 0.1644, + "step": 60650 + }, + { + "epoch": 2.39, + "grad_norm": 1.9982050594746743, + "learning_rate": 4.91538127090301e-06, + "loss": 0.1637, + "step": 60675 + }, + { + "epoch": 2.39, + "grad_norm": 2.3306033444527405, + "learning_rate": 4.915339464882943e-06, + "loss": 0.166, + "step": 60700 + }, + { + "epoch": 2.39, + "grad_norm": 2.239829674488769, + "learning_rate": 4.915297658862877e-06, + "loss": 0.1721, + "step": 60725 + }, + { + "epoch": 2.39, + "grad_norm": 2.0716748422338145, + "learning_rate": 4.915255852842809e-06, + "loss": 0.1665, + "step": 60750 + }, + { + "epoch": 2.39, + "grad_norm": 2.3627489051626163, + "learning_rate": 4.915214046822743e-06, + "loss": 0.1656, + "step": 60775 + }, + { + "epoch": 2.39, + "grad_norm": 2.7947561118872883, + "learning_rate": 4.915172240802676e-06, + "loss": 0.1821, + "step": 60800 + }, + { + "epoch": 2.39, + "grad_norm": 2.481319736763784, + "learning_rate": 4.915130434782609e-06, + "loss": 0.1608, + "step": 60825 + }, + { + "epoch": 2.39, + "grad_norm": 1.5617043799686217, + "learning_rate": 4.915088628762542e-06, + "loss": 0.1556, + "step": 60850 + }, + { + "epoch": 2.39, + "grad_norm": 2.552707926658039, + "learning_rate": 4.915046822742475e-06, + "loss": 0.1748, + "step": 60875 + }, + { + "epoch": 2.4, + "grad_norm": 2.4439378238092067, + "learning_rate": 4.915005016722408e-06, + "loss": 0.1682, + "step": 60900 + }, + { + "epoch": 2.4, + "grad_norm": 1.998043095324892, + "learning_rate": 4.914963210702342e-06, + "loss": 0.1976, + "step": 60925 + }, + { + "epoch": 2.4, + "grad_norm": 2.2108208593309526, + "learning_rate": 4.914921404682274e-06, + "loss": 0.1552, + "step": 60950 + }, + { + "epoch": 2.4, + "grad_norm": 1.9371077350313313, + "learning_rate": 4.914879598662208e-06, + "loss": 0.1467, + "step": 60975 + }, + { + "epoch": 2.4, + "grad_norm": 1.7039801639447034, + "learning_rate": 4.9148377926421406e-06, + "loss": 0.1518, + "step": 61000 + }, + { + "epoch": 2.4, + "grad_norm": 1.915961676427643, + "learning_rate": 4.914795986622074e-06, + "loss": 0.1638, + "step": 61025 + }, + { + "epoch": 2.4, + "grad_norm": 1.9113673117236556, + "learning_rate": 4.914754180602007e-06, + "loss": 0.1676, + "step": 61050 + }, + { + "epoch": 2.4, + "grad_norm": 2.4662369168057308, + "learning_rate": 4.91471237458194e-06, + "loss": 0.1757, + "step": 61075 + }, + { + "epoch": 2.4, + "grad_norm": 2.0060579273255077, + "learning_rate": 4.914670568561873e-06, + "loss": 0.1525, + "step": 61100 + }, + { + "epoch": 2.4, + "grad_norm": 1.6452207663404552, + "learning_rate": 4.914628762541807e-06, + "loss": 0.186, + "step": 61125 + }, + { + "epoch": 2.41, + "grad_norm": 2.043111214852014, + "learning_rate": 4.914586956521739e-06, + "loss": 0.1721, + "step": 61150 + }, + { + "epoch": 2.41, + "grad_norm": 2.2998189089040224, + "learning_rate": 4.914546822742475e-06, + "loss": 0.173, + "step": 61175 + }, + { + "epoch": 2.41, + "grad_norm": 2.229036607651321, + "learning_rate": 4.914505016722409e-06, + "loss": 0.1677, + "step": 61200 + }, + { + "epoch": 2.41, + "grad_norm": 2.2040697290855484, + "learning_rate": 4.914463210702341e-06, + "loss": 0.1565, + "step": 61225 + }, + { + "epoch": 2.41, + "grad_norm": 1.975337279121286, + "learning_rate": 4.914421404682275e-06, + "loss": 0.1567, + "step": 61250 + }, + { + "epoch": 2.41, + "grad_norm": 2.1901711072719783, + "learning_rate": 4.914379598662208e-06, + "loss": 0.1615, + "step": 61275 + }, + { + "epoch": 2.41, + "grad_norm": 2.410651010100104, + "learning_rate": 4.914337792642141e-06, + "loss": 0.1596, + "step": 61300 + }, + { + "epoch": 2.41, + "grad_norm": 2.473836763687653, + "learning_rate": 4.914295986622074e-06, + "loss": 0.173, + "step": 61325 + }, + { + "epoch": 2.41, + "grad_norm": 2.0361562862320617, + "learning_rate": 4.914254180602007e-06, + "loss": 0.1705, + "step": 61350 + }, + { + "epoch": 2.41, + "grad_norm": 2.6879914919584564, + "learning_rate": 4.91421237458194e-06, + "loss": 0.1607, + "step": 61375 + }, + { + "epoch": 2.42, + "grad_norm": 2.157305173574173, + "learning_rate": 4.914170568561874e-06, + "loss": 0.1512, + "step": 61400 + }, + { + "epoch": 2.42, + "grad_norm": 1.9317567624460439, + "learning_rate": 4.914128762541806e-06, + "loss": 0.1569, + "step": 61425 + }, + { + "epoch": 2.42, + "grad_norm": 2.361584589222622, + "learning_rate": 4.91408695652174e-06, + "loss": 0.1638, + "step": 61450 + }, + { + "epoch": 2.42, + "grad_norm": 1.8644040646184097, + "learning_rate": 4.914045150501673e-06, + "loss": 0.1687, + "step": 61475 + }, + { + "epoch": 2.42, + "grad_norm": 2.198548588498155, + "learning_rate": 4.914003344481606e-06, + "loss": 0.154, + "step": 61500 + }, + { + "epoch": 2.42, + "grad_norm": 2.607394816578382, + "learning_rate": 4.913961538461539e-06, + "loss": 0.1646, + "step": 61525 + }, + { + "epoch": 2.42, + "grad_norm": 2.7313264618704025, + "learning_rate": 4.913919732441472e-06, + "loss": 0.1666, + "step": 61550 + }, + { + "epoch": 2.42, + "grad_norm": 1.8872158667961136, + "learning_rate": 4.913877926421405e-06, + "loss": 0.1594, + "step": 61575 + }, + { + "epoch": 2.42, + "grad_norm": 2.005983861142906, + "learning_rate": 4.913836120401338e-06, + "loss": 0.1671, + "step": 61600 + }, + { + "epoch": 2.42, + "grad_norm": 1.6811726737767227, + "learning_rate": 4.913794314381271e-06, + "loss": 0.1773, + "step": 61625 + }, + { + "epoch": 2.43, + "grad_norm": 2.4459555545243172, + "learning_rate": 4.913752508361204e-06, + "loss": 0.1666, + "step": 61650 + }, + { + "epoch": 2.43, + "grad_norm": 2.1227618361315246, + "learning_rate": 4.9137107023411376e-06, + "loss": 0.1716, + "step": 61675 + }, + { + "epoch": 2.43, + "grad_norm": 2.1285845172063067, + "learning_rate": 4.91366889632107e-06, + "loss": 0.1703, + "step": 61700 + }, + { + "epoch": 2.43, + "grad_norm": 3.090357456437784, + "learning_rate": 4.913627090301004e-06, + "loss": 0.1602, + "step": 61725 + }, + { + "epoch": 2.43, + "grad_norm": 2.0482938851822245, + "learning_rate": 4.9135852842809365e-06, + "loss": 0.1505, + "step": 61750 + }, + { + "epoch": 2.43, + "grad_norm": 2.5875726035509103, + "learning_rate": 4.91354347826087e-06, + "loss": 0.1724, + "step": 61775 + }, + { + "epoch": 2.43, + "grad_norm": 1.6512222827222043, + "learning_rate": 4.913501672240803e-06, + "loss": 0.1617, + "step": 61800 + }, + { + "epoch": 2.43, + "grad_norm": 1.8674513598205074, + "learning_rate": 4.913459866220736e-06, + "loss": 0.1602, + "step": 61825 + }, + { + "epoch": 2.43, + "grad_norm": 2.3103847629985763, + "learning_rate": 4.913419732441472e-06, + "loss": 0.1747, + "step": 61850 + }, + { + "epoch": 2.43, + "grad_norm": 2.3023261514395763, + "learning_rate": 4.913377926421405e-06, + "loss": 0.1581, + "step": 61875 + }, + { + "epoch": 2.44, + "grad_norm": 1.449709317162717, + "learning_rate": 4.913336120401338e-06, + "loss": 0.1742, + "step": 61900 + }, + { + "epoch": 2.44, + "grad_norm": 2.2658196543121774, + "learning_rate": 4.913294314381271e-06, + "loss": 0.1684, + "step": 61925 + }, + { + "epoch": 2.44, + "grad_norm": 2.2361849806007785, + "learning_rate": 4.913252508361205e-06, + "loss": 0.1745, + "step": 61950 + }, + { + "epoch": 2.44, + "grad_norm": 1.8218866051780491, + "learning_rate": 4.913210702341137e-06, + "loss": 0.1522, + "step": 61975 + }, + { + "epoch": 2.44, + "grad_norm": 3.1296586951775764, + "learning_rate": 4.913168896321071e-06, + "loss": 0.1695, + "step": 62000 + }, + { + "epoch": 2.44, + "grad_norm": 1.7770083377004013, + "learning_rate": 4.9131270903010036e-06, + "loss": 0.1667, + "step": 62025 + }, + { + "epoch": 2.44, + "grad_norm": 2.9390687351900326, + "learning_rate": 4.913085284280937e-06, + "loss": 0.1539, + "step": 62050 + }, + { + "epoch": 2.44, + "grad_norm": 1.7051591263423855, + "learning_rate": 4.91304347826087e-06, + "loss": 0.1547, + "step": 62075 + }, + { + "epoch": 2.44, + "grad_norm": 3.1409256857260597, + "learning_rate": 4.913001672240803e-06, + "loss": 0.176, + "step": 62100 + }, + { + "epoch": 2.44, + "grad_norm": 2.794092126918202, + "learning_rate": 4.912959866220736e-06, + "loss": 0.1676, + "step": 62125 + }, + { + "epoch": 2.45, + "grad_norm": 2.165282938697665, + "learning_rate": 4.91291806020067e-06, + "loss": 0.1711, + "step": 62150 + }, + { + "epoch": 2.45, + "grad_norm": 1.8794889473406264, + "learning_rate": 4.912876254180602e-06, + "loss": 0.1683, + "step": 62175 + }, + { + "epoch": 2.45, + "grad_norm": 1.753755884035197, + "learning_rate": 4.912834448160536e-06, + "loss": 0.1782, + "step": 62200 + }, + { + "epoch": 2.45, + "grad_norm": 1.691916719938932, + "learning_rate": 4.9127926421404685e-06, + "loss": 0.1764, + "step": 62225 + }, + { + "epoch": 2.45, + "grad_norm": 2.4777575790892183, + "learning_rate": 4.912750836120402e-06, + "loss": 0.1778, + "step": 62250 + }, + { + "epoch": 2.45, + "grad_norm": 2.082308380129261, + "learning_rate": 4.912709030100335e-06, + "loss": 0.1734, + "step": 62275 + }, + { + "epoch": 2.45, + "grad_norm": 2.5276306526812324, + "learning_rate": 4.912667224080268e-06, + "loss": 0.166, + "step": 62300 + }, + { + "epoch": 2.45, + "grad_norm": 1.0447729926081506, + "learning_rate": 4.912625418060201e-06, + "loss": 0.1519, + "step": 62325 + }, + { + "epoch": 2.45, + "grad_norm": 2.1623636996029796, + "learning_rate": 4.912583612040135e-06, + "loss": 0.1607, + "step": 62350 + }, + { + "epoch": 2.45, + "grad_norm": 2.06902259972889, + "learning_rate": 4.912541806020067e-06, + "loss": 0.1686, + "step": 62375 + }, + { + "epoch": 2.45, + "grad_norm": 2.4778763891548006, + "learning_rate": 4.912500000000001e-06, + "loss": 0.1682, + "step": 62400 + }, + { + "epoch": 2.46, + "grad_norm": 1.9091976161593263, + "learning_rate": 4.9124581939799335e-06, + "loss": 0.172, + "step": 62425 + }, + { + "epoch": 2.46, + "grad_norm": 2.913027598613694, + "learning_rate": 4.912416387959867e-06, + "loss": 0.1601, + "step": 62450 + }, + { + "epoch": 2.46, + "grad_norm": 2.007501580547803, + "learning_rate": 4.9123745819398e-06, + "loss": 0.1453, + "step": 62475 + }, + { + "epoch": 2.46, + "grad_norm": 1.8561817066776358, + "learning_rate": 4.9123327759197325e-06, + "loss": 0.1705, + "step": 62500 + }, + { + "epoch": 2.46, + "grad_norm": 2.161749847169862, + "learning_rate": 4.912290969899666e-06, + "loss": 0.1737, + "step": 62525 + }, + { + "epoch": 2.46, + "grad_norm": 2.1232595660705242, + "learning_rate": 4.912249163879599e-06, + "loss": 0.1574, + "step": 62550 + }, + { + "epoch": 2.46, + "grad_norm": 1.7837035969587884, + "learning_rate": 4.912207357859532e-06, + "loss": 0.1544, + "step": 62575 + }, + { + "epoch": 2.46, + "grad_norm": 2.8829737147072323, + "learning_rate": 4.912165551839465e-06, + "loss": 0.1546, + "step": 62600 + }, + { + "epoch": 2.46, + "grad_norm": 1.7869648211061147, + "learning_rate": 4.9121237458193985e-06, + "loss": 0.1605, + "step": 62625 + }, + { + "epoch": 2.46, + "grad_norm": 2.693915151097951, + "learning_rate": 4.912081939799331e-06, + "loss": 0.1612, + "step": 62650 + }, + { + "epoch": 2.47, + "grad_norm": 1.8652334580681769, + "learning_rate": 4.912040133779265e-06, + "loss": 0.1682, + "step": 62675 + }, + { + "epoch": 2.47, + "grad_norm": 3.4003444849061477, + "learning_rate": 4.9119983277591974e-06, + "loss": 0.1905, + "step": 62700 + }, + { + "epoch": 2.47, + "grad_norm": 2.3689247940924316, + "learning_rate": 4.911956521739131e-06, + "loss": 0.1614, + "step": 62725 + }, + { + "epoch": 2.47, + "grad_norm": 2.08385825279254, + "learning_rate": 4.911914715719064e-06, + "loss": 0.1698, + "step": 62750 + }, + { + "epoch": 2.47, + "grad_norm": 1.6497607238941634, + "learning_rate": 4.911872909698997e-06, + "loss": 0.16, + "step": 62775 + }, + { + "epoch": 2.47, + "grad_norm": 1.5394626117283912, + "learning_rate": 4.91183110367893e-06, + "loss": 0.1627, + "step": 62800 + }, + { + "epoch": 2.47, + "grad_norm": 2.20644693606779, + "learning_rate": 4.9117892976588635e-06, + "loss": 0.1673, + "step": 62825 + }, + { + "epoch": 2.47, + "grad_norm": 1.0547792480076508, + "learning_rate": 4.911747491638796e-06, + "loss": 0.1673, + "step": 62850 + }, + { + "epoch": 2.47, + "grad_norm": 2.143794378453494, + "learning_rate": 4.91170568561873e-06, + "loss": 0.154, + "step": 62875 + }, + { + "epoch": 2.47, + "grad_norm": 2.8088862278469335, + "learning_rate": 4.911663879598662e-06, + "loss": 0.1577, + "step": 62900 + }, + { + "epoch": 2.48, + "grad_norm": 2.5161033945231814, + "learning_rate": 4.911622073578596e-06, + "loss": 0.1738, + "step": 62925 + }, + { + "epoch": 2.48, + "grad_norm": 2.6053312739319967, + "learning_rate": 4.911580267558529e-06, + "loss": 0.1561, + "step": 62950 + }, + { + "epoch": 2.48, + "grad_norm": 3.093633337419838, + "learning_rate": 4.911538461538462e-06, + "loss": 0.1611, + "step": 62975 + }, + { + "epoch": 2.48, + "grad_norm": 1.5048885696579846, + "learning_rate": 4.911496655518395e-06, + "loss": 0.1537, + "step": 63000 + }, + { + "epoch": 2.48, + "grad_norm": 1.4850024515720803, + "learning_rate": 4.9114548494983285e-06, + "loss": 0.1749, + "step": 63025 + }, + { + "epoch": 2.48, + "grad_norm": 2.267747673130082, + "learning_rate": 4.911413043478261e-06, + "loss": 0.1647, + "step": 63050 + }, + { + "epoch": 2.48, + "grad_norm": 2.013113252947993, + "learning_rate": 4.911371237458195e-06, + "loss": 0.1516, + "step": 63075 + }, + { + "epoch": 2.48, + "grad_norm": 1.373805904984799, + "learning_rate": 4.911329431438127e-06, + "loss": 0.1644, + "step": 63100 + }, + { + "epoch": 2.48, + "grad_norm": 2.012563965344033, + "learning_rate": 4.911287625418061e-06, + "loss": 0.1786, + "step": 63125 + }, + { + "epoch": 2.48, + "grad_norm": 2.510353666710276, + "learning_rate": 4.911245819397994e-06, + "loss": 0.1585, + "step": 63150 + }, + { + "epoch": 2.49, + "grad_norm": 1.894923289500306, + "learning_rate": 4.911204013377927e-06, + "loss": 0.1642, + "step": 63175 + }, + { + "epoch": 2.49, + "grad_norm": 2.9141467079712533, + "learning_rate": 4.91116220735786e-06, + "loss": 0.1633, + "step": 63200 + }, + { + "epoch": 2.49, + "grad_norm": 2.1543105842708106, + "learning_rate": 4.911120401337793e-06, + "loss": 0.1817, + "step": 63225 + }, + { + "epoch": 2.49, + "grad_norm": 2.196066091671809, + "learning_rate": 4.911078595317726e-06, + "loss": 0.1654, + "step": 63250 + }, + { + "epoch": 2.49, + "grad_norm": 2.574609823964885, + "learning_rate": 4.911036789297659e-06, + "loss": 0.165, + "step": 63275 + }, + { + "epoch": 2.49, + "grad_norm": 1.7239441756225606, + "learning_rate": 4.910994983277592e-06, + "loss": 0.1449, + "step": 63300 + }, + { + "epoch": 2.49, + "grad_norm": 2.155397841796658, + "learning_rate": 4.910953177257525e-06, + "loss": 0.1639, + "step": 63325 + }, + { + "epoch": 2.49, + "grad_norm": 2.0676428516731504, + "learning_rate": 4.910911371237459e-06, + "loss": 0.1544, + "step": 63350 + }, + { + "epoch": 2.49, + "grad_norm": 2.908000877442016, + "learning_rate": 4.910869565217391e-06, + "loss": 0.1663, + "step": 63375 + }, + { + "epoch": 2.49, + "grad_norm": 1.2534265614289273, + "learning_rate": 4.910827759197325e-06, + "loss": 0.1467, + "step": 63400 + }, + { + "epoch": 2.5, + "grad_norm": 2.6717733780311184, + "learning_rate": 4.9107859531772576e-06, + "loss": 0.1734, + "step": 63425 + }, + { + "epoch": 2.5, + "grad_norm": 2.115600821956127, + "learning_rate": 4.910744147157191e-06, + "loss": 0.1538, + "step": 63450 + }, + { + "epoch": 2.5, + "grad_norm": 1.7077889016665388, + "learning_rate": 4.910702341137124e-06, + "loss": 0.1645, + "step": 63475 + }, + { + "epoch": 2.5, + "grad_norm": 2.079355530724143, + "learning_rate": 4.910660535117057e-06, + "loss": 0.1635, + "step": 63500 + }, + { + "epoch": 2.5, + "grad_norm": 1.6935640568962875, + "learning_rate": 4.91061872909699e-06, + "loss": 0.1669, + "step": 63525 + }, + { + "epoch": 2.5, + "grad_norm": 2.501349861253381, + "learning_rate": 4.910576923076924e-06, + "loss": 0.1618, + "step": 63550 + }, + { + "epoch": 2.5, + "grad_norm": 2.258413469914614, + "learning_rate": 4.910535117056856e-06, + "loss": 0.1594, + "step": 63575 + }, + { + "epoch": 2.5, + "grad_norm": 3.048614342796082, + "learning_rate": 4.91049331103679e-06, + "loss": 0.1712, + "step": 63600 + }, + { + "epoch": 2.5, + "grad_norm": 3.199999060663906, + "learning_rate": 4.9104515050167225e-06, + "loss": 0.1532, + "step": 63625 + }, + { + "epoch": 2.5, + "grad_norm": 1.0916681640164947, + "learning_rate": 4.910409698996656e-06, + "loss": 0.1793, + "step": 63650 + }, + { + "epoch": 2.51, + "grad_norm": 1.5617092875552199, + "learning_rate": 4.910367892976589e-06, + "loss": 0.1502, + "step": 63675 + }, + { + "epoch": 2.51, + "grad_norm": 2.431000733781789, + "learning_rate": 4.910326086956522e-06, + "loss": 0.1618, + "step": 63700 + }, + { + "epoch": 2.51, + "grad_norm": 2.03889062924083, + "learning_rate": 4.910284280936455e-06, + "loss": 0.1408, + "step": 63725 + }, + { + "epoch": 2.51, + "grad_norm": 3.1713570023080253, + "learning_rate": 4.9102424749163886e-06, + "loss": 0.1626, + "step": 63750 + }, + { + "epoch": 2.51, + "grad_norm": 1.4983183955799593, + "learning_rate": 4.910200668896321e-06, + "loss": 0.1668, + "step": 63775 + }, + { + "epoch": 2.51, + "grad_norm": 1.588454811441986, + "learning_rate": 4.910158862876255e-06, + "loss": 0.1438, + "step": 63800 + }, + { + "epoch": 2.51, + "grad_norm": 1.8740606752064477, + "learning_rate": 4.9101170568561875e-06, + "loss": 0.1613, + "step": 63825 + }, + { + "epoch": 2.51, + "grad_norm": 1.863894051691388, + "learning_rate": 4.910076923076923e-06, + "loss": 0.1711, + "step": 63850 + }, + { + "epoch": 2.51, + "grad_norm": 2.9926581104511647, + "learning_rate": 4.910035117056857e-06, + "loss": 0.1759, + "step": 63875 + }, + { + "epoch": 2.51, + "grad_norm": 1.9004004513452737, + "learning_rate": 4.90999331103679e-06, + "loss": 0.1798, + "step": 63900 + }, + { + "epoch": 2.51, + "grad_norm": 2.7288616511230606, + "learning_rate": 4.909951505016723e-06, + "loss": 0.1736, + "step": 63925 + }, + { + "epoch": 2.52, + "grad_norm": 2.0606551832809346, + "learning_rate": 4.909909698996656e-06, + "loss": 0.1632, + "step": 63950 + }, + { + "epoch": 2.52, + "grad_norm": 2.2478370244128305, + "learning_rate": 4.909867892976589e-06, + "loss": 0.1657, + "step": 63975 + }, + { + "epoch": 2.52, + "grad_norm": 1.6587952879693575, + "learning_rate": 4.909826086956522e-06, + "loss": 0.1754, + "step": 64000 + }, + { + "epoch": 2.52, + "grad_norm": 2.3087314952369464, + "learning_rate": 4.909784280936456e-06, + "loss": 0.1749, + "step": 64025 + }, + { + "epoch": 2.52, + "grad_norm": 3.886888875975629, + "learning_rate": 4.909742474916388e-06, + "loss": 0.1704, + "step": 64050 + }, + { + "epoch": 2.52, + "grad_norm": 2.5427882532308765, + "learning_rate": 4.909700668896322e-06, + "loss": 0.1669, + "step": 64075 + }, + { + "epoch": 2.52, + "grad_norm": 2.6760075354807005, + "learning_rate": 4.9096588628762546e-06, + "loss": 0.1585, + "step": 64100 + }, + { + "epoch": 2.52, + "grad_norm": 2.1830101383040494, + "learning_rate": 4.909617056856188e-06, + "loss": 0.1607, + "step": 64125 + }, + { + "epoch": 2.52, + "grad_norm": 2.215983424795264, + "learning_rate": 4.909575250836121e-06, + "loss": 0.1649, + "step": 64150 + }, + { + "epoch": 2.52, + "grad_norm": 2.471626141069725, + "learning_rate": 4.9095334448160535e-06, + "loss": 0.1478, + "step": 64175 + }, + { + "epoch": 2.53, + "grad_norm": 2.65952247452563, + "learning_rate": 4.909491638795987e-06, + "loss": 0.1804, + "step": 64200 + }, + { + "epoch": 2.53, + "grad_norm": 1.6232088149469335, + "learning_rate": 4.90944983277592e-06, + "loss": 0.1594, + "step": 64225 + }, + { + "epoch": 2.53, + "grad_norm": 2.030118235294256, + "learning_rate": 4.909408026755853e-06, + "loss": 0.1521, + "step": 64250 + }, + { + "epoch": 2.53, + "grad_norm": 2.872547185240935, + "learning_rate": 4.909366220735786e-06, + "loss": 0.1704, + "step": 64275 + }, + { + "epoch": 2.53, + "grad_norm": 2.5078969587091784, + "learning_rate": 4.9093244147157195e-06, + "loss": 0.1573, + "step": 64300 + }, + { + "epoch": 2.53, + "grad_norm": 1.9992021677403815, + "learning_rate": 4.909282608695652e-06, + "loss": 0.1498, + "step": 64325 + }, + { + "epoch": 2.53, + "grad_norm": 2.503697200714061, + "learning_rate": 4.909240802675586e-06, + "loss": 0.1568, + "step": 64350 + }, + { + "epoch": 2.53, + "grad_norm": 1.9626698849941484, + "learning_rate": 4.9091989966555185e-06, + "loss": 0.1539, + "step": 64375 + }, + { + "epoch": 2.53, + "grad_norm": 2.490978407286913, + "learning_rate": 4.909157190635452e-06, + "loss": 0.1551, + "step": 64400 + }, + { + "epoch": 2.53, + "grad_norm": 1.783432543381823, + "learning_rate": 4.909115384615385e-06, + "loss": 0.1824, + "step": 64425 + }, + { + "epoch": 2.54, + "grad_norm": 2.0354994054583373, + "learning_rate": 4.909073578595318e-06, + "loss": 0.1626, + "step": 64450 + }, + { + "epoch": 2.54, + "grad_norm": 2.9963627176197933, + "learning_rate": 4.909031772575251e-06, + "loss": 0.1672, + "step": 64475 + }, + { + "epoch": 2.54, + "grad_norm": 2.57838119622678, + "learning_rate": 4.9089899665551845e-06, + "loss": 0.1816, + "step": 64500 + }, + { + "epoch": 2.54, + "grad_norm": 2.4899750928360356, + "learning_rate": 4.908948160535117e-06, + "loss": 0.1634, + "step": 64525 + }, + { + "epoch": 2.54, + "grad_norm": 2.1326207915213065, + "learning_rate": 4.908906354515051e-06, + "loss": 0.163, + "step": 64550 + }, + { + "epoch": 2.54, + "grad_norm": 1.7329610880996327, + "learning_rate": 4.9088645484949835e-06, + "loss": 0.1631, + "step": 64575 + }, + { + "epoch": 2.54, + "grad_norm": 1.597797070784199, + "learning_rate": 4.908822742474917e-06, + "loss": 0.1481, + "step": 64600 + }, + { + "epoch": 2.54, + "grad_norm": 1.593120783537612, + "learning_rate": 4.90878093645485e-06, + "loss": 0.1619, + "step": 64625 + }, + { + "epoch": 2.54, + "grad_norm": 1.3340178656573458, + "learning_rate": 4.908739130434783e-06, + "loss": 0.1619, + "step": 64650 + }, + { + "epoch": 2.54, + "grad_norm": 3.027825908379043, + "learning_rate": 4.908697324414716e-06, + "loss": 0.1784, + "step": 64675 + }, + { + "epoch": 2.55, + "grad_norm": 2.3061907558400665, + "learning_rate": 4.9086555183946495e-06, + "loss": 0.1728, + "step": 64700 + }, + { + "epoch": 2.55, + "grad_norm": 1.8145354193239998, + "learning_rate": 4.908613712374582e-06, + "loss": 0.1677, + "step": 64725 + }, + { + "epoch": 2.55, + "grad_norm": 3.2098623467674083, + "learning_rate": 4.908571906354516e-06, + "loss": 0.1799, + "step": 64750 + }, + { + "epoch": 2.55, + "grad_norm": 1.9804590850895585, + "learning_rate": 4.9085301003344484e-06, + "loss": 0.1655, + "step": 64775 + }, + { + "epoch": 2.55, + "grad_norm": 2.647426146096992, + "learning_rate": 4.908488294314382e-06, + "loss": 0.1464, + "step": 64800 + }, + { + "epoch": 2.55, + "grad_norm": 2.9055118736851995, + "learning_rate": 4.908446488294315e-06, + "loss": 0.1771, + "step": 64825 + }, + { + "epoch": 2.55, + "grad_norm": 1.8514368286597422, + "learning_rate": 4.9084063545150505e-06, + "loss": 0.174, + "step": 64850 + }, + { + "epoch": 2.55, + "grad_norm": 2.9887629605918113, + "learning_rate": 4.908364548494984e-06, + "loss": 0.1621, + "step": 64875 + }, + { + "epoch": 2.55, + "grad_norm": 2.4961759409869595, + "learning_rate": 4.908322742474917e-06, + "loss": 0.1472, + "step": 64900 + }, + { + "epoch": 2.55, + "grad_norm": 1.4943853813301795, + "learning_rate": 4.90828093645485e-06, + "loss": 0.159, + "step": 64925 + }, + { + "epoch": 2.56, + "grad_norm": 2.1198505134386583, + "learning_rate": 4.908239130434783e-06, + "loss": 0.1427, + "step": 64950 + }, + { + "epoch": 2.56, + "grad_norm": 1.9455591466937527, + "learning_rate": 4.9081973244147165e-06, + "loss": 0.1655, + "step": 64975 + }, + { + "epoch": 2.56, + "grad_norm": 2.276710046339653, + "learning_rate": 4.908155518394649e-06, + "loss": 0.1622, + "step": 65000 + }, + { + "epoch": 2.56, + "grad_norm": 2.112276112652573, + "learning_rate": 4.908113712374583e-06, + "loss": 0.1697, + "step": 65025 + }, + { + "epoch": 2.56, + "grad_norm": 2.465633305808334, + "learning_rate": 4.9080719063545155e-06, + "loss": 0.154, + "step": 65050 + }, + { + "epoch": 2.56, + "grad_norm": 2.393310551281838, + "learning_rate": 4.908030100334449e-06, + "loss": 0.1471, + "step": 65075 + }, + { + "epoch": 2.56, + "grad_norm": 2.4821237816549835, + "learning_rate": 4.907988294314382e-06, + "loss": 0.1585, + "step": 65100 + }, + { + "epoch": 2.56, + "grad_norm": 2.3884124180340605, + "learning_rate": 4.9079464882943144e-06, + "loss": 0.1573, + "step": 65125 + }, + { + "epoch": 2.56, + "grad_norm": 2.8117247832441796, + "learning_rate": 4.907904682274248e-06, + "loss": 0.1629, + "step": 65150 + }, + { + "epoch": 2.56, + "grad_norm": 2.8371308882763624, + "learning_rate": 4.907862876254181e-06, + "loss": 0.164, + "step": 65175 + }, + { + "epoch": 2.57, + "grad_norm": 2.677839446744097, + "learning_rate": 4.907821070234114e-06, + "loss": 0.1646, + "step": 65200 + }, + { + "epoch": 2.57, + "grad_norm": 2.1656191975374735, + "learning_rate": 4.907779264214047e-06, + "loss": 0.1666, + "step": 65225 + }, + { + "epoch": 2.57, + "grad_norm": 1.9170574359950836, + "learning_rate": 4.9077374581939805e-06, + "loss": 0.1752, + "step": 65250 + }, + { + "epoch": 2.57, + "grad_norm": 2.2651721052092806, + "learning_rate": 4.907695652173913e-06, + "loss": 0.1652, + "step": 65275 + }, + { + "epoch": 2.57, + "grad_norm": 2.848088945866107, + "learning_rate": 4.907653846153847e-06, + "loss": 0.1677, + "step": 65300 + }, + { + "epoch": 2.57, + "grad_norm": 2.3172246339739884, + "learning_rate": 4.907612040133779e-06, + "loss": 0.1718, + "step": 65325 + }, + { + "epoch": 2.57, + "grad_norm": 2.8405472169057213, + "learning_rate": 4.907570234113713e-06, + "loss": 0.1616, + "step": 65350 + }, + { + "epoch": 2.57, + "grad_norm": 1.917029554946475, + "learning_rate": 4.907528428093646e-06, + "loss": 0.1669, + "step": 65375 + }, + { + "epoch": 2.57, + "grad_norm": 2.2302027109018785, + "learning_rate": 4.907486622073579e-06, + "loss": 0.1707, + "step": 65400 + }, + { + "epoch": 2.57, + "grad_norm": 2.1756400588268625, + "learning_rate": 4.907444816053512e-06, + "loss": 0.1797, + "step": 65425 + }, + { + "epoch": 2.57, + "grad_norm": 2.441329759624097, + "learning_rate": 4.9074030100334454e-06, + "loss": 0.1632, + "step": 65450 + }, + { + "epoch": 2.58, + "grad_norm": 2.6632498992499993, + "learning_rate": 4.907361204013378e-06, + "loss": 0.185, + "step": 65475 + }, + { + "epoch": 2.58, + "grad_norm": 1.7352876275637894, + "learning_rate": 4.907319397993312e-06, + "loss": 0.1627, + "step": 65500 + }, + { + "epoch": 2.58, + "grad_norm": 2.4227494238068776, + "learning_rate": 4.907277591973244e-06, + "loss": 0.1605, + "step": 65525 + }, + { + "epoch": 2.58, + "grad_norm": 2.076807380471068, + "learning_rate": 4.907235785953178e-06, + "loss": 0.1692, + "step": 65550 + }, + { + "epoch": 2.58, + "grad_norm": 1.9389772507642347, + "learning_rate": 4.907193979933111e-06, + "loss": 0.1554, + "step": 65575 + }, + { + "epoch": 2.58, + "grad_norm": 2.548984641939041, + "learning_rate": 4.907152173913044e-06, + "loss": 0.1627, + "step": 65600 + }, + { + "epoch": 2.58, + "grad_norm": 2.6788514190264143, + "learning_rate": 4.907110367892977e-06, + "loss": 0.1436, + "step": 65625 + }, + { + "epoch": 2.58, + "grad_norm": 3.1995893686446286, + "learning_rate": 4.90706856187291e-06, + "loss": 0.178, + "step": 65650 + }, + { + "epoch": 2.58, + "grad_norm": 1.6593985754679415, + "learning_rate": 4.907026755852843e-06, + "loss": 0.1685, + "step": 65675 + }, + { + "epoch": 2.58, + "grad_norm": 2.317242944867583, + "learning_rate": 4.906984949832777e-06, + "loss": 0.1656, + "step": 65700 + }, + { + "epoch": 2.59, + "grad_norm": 2.8102006179505628, + "learning_rate": 4.906943143812709e-06, + "loss": 0.1587, + "step": 65725 + }, + { + "epoch": 2.59, + "grad_norm": 2.201863264339552, + "learning_rate": 4.906901337792643e-06, + "loss": 0.1607, + "step": 65750 + }, + { + "epoch": 2.59, + "grad_norm": 1.702427793526987, + "learning_rate": 4.906859531772576e-06, + "loss": 0.1755, + "step": 65775 + }, + { + "epoch": 2.59, + "grad_norm": 2.6686775607203446, + "learning_rate": 4.906817725752509e-06, + "loss": 0.1396, + "step": 65800 + }, + { + "epoch": 2.59, + "grad_norm": 2.6544320471662886, + "learning_rate": 4.906775919732442e-06, + "loss": 0.1568, + "step": 65825 + }, + { + "epoch": 2.59, + "grad_norm": 0.7099841241182607, + "learning_rate": 4.906735785953178e-06, + "loss": 0.1543, + "step": 65850 + }, + { + "epoch": 2.59, + "grad_norm": 1.9112198873388229, + "learning_rate": 4.906693979933111e-06, + "loss": 0.1526, + "step": 65875 + }, + { + "epoch": 2.59, + "grad_norm": 2.3657780976079748, + "learning_rate": 4.906652173913044e-06, + "loss": 0.1566, + "step": 65900 + }, + { + "epoch": 2.59, + "grad_norm": 1.9480778857317624, + "learning_rate": 4.9066103678929775e-06, + "loss": 0.1585, + "step": 65925 + }, + { + "epoch": 2.59, + "grad_norm": 2.942927544287174, + "learning_rate": 4.90656856187291e-06, + "loss": 0.1671, + "step": 65950 + }, + { + "epoch": 2.6, + "grad_norm": 1.9627043489734612, + "learning_rate": 4.906526755852844e-06, + "loss": 0.1725, + "step": 65975 + }, + { + "epoch": 2.6, + "grad_norm": 1.696701666683548, + "learning_rate": 4.906486622073579e-06, + "loss": 0.1576, + "step": 66000 + }, + { + "epoch": 2.6, + "grad_norm": 1.4217429930751502, + "learning_rate": 4.906444816053512e-06, + "loss": 0.164, + "step": 66025 + }, + { + "epoch": 2.6, + "grad_norm": 2.549382249171763, + "learning_rate": 4.906403010033445e-06, + "loss": 0.1618, + "step": 66050 + }, + { + "epoch": 2.6, + "grad_norm": 2.2060418166923053, + "learning_rate": 4.9063612040133785e-06, + "loss": 0.1752, + "step": 66075 + }, + { + "epoch": 2.6, + "grad_norm": 2.526241507559611, + "learning_rate": 4.906319397993311e-06, + "loss": 0.1661, + "step": 66100 + }, + { + "epoch": 2.6, + "grad_norm": 2.024390593256092, + "learning_rate": 4.906277591973245e-06, + "loss": 0.1674, + "step": 66125 + }, + { + "epoch": 2.6, + "grad_norm": 2.5548120313238436, + "learning_rate": 4.9062357859531774e-06, + "loss": 0.1555, + "step": 66150 + }, + { + "epoch": 2.6, + "grad_norm": 1.7203300785919162, + "learning_rate": 4.906193979933111e-06, + "loss": 0.1654, + "step": 66175 + }, + { + "epoch": 2.6, + "grad_norm": 2.1977595422357936, + "learning_rate": 4.9061521739130445e-06, + "loss": 0.1771, + "step": 66200 + }, + { + "epoch": 2.61, + "grad_norm": 1.7535093547919756, + "learning_rate": 4.906110367892977e-06, + "loss": 0.1603, + "step": 66225 + }, + { + "epoch": 2.61, + "grad_norm": 2.154775437570024, + "learning_rate": 4.906068561872911e-06, + "loss": 0.1671, + "step": 66250 + }, + { + "epoch": 2.61, + "grad_norm": 2.957733586615425, + "learning_rate": 4.906026755852843e-06, + "loss": 0.1551, + "step": 66275 + }, + { + "epoch": 2.61, + "grad_norm": 2.017968097634726, + "learning_rate": 4.905984949832776e-06, + "loss": 0.1718, + "step": 66300 + }, + { + "epoch": 2.61, + "grad_norm": 2.1178933664306134, + "learning_rate": 4.905943143812709e-06, + "loss": 0.1418, + "step": 66325 + }, + { + "epoch": 2.61, + "grad_norm": 2.2771629555356636, + "learning_rate": 4.905901337792642e-06, + "loss": 0.1591, + "step": 66350 + }, + { + "epoch": 2.61, + "grad_norm": 2.114269650050633, + "learning_rate": 4.905859531772575e-06, + "loss": 0.157, + "step": 66375 + }, + { + "epoch": 2.61, + "grad_norm": 1.8226274914884266, + "learning_rate": 4.905817725752509e-06, + "loss": 0.1586, + "step": 66400 + }, + { + "epoch": 2.61, + "grad_norm": 2.2937073243363506, + "learning_rate": 4.905775919732441e-06, + "loss": 0.1605, + "step": 66425 + }, + { + "epoch": 2.61, + "grad_norm": 1.521627303225354, + "learning_rate": 4.905734113712375e-06, + "loss": 0.1731, + "step": 66450 + }, + { + "epoch": 2.62, + "grad_norm": 2.1538938575588253, + "learning_rate": 4.905692307692308e-06, + "loss": 0.1662, + "step": 66475 + }, + { + "epoch": 2.62, + "grad_norm": 2.1191327411063714, + "learning_rate": 4.905650501672241e-06, + "loss": 0.1563, + "step": 66500 + }, + { + "epoch": 2.62, + "grad_norm": 1.9074138182924498, + "learning_rate": 4.905608695652174e-06, + "loss": 0.155, + "step": 66525 + }, + { + "epoch": 2.62, + "grad_norm": 2.914625498132579, + "learning_rate": 4.905566889632107e-06, + "loss": 0.1624, + "step": 66550 + }, + { + "epoch": 2.62, + "grad_norm": 2.6539242263530385, + "learning_rate": 4.90552508361204e-06, + "loss": 0.1714, + "step": 66575 + }, + { + "epoch": 2.62, + "grad_norm": 2.088674751899696, + "learning_rate": 4.905483277591974e-06, + "loss": 0.1714, + "step": 66600 + }, + { + "epoch": 2.62, + "grad_norm": 2.2135373112639694, + "learning_rate": 4.905441471571907e-06, + "loss": 0.1584, + "step": 66625 + }, + { + "epoch": 2.62, + "grad_norm": 1.4192960242700194, + "learning_rate": 4.90539966555184e-06, + "loss": 0.1591, + "step": 66650 + }, + { + "epoch": 2.62, + "grad_norm": 1.7390386910080664, + "learning_rate": 4.905357859531773e-06, + "loss": 0.1769, + "step": 66675 + }, + { + "epoch": 2.62, + "grad_norm": 2.232094694142498, + "learning_rate": 4.905316053511706e-06, + "loss": 0.1455, + "step": 66700 + }, + { + "epoch": 2.63, + "grad_norm": 3.0138843173390297, + "learning_rate": 4.90527424749164e-06, + "loss": 0.1514, + "step": 66725 + }, + { + "epoch": 2.63, + "grad_norm": 2.0070506284026393, + "learning_rate": 4.905232441471572e-06, + "loss": 0.1866, + "step": 66750 + }, + { + "epoch": 2.63, + "grad_norm": 1.4365348345881692, + "learning_rate": 4.905190635451506e-06, + "loss": 0.1739, + "step": 66775 + }, + { + "epoch": 2.63, + "grad_norm": 2.0958303312111766, + "learning_rate": 4.905148829431439e-06, + "loss": 0.1554, + "step": 66800 + }, + { + "epoch": 2.63, + "grad_norm": 2.4725145319946504, + "learning_rate": 4.905107023411372e-06, + "loss": 0.1675, + "step": 66825 + }, + { + "epoch": 2.63, + "grad_norm": 2.718854874162693, + "learning_rate": 4.905065217391305e-06, + "loss": 0.1548, + "step": 66850 + }, + { + "epoch": 2.63, + "grad_norm": 3.0116226602644884, + "learning_rate": 4.905023411371238e-06, + "loss": 0.1478, + "step": 66875 + }, + { + "epoch": 2.63, + "grad_norm": 2.8899799082771738, + "learning_rate": 4.904981605351171e-06, + "loss": 0.161, + "step": 66900 + }, + { + "epoch": 2.63, + "grad_norm": 2.3658188452538966, + "learning_rate": 4.904939799331105e-06, + "loss": 0.1599, + "step": 66925 + }, + { + "epoch": 2.63, + "grad_norm": 2.509051426616327, + "learning_rate": 4.904897993311037e-06, + "loss": 0.1614, + "step": 66950 + }, + { + "epoch": 2.63, + "grad_norm": 3.211165300889533, + "learning_rate": 4.904856187290971e-06, + "loss": 0.1553, + "step": 66975 + }, + { + "epoch": 2.64, + "grad_norm": 1.6733992628510892, + "learning_rate": 4.904814381270903e-06, + "loss": 0.1711, + "step": 67000 + }, + { + "epoch": 2.64, + "grad_norm": 2.3085595836756876, + "learning_rate": 4.904772575250836e-06, + "loss": 0.167, + "step": 67025 + }, + { + "epoch": 2.64, + "grad_norm": 2.9511587121489784, + "learning_rate": 4.90473076923077e-06, + "loss": 0.1802, + "step": 67050 + }, + { + "epoch": 2.64, + "grad_norm": 1.7009959258288003, + "learning_rate": 4.9046889632107025e-06, + "loss": 0.1695, + "step": 67075 + }, + { + "epoch": 2.64, + "grad_norm": 1.861255834759058, + "learning_rate": 4.904647157190636e-06, + "loss": 0.1708, + "step": 67100 + }, + { + "epoch": 2.64, + "grad_norm": 2.332246734287938, + "learning_rate": 4.904605351170569e-06, + "loss": 0.1647, + "step": 67125 + }, + { + "epoch": 2.64, + "grad_norm": 3.0582558350326523, + "learning_rate": 4.904563545150502e-06, + "loss": 0.1568, + "step": 67150 + }, + { + "epoch": 2.64, + "grad_norm": 2.838261030151132, + "learning_rate": 4.904521739130435e-06, + "loss": 0.1527, + "step": 67175 + }, + { + "epoch": 2.64, + "grad_norm": 2.9315235968632813, + "learning_rate": 4.9044799331103686e-06, + "loss": 0.1829, + "step": 67200 + }, + { + "epoch": 2.64, + "grad_norm": 2.167459167955874, + "learning_rate": 4.904438127090301e-06, + "loss": 0.1524, + "step": 67225 + }, + { + "epoch": 2.65, + "grad_norm": 3.8651054355321226, + "learning_rate": 4.904396321070235e-06, + "loss": 0.1765, + "step": 67250 + }, + { + "epoch": 2.65, + "grad_norm": 1.974004440932649, + "learning_rate": 4.9043545150501675e-06, + "loss": 0.179, + "step": 67275 + }, + { + "epoch": 2.65, + "grad_norm": 1.7036311477514448, + "learning_rate": 4.904312709030101e-06, + "loss": 0.1502, + "step": 67300 + }, + { + "epoch": 2.65, + "grad_norm": 2.3390162169708817, + "learning_rate": 4.904270903010034e-06, + "loss": 0.1632, + "step": 67325 + }, + { + "epoch": 2.65, + "grad_norm": 2.763794342227092, + "learning_rate": 4.904229096989967e-06, + "loss": 0.177, + "step": 67350 + }, + { + "epoch": 2.65, + "grad_norm": 2.5359226898254685, + "learning_rate": 4.9041872909699e-06, + "loss": 0.1675, + "step": 67375 + }, + { + "epoch": 2.65, + "grad_norm": 3.1775710087532154, + "learning_rate": 4.9041454849498335e-06, + "loss": 0.1738, + "step": 67400 + }, + { + "epoch": 2.65, + "grad_norm": 2.345033829775269, + "learning_rate": 4.904103678929766e-06, + "loss": 0.1675, + "step": 67425 + }, + { + "epoch": 2.65, + "grad_norm": 2.336885577490091, + "learning_rate": 4.9040618729097e-06, + "loss": 0.1517, + "step": 67450 + }, + { + "epoch": 2.65, + "grad_norm": 2.4565329151898867, + "learning_rate": 4.9040200668896325e-06, + "loss": 0.1669, + "step": 67475 + }, + { + "epoch": 2.66, + "grad_norm": 2.5585596954556937, + "learning_rate": 4.903978260869566e-06, + "loss": 0.1685, + "step": 67500 + }, + { + "epoch": 2.66, + "grad_norm": 1.7937763982225357, + "learning_rate": 4.903936454849499e-06, + "loss": 0.1479, + "step": 67525 + }, + { + "epoch": 2.66, + "grad_norm": 1.6019104450697434, + "learning_rate": 4.903894648829432e-06, + "loss": 0.1623, + "step": 67550 + }, + { + "epoch": 2.66, + "grad_norm": 2.4719412168073753, + "learning_rate": 4.903852842809365e-06, + "loss": 0.1754, + "step": 67575 + }, + { + "epoch": 2.66, + "grad_norm": 2.370947438562601, + "learning_rate": 4.9038110367892985e-06, + "loss": 0.1644, + "step": 67600 + }, + { + "epoch": 2.66, + "grad_norm": 3.131913579404752, + "learning_rate": 4.903769230769231e-06, + "loss": 0.169, + "step": 67625 + }, + { + "epoch": 2.66, + "grad_norm": 1.743261368460137, + "learning_rate": 4.903727424749165e-06, + "loss": 0.1668, + "step": 67650 + }, + { + "epoch": 2.66, + "grad_norm": 1.3480808608244685, + "learning_rate": 4.9036856187290975e-06, + "loss": 0.164, + "step": 67675 + }, + { + "epoch": 2.66, + "grad_norm": 1.8534317999026895, + "learning_rate": 4.90364381270903e-06, + "loss": 0.1497, + "step": 67700 + }, + { + "epoch": 2.66, + "grad_norm": 2.960335057951582, + "learning_rate": 4.903602006688964e-06, + "loss": 0.1727, + "step": 67725 + }, + { + "epoch": 2.67, + "grad_norm": 2.423990432992723, + "learning_rate": 4.903560200668896e-06, + "loss": 0.1529, + "step": 67750 + }, + { + "epoch": 2.67, + "grad_norm": 2.0462023472811737, + "learning_rate": 4.90351839464883e-06, + "loss": 0.1727, + "step": 67775 + }, + { + "epoch": 2.67, + "grad_norm": 2.6099807901854786, + "learning_rate": 4.903476588628763e-06, + "loss": 0.1578, + "step": 67800 + }, + { + "epoch": 2.67, + "grad_norm": 2.170286265230943, + "learning_rate": 4.903434782608696e-06, + "loss": 0.1584, + "step": 67825 + }, + { + "epoch": 2.67, + "grad_norm": 2.3937380773545813, + "learning_rate": 4.903392976588629e-06, + "loss": 0.1791, + "step": 67850 + }, + { + "epoch": 2.67, + "grad_norm": 1.5374488563219373, + "learning_rate": 4.9033511705685624e-06, + "loss": 0.1427, + "step": 67875 + }, + { + "epoch": 2.67, + "grad_norm": 3.2583070199750472, + "learning_rate": 4.903309364548495e-06, + "loss": 0.1636, + "step": 67900 + }, + { + "epoch": 2.67, + "grad_norm": 1.8468048091641243, + "learning_rate": 4.903267558528429e-06, + "loss": 0.1519, + "step": 67925 + }, + { + "epoch": 2.67, + "grad_norm": 2.6291588159527373, + "learning_rate": 4.903225752508361e-06, + "loss": 0.1758, + "step": 67950 + }, + { + "epoch": 2.67, + "grad_norm": 3.7050360639335347, + "learning_rate": 4.903183946488295e-06, + "loss": 0.1749, + "step": 67975 + }, + { + "epoch": 2.68, + "grad_norm": 1.7682431370811424, + "learning_rate": 4.90314381270903e-06, + "loss": 0.1573, + "step": 68000 + }, + { + "epoch": 2.68, + "grad_norm": 2.3824688003115226, + "learning_rate": 4.9031020066889635e-06, + "loss": 0.1468, + "step": 68025 + }, + { + "epoch": 2.68, + "grad_norm": 1.7485707314388916, + "learning_rate": 4.903060200668896e-06, + "loss": 0.1615, + "step": 68050 + }, + { + "epoch": 2.68, + "grad_norm": 1.7128641702363026, + "learning_rate": 4.90301839464883e-06, + "loss": 0.1592, + "step": 68075 + }, + { + "epoch": 2.68, + "grad_norm": 2.075482147796586, + "learning_rate": 4.902976588628762e-06, + "loss": 0.1352, + "step": 68100 + }, + { + "epoch": 2.68, + "grad_norm": 1.6209920469074521, + "learning_rate": 4.902934782608696e-06, + "loss": 0.1712, + "step": 68125 + }, + { + "epoch": 2.68, + "grad_norm": 2.2059154225617035, + "learning_rate": 4.902892976588629e-06, + "loss": 0.1582, + "step": 68150 + }, + { + "epoch": 2.68, + "grad_norm": 1.5679563332864526, + "learning_rate": 4.902851170568562e-06, + "loss": 0.1644, + "step": 68175 + }, + { + "epoch": 2.68, + "grad_norm": 1.3764731345936694, + "learning_rate": 4.902811036789298e-06, + "loss": 0.1656, + "step": 68200 + }, + { + "epoch": 2.68, + "grad_norm": 0.9139328926308926, + "learning_rate": 4.902769230769231e-06, + "loss": 0.167, + "step": 68225 + }, + { + "epoch": 2.68, + "grad_norm": 2.2718914414232003, + "learning_rate": 4.902727424749164e-06, + "loss": 0.1524, + "step": 68250 + }, + { + "epoch": 2.69, + "grad_norm": 2.0111405659125294, + "learning_rate": 4.902685618729097e-06, + "loss": 0.1858, + "step": 68275 + }, + { + "epoch": 2.69, + "grad_norm": 2.2865058081144105, + "learning_rate": 4.9026438127090305e-06, + "loss": 0.1537, + "step": 68300 + }, + { + "epoch": 2.69, + "grad_norm": 2.1726475243792116, + "learning_rate": 4.902602006688963e-06, + "loss": 0.1633, + "step": 68325 + }, + { + "epoch": 2.69, + "grad_norm": 2.8799278645516595, + "learning_rate": 4.902560200668897e-06, + "loss": 0.1432, + "step": 68350 + }, + { + "epoch": 2.69, + "grad_norm": 3.1468444524866888, + "learning_rate": 4.9025183946488294e-06, + "loss": 0.1729, + "step": 68375 + }, + { + "epoch": 2.69, + "grad_norm": 1.6869292960241524, + "learning_rate": 4.902476588628763e-06, + "loss": 0.1692, + "step": 68400 + }, + { + "epoch": 2.69, + "grad_norm": 2.2943763214968387, + "learning_rate": 4.902434782608696e-06, + "loss": 0.1523, + "step": 68425 + }, + { + "epoch": 2.69, + "grad_norm": 0.9078772412947974, + "learning_rate": 4.902392976588629e-06, + "loss": 0.1489, + "step": 68450 + }, + { + "epoch": 2.69, + "grad_norm": 2.624638040883813, + "learning_rate": 4.902351170568562e-06, + "loss": 0.1663, + "step": 68475 + }, + { + "epoch": 2.69, + "grad_norm": 2.0599344273565725, + "learning_rate": 4.9023093645484955e-06, + "loss": 0.1476, + "step": 68500 + }, + { + "epoch": 2.7, + "grad_norm": 2.132918351620154, + "learning_rate": 4.902267558528428e-06, + "loss": 0.1706, + "step": 68525 + }, + { + "epoch": 2.7, + "grad_norm": 2.2024926639954443, + "learning_rate": 4.902225752508362e-06, + "loss": 0.1638, + "step": 68550 + }, + { + "epoch": 2.7, + "grad_norm": 2.00228229652154, + "learning_rate": 4.9021839464882944e-06, + "loss": 0.1745, + "step": 68575 + }, + { + "epoch": 2.7, + "grad_norm": 2.4173513208178314, + "learning_rate": 4.902142140468228e-06, + "loss": 0.1667, + "step": 68600 + }, + { + "epoch": 2.7, + "grad_norm": 1.4535956046540373, + "learning_rate": 4.902100334448161e-06, + "loss": 0.1872, + "step": 68625 + }, + { + "epoch": 2.7, + "grad_norm": 1.5770697206236601, + "learning_rate": 4.902058528428094e-06, + "loss": 0.1584, + "step": 68650 + }, + { + "epoch": 2.7, + "grad_norm": 2.1907711996947206, + "learning_rate": 4.902016722408027e-06, + "loss": 0.1614, + "step": 68675 + }, + { + "epoch": 2.7, + "grad_norm": 1.5996563388129679, + "learning_rate": 4.9019749163879605e-06, + "loss": 0.1528, + "step": 68700 + }, + { + "epoch": 2.7, + "grad_norm": 1.5217021335901246, + "learning_rate": 4.901933110367893e-06, + "loss": 0.1583, + "step": 68725 + }, + { + "epoch": 2.7, + "grad_norm": 2.830636659279085, + "learning_rate": 4.901891304347827e-06, + "loss": 0.16, + "step": 68750 + }, + { + "epoch": 2.71, + "grad_norm": 3.132431011223764, + "learning_rate": 4.901849498327759e-06, + "loss": 0.1755, + "step": 68775 + }, + { + "epoch": 2.71, + "grad_norm": 2.954810412804257, + "learning_rate": 4.901807692307693e-06, + "loss": 0.1593, + "step": 68800 + }, + { + "epoch": 2.71, + "grad_norm": 1.9873971899648006, + "learning_rate": 4.901765886287626e-06, + "loss": 0.1576, + "step": 68825 + }, + { + "epoch": 2.71, + "grad_norm": 1.8546844581500215, + "learning_rate": 4.901724080267559e-06, + "loss": 0.1486, + "step": 68850 + }, + { + "epoch": 2.71, + "grad_norm": 2.699256024899676, + "learning_rate": 4.901682274247492e-06, + "loss": 0.1497, + "step": 68875 + }, + { + "epoch": 2.71, + "grad_norm": 2.1603180169557, + "learning_rate": 4.901640468227425e-06, + "loss": 0.159, + "step": 68900 + }, + { + "epoch": 2.71, + "grad_norm": 1.9479327767344141, + "learning_rate": 4.901598662207358e-06, + "loss": 0.1679, + "step": 68925 + }, + { + "epoch": 2.71, + "grad_norm": 1.3827271970959603, + "learning_rate": 4.901556856187291e-06, + "loss": 0.1696, + "step": 68950 + }, + { + "epoch": 2.71, + "grad_norm": 1.806564358309027, + "learning_rate": 4.901515050167224e-06, + "loss": 0.1659, + "step": 68975 + }, + { + "epoch": 2.71, + "grad_norm": 2.0332614028768843, + "learning_rate": 4.901473244147157e-06, + "loss": 0.174, + "step": 69000 + }, + { + "epoch": 2.72, + "grad_norm": 2.079579653117786, + "learning_rate": 4.901431438127091e-06, + "loss": 0.1801, + "step": 69025 + }, + { + "epoch": 2.72, + "grad_norm": 2.6221209766597116, + "learning_rate": 4.901389632107023e-06, + "loss": 0.1836, + "step": 69050 + }, + { + "epoch": 2.72, + "grad_norm": 2.0314819729265583, + "learning_rate": 4.901347826086957e-06, + "loss": 0.1723, + "step": 69075 + }, + { + "epoch": 2.72, + "grad_norm": 2.561945317804322, + "learning_rate": 4.9013060200668896e-06, + "loss": 0.1716, + "step": 69100 + }, + { + "epoch": 2.72, + "grad_norm": 2.22404829656543, + "learning_rate": 4.901264214046823e-06, + "loss": 0.1616, + "step": 69125 + }, + { + "epoch": 2.72, + "grad_norm": 3.0273108921916183, + "learning_rate": 4.901222408026756e-06, + "loss": 0.1553, + "step": 69150 + }, + { + "epoch": 2.72, + "grad_norm": 1.9271862459705924, + "learning_rate": 4.901180602006689e-06, + "loss": 0.1535, + "step": 69175 + }, + { + "epoch": 2.72, + "grad_norm": 1.8153291263804212, + "learning_rate": 4.901138795986622e-06, + "loss": 0.167, + "step": 69200 + }, + { + "epoch": 2.72, + "grad_norm": 3.3250437575275225, + "learning_rate": 4.901096989966556e-06, + "loss": 0.1723, + "step": 69225 + }, + { + "epoch": 2.72, + "grad_norm": 2.767320385273223, + "learning_rate": 4.901055183946488e-06, + "loss": 0.1731, + "step": 69250 + }, + { + "epoch": 2.73, + "grad_norm": 2.0717377921678923, + "learning_rate": 4.901013377926422e-06, + "loss": 0.1831, + "step": 69275 + }, + { + "epoch": 2.73, + "grad_norm": 2.044160851544343, + "learning_rate": 4.900971571906355e-06, + "loss": 0.1707, + "step": 69300 + }, + { + "epoch": 2.73, + "grad_norm": 2.3873404711946207, + "learning_rate": 4.900929765886288e-06, + "loss": 0.1549, + "step": 69325 + }, + { + "epoch": 2.73, + "grad_norm": 2.2695647905827445, + "learning_rate": 4.900887959866222e-06, + "loss": 0.1818, + "step": 69350 + }, + { + "epoch": 2.73, + "grad_norm": 1.4378326071784568, + "learning_rate": 4.900846153846154e-06, + "loss": 0.1617, + "step": 69375 + }, + { + "epoch": 2.73, + "grad_norm": 1.9957570406745557, + "learning_rate": 4.900804347826088e-06, + "loss": 0.1649, + "step": 69400 + }, + { + "epoch": 2.73, + "grad_norm": 2.2610619693808034, + "learning_rate": 4.9007625418060206e-06, + "loss": 0.1614, + "step": 69425 + }, + { + "epoch": 2.73, + "grad_norm": 1.9526855098138778, + "learning_rate": 4.900720735785954e-06, + "loss": 0.1709, + "step": 69450 + }, + { + "epoch": 2.73, + "grad_norm": 1.8988605064657225, + "learning_rate": 4.900678929765887e-06, + "loss": 0.1671, + "step": 69475 + }, + { + "epoch": 2.73, + "grad_norm": 2.2746147588855834, + "learning_rate": 4.90063712374582e-06, + "loss": 0.1577, + "step": 69500 + }, + { + "epoch": 2.74, + "grad_norm": 2.3770617254178164, + "learning_rate": 4.900595317725753e-06, + "loss": 0.1719, + "step": 69525 + }, + { + "epoch": 2.74, + "grad_norm": 1.6617922742946107, + "learning_rate": 4.900553511705687e-06, + "loss": 0.1661, + "step": 69550 + }, + { + "epoch": 2.74, + "grad_norm": 1.783213648386689, + "learning_rate": 4.900511705685619e-06, + "loss": 0.1591, + "step": 69575 + }, + { + "epoch": 2.74, + "grad_norm": 2.981878987936314, + "learning_rate": 4.900469899665552e-06, + "loss": 0.1658, + "step": 69600 + }, + { + "epoch": 2.74, + "grad_norm": 1.6476488160209035, + "learning_rate": 4.900428093645485e-06, + "loss": 0.1536, + "step": 69625 + }, + { + "epoch": 2.74, + "grad_norm": 2.325940941846499, + "learning_rate": 4.900386287625418e-06, + "loss": 0.169, + "step": 69650 + }, + { + "epoch": 2.74, + "grad_norm": 2.4505268528811524, + "learning_rate": 4.900344481605351e-06, + "loss": 0.1702, + "step": 69675 + }, + { + "epoch": 2.74, + "grad_norm": 1.9311266368341664, + "learning_rate": 4.9003026755852845e-06, + "loss": 0.1548, + "step": 69700 + }, + { + "epoch": 2.74, + "grad_norm": 1.612976220389442, + "learning_rate": 4.900260869565218e-06, + "loss": 0.1612, + "step": 69725 + }, + { + "epoch": 2.74, + "grad_norm": 2.8961876952169523, + "learning_rate": 4.900219063545151e-06, + "loss": 0.1839, + "step": 69750 + }, + { + "epoch": 2.74, + "grad_norm": 1.7490468429363928, + "learning_rate": 4.900177257525084e-06, + "loss": 0.1601, + "step": 69775 + }, + { + "epoch": 2.75, + "grad_norm": 2.701145780751548, + "learning_rate": 4.900135451505017e-06, + "loss": 0.1826, + "step": 69800 + }, + { + "epoch": 2.75, + "grad_norm": 1.8448000328926708, + "learning_rate": 4.9000936454849505e-06, + "loss": 0.1698, + "step": 69825 + }, + { + "epoch": 2.75, + "grad_norm": 1.6674747031267765, + "learning_rate": 4.900051839464883e-06, + "loss": 0.1673, + "step": 69850 + }, + { + "epoch": 2.75, + "grad_norm": 2.9567258027871106, + "learning_rate": 4.900010033444817e-06, + "loss": 0.1816, + "step": 69875 + }, + { + "epoch": 2.75, + "grad_norm": 0.9510130741773436, + "learning_rate": 4.8999682274247495e-06, + "loss": 0.1765, + "step": 69900 + }, + { + "epoch": 2.75, + "grad_norm": 2.3497872557880353, + "learning_rate": 4.899926421404683e-06, + "loss": 0.1721, + "step": 69925 + }, + { + "epoch": 2.75, + "grad_norm": 2.5324795074581137, + "learning_rate": 4.899884615384616e-06, + "loss": 0.1654, + "step": 69950 + }, + { + "epoch": 2.75, + "grad_norm": 1.497124922083692, + "learning_rate": 4.899842809364549e-06, + "loss": 0.1709, + "step": 69975 + }, + { + "epoch": 2.75, + "grad_norm": 2.0770713159583654, + "learning_rate": 4.899801003344482e-06, + "loss": 0.1692, + "step": 70000 + }, + { + "epoch": 2.75, + "eval_loss": 0.60400390625, + "eval_runtime": 11576.636, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.051, + "eval_wer": 0.11463410710551293, + "step": 70000 + }, + { + "epoch": 2.75, + "grad_norm": 3.404522336853661, + "learning_rate": 4.8997591973244155e-06, + "loss": 0.1641, + "step": 70025 + }, + { + "epoch": 2.76, + "grad_norm": 2.604856475398737, + "learning_rate": 4.899717391304348e-06, + "loss": 0.163, + "step": 70050 + }, + { + "epoch": 2.76, + "grad_norm": 1.9673677744338196, + "learning_rate": 4.899675585284282e-06, + "loss": 0.1639, + "step": 70075 + }, + { + "epoch": 2.76, + "grad_norm": 2.522347353607375, + "learning_rate": 4.8996337792642144e-06, + "loss": 0.1478, + "step": 70100 + }, + { + "epoch": 2.76, + "grad_norm": 2.0517890231441367, + "learning_rate": 4.899591973244148e-06, + "loss": 0.1658, + "step": 70125 + }, + { + "epoch": 2.76, + "grad_norm": 2.0923956047064167, + "learning_rate": 4.899550167224081e-06, + "loss": 0.1676, + "step": 70150 + }, + { + "epoch": 2.76, + "grad_norm": 2.301297191893959, + "learning_rate": 4.899508361204014e-06, + "loss": 0.1651, + "step": 70175 + }, + { + "epoch": 2.76, + "grad_norm": 2.583427012599905, + "learning_rate": 4.899468227424749e-06, + "loss": 0.1932, + "step": 70200 + }, + { + "epoch": 2.76, + "grad_norm": 2.2125202227042093, + "learning_rate": 4.899426421404683e-06, + "loss": 0.1605, + "step": 70225 + }, + { + "epoch": 2.76, + "grad_norm": 2.443241511177795, + "learning_rate": 4.8993846153846155e-06, + "loss": 0.1649, + "step": 70250 + }, + { + "epoch": 2.76, + "grad_norm": 2.2941758774610634, + "learning_rate": 4.899342809364549e-06, + "loss": 0.1624, + "step": 70275 + }, + { + "epoch": 2.77, + "grad_norm": 2.041645811171096, + "learning_rate": 4.899301003344482e-06, + "loss": 0.1718, + "step": 70300 + }, + { + "epoch": 2.77, + "grad_norm": 3.4730930100201176, + "learning_rate": 4.899259197324415e-06, + "loss": 0.175, + "step": 70325 + }, + { + "epoch": 2.77, + "grad_norm": 2.557520966431368, + "learning_rate": 4.899217391304348e-06, + "loss": 0.177, + "step": 70350 + }, + { + "epoch": 2.77, + "grad_norm": 2.0185704278932226, + "learning_rate": 4.8991755852842815e-06, + "loss": 0.1474, + "step": 70375 + }, + { + "epoch": 2.77, + "grad_norm": 3.1796598269669145, + "learning_rate": 4.899133779264215e-06, + "loss": 0.1533, + "step": 70400 + }, + { + "epoch": 2.77, + "grad_norm": 2.000036451479218, + "learning_rate": 4.899091973244148e-06, + "loss": 0.1616, + "step": 70425 + }, + { + "epoch": 2.77, + "grad_norm": 2.416211515477164, + "learning_rate": 4.899050167224081e-06, + "loss": 0.1909, + "step": 70450 + }, + { + "epoch": 2.77, + "grad_norm": 2.3646053101731757, + "learning_rate": 4.899008361204014e-06, + "loss": 0.1586, + "step": 70475 + }, + { + "epoch": 2.77, + "grad_norm": 2.2500982911348384, + "learning_rate": 4.8989665551839475e-06, + "loss": 0.169, + "step": 70500 + }, + { + "epoch": 2.77, + "grad_norm": 1.9615764608426884, + "learning_rate": 4.898924749163879e-06, + "loss": 0.1608, + "step": 70525 + }, + { + "epoch": 2.78, + "grad_norm": 2.359861228050328, + "learning_rate": 4.898882943143813e-06, + "loss": 0.1743, + "step": 70550 + }, + { + "epoch": 2.78, + "grad_norm": 1.986395507242446, + "learning_rate": 4.898841137123746e-06, + "loss": 0.1612, + "step": 70575 + }, + { + "epoch": 2.78, + "grad_norm": 2.650973993750791, + "learning_rate": 4.898799331103679e-06, + "loss": 0.1907, + "step": 70600 + }, + { + "epoch": 2.78, + "grad_norm": 1.5054656396604489, + "learning_rate": 4.898757525083612e-06, + "loss": 0.1614, + "step": 70625 + }, + { + "epoch": 2.78, + "grad_norm": 2.099185235357897, + "learning_rate": 4.898715719063545e-06, + "loss": 0.1599, + "step": 70650 + }, + { + "epoch": 2.78, + "grad_norm": 1.752903980750435, + "learning_rate": 4.898673913043478e-06, + "loss": 0.1727, + "step": 70675 + }, + { + "epoch": 2.78, + "grad_norm": 2.215584387164041, + "learning_rate": 4.898632107023412e-06, + "loss": 0.1555, + "step": 70700 + }, + { + "epoch": 2.78, + "grad_norm": 2.7082133233014423, + "learning_rate": 4.898590301003344e-06, + "loss": 0.1586, + "step": 70725 + }, + { + "epoch": 2.78, + "grad_norm": 1.8680573169122017, + "learning_rate": 4.898548494983278e-06, + "loss": 0.1922, + "step": 70750 + }, + { + "epoch": 2.78, + "grad_norm": 1.5818802598899575, + "learning_rate": 4.898506688963211e-06, + "loss": 0.1632, + "step": 70775 + }, + { + "epoch": 2.79, + "grad_norm": 2.6198897893488717, + "learning_rate": 4.898464882943144e-06, + "loss": 0.1765, + "step": 70800 + }, + { + "epoch": 2.79, + "grad_norm": 2.112835040259786, + "learning_rate": 4.898423076923078e-06, + "loss": 0.174, + "step": 70825 + }, + { + "epoch": 2.79, + "grad_norm": 2.5854543693695047, + "learning_rate": 4.89838127090301e-06, + "loss": 0.1454, + "step": 70850 + }, + { + "epoch": 2.79, + "grad_norm": 1.339170672832636, + "learning_rate": 4.898339464882944e-06, + "loss": 0.1358, + "step": 70875 + }, + { + "epoch": 2.79, + "grad_norm": 1.6926476355982136, + "learning_rate": 4.898297658862877e-06, + "loss": 0.1496, + "step": 70900 + }, + { + "epoch": 2.79, + "grad_norm": 1.5075863109608603, + "learning_rate": 4.89825585284281e-06, + "loss": 0.1552, + "step": 70925 + }, + { + "epoch": 2.79, + "grad_norm": 2.2298242138770483, + "learning_rate": 4.898214046822743e-06, + "loss": 0.1658, + "step": 70950 + }, + { + "epoch": 2.79, + "grad_norm": 2.56403234832974, + "learning_rate": 4.8981722408026764e-06, + "loss": 0.1713, + "step": 70975 + }, + { + "epoch": 2.79, + "grad_norm": 2.266105914417862, + "learning_rate": 4.898130434782609e-06, + "loss": 0.1721, + "step": 71000 + }, + { + "epoch": 2.79, + "grad_norm": 2.625059941253109, + "learning_rate": 4.898088628762543e-06, + "loss": 0.1688, + "step": 71025 + }, + { + "epoch": 2.8, + "grad_norm": 2.192109072550119, + "learning_rate": 4.898046822742475e-06, + "loss": 0.1651, + "step": 71050 + }, + { + "epoch": 2.8, + "grad_norm": 4.035841787276603, + "learning_rate": 4.898005016722409e-06, + "loss": 0.164, + "step": 71075 + }, + { + "epoch": 2.8, + "grad_norm": 3.4732146936336017, + "learning_rate": 4.897963210702342e-06, + "loss": 0.1783, + "step": 71100 + }, + { + "epoch": 2.8, + "grad_norm": 1.9311210420583675, + "learning_rate": 4.897921404682275e-06, + "loss": 0.1717, + "step": 71125 + }, + { + "epoch": 2.8, + "grad_norm": 1.8756949688926736, + "learning_rate": 4.897879598662208e-06, + "loss": 0.1659, + "step": 71150 + }, + { + "epoch": 2.8, + "grad_norm": 2.422990879289581, + "learning_rate": 4.897837792642141e-06, + "loss": 0.1725, + "step": 71175 + }, + { + "epoch": 2.8, + "grad_norm": 2.8900976364859154, + "learning_rate": 4.897797658862876e-06, + "loss": 0.1763, + "step": 71200 + }, + { + "epoch": 2.8, + "grad_norm": 2.381504042282298, + "learning_rate": 4.89775585284281e-06, + "loss": 0.1592, + "step": 71225 + }, + { + "epoch": 2.8, + "grad_norm": 1.8401869313416392, + "learning_rate": 4.897714046822743e-06, + "loss": 0.1719, + "step": 71250 + }, + { + "epoch": 2.8, + "grad_norm": 1.2582691293928905, + "learning_rate": 4.897672240802676e-06, + "loss": 0.1661, + "step": 71275 + }, + { + "epoch": 2.8, + "grad_norm": 1.3065157469687312, + "learning_rate": 4.897630434782609e-06, + "loss": 0.1574, + "step": 71300 + }, + { + "epoch": 2.81, + "grad_norm": 2.8882287169254925, + "learning_rate": 4.8975886287625424e-06, + "loss": 0.1781, + "step": 71325 + }, + { + "epoch": 2.81, + "grad_norm": 1.8066680445158254, + "learning_rate": 4.897546822742475e-06, + "loss": 0.1514, + "step": 71350 + }, + { + "epoch": 2.81, + "grad_norm": 2.1135129603595155, + "learning_rate": 4.897505016722409e-06, + "loss": 0.168, + "step": 71375 + }, + { + "epoch": 2.81, + "grad_norm": 2.4731969275899046, + "learning_rate": 4.897463210702341e-06, + "loss": 0.1679, + "step": 71400 + }, + { + "epoch": 2.81, + "grad_norm": 2.1670796062973263, + "learning_rate": 4.897421404682275e-06, + "loss": 0.1721, + "step": 71425 + }, + { + "epoch": 2.81, + "grad_norm": 2.1632396701972914, + "learning_rate": 4.897379598662208e-06, + "loss": 0.1728, + "step": 71450 + }, + { + "epoch": 2.81, + "grad_norm": 1.8934608853543096, + "learning_rate": 4.89733779264214e-06, + "loss": 0.1585, + "step": 71475 + }, + { + "epoch": 2.81, + "grad_norm": 1.8398068522285902, + "learning_rate": 4.897295986622074e-06, + "loss": 0.1697, + "step": 71500 + }, + { + "epoch": 2.81, + "grad_norm": 1.8325654805684302, + "learning_rate": 4.8972541806020066e-06, + "loss": 0.1704, + "step": 71525 + }, + { + "epoch": 2.81, + "grad_norm": 2.3199751539353652, + "learning_rate": 4.89721237458194e-06, + "loss": 0.161, + "step": 71550 + }, + { + "epoch": 2.82, + "grad_norm": 1.147554620731119, + "learning_rate": 4.897170568561873e-06, + "loss": 0.1531, + "step": 71575 + }, + { + "epoch": 2.82, + "grad_norm": 2.0353602477046118, + "learning_rate": 4.897128762541806e-06, + "loss": 0.163, + "step": 71600 + }, + { + "epoch": 2.82, + "grad_norm": 2.371029050435062, + "learning_rate": 4.897086956521739e-06, + "loss": 0.1813, + "step": 71625 + }, + { + "epoch": 2.82, + "grad_norm": 2.1762205039573836, + "learning_rate": 4.897045150501673e-06, + "loss": 0.1736, + "step": 71650 + }, + { + "epoch": 2.82, + "grad_norm": 1.7956537476456738, + "learning_rate": 4.897003344481605e-06, + "loss": 0.1716, + "step": 71675 + }, + { + "epoch": 2.82, + "grad_norm": 2.2524746958016393, + "learning_rate": 4.896961538461539e-06, + "loss": 0.1801, + "step": 71700 + }, + { + "epoch": 2.82, + "grad_norm": 2.136139836516011, + "learning_rate": 4.8969197324414715e-06, + "loss": 0.1635, + "step": 71725 + }, + { + "epoch": 2.82, + "grad_norm": 2.307095824383635, + "learning_rate": 4.896877926421405e-06, + "loss": 0.1487, + "step": 71750 + }, + { + "epoch": 2.82, + "grad_norm": 1.9994033511834624, + "learning_rate": 4.896836120401338e-06, + "loss": 0.1698, + "step": 71775 + }, + { + "epoch": 2.82, + "grad_norm": 2.0458057349311662, + "learning_rate": 4.896794314381271e-06, + "loss": 0.1664, + "step": 71800 + }, + { + "epoch": 2.83, + "grad_norm": 2.7828406872393536, + "learning_rate": 4.896752508361204e-06, + "loss": 0.1835, + "step": 71825 + }, + { + "epoch": 2.83, + "grad_norm": 2.3490321431100463, + "learning_rate": 4.8967107023411376e-06, + "loss": 0.1772, + "step": 71850 + }, + { + "epoch": 2.83, + "grad_norm": 2.1239436779644456, + "learning_rate": 4.89666889632107e-06, + "loss": 0.1698, + "step": 71875 + }, + { + "epoch": 2.83, + "grad_norm": 2.55078053521957, + "learning_rate": 4.896627090301004e-06, + "loss": 0.1865, + "step": 71900 + }, + { + "epoch": 2.83, + "grad_norm": 2.0722007158937537, + "learning_rate": 4.896585284280937e-06, + "loss": 0.1534, + "step": 71925 + }, + { + "epoch": 2.83, + "grad_norm": 3.2040867285924763, + "learning_rate": 4.89654347826087e-06, + "loss": 0.1615, + "step": 71950 + }, + { + "epoch": 2.83, + "grad_norm": 2.2719001769767058, + "learning_rate": 4.896501672240804e-06, + "loss": 0.16, + "step": 71975 + }, + { + "epoch": 2.83, + "grad_norm": 2.003118329948211, + "learning_rate": 4.896459866220736e-06, + "loss": 0.157, + "step": 72000 + }, + { + "epoch": 2.83, + "grad_norm": 2.933295145712109, + "learning_rate": 4.89641806020067e-06, + "loss": 0.1549, + "step": 72025 + }, + { + "epoch": 2.83, + "grad_norm": 1.89512122355957, + "learning_rate": 4.8963762541806025e-06, + "loss": 0.174, + "step": 72050 + }, + { + "epoch": 2.84, + "grad_norm": 1.9653910668438528, + "learning_rate": 4.896334448160536e-06, + "loss": 0.1537, + "step": 72075 + }, + { + "epoch": 2.84, + "grad_norm": 1.8822786208686735, + "learning_rate": 4.896292642140469e-06, + "loss": 0.1483, + "step": 72100 + }, + { + "epoch": 2.84, + "grad_norm": 1.6974680148122512, + "learning_rate": 4.896250836120402e-06, + "loss": 0.1842, + "step": 72125 + }, + { + "epoch": 2.84, + "grad_norm": 2.1232240441546324, + "learning_rate": 4.896209030100335e-06, + "loss": 0.1708, + "step": 72150 + }, + { + "epoch": 2.84, + "grad_norm": 2.5147761693595014, + "learning_rate": 4.896167224080269e-06, + "loss": 0.1734, + "step": 72175 + }, + { + "epoch": 2.84, + "grad_norm": 1.5814979109687117, + "learning_rate": 4.8961270903010036e-06, + "loss": 0.1753, + "step": 72200 + }, + { + "epoch": 2.84, + "grad_norm": 1.9271487770120417, + "learning_rate": 4.896085284280937e-06, + "loss": 0.16, + "step": 72225 + }, + { + "epoch": 2.84, + "grad_norm": 1.6807905811978427, + "learning_rate": 4.89604347826087e-06, + "loss": 0.1638, + "step": 72250 + }, + { + "epoch": 2.84, + "grad_norm": 3.0101811846023243, + "learning_rate": 4.896001672240803e-06, + "loss": 0.1482, + "step": 72275 + }, + { + "epoch": 2.84, + "grad_norm": 2.1706275732886335, + "learning_rate": 4.895959866220736e-06, + "loss": 0.1694, + "step": 72300 + }, + { + "epoch": 2.85, + "grad_norm": 1.8417541316276584, + "learning_rate": 4.89591806020067e-06, + "loss": 0.1714, + "step": 72325 + }, + { + "epoch": 2.85, + "grad_norm": 1.6630468895130261, + "learning_rate": 4.895876254180602e-06, + "loss": 0.1551, + "step": 72350 + }, + { + "epoch": 2.85, + "grad_norm": 3.309966479941043, + "learning_rate": 4.895834448160536e-06, + "loss": 0.1719, + "step": 72375 + }, + { + "epoch": 2.85, + "grad_norm": 1.7480764555332018, + "learning_rate": 4.8957926421404685e-06, + "loss": 0.1615, + "step": 72400 + }, + { + "epoch": 2.85, + "grad_norm": 1.8651253589940062, + "learning_rate": 4.895750836120401e-06, + "loss": 0.1673, + "step": 72425 + }, + { + "epoch": 2.85, + "grad_norm": 1.367768522432886, + "learning_rate": 4.895709030100335e-06, + "loss": 0.1614, + "step": 72450 + }, + { + "epoch": 2.85, + "grad_norm": 2.47582620859887, + "learning_rate": 4.8956672240802675e-06, + "loss": 0.1607, + "step": 72475 + }, + { + "epoch": 2.85, + "grad_norm": 3.023070237184101, + "learning_rate": 4.895625418060201e-06, + "loss": 0.1719, + "step": 72500 + }, + { + "epoch": 2.85, + "grad_norm": 2.177541987436765, + "learning_rate": 4.895583612040134e-06, + "loss": 0.1612, + "step": 72525 + }, + { + "epoch": 2.85, + "grad_norm": 2.257043109303166, + "learning_rate": 4.895541806020067e-06, + "loss": 0.1707, + "step": 72550 + }, + { + "epoch": 2.86, + "grad_norm": 2.0137969172613395, + "learning_rate": 4.8955e-06, + "loss": 0.1643, + "step": 72575 + }, + { + "epoch": 2.86, + "grad_norm": 1.9763477092821808, + "learning_rate": 4.8954581939799335e-06, + "loss": 0.1709, + "step": 72600 + }, + { + "epoch": 2.86, + "grad_norm": 1.6234222767388702, + "learning_rate": 4.895416387959866e-06, + "loss": 0.1556, + "step": 72625 + }, + { + "epoch": 2.86, + "grad_norm": 1.8976361029757107, + "learning_rate": 4.8953745819398e-06, + "loss": 0.1585, + "step": 72650 + }, + { + "epoch": 2.86, + "grad_norm": 1.739742768865686, + "learning_rate": 4.8953327759197325e-06, + "loss": 0.1721, + "step": 72675 + }, + { + "epoch": 2.86, + "grad_norm": 2.1472365854087303, + "learning_rate": 4.895290969899666e-06, + "loss": 0.1681, + "step": 72700 + }, + { + "epoch": 2.86, + "grad_norm": 2.4942342308769527, + "learning_rate": 4.895249163879599e-06, + "loss": 0.1653, + "step": 72725 + }, + { + "epoch": 2.86, + "grad_norm": 2.213462965668188, + "learning_rate": 4.895207357859532e-06, + "loss": 0.1533, + "step": 72750 + }, + { + "epoch": 2.86, + "grad_norm": 1.757886571286951, + "learning_rate": 4.895165551839465e-06, + "loss": 0.1628, + "step": 72775 + }, + { + "epoch": 2.86, + "grad_norm": 3.161382454759714, + "learning_rate": 4.8951237458193985e-06, + "loss": 0.1746, + "step": 72800 + }, + { + "epoch": 2.86, + "grad_norm": 2.898760234748562, + "learning_rate": 4.895081939799331e-06, + "loss": 0.1644, + "step": 72825 + }, + { + "epoch": 2.87, + "grad_norm": 1.9181331685697647, + "learning_rate": 4.895040133779265e-06, + "loss": 0.1744, + "step": 72850 + }, + { + "epoch": 2.87, + "grad_norm": 1.2622034792223826, + "learning_rate": 4.8949983277591974e-06, + "loss": 0.1627, + "step": 72875 + }, + { + "epoch": 2.87, + "grad_norm": 2.0311044399909335, + "learning_rate": 4.894956521739131e-06, + "loss": 0.1644, + "step": 72900 + }, + { + "epoch": 2.87, + "grad_norm": 1.7264522011793653, + "learning_rate": 4.894914715719064e-06, + "loss": 0.162, + "step": 72925 + }, + { + "epoch": 2.87, + "grad_norm": 2.5284533556414153, + "learning_rate": 4.894872909698997e-06, + "loss": 0.1708, + "step": 72950 + }, + { + "epoch": 2.87, + "grad_norm": 2.3584564833935526, + "learning_rate": 4.89483110367893e-06, + "loss": 0.1553, + "step": 72975 + }, + { + "epoch": 2.87, + "grad_norm": 2.6811106192462244, + "learning_rate": 4.8947892976588635e-06, + "loss": 0.1768, + "step": 73000 + }, + { + "epoch": 2.87, + "grad_norm": 2.880359819740993, + "learning_rate": 4.894747491638796e-06, + "loss": 0.1638, + "step": 73025 + }, + { + "epoch": 2.87, + "grad_norm": 1.8098189143936174, + "learning_rate": 4.89470568561873e-06, + "loss": 0.1694, + "step": 73050 + }, + { + "epoch": 2.87, + "grad_norm": 2.1960220587817347, + "learning_rate": 4.894663879598663e-06, + "loss": 0.1529, + "step": 73075 + }, + { + "epoch": 2.88, + "grad_norm": 1.879513441542289, + "learning_rate": 4.894622073578596e-06, + "loss": 0.1602, + "step": 73100 + }, + { + "epoch": 2.88, + "grad_norm": 2.354208570461897, + "learning_rate": 4.894580267558529e-06, + "loss": 0.154, + "step": 73125 + }, + { + "epoch": 2.88, + "grad_norm": 1.2171654991734855, + "learning_rate": 4.894538461538461e-06, + "loss": 0.1699, + "step": 73150 + }, + { + "epoch": 2.88, + "grad_norm": 2.2940717027810593, + "learning_rate": 4.894496655518395e-06, + "loss": 0.1675, + "step": 73175 + }, + { + "epoch": 2.88, + "grad_norm": 1.7813725219024068, + "learning_rate": 4.894456521739131e-06, + "loss": 0.1726, + "step": 73200 + }, + { + "epoch": 2.88, + "grad_norm": 1.5471775285839873, + "learning_rate": 4.894414715719064e-06, + "loss": 0.1506, + "step": 73225 + }, + { + "epoch": 2.88, + "grad_norm": 2.3179111030180164, + "learning_rate": 4.894372909698997e-06, + "loss": 0.1549, + "step": 73250 + }, + { + "epoch": 2.88, + "grad_norm": 2.381683834465129, + "learning_rate": 4.8943311036789305e-06, + "loss": 0.1578, + "step": 73275 + }, + { + "epoch": 2.88, + "grad_norm": 2.8932283117028095, + "learning_rate": 4.894289297658863e-06, + "loss": 0.1596, + "step": 73300 + }, + { + "epoch": 2.88, + "grad_norm": 3.0652628362276526, + "learning_rate": 4.894247491638797e-06, + "loss": 0.1757, + "step": 73325 + }, + { + "epoch": 2.89, + "grad_norm": 2.2515578022167846, + "learning_rate": 4.8942056856187295e-06, + "loss": 0.1531, + "step": 73350 + }, + { + "epoch": 2.89, + "grad_norm": 1.9302780505682737, + "learning_rate": 4.894163879598662e-06, + "loss": 0.1751, + "step": 73375 + }, + { + "epoch": 2.89, + "grad_norm": 2.3296661369560576, + "learning_rate": 4.894122073578596e-06, + "loss": 0.1579, + "step": 73400 + }, + { + "epoch": 2.89, + "grad_norm": 2.317270323434963, + "learning_rate": 4.894080267558528e-06, + "loss": 0.1521, + "step": 73425 + }, + { + "epoch": 2.89, + "grad_norm": 1.8791542432395285, + "learning_rate": 4.894038461538462e-06, + "loss": 0.1666, + "step": 73450 + }, + { + "epoch": 2.89, + "grad_norm": 3.1141465597924354, + "learning_rate": 4.893996655518395e-06, + "loss": 0.1779, + "step": 73475 + }, + { + "epoch": 2.89, + "grad_norm": 1.83550212334721, + "learning_rate": 4.893954849498328e-06, + "loss": 0.1573, + "step": 73500 + }, + { + "epoch": 2.89, + "grad_norm": 1.869911326816695, + "learning_rate": 4.893913043478261e-06, + "loss": 0.1505, + "step": 73525 + }, + { + "epoch": 2.89, + "grad_norm": 1.875677676542583, + "learning_rate": 4.8938712374581944e-06, + "loss": 0.1749, + "step": 73550 + }, + { + "epoch": 2.89, + "grad_norm": 2.877401745073335, + "learning_rate": 4.893829431438127e-06, + "loss": 0.1756, + "step": 73575 + }, + { + "epoch": 2.9, + "grad_norm": 2.8537228949707183, + "learning_rate": 4.893787625418061e-06, + "loss": 0.1553, + "step": 73600 + }, + { + "epoch": 2.9, + "grad_norm": 3.132534812602467, + "learning_rate": 4.893745819397993e-06, + "loss": 0.1593, + "step": 73625 + }, + { + "epoch": 2.9, + "grad_norm": 1.5367775675098951, + "learning_rate": 4.893704013377927e-06, + "loss": 0.1658, + "step": 73650 + }, + { + "epoch": 2.9, + "grad_norm": 2.51031029076394, + "learning_rate": 4.89366220735786e-06, + "loss": 0.1599, + "step": 73675 + }, + { + "epoch": 2.9, + "grad_norm": 1.9305209052598873, + "learning_rate": 4.893620401337793e-06, + "loss": 0.1642, + "step": 73700 + }, + { + "epoch": 2.9, + "grad_norm": 1.8180335977528552, + "learning_rate": 4.893578595317726e-06, + "loss": 0.1697, + "step": 73725 + }, + { + "epoch": 2.9, + "grad_norm": 1.7564659182371012, + "learning_rate": 4.893536789297659e-06, + "loss": 0.1408, + "step": 73750 + }, + { + "epoch": 2.9, + "grad_norm": 2.707073171305364, + "learning_rate": 4.893494983277592e-06, + "loss": 0.1761, + "step": 73775 + }, + { + "epoch": 2.9, + "grad_norm": 2.0197959277765496, + "learning_rate": 4.893453177257526e-06, + "loss": 0.1544, + "step": 73800 + }, + { + "epoch": 2.9, + "grad_norm": 1.592459155442505, + "learning_rate": 4.893411371237458e-06, + "loss": 0.169, + "step": 73825 + }, + { + "epoch": 2.91, + "grad_norm": 2.1128575826279286, + "learning_rate": 4.893369565217392e-06, + "loss": 0.1821, + "step": 73850 + }, + { + "epoch": 2.91, + "grad_norm": 2.330641493994605, + "learning_rate": 4.893327759197325e-06, + "loss": 0.1599, + "step": 73875 + }, + { + "epoch": 2.91, + "grad_norm": 2.289273202147065, + "learning_rate": 4.893285953177258e-06, + "loss": 0.1665, + "step": 73900 + }, + { + "epoch": 2.91, + "grad_norm": 2.3967166690376724, + "learning_rate": 4.893244147157191e-06, + "loss": 0.1537, + "step": 73925 + }, + { + "epoch": 2.91, + "grad_norm": 1.7436469370309389, + "learning_rate": 4.893202341137124e-06, + "loss": 0.1681, + "step": 73950 + }, + { + "epoch": 2.91, + "grad_norm": 2.593160901601273, + "learning_rate": 4.893160535117057e-06, + "loss": 0.1549, + "step": 73975 + }, + { + "epoch": 2.91, + "grad_norm": 1.7292694124000343, + "learning_rate": 4.893118729096991e-06, + "loss": 0.1704, + "step": 74000 + }, + { + "epoch": 2.91, + "grad_norm": 2.7513280274403336, + "learning_rate": 4.893076923076923e-06, + "loss": 0.1762, + "step": 74025 + }, + { + "epoch": 2.91, + "grad_norm": 2.245015646917485, + "learning_rate": 4.893035117056857e-06, + "loss": 0.1545, + "step": 74050 + }, + { + "epoch": 2.91, + "grad_norm": 2.0277778975395058, + "learning_rate": 4.89299331103679e-06, + "loss": 0.1585, + "step": 74075 + }, + { + "epoch": 2.92, + "grad_norm": 2.6304291462822813, + "learning_rate": 4.892951505016722e-06, + "loss": 0.1635, + "step": 74100 + }, + { + "epoch": 2.92, + "grad_norm": 2.1099036734597085, + "learning_rate": 4.892909698996656e-06, + "loss": 0.165, + "step": 74125 + }, + { + "epoch": 2.92, + "grad_norm": 2.3841230259715886, + "learning_rate": 4.8928678929765885e-06, + "loss": 0.1541, + "step": 74150 + }, + { + "epoch": 2.92, + "grad_norm": 1.4945382544604393, + "learning_rate": 4.892826086956522e-06, + "loss": 0.159, + "step": 74175 + }, + { + "epoch": 2.92, + "grad_norm": 3.160768317692789, + "learning_rate": 4.892785953177258e-06, + "loss": 0.1554, + "step": 74200 + }, + { + "epoch": 2.92, + "grad_norm": 2.307190734455112, + "learning_rate": 4.8927441471571914e-06, + "loss": 0.1572, + "step": 74225 + }, + { + "epoch": 2.92, + "grad_norm": 2.5864605527863525, + "learning_rate": 4.892702341137124e-06, + "loss": 0.1478, + "step": 74250 + }, + { + "epoch": 2.92, + "grad_norm": 2.3159481412550114, + "learning_rate": 4.892660535117058e-06, + "loss": 0.1712, + "step": 74275 + }, + { + "epoch": 2.92, + "grad_norm": 1.7017971769327145, + "learning_rate": 4.89261872909699e-06, + "loss": 0.1576, + "step": 74300 + }, + { + "epoch": 2.92, + "grad_norm": 2.916033378211683, + "learning_rate": 4.892576923076923e-06, + "loss": 0.1415, + "step": 74325 + }, + { + "epoch": 2.92, + "grad_norm": 2.589428720427529, + "learning_rate": 4.892535117056857e-06, + "loss": 0.1514, + "step": 74350 + }, + { + "epoch": 2.93, + "grad_norm": 2.4781537139443732, + "learning_rate": 4.892493311036789e-06, + "loss": 0.1753, + "step": 74375 + }, + { + "epoch": 2.93, + "grad_norm": 2.087506703608687, + "learning_rate": 4.892451505016723e-06, + "loss": 0.1665, + "step": 74400 + }, + { + "epoch": 2.93, + "grad_norm": 1.4517621145289306, + "learning_rate": 4.8924096989966556e-06, + "loss": 0.1673, + "step": 74425 + }, + { + "epoch": 2.93, + "grad_norm": 1.549222778728874, + "learning_rate": 4.892367892976589e-06, + "loss": 0.1623, + "step": 74450 + }, + { + "epoch": 2.93, + "grad_norm": 2.6905760289098284, + "learning_rate": 4.892326086956522e-06, + "loss": 0.1688, + "step": 74475 + }, + { + "epoch": 2.93, + "grad_norm": 1.7294195336741673, + "learning_rate": 4.892284280936455e-06, + "loss": 0.1792, + "step": 74500 + }, + { + "epoch": 2.93, + "grad_norm": 2.2205866806833905, + "learning_rate": 4.892242474916388e-06, + "loss": 0.1703, + "step": 74525 + }, + { + "epoch": 2.93, + "grad_norm": 1.512048417276666, + "learning_rate": 4.892200668896322e-06, + "loss": 0.1727, + "step": 74550 + }, + { + "epoch": 2.93, + "grad_norm": 1.6165734990093756, + "learning_rate": 4.892158862876254e-06, + "loss": 0.1689, + "step": 74575 + }, + { + "epoch": 2.93, + "grad_norm": 1.7105503061939633, + "learning_rate": 4.892117056856188e-06, + "loss": 0.1645, + "step": 74600 + }, + { + "epoch": 2.94, + "grad_norm": 2.241186103825974, + "learning_rate": 4.8920752508361206e-06, + "loss": 0.174, + "step": 74625 + }, + { + "epoch": 2.94, + "grad_norm": 3.0478956139162046, + "learning_rate": 4.892033444816054e-06, + "loss": 0.1732, + "step": 74650 + }, + { + "epoch": 2.94, + "grad_norm": 2.2023056589981613, + "learning_rate": 4.891991638795987e-06, + "loss": 0.1704, + "step": 74675 + }, + { + "epoch": 2.94, + "grad_norm": 2.1750560553924156, + "learning_rate": 4.89194983277592e-06, + "loss": 0.1757, + "step": 74700 + }, + { + "epoch": 2.94, + "grad_norm": 1.7338152881128504, + "learning_rate": 4.891908026755853e-06, + "loss": 0.1625, + "step": 74725 + }, + { + "epoch": 2.94, + "grad_norm": 1.5947374991763381, + "learning_rate": 4.891866220735787e-06, + "loss": 0.1742, + "step": 74750 + }, + { + "epoch": 2.94, + "grad_norm": 1.9887138661903652, + "learning_rate": 4.891824414715719e-06, + "loss": 0.1811, + "step": 74775 + }, + { + "epoch": 2.94, + "grad_norm": 2.436010532096877, + "learning_rate": 4.891782608695653e-06, + "loss": 0.1537, + "step": 74800 + }, + { + "epoch": 2.94, + "grad_norm": 2.926286576820829, + "learning_rate": 4.8917408026755855e-06, + "loss": 0.16, + "step": 74825 + }, + { + "epoch": 2.94, + "grad_norm": 1.824564621763424, + "learning_rate": 4.891698996655519e-06, + "loss": 0.1629, + "step": 74850 + }, + { + "epoch": 2.95, + "grad_norm": 2.683636286974673, + "learning_rate": 4.891657190635452e-06, + "loss": 0.1557, + "step": 74875 + }, + { + "epoch": 2.95, + "grad_norm": 2.1385204177371224, + "learning_rate": 4.891615384615385e-06, + "loss": 0.1675, + "step": 74900 + }, + { + "epoch": 2.95, + "grad_norm": 1.5571459930052227, + "learning_rate": 4.891573578595318e-06, + "loss": 0.1468, + "step": 74925 + }, + { + "epoch": 2.95, + "grad_norm": 2.19393355253032, + "learning_rate": 4.8915317725752516e-06, + "loss": 0.1668, + "step": 74950 + }, + { + "epoch": 2.95, + "grad_norm": 2.1955950411829646, + "learning_rate": 4.891489966555184e-06, + "loss": 0.1705, + "step": 74975 + }, + { + "epoch": 2.95, + "grad_norm": 1.8842544208995302, + "learning_rate": 4.891448160535118e-06, + "loss": 0.1853, + "step": 75000 + }, + { + "epoch": 2.95, + "grad_norm": 1.9456707154959463, + "learning_rate": 4.8914063545150505e-06, + "loss": 0.1676, + "step": 75025 + }, + { + "epoch": 2.95, + "grad_norm": 1.720712384593594, + "learning_rate": 4.891364548494983e-06, + "loss": 0.1691, + "step": 75050 + }, + { + "epoch": 2.95, + "grad_norm": 2.732269103665272, + "learning_rate": 4.891322742474917e-06, + "loss": 0.157, + "step": 75075 + }, + { + "epoch": 2.95, + "grad_norm": 2.093993948698275, + "learning_rate": 4.8912809364548494e-06, + "loss": 0.1579, + "step": 75100 + }, + { + "epoch": 2.96, + "grad_norm": 2.2596660812012637, + "learning_rate": 4.891239130434783e-06, + "loss": 0.1612, + "step": 75125 + }, + { + "epoch": 2.96, + "grad_norm": 1.607362977009435, + "learning_rate": 4.891197324414716e-06, + "loss": 0.1539, + "step": 75150 + }, + { + "epoch": 2.96, + "grad_norm": 1.750020200184701, + "learning_rate": 4.891155518394649e-06, + "loss": 0.1653, + "step": 75175 + }, + { + "epoch": 2.96, + "grad_norm": 2.65398912558923, + "learning_rate": 4.891115384615385e-06, + "loss": 0.1536, + "step": 75200 + }, + { + "epoch": 2.96, + "grad_norm": 2.8119697714667136, + "learning_rate": 4.891073578595319e-06, + "loss": 0.1533, + "step": 75225 + }, + { + "epoch": 2.96, + "grad_norm": 2.249098596865318, + "learning_rate": 4.891031772575251e-06, + "loss": 0.1636, + "step": 75250 + }, + { + "epoch": 2.96, + "grad_norm": 2.380834954836527, + "learning_rate": 4.890989966555184e-06, + "loss": 0.1592, + "step": 75275 + }, + { + "epoch": 2.96, + "grad_norm": 2.074689075288214, + "learning_rate": 4.8909481605351176e-06, + "loss": 0.1662, + "step": 75300 + }, + { + "epoch": 2.96, + "grad_norm": 2.2019387377694404, + "learning_rate": 4.89090635451505e-06, + "loss": 0.1697, + "step": 75325 + }, + { + "epoch": 2.96, + "grad_norm": 2.4940970023751756, + "learning_rate": 4.890864548494984e-06, + "loss": 0.1524, + "step": 75350 + }, + { + "epoch": 2.97, + "grad_norm": 3.0020918616337213, + "learning_rate": 4.8908227424749165e-06, + "loss": 0.167, + "step": 75375 + }, + { + "epoch": 2.97, + "grad_norm": 2.04633813344427, + "learning_rate": 4.89078093645485e-06, + "loss": 0.1857, + "step": 75400 + }, + { + "epoch": 2.97, + "grad_norm": 1.8578961402822785, + "learning_rate": 4.890739130434783e-06, + "loss": 0.1712, + "step": 75425 + }, + { + "epoch": 2.97, + "grad_norm": 2.568127490750955, + "learning_rate": 4.890697324414716e-06, + "loss": 0.1824, + "step": 75450 + }, + { + "epoch": 2.97, + "grad_norm": 1.7273995103916189, + "learning_rate": 4.890655518394649e-06, + "loss": 0.1633, + "step": 75475 + }, + { + "epoch": 2.97, + "grad_norm": 1.8705828417286539, + "learning_rate": 4.8906137123745825e-06, + "loss": 0.1555, + "step": 75500 + }, + { + "epoch": 2.97, + "grad_norm": 1.2009745533949896, + "learning_rate": 4.890571906354515e-06, + "loss": 0.1711, + "step": 75525 + }, + { + "epoch": 2.97, + "grad_norm": 2.4521956369026072, + "learning_rate": 4.890530100334449e-06, + "loss": 0.1702, + "step": 75550 + }, + { + "epoch": 2.97, + "grad_norm": 1.6330318191278905, + "learning_rate": 4.8904882943143815e-06, + "loss": 0.1622, + "step": 75575 + }, + { + "epoch": 2.97, + "grad_norm": 1.8895480021883273, + "learning_rate": 4.890446488294315e-06, + "loss": 0.1484, + "step": 75600 + }, + { + "epoch": 2.98, + "grad_norm": 2.990507862504963, + "learning_rate": 4.890404682274248e-06, + "loss": 0.1704, + "step": 75625 + }, + { + "epoch": 2.98, + "grad_norm": 2.254372856378548, + "learning_rate": 4.890362876254181e-06, + "loss": 0.1781, + "step": 75650 + }, + { + "epoch": 2.98, + "grad_norm": 2.038112544222414, + "learning_rate": 4.890321070234114e-06, + "loss": 0.1593, + "step": 75675 + }, + { + "epoch": 2.98, + "grad_norm": 3.040909300567587, + "learning_rate": 4.8902792642140475e-06, + "loss": 0.1729, + "step": 75700 + }, + { + "epoch": 2.98, + "grad_norm": 2.7786710261443406, + "learning_rate": 4.89023745819398e-06, + "loss": 0.1461, + "step": 75725 + }, + { + "epoch": 2.98, + "grad_norm": 2.3393555386740212, + "learning_rate": 4.890195652173914e-06, + "loss": 0.1595, + "step": 75750 + }, + { + "epoch": 2.98, + "grad_norm": 5.199257311504345, + "learning_rate": 4.8901538461538465e-06, + "loss": 0.1753, + "step": 75775 + }, + { + "epoch": 2.98, + "grad_norm": 1.703194333068814, + "learning_rate": 4.89011204013378e-06, + "loss": 0.1589, + "step": 75800 + }, + { + "epoch": 2.98, + "grad_norm": 2.780370147699592, + "learning_rate": 4.890070234113713e-06, + "loss": 0.1715, + "step": 75825 + }, + { + "epoch": 2.98, + "grad_norm": 2.3580394840206327, + "learning_rate": 4.890028428093646e-06, + "loss": 0.1559, + "step": 75850 + }, + { + "epoch": 2.98, + "grad_norm": 2.1728983607540133, + "learning_rate": 4.889986622073579e-06, + "loss": 0.1591, + "step": 75875 + }, + { + "epoch": 2.99, + "grad_norm": 2.277580980241663, + "learning_rate": 4.8899448160535125e-06, + "loss": 0.1659, + "step": 75900 + }, + { + "epoch": 2.99, + "grad_norm": 2.3986761264261127, + "learning_rate": 4.889903010033445e-06, + "loss": 0.1672, + "step": 75925 + }, + { + "epoch": 2.99, + "grad_norm": 1.931152925984652, + "learning_rate": 4.889861204013378e-06, + "loss": 0.1657, + "step": 75950 + }, + { + "epoch": 2.99, + "grad_norm": 1.68257502997896, + "learning_rate": 4.8898193979933114e-06, + "loss": 0.1614, + "step": 75975 + }, + { + "epoch": 2.99, + "grad_norm": 1.470089170915271, + "learning_rate": 4.889777591973244e-06, + "loss": 0.1782, + "step": 76000 + }, + { + "epoch": 2.99, + "grad_norm": 3.2551584773611544, + "learning_rate": 4.889735785953178e-06, + "loss": 0.1503, + "step": 76025 + }, + { + "epoch": 2.99, + "grad_norm": 2.1093960304591755, + "learning_rate": 4.88969397993311e-06, + "loss": 0.1657, + "step": 76050 + }, + { + "epoch": 2.99, + "grad_norm": 2.672877955706056, + "learning_rate": 4.889652173913044e-06, + "loss": 0.1649, + "step": 76075 + }, + { + "epoch": 2.99, + "grad_norm": 3.1470669315639115, + "learning_rate": 4.889610367892977e-06, + "loss": 0.1746, + "step": 76100 + }, + { + "epoch": 2.99, + "grad_norm": 3.0579894482513397, + "learning_rate": 4.88956856187291e-06, + "loss": 0.1657, + "step": 76125 + }, + { + "epoch": 3.0, + "grad_norm": 2.947301172535142, + "learning_rate": 4.889526755852843e-06, + "loss": 0.1516, + "step": 76150 + }, + { + "epoch": 3.0, + "grad_norm": 3.3863874587051725, + "learning_rate": 4.889484949832776e-06, + "loss": 0.1697, + "step": 76175 + }, + { + "epoch": 3.0, + "grad_norm": 1.9291008913623715, + "learning_rate": 4.889444816053512e-06, + "loss": 0.1776, + "step": 76200 + }, + { + "epoch": 3.0, + "grad_norm": 2.319491892854814, + "learning_rate": 4.889403010033445e-06, + "loss": 0.1781, + "step": 76225 + }, + { + "epoch": 3.0, + "grad_norm": 3.488180039037966, + "learning_rate": 4.889362876254181e-06, + "loss": 0.1857, + "step": 76250 + }, + { + "epoch": 3.0, + "grad_norm": 2.412768379326752, + "learning_rate": 4.889321070234114e-06, + "loss": 0.1438, + "step": 76275 + }, + { + "epoch": 3.0, + "grad_norm": 3.8203948129859513, + "learning_rate": 4.889279264214047e-06, + "loss": 0.1328, + "step": 76300 + }, + { + "epoch": 3.0, + "grad_norm": 2.8638845252884635, + "learning_rate": 4.8892374581939806e-06, + "loss": 0.1195, + "step": 76325 + }, + { + "epoch": 3.0, + "grad_norm": 3.0186144490512676, + "learning_rate": 4.889195652173913e-06, + "loss": 0.1231, + "step": 76350 + }, + { + "epoch": 3.0, + "grad_norm": 2.40564050431268, + "learning_rate": 4.889153846153847e-06, + "loss": 0.1288, + "step": 76375 + }, + { + "epoch": 3.01, + "grad_norm": 3.223902399031433, + "learning_rate": 4.88911204013378e-06, + "loss": 0.1253, + "step": 76400 + }, + { + "epoch": 3.01, + "grad_norm": 2.524120816578796, + "learning_rate": 4.889070234113712e-06, + "loss": 0.1186, + "step": 76425 + }, + { + "epoch": 3.01, + "grad_norm": 4.33608755350878, + "learning_rate": 4.889028428093646e-06, + "loss": 0.1302, + "step": 76450 + }, + { + "epoch": 3.01, + "grad_norm": 3.0812374326190763, + "learning_rate": 4.8889866220735784e-06, + "loss": 0.1154, + "step": 76475 + }, + { + "epoch": 3.01, + "grad_norm": 3.8611643250629357, + "learning_rate": 4.888944816053512e-06, + "loss": 0.1182, + "step": 76500 + }, + { + "epoch": 3.01, + "grad_norm": 4.562201565976749, + "learning_rate": 4.888903010033445e-06, + "loss": 0.1163, + "step": 76525 + }, + { + "epoch": 3.01, + "grad_norm": 5.0482039643712335, + "learning_rate": 4.888861204013378e-06, + "loss": 0.1294, + "step": 76550 + }, + { + "epoch": 3.01, + "grad_norm": 2.7435608656097723, + "learning_rate": 4.888819397993311e-06, + "loss": 0.1384, + "step": 76575 + }, + { + "epoch": 3.01, + "grad_norm": 2.2602926708641857, + "learning_rate": 4.8887775919732445e-06, + "loss": 0.1116, + "step": 76600 + }, + { + "epoch": 3.01, + "grad_norm": 5.6874792345843135, + "learning_rate": 4.888735785953177e-06, + "loss": 0.1224, + "step": 76625 + }, + { + "epoch": 3.02, + "grad_norm": 2.720275109336373, + "learning_rate": 4.888693979933111e-06, + "loss": 0.123, + "step": 76650 + }, + { + "epoch": 3.02, + "grad_norm": 4.395574166086428, + "learning_rate": 4.888652173913043e-06, + "loss": 0.1168, + "step": 76675 + }, + { + "epoch": 3.02, + "grad_norm": 2.703916615126388, + "learning_rate": 4.888610367892977e-06, + "loss": 0.1214, + "step": 76700 + }, + { + "epoch": 3.02, + "grad_norm": 3.181411929507769, + "learning_rate": 4.88856856187291e-06, + "loss": 0.1138, + "step": 76725 + }, + { + "epoch": 3.02, + "grad_norm": 3.509556295339099, + "learning_rate": 4.888526755852843e-06, + "loss": 0.1232, + "step": 76750 + }, + { + "epoch": 3.02, + "grad_norm": 3.8902899973523626, + "learning_rate": 4.888484949832776e-06, + "loss": 0.1212, + "step": 76775 + }, + { + "epoch": 3.02, + "grad_norm": 2.0686794778491895, + "learning_rate": 4.8884431438127095e-06, + "loss": 0.123, + "step": 76800 + }, + { + "epoch": 3.02, + "grad_norm": 3.16510341619209, + "learning_rate": 4.888401337792643e-06, + "loss": 0.1183, + "step": 76825 + }, + { + "epoch": 3.02, + "grad_norm": 4.458808729537471, + "learning_rate": 4.888359531772576e-06, + "loss": 0.1277, + "step": 76850 + }, + { + "epoch": 3.02, + "grad_norm": 2.6194750814850947, + "learning_rate": 4.888317725752509e-06, + "loss": 0.1167, + "step": 76875 + }, + { + "epoch": 3.03, + "grad_norm": 2.4407102227373794, + "learning_rate": 4.888275919732442e-06, + "loss": 0.1298, + "step": 76900 + }, + { + "epoch": 3.03, + "grad_norm": 2.6333427547264407, + "learning_rate": 4.8882341137123755e-06, + "loss": 0.1372, + "step": 76925 + }, + { + "epoch": 3.03, + "grad_norm": 2.4980445842112227, + "learning_rate": 4.888192307692308e-06, + "loss": 0.1213, + "step": 76950 + }, + { + "epoch": 3.03, + "grad_norm": 3.410736806496833, + "learning_rate": 4.888150501672242e-06, + "loss": 0.1324, + "step": 76975 + }, + { + "epoch": 3.03, + "grad_norm": 1.9758670528816349, + "learning_rate": 4.8881086956521744e-06, + "loss": 0.1302, + "step": 77000 + }, + { + "epoch": 3.03, + "grad_norm": 3.4608232211672334, + "learning_rate": 4.888066889632108e-06, + "loss": 0.1157, + "step": 77025 + }, + { + "epoch": 3.03, + "grad_norm": 3.9971640082189137, + "learning_rate": 4.888025083612041e-06, + "loss": 0.1116, + "step": 77050 + }, + { + "epoch": 3.03, + "grad_norm": 3.7059743493192046, + "learning_rate": 4.887983277591974e-06, + "loss": 0.1168, + "step": 77075 + }, + { + "epoch": 3.03, + "grad_norm": 3.525869534264302, + "learning_rate": 4.887941471571907e-06, + "loss": 0.1272, + "step": 77100 + }, + { + "epoch": 3.03, + "grad_norm": 3.3799153930239405, + "learning_rate": 4.88789966555184e-06, + "loss": 0.1373, + "step": 77125 + }, + { + "epoch": 3.04, + "grad_norm": 2.979968630998421, + "learning_rate": 4.887857859531772e-06, + "loss": 0.1173, + "step": 77150 + }, + { + "epoch": 3.04, + "grad_norm": 3.116452957430253, + "learning_rate": 4.887816053511706e-06, + "loss": 0.1313, + "step": 77175 + }, + { + "epoch": 3.04, + "grad_norm": 4.502809671470845, + "learning_rate": 4.8877742474916386e-06, + "loss": 0.1211, + "step": 77200 + }, + { + "epoch": 3.04, + "grad_norm": 3.3430787138363804, + "learning_rate": 4.887732441471572e-06, + "loss": 0.1137, + "step": 77225 + }, + { + "epoch": 3.04, + "grad_norm": 3.4815832417449935, + "learning_rate": 4.887690635451506e-06, + "loss": 0.124, + "step": 77250 + }, + { + "epoch": 3.04, + "grad_norm": 2.8682967863246294, + "learning_rate": 4.887648829431438e-06, + "loss": 0.1292, + "step": 77275 + }, + { + "epoch": 3.04, + "grad_norm": 3.5200313476756526, + "learning_rate": 4.887607023411372e-06, + "loss": 0.1175, + "step": 77300 + }, + { + "epoch": 3.04, + "grad_norm": 3.7267567794932743, + "learning_rate": 4.887565217391305e-06, + "loss": 0.1252, + "step": 77325 + }, + { + "epoch": 3.04, + "grad_norm": 3.9971687272656258, + "learning_rate": 4.887523411371238e-06, + "loss": 0.1274, + "step": 77350 + }, + { + "epoch": 3.04, + "grad_norm": 4.970670446361493, + "learning_rate": 4.887481605351171e-06, + "loss": 0.1329, + "step": 77375 + }, + { + "epoch": 3.04, + "grad_norm": 2.4841506585451643, + "learning_rate": 4.887439799331104e-06, + "loss": 0.1214, + "step": 77400 + }, + { + "epoch": 3.05, + "grad_norm": 4.295568567553117, + "learning_rate": 4.887397993311037e-06, + "loss": 0.1232, + "step": 77425 + }, + { + "epoch": 3.05, + "grad_norm": 4.063655366491959, + "learning_rate": 4.887356187290971e-06, + "loss": 0.1326, + "step": 77450 + }, + { + "epoch": 3.05, + "grad_norm": 2.8435059530091262, + "learning_rate": 4.887314381270903e-06, + "loss": 0.1242, + "step": 77475 + }, + { + "epoch": 3.05, + "grad_norm": 2.1310397377270602, + "learning_rate": 4.887272575250837e-06, + "loss": 0.1181, + "step": 77500 + }, + { + "epoch": 3.05, + "grad_norm": 3.3838400346860897, + "learning_rate": 4.8872307692307696e-06, + "loss": 0.1243, + "step": 77525 + }, + { + "epoch": 3.05, + "grad_norm": 3.642961468985254, + "learning_rate": 4.887188963210703e-06, + "loss": 0.1402, + "step": 77550 + }, + { + "epoch": 3.05, + "grad_norm": 2.593211405878639, + "learning_rate": 4.887147157190636e-06, + "loss": 0.1219, + "step": 77575 + }, + { + "epoch": 3.05, + "grad_norm": 4.739286033182659, + "learning_rate": 4.887105351170569e-06, + "loss": 0.1313, + "step": 77600 + }, + { + "epoch": 3.05, + "grad_norm": 3.05581489965152, + "learning_rate": 4.887063545150502e-06, + "loss": 0.1201, + "step": 77625 + }, + { + "epoch": 3.05, + "grad_norm": 3.0899269653481745, + "learning_rate": 4.887021739130436e-06, + "loss": 0.1247, + "step": 77650 + }, + { + "epoch": 3.06, + "grad_norm": 3.758598812637565, + "learning_rate": 4.886979933110368e-06, + "loss": 0.1213, + "step": 77675 + }, + { + "epoch": 3.06, + "grad_norm": 3.6037831919337875, + "learning_rate": 4.886938127090302e-06, + "loss": 0.128, + "step": 77700 + }, + { + "epoch": 3.06, + "grad_norm": 4.451175536710578, + "learning_rate": 4.8868963210702345e-06, + "loss": 0.1311, + "step": 77725 + }, + { + "epoch": 3.06, + "grad_norm": 3.09548461829267, + "learning_rate": 4.886854515050168e-06, + "loss": 0.1207, + "step": 77750 + }, + { + "epoch": 3.06, + "grad_norm": 3.689827710615845, + "learning_rate": 4.886812709030101e-06, + "loss": 0.1152, + "step": 77775 + }, + { + "epoch": 3.06, + "grad_norm": 3.7771578128301226, + "learning_rate": 4.886770903010034e-06, + "loss": 0.1242, + "step": 77800 + }, + { + "epoch": 3.06, + "grad_norm": 4.031573600779883, + "learning_rate": 4.886729096989967e-06, + "loss": 0.1324, + "step": 77825 + }, + { + "epoch": 3.06, + "grad_norm": 3.426503558716008, + "learning_rate": 4.8866872909699e-06, + "loss": 0.1173, + "step": 77850 + }, + { + "epoch": 3.06, + "grad_norm": 2.4533360047703456, + "learning_rate": 4.886645484949833e-06, + "loss": 0.1177, + "step": 77875 + }, + { + "epoch": 3.06, + "grad_norm": 3.6081638322635046, + "learning_rate": 4.886603678929766e-06, + "loss": 0.1318, + "step": 77900 + }, + { + "epoch": 3.07, + "grad_norm": 3.651931508167466, + "learning_rate": 4.8865618729096995e-06, + "loss": 0.1278, + "step": 77925 + }, + { + "epoch": 3.07, + "grad_norm": 3.133415821529739, + "learning_rate": 4.886520066889632e-06, + "loss": 0.1227, + "step": 77950 + }, + { + "epoch": 3.07, + "grad_norm": 2.6432438963375477, + "learning_rate": 4.886478260869566e-06, + "loss": 0.1226, + "step": 77975 + }, + { + "epoch": 3.07, + "grad_norm": 4.072468610495336, + "learning_rate": 4.8864364548494985e-06, + "loss": 0.1336, + "step": 78000 + }, + { + "epoch": 3.07, + "grad_norm": 3.607977716821972, + "learning_rate": 4.886394648829432e-06, + "loss": 0.1259, + "step": 78025 + }, + { + "epoch": 3.07, + "grad_norm": 3.192830764597763, + "learning_rate": 4.886352842809365e-06, + "loss": 0.1216, + "step": 78050 + }, + { + "epoch": 3.07, + "grad_norm": 3.583472664299709, + "learning_rate": 4.886311036789298e-06, + "loss": 0.1296, + "step": 78075 + }, + { + "epoch": 3.07, + "grad_norm": 3.1812193947409777, + "learning_rate": 4.886269230769231e-06, + "loss": 0.1129, + "step": 78100 + }, + { + "epoch": 3.07, + "grad_norm": 3.5681957955883603, + "learning_rate": 4.8862274247491645e-06, + "loss": 0.1257, + "step": 78125 + }, + { + "epoch": 3.07, + "grad_norm": 4.3655757139060585, + "learning_rate": 4.886185618729097e-06, + "loss": 0.1176, + "step": 78150 + }, + { + "epoch": 3.08, + "grad_norm": 3.627286736907382, + "learning_rate": 4.886143812709031e-06, + "loss": 0.128, + "step": 78175 + }, + { + "epoch": 3.08, + "grad_norm": 4.110026082315242, + "learning_rate": 4.8861020066889634e-06, + "loss": 0.1314, + "step": 78200 + }, + { + "epoch": 3.08, + "grad_norm": 3.540369512342808, + "learning_rate": 4.886060200668897e-06, + "loss": 0.1242, + "step": 78225 + }, + { + "epoch": 3.08, + "grad_norm": 3.0234131517149936, + "learning_rate": 4.886020066889632e-06, + "loss": 0.1132, + "step": 78250 + }, + { + "epoch": 3.08, + "grad_norm": 3.4765331389271035, + "learning_rate": 4.8859782608695655e-06, + "loss": 0.1241, + "step": 78275 + }, + { + "epoch": 3.08, + "grad_norm": 3.7796806155271914, + "learning_rate": 4.885936454849498e-06, + "loss": 0.1158, + "step": 78300 + }, + { + "epoch": 3.08, + "grad_norm": 2.8841477452018838, + "learning_rate": 4.885894648829432e-06, + "loss": 0.1493, + "step": 78325 + }, + { + "epoch": 3.08, + "grad_norm": 4.4433914203454625, + "learning_rate": 4.885852842809365e-06, + "loss": 0.1196, + "step": 78350 + }, + { + "epoch": 3.08, + "grad_norm": 2.75884085554516, + "learning_rate": 4.885811036789298e-06, + "loss": 0.1188, + "step": 78375 + }, + { + "epoch": 3.08, + "grad_norm": 4.273808052650839, + "learning_rate": 4.8857692307692316e-06, + "loss": 0.1217, + "step": 78400 + }, + { + "epoch": 3.09, + "grad_norm": 3.897334135564084, + "learning_rate": 4.885727424749164e-06, + "loss": 0.1286, + "step": 78425 + }, + { + "epoch": 3.09, + "grad_norm": 4.615939251349839, + "learning_rate": 4.885685618729098e-06, + "loss": 0.1362, + "step": 78450 + }, + { + "epoch": 3.09, + "grad_norm": 3.730304960317534, + "learning_rate": 4.8856438127090305e-06, + "loss": 0.1344, + "step": 78475 + }, + { + "epoch": 3.09, + "grad_norm": 3.5702453678071944, + "learning_rate": 4.885602006688964e-06, + "loss": 0.1254, + "step": 78500 + }, + { + "epoch": 3.09, + "grad_norm": 2.9459198258987245, + "learning_rate": 4.885560200668897e-06, + "loss": 0.1155, + "step": 78525 + }, + { + "epoch": 3.09, + "grad_norm": 2.9422269730368202, + "learning_rate": 4.88551839464883e-06, + "loss": 0.1341, + "step": 78550 + }, + { + "epoch": 3.09, + "grad_norm": 4.345971398090705, + "learning_rate": 4.885476588628763e-06, + "loss": 0.125, + "step": 78575 + }, + { + "epoch": 3.09, + "grad_norm": 3.4203734766216543, + "learning_rate": 4.8854347826086965e-06, + "loss": 0.1219, + "step": 78600 + }, + { + "epoch": 3.09, + "grad_norm": 2.7450396553454763, + "learning_rate": 4.885392976588629e-06, + "loss": 0.1273, + "step": 78625 + }, + { + "epoch": 3.09, + "grad_norm": 2.6718974934824007, + "learning_rate": 4.885351170568563e-06, + "loss": 0.1431, + "step": 78650 + }, + { + "epoch": 3.1, + "grad_norm": 2.853838303655648, + "learning_rate": 4.8853093645484955e-06, + "loss": 0.125, + "step": 78675 + }, + { + "epoch": 3.1, + "grad_norm": 2.9709753361358064, + "learning_rate": 4.885267558528429e-06, + "loss": 0.1244, + "step": 78700 + }, + { + "epoch": 3.1, + "grad_norm": 4.023857278065838, + "learning_rate": 4.885225752508362e-06, + "loss": 0.1293, + "step": 78725 + }, + { + "epoch": 3.1, + "grad_norm": 4.373864745184071, + "learning_rate": 4.885183946488295e-06, + "loss": 0.1319, + "step": 78750 + }, + { + "epoch": 3.1, + "grad_norm": 2.508205959991612, + "learning_rate": 4.885142140468228e-06, + "loss": 0.1273, + "step": 78775 + }, + { + "epoch": 3.1, + "grad_norm": 2.6411187927711772, + "learning_rate": 4.885100334448161e-06, + "loss": 0.1311, + "step": 78800 + }, + { + "epoch": 3.1, + "grad_norm": 3.3179749804960834, + "learning_rate": 4.885058528428094e-06, + "loss": 0.1206, + "step": 78825 + }, + { + "epoch": 3.1, + "grad_norm": 3.7366552197499647, + "learning_rate": 4.885016722408027e-06, + "loss": 0.1246, + "step": 78850 + }, + { + "epoch": 3.1, + "grad_norm": 3.118992258293641, + "learning_rate": 4.8849749163879605e-06, + "loss": 0.1126, + "step": 78875 + }, + { + "epoch": 3.1, + "grad_norm": 2.056430415904338, + "learning_rate": 4.884933110367893e-06, + "loss": 0.1069, + "step": 78900 + }, + { + "epoch": 3.1, + "grad_norm": 3.785563939242501, + "learning_rate": 4.884891304347827e-06, + "loss": 0.1279, + "step": 78925 + }, + { + "epoch": 3.11, + "grad_norm": 3.522011901121123, + "learning_rate": 4.884849498327759e-06, + "loss": 0.1221, + "step": 78950 + }, + { + "epoch": 3.11, + "grad_norm": 2.815780165397175, + "learning_rate": 4.884807692307693e-06, + "loss": 0.1252, + "step": 78975 + }, + { + "epoch": 3.11, + "grad_norm": 3.6460270553802965, + "learning_rate": 4.884765886287626e-06, + "loss": 0.1291, + "step": 79000 + }, + { + "epoch": 3.11, + "grad_norm": 3.267409610114439, + "learning_rate": 4.884724080267559e-06, + "loss": 0.131, + "step": 79025 + }, + { + "epoch": 3.11, + "grad_norm": 3.512502522504111, + "learning_rate": 4.884682274247492e-06, + "loss": 0.1193, + "step": 79050 + }, + { + "epoch": 3.11, + "grad_norm": 4.876716295837919, + "learning_rate": 4.8846404682274254e-06, + "loss": 0.1279, + "step": 79075 + }, + { + "epoch": 3.11, + "grad_norm": 4.068141572812643, + "learning_rate": 4.884598662207358e-06, + "loss": 0.1191, + "step": 79100 + }, + { + "epoch": 3.11, + "grad_norm": 2.57695746518627, + "learning_rate": 4.884556856187292e-06, + "loss": 0.1218, + "step": 79125 + }, + { + "epoch": 3.11, + "grad_norm": 2.8362615333679826, + "learning_rate": 4.884515050167224e-06, + "loss": 0.1169, + "step": 79150 + }, + { + "epoch": 3.11, + "grad_norm": 3.7181091078142487, + "learning_rate": 4.884473244147158e-06, + "loss": 0.1271, + "step": 79175 + }, + { + "epoch": 3.12, + "grad_norm": 2.562858519582811, + "learning_rate": 4.884431438127091e-06, + "loss": 0.132, + "step": 79200 + }, + { + "epoch": 3.12, + "grad_norm": 3.226476454470988, + "learning_rate": 4.884389632107024e-06, + "loss": 0.1202, + "step": 79225 + }, + { + "epoch": 3.12, + "grad_norm": 3.251373898366226, + "learning_rate": 4.884349498327759e-06, + "loss": 0.1468, + "step": 79250 + }, + { + "epoch": 3.12, + "grad_norm": 3.198441179981739, + "learning_rate": 4.884307692307693e-06, + "loss": 0.1273, + "step": 79275 + }, + { + "epoch": 3.12, + "grad_norm": 3.0629650236020756, + "learning_rate": 4.884265886287625e-06, + "loss": 0.1231, + "step": 79300 + }, + { + "epoch": 3.12, + "grad_norm": 3.2827139225909976, + "learning_rate": 4.884224080267559e-06, + "loss": 0.1404, + "step": 79325 + }, + { + "epoch": 3.12, + "grad_norm": 3.144707188115353, + "learning_rate": 4.884182274247492e-06, + "loss": 0.1298, + "step": 79350 + }, + { + "epoch": 3.12, + "grad_norm": 2.9421101700328185, + "learning_rate": 4.884140468227425e-06, + "loss": 0.1405, + "step": 79375 + }, + { + "epoch": 3.12, + "grad_norm": 4.59682086895325, + "learning_rate": 4.884098662207358e-06, + "loss": 0.1441, + "step": 79400 + }, + { + "epoch": 3.12, + "grad_norm": 3.391160492130251, + "learning_rate": 4.884056856187291e-06, + "loss": 0.1375, + "step": 79425 + }, + { + "epoch": 3.13, + "grad_norm": 3.180088795295559, + "learning_rate": 4.884015050167225e-06, + "loss": 0.1287, + "step": 79450 + }, + { + "epoch": 3.13, + "grad_norm": 4.570107888816917, + "learning_rate": 4.883973244147158e-06, + "loss": 0.1255, + "step": 79475 + }, + { + "epoch": 3.13, + "grad_norm": 3.0789019235619275, + "learning_rate": 4.883931438127091e-06, + "loss": 0.1254, + "step": 79500 + }, + { + "epoch": 3.13, + "grad_norm": 3.5793084776409003, + "learning_rate": 4.883889632107024e-06, + "loss": 0.1181, + "step": 79525 + }, + { + "epoch": 3.13, + "grad_norm": 3.797918053156867, + "learning_rate": 4.8838478260869575e-06, + "loss": 0.1248, + "step": 79550 + }, + { + "epoch": 3.13, + "grad_norm": 3.027925691307738, + "learning_rate": 4.88380602006689e-06, + "loss": 0.1218, + "step": 79575 + }, + { + "epoch": 3.13, + "grad_norm": 5.0368441312464, + "learning_rate": 4.883764214046824e-06, + "loss": 0.1157, + "step": 79600 + }, + { + "epoch": 3.13, + "grad_norm": 3.8288632838026757, + "learning_rate": 4.883722408026756e-06, + "loss": 0.1156, + "step": 79625 + }, + { + "epoch": 3.13, + "grad_norm": 2.597067146027184, + "learning_rate": 4.88368060200669e-06, + "loss": 0.1297, + "step": 79650 + }, + { + "epoch": 3.13, + "grad_norm": 3.500369841183393, + "learning_rate": 4.883638795986623e-06, + "loss": 0.1239, + "step": 79675 + }, + { + "epoch": 3.14, + "grad_norm": 2.9150129908770857, + "learning_rate": 4.883596989966556e-06, + "loss": 0.1264, + "step": 79700 + }, + { + "epoch": 3.14, + "grad_norm": 3.720662911601689, + "learning_rate": 4.883555183946488e-06, + "loss": 0.1289, + "step": 79725 + }, + { + "epoch": 3.14, + "grad_norm": 3.340766578368334, + "learning_rate": 4.883513377926422e-06, + "loss": 0.1416, + "step": 79750 + }, + { + "epoch": 3.14, + "grad_norm": 3.3314964950611614, + "learning_rate": 4.883471571906354e-06, + "loss": 0.1245, + "step": 79775 + }, + { + "epoch": 3.14, + "grad_norm": 4.481987339852263, + "learning_rate": 4.883429765886288e-06, + "loss": 0.1423, + "step": 79800 + }, + { + "epoch": 3.14, + "grad_norm": 2.6168443489008637, + "learning_rate": 4.8833879598662205e-06, + "loss": 0.1343, + "step": 79825 + }, + { + "epoch": 3.14, + "grad_norm": 3.9455102466510907, + "learning_rate": 4.883346153846154e-06, + "loss": 0.1227, + "step": 79850 + }, + { + "epoch": 3.14, + "grad_norm": 2.6209413472131025, + "learning_rate": 4.883304347826088e-06, + "loss": 0.1405, + "step": 79875 + }, + { + "epoch": 3.14, + "grad_norm": 4.187224029783121, + "learning_rate": 4.88326254180602e-06, + "loss": 0.1346, + "step": 79900 + }, + { + "epoch": 3.14, + "grad_norm": 3.9706536290794507, + "learning_rate": 4.883220735785954e-06, + "loss": 0.1334, + "step": 79925 + }, + { + "epoch": 3.15, + "grad_norm": 3.1222225652663607, + "learning_rate": 4.8831789297658866e-06, + "loss": 0.1165, + "step": 79950 + }, + { + "epoch": 3.15, + "grad_norm": 3.1495103383983034, + "learning_rate": 4.88313712374582e-06, + "loss": 0.119, + "step": 79975 + }, + { + "epoch": 3.15, + "grad_norm": 3.5192842049873176, + "learning_rate": 4.883095317725753e-06, + "loss": 0.121, + "step": 80000 + }, + { + "epoch": 3.15, + "eval_loss": 0.5361328125, + "eval_runtime": 11580.0151, + "eval_samples_per_second": 0.818, + "eval_steps_per_second": 0.051, + "eval_wer": 0.11300934639576597, + "step": 80000 + }, + { + "epoch": 3.15, + "grad_norm": 3.382723320522212, + "learning_rate": 1.4347222222222223e-07, + "loss": 0.1257, + "step": 80025 + }, + { + "epoch": 3.15, + "grad_norm": 3.7499859028087914, + "learning_rate": 1.417361111111111e-07, + "loss": 0.1128, + "step": 80050 + }, + { + "epoch": 3.15, + "grad_norm": 2.9592785016890195, + "learning_rate": 1.4e-07, + "loss": 0.1195, + "step": 80075 + }, + { + "epoch": 3.15, + "grad_norm": 3.4893216360464137, + "learning_rate": 1.3826388888888889e-07, + "loss": 0.1313, + "step": 80100 + }, + { + "epoch": 3.15, + "grad_norm": 3.719554257846966, + "learning_rate": 1.365277777777778e-07, + "loss": 0.1113, + "step": 80125 + }, + { + "epoch": 3.15, + "grad_norm": 3.3150805950550217, + "learning_rate": 1.347916666666667e-07, + "loss": 0.124, + "step": 80150 + }, + { + "epoch": 3.15, + "grad_norm": 3.810734588250568, + "learning_rate": 1.3305555555555557e-07, + "loss": 0.1217, + "step": 80175 + }, + { + "epoch": 3.15, + "grad_norm": 2.616845456697401, + "learning_rate": 1.3131944444444444e-07, + "loss": 0.1277, + "step": 80200 + }, + { + "epoch": 3.16, + "grad_norm": 3.2366291515709578, + "learning_rate": 1.2958333333333335e-07, + "loss": 0.1152, + "step": 80225 + }, + { + "epoch": 3.16, + "grad_norm": 3.7270587123667935, + "learning_rate": 1.2791666666666668e-07, + "loss": 0.1214, + "step": 80250 + }, + { + "epoch": 3.16, + "grad_norm": 3.79323438662421, + "learning_rate": 1.2618055555555558e-07, + "loss": 0.1164, + "step": 80275 + }, + { + "epoch": 3.16, + "grad_norm": 3.388340458302277, + "learning_rate": 1.2444444444444446e-07, + "loss": 0.1053, + "step": 80300 + }, + { + "epoch": 3.16, + "grad_norm": 3.0068550159409613, + "learning_rate": 1.2270833333333333e-07, + "loss": 0.1363, + "step": 80325 + }, + { + "epoch": 3.16, + "grad_norm": 2.6705820472591024, + "learning_rate": 1.2097222222222223e-07, + "loss": 0.1184, + "step": 80350 + }, + { + "epoch": 3.16, + "grad_norm": 3.6711884295544013, + "learning_rate": 1.192361111111111e-07, + "loss": 0.1183, + "step": 80375 + }, + { + "epoch": 3.16, + "grad_norm": 4.303935705314459, + "learning_rate": 1.1750000000000001e-07, + "loss": 0.1312, + "step": 80400 + }, + { + "epoch": 3.16, + "grad_norm": 3.0047973076097407, + "learning_rate": 1.157638888888889e-07, + "loss": 0.1308, + "step": 80425 + }, + { + "epoch": 3.16, + "grad_norm": 3.809379593211185, + "learning_rate": 1.1402777777777778e-07, + "loss": 0.1167, + "step": 80450 + }, + { + "epoch": 3.16, + "grad_norm": 2.9141385074256534, + "learning_rate": 1.1229166666666668e-07, + "loss": 0.1215, + "step": 80475 + }, + { + "epoch": 3.17, + "grad_norm": 3.6712555964990172, + "learning_rate": 1.1055555555555557e-07, + "loss": 0.1307, + "step": 80500 + }, + { + "epoch": 3.17, + "grad_norm": 2.821579337006869, + "learning_rate": 1.0881944444444445e-07, + "loss": 0.1114, + "step": 80525 + }, + { + "epoch": 3.17, + "grad_norm": 3.7286248730869658, + "learning_rate": 1.0708333333333335e-07, + "loss": 0.1346, + "step": 80550 + }, + { + "epoch": 3.17, + "grad_norm": 3.0656348499738315, + "learning_rate": 1.0534722222222223e-07, + "loss": 0.1149, + "step": 80575 + }, + { + "epoch": 3.17, + "grad_norm": 4.331509026966981, + "learning_rate": 1.0361111111111111e-07, + "loss": 0.1107, + "step": 80600 + }, + { + "epoch": 3.17, + "grad_norm": 1.8774186348904527, + "learning_rate": 1.0187500000000002e-07, + "loss": 0.1117, + "step": 80625 + }, + { + "epoch": 3.17, + "grad_norm": 2.706357801131824, + "learning_rate": 1.0013888888888889e-07, + "loss": 0.1129, + "step": 80650 + }, + { + "epoch": 3.17, + "grad_norm": 3.5291983010831305, + "learning_rate": 9.840277777777778e-08, + "loss": 0.1276, + "step": 80675 + }, + { + "epoch": 3.17, + "grad_norm": 2.5884193020093105, + "learning_rate": 9.666666666666669e-08, + "loss": 0.1228, + "step": 80700 + }, + { + "epoch": 3.17, + "grad_norm": 3.065588647509654, + "learning_rate": 9.493055555555556e-08, + "loss": 0.1282, + "step": 80725 + }, + { + "epoch": 3.18, + "grad_norm": 3.4441302429520957, + "learning_rate": 9.319444444444445e-08, + "loss": 0.1109, + "step": 80750 + }, + { + "epoch": 3.18, + "grad_norm": 3.568478894528398, + "learning_rate": 9.145833333333335e-08, + "loss": 0.115, + "step": 80775 + }, + { + "epoch": 3.18, + "grad_norm": 3.2298861894081052, + "learning_rate": 8.972222222222223e-08, + "loss": 0.1079, + "step": 80800 + }, + { + "epoch": 3.18, + "grad_norm": 3.0351733682866784, + "learning_rate": 8.798611111111112e-08, + "loss": 0.1155, + "step": 80825 + }, + { + "epoch": 3.18, + "grad_norm": 2.8423943298154777, + "learning_rate": 8.625000000000001e-08, + "loss": 0.1058, + "step": 80850 + }, + { + "epoch": 3.18, + "grad_norm": 4.239375206161924, + "learning_rate": 8.45138888888889e-08, + "loss": 0.1202, + "step": 80875 + }, + { + "epoch": 3.18, + "grad_norm": 3.9414210967413044, + "learning_rate": 8.277777777777779e-08, + "loss": 0.1226, + "step": 80900 + }, + { + "epoch": 3.18, + "grad_norm": 2.6741087247987356, + "learning_rate": 8.104166666666668e-08, + "loss": 0.1131, + "step": 80925 + }, + { + "epoch": 3.18, + "grad_norm": 2.470938835323759, + "learning_rate": 7.930555555555557e-08, + "loss": 0.1006, + "step": 80950 + }, + { + "epoch": 3.18, + "grad_norm": 3.1023252839310533, + "learning_rate": 7.756944444444445e-08, + "loss": 0.1192, + "step": 80975 + }, + { + "epoch": 3.19, + "grad_norm": 3.1292889463150133, + "learning_rate": 7.583333333333334e-08, + "loss": 0.1111, + "step": 81000 + }, + { + "epoch": 3.19, + "grad_norm": 2.8191984119816373, + "learning_rate": 7.409722222222222e-08, + "loss": 0.1214, + "step": 81025 + }, + { + "epoch": 3.19, + "grad_norm": 2.699005654746743, + "learning_rate": 7.236111111111111e-08, + "loss": 0.1083, + "step": 81050 + }, + { + "epoch": 3.19, + "grad_norm": 2.5511639322171162, + "learning_rate": 7.062500000000001e-08, + "loss": 0.1028, + "step": 81075 + }, + { + "epoch": 3.19, + "grad_norm": 4.017135101608909, + "learning_rate": 6.888888888888889e-08, + "loss": 0.108, + "step": 81100 + }, + { + "epoch": 3.19, + "grad_norm": 3.354633095244795, + "learning_rate": 6.715277777777778e-08, + "loss": 0.1187, + "step": 81125 + }, + { + "epoch": 3.19, + "grad_norm": 2.968280201052923, + "learning_rate": 6.541666666666668e-08, + "loss": 0.1117, + "step": 81150 + }, + { + "epoch": 3.19, + "grad_norm": 3.441489150840829, + "learning_rate": 6.368055555555556e-08, + "loss": 0.1168, + "step": 81175 + }, + { + "epoch": 3.19, + "grad_norm": 3.397176047085528, + "learning_rate": 6.194444444444445e-08, + "loss": 0.1167, + "step": 81200 + }, + { + "epoch": 3.19, + "grad_norm": 3.7413484965620807, + "learning_rate": 6.020833333333335e-08, + "loss": 0.1245, + "step": 81225 + }, + { + "epoch": 3.2, + "grad_norm": 3.368881934301906, + "learning_rate": 5.8541666666666666e-08, + "loss": 0.1164, + "step": 81250 + }, + { + "epoch": 3.2, + "grad_norm": 3.5376665390407798, + "learning_rate": 5.680555555555556e-08, + "loss": 0.1125, + "step": 81275 + }, + { + "epoch": 3.2, + "grad_norm": 3.071272749793487, + "learning_rate": 5.506944444444445e-08, + "loss": 0.1043, + "step": 81300 + }, + { + "epoch": 3.2, + "grad_norm": 2.8957813289832415, + "learning_rate": 5.3333333333333334e-08, + "loss": 0.1227, + "step": 81325 + }, + { + "epoch": 3.2, + "grad_norm": 3.177849245107898, + "learning_rate": 5.1597222222222224e-08, + "loss": 0.1151, + "step": 81350 + }, + { + "epoch": 3.2, + "grad_norm": 3.036992433502032, + "learning_rate": 4.986111111111112e-08, + "loss": 0.1067, + "step": 81375 + }, + { + "epoch": 3.2, + "grad_norm": 3.2947567806935134, + "learning_rate": 4.8125e-08, + "loss": 0.1169, + "step": 81400 + }, + { + "epoch": 3.2, + "grad_norm": 2.6033716960489306, + "learning_rate": 4.638888888888889e-08, + "loss": 0.1308, + "step": 81425 + }, + { + "epoch": 3.2, + "grad_norm": 3.271050187274925, + "learning_rate": 4.465277777777779e-08, + "loss": 0.1216, + "step": 81450 + }, + { + "epoch": 3.2, + "grad_norm": 3.228010138283361, + "learning_rate": 4.291666666666667e-08, + "loss": 0.1178, + "step": 81475 + }, + { + "epoch": 3.21, + "grad_norm": 1.4784343520529, + "learning_rate": 4.118055555555556e-08, + "loss": 0.1104, + "step": 81500 + }, + { + "epoch": 3.21, + "grad_norm": 3.007137030167928, + "learning_rate": 3.944444444444445e-08, + "loss": 0.1063, + "step": 81525 + }, + { + "epoch": 3.21, + "grad_norm": 3.391635916189553, + "learning_rate": 3.770833333333334e-08, + "loss": 0.1057, + "step": 81550 + }, + { + "epoch": 3.21, + "grad_norm": 2.9441226132043647, + "learning_rate": 3.597222222222222e-08, + "loss": 0.1265, + "step": 81575 + }, + { + "epoch": 3.21, + "grad_norm": 2.4559004956821036, + "learning_rate": 3.423611111111112e-08, + "loss": 0.107, + "step": 81600 + }, + { + "epoch": 3.21, + "grad_norm": 3.1518532804348625, + "learning_rate": 3.25e-08, + "loss": 0.1375, + "step": 81625 + }, + { + "epoch": 3.21, + "grad_norm": 2.8691924917711353, + "learning_rate": 3.076388888888889e-08, + "loss": 0.1073, + "step": 81650 + }, + { + "epoch": 3.21, + "grad_norm": 2.866450101574571, + "learning_rate": 2.902777777777778e-08, + "loss": 0.1081, + "step": 81675 + }, + { + "epoch": 3.21, + "grad_norm": 4.055904824711259, + "learning_rate": 2.7291666666666668e-08, + "loss": 0.1108, + "step": 81700 + }, + { + "epoch": 3.21, + "grad_norm": 3.35548877818344, + "learning_rate": 2.555555555555556e-08, + "loss": 0.1175, + "step": 81725 + }, + { + "epoch": 3.22, + "grad_norm": 3.0025022533511128, + "learning_rate": 2.3819444444444447e-08, + "loss": 0.1196, + "step": 81750 + }, + { + "epoch": 3.22, + "grad_norm": 3.9601682959534625, + "learning_rate": 2.2083333333333336e-08, + "loss": 0.121, + "step": 81775 + }, + { + "epoch": 3.22, + "grad_norm": 3.5576863922110955, + "learning_rate": 2.0347222222222222e-08, + "loss": 0.1156, + "step": 81800 + }, + { + "epoch": 3.22, + "grad_norm": 3.0524980328514615, + "learning_rate": 1.861111111111111e-08, + "loss": 0.107, + "step": 81825 + }, + { + "epoch": 3.22, + "grad_norm": 2.5465332535964564, + "learning_rate": 1.6875e-08, + "loss": 0.1104, + "step": 81850 + }, + { + "epoch": 3.22, + "grad_norm": 3.9758255525233475, + "learning_rate": 1.513888888888889e-08, + "loss": 0.1192, + "step": 81875 + }, + { + "epoch": 3.22, + "grad_norm": 3.156736393762649, + "learning_rate": 1.3402777777777778e-08, + "loss": 0.1246, + "step": 81900 + }, + { + "epoch": 3.22, + "grad_norm": 2.193575947789099, + "learning_rate": 1.1666666666666669e-08, + "loss": 0.1183, + "step": 81925 + }, + { + "epoch": 3.22, + "grad_norm": 4.202801227471746, + "learning_rate": 9.930555555555556e-09, + "loss": 0.1035, + "step": 81950 + }, + { + "epoch": 3.22, + "grad_norm": 4.617004459055916, + "learning_rate": 8.194444444444446e-09, + "loss": 0.1142, + "step": 81975 + }, + { + "epoch": 3.22, + "grad_norm": 3.2809986802623974, + "learning_rate": 6.458333333333334e-09, + "loss": 0.1231, + "step": 82000 + }, + { + "epoch": 3.22, + "step": 82000, + "total_flos": 7.219553262695875e+18, + "train_loss": 0.0028538422119326707, + "train_runtime": 23952.0519, + "train_samples_per_second": 109.552, + "train_steps_per_second": 3.424 + } + ], + "logging_steps": 25, + "max_steps": 82000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 5000, + "total_flos": 7.219553262695875e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}