{ "best_metric": null, "best_model_checkpoint": null, "epoch": 98.96907216494846, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.41237113402061853, "grad_norm": 5.916716575622559, "learning_rate": 2.0833333333333334e-06, "loss": 1.6297, "step": 10 }, { "epoch": 0.8247422680412371, "grad_norm": 5.051618576049805, "learning_rate": 4.166666666666667e-06, "loss": 1.613, "step": 20 }, { "epoch": 0.9896907216494846, "eval_accuracy": 0.2927536231884058, "eval_loss": 1.5833344459533691, "eval_precision": 0.3247879943590829, "eval_recall": 0.2927536231884058, "eval_runtime": 2.9495, "eval_samples_per_second": 116.97, "eval_steps_per_second": 3.729, "step": 24 }, { "epoch": 1.2371134020618557, "grad_norm": 4.8794169425964355, "learning_rate": 6.25e-06, "loss": 1.5792, "step": 30 }, { "epoch": 1.6494845360824741, "grad_norm": 6.336801052093506, "learning_rate": 8.333333333333334e-06, "loss": 1.5494, "step": 40 }, { "epoch": 1.9793814432989691, "eval_accuracy": 0.3681159420289855, "eval_loss": 1.4944071769714355, "eval_precision": 0.440954469667821, "eval_recall": 0.3681159420289855, "eval_runtime": 1.7863, "eval_samples_per_second": 193.135, "eval_steps_per_second": 6.158, "step": 48 }, { "epoch": 2.0618556701030926, "grad_norm": 8.574434280395508, "learning_rate": 1.0416666666666668e-05, "loss": 1.5014, "step": 50 }, { "epoch": 2.4742268041237114, "grad_norm": 6.564225673675537, "learning_rate": 1.25e-05, "loss": 1.4422, "step": 60 }, { "epoch": 2.88659793814433, "grad_norm": 5.804593086242676, "learning_rate": 1.4583333333333335e-05, "loss": 1.3989, "step": 70 }, { "epoch": 2.9690721649484537, "eval_accuracy": 0.5159420289855072, "eval_loss": 1.3423842191696167, "eval_precision": 0.52619860815513, "eval_recall": 0.5159420289855072, "eval_runtime": 1.8303, "eval_samples_per_second": 188.493, "eval_steps_per_second": 6.01, "step": 72 }, { "epoch": 3.2989690721649483, "grad_norm": 6.893215656280518, "learning_rate": 1.6666666666666667e-05, "loss": 1.2968, "step": 80 }, { "epoch": 3.711340206185567, "grad_norm": 12.37126350402832, "learning_rate": 1.8750000000000002e-05, "loss": 1.2238, "step": 90 }, { "epoch": 4.0, "eval_accuracy": 0.6260869565217392, "eval_loss": 1.1162269115447998, "eval_precision": 0.6665610702002287, "eval_recall": 0.6260869565217392, "eval_runtime": 1.8634, "eval_samples_per_second": 185.144, "eval_steps_per_second": 5.903, "step": 97 }, { "epoch": 4.123711340206185, "grad_norm": 6.501392841339111, "learning_rate": 2.0833333333333336e-05, "loss": 1.1194, "step": 100 }, { "epoch": 4.536082474226804, "grad_norm": 14.653229713439941, "learning_rate": 2.2916666666666667e-05, "loss": 1.0499, "step": 110 }, { "epoch": 4.948453608247423, "grad_norm": 15.2618408203125, "learning_rate": 2.5e-05, "loss": 0.9585, "step": 120 }, { "epoch": 4.989690721649485, "eval_accuracy": 0.6985507246376812, "eval_loss": 0.8966168761253357, "eval_precision": 0.7013922738306568, "eval_recall": 0.6985507246376812, "eval_runtime": 1.8339, "eval_samples_per_second": 188.12, "eval_steps_per_second": 5.998, "step": 121 }, { "epoch": 5.360824742268041, "grad_norm": 12.275806427001953, "learning_rate": 2.7083333333333332e-05, "loss": 0.8986, "step": 130 }, { "epoch": 5.77319587628866, "grad_norm": 15.373220443725586, "learning_rate": 2.916666666666667e-05, "loss": 0.8934, "step": 140 }, { "epoch": 5.979381443298969, "eval_accuracy": 0.7507246376811594, "eval_loss": 0.763816773891449, "eval_precision": 0.7489666881245252, "eval_recall": 0.7507246376811594, "eval_runtime": 1.9332, "eval_samples_per_second": 178.459, "eval_steps_per_second": 5.69, "step": 145 }, { "epoch": 6.185567010309279, "grad_norm": 15.394486427307129, "learning_rate": 3.125e-05, "loss": 0.8326, "step": 150 }, { "epoch": 6.597938144329897, "grad_norm": 14.27376937866211, "learning_rate": 3.3333333333333335e-05, "loss": 0.7589, "step": 160 }, { "epoch": 6.969072164948454, "eval_accuracy": 0.7652173913043478, "eval_loss": 0.6776081919670105, "eval_precision": 0.771906259033061, "eval_recall": 0.7652173913043478, "eval_runtime": 1.836, "eval_samples_per_second": 187.91, "eval_steps_per_second": 5.991, "step": 169 }, { "epoch": 7.010309278350515, "grad_norm": 21.43760871887207, "learning_rate": 3.541666666666667e-05, "loss": 0.7404, "step": 170 }, { "epoch": 7.422680412371134, "grad_norm": 15.207581520080566, "learning_rate": 3.7500000000000003e-05, "loss": 0.653, "step": 180 }, { "epoch": 7.835051546391752, "grad_norm": 25.153663635253906, "learning_rate": 3.958333333333333e-05, "loss": 0.6746, "step": 190 }, { "epoch": 8.0, "eval_accuracy": 0.7623188405797101, "eval_loss": 0.6126735210418701, "eval_precision": 0.7628428431334807, "eval_recall": 0.7623188405797101, "eval_runtime": 1.8501, "eval_samples_per_second": 186.474, "eval_steps_per_second": 5.946, "step": 194 }, { "epoch": 8.24742268041237, "grad_norm": 23.2750301361084, "learning_rate": 4.166666666666667e-05, "loss": 0.6516, "step": 200 }, { "epoch": 8.65979381443299, "grad_norm": 21.777841567993164, "learning_rate": 4.375e-05, "loss": 0.6048, "step": 210 }, { "epoch": 8.989690721649485, "eval_accuracy": 0.8202898550724638, "eval_loss": 0.5220813751220703, "eval_precision": 0.8216835971752063, "eval_recall": 0.8202898550724638, "eval_runtime": 1.8243, "eval_samples_per_second": 189.114, "eval_steps_per_second": 6.03, "step": 218 }, { "epoch": 9.072164948453608, "grad_norm": 15.630614280700684, "learning_rate": 4.5833333333333334e-05, "loss": 0.5723, "step": 220 }, { "epoch": 9.484536082474227, "grad_norm": 13.571239471435547, "learning_rate": 4.791666666666667e-05, "loss": 0.5436, "step": 230 }, { "epoch": 9.896907216494846, "grad_norm": 24.206087112426758, "learning_rate": 5e-05, "loss": 0.531, "step": 240 }, { "epoch": 9.97938144329897, "eval_accuracy": 0.8115942028985508, "eval_loss": 0.4930874705314636, "eval_precision": 0.8203605371226137, "eval_recall": 0.8115942028985508, "eval_runtime": 1.788, "eval_samples_per_second": 192.958, "eval_steps_per_second": 6.152, "step": 242 }, { "epoch": 10.309278350515465, "grad_norm": 17.16573715209961, "learning_rate": 4.976851851851852e-05, "loss": 0.5034, "step": 250 }, { "epoch": 10.721649484536082, "grad_norm": 19.933942794799805, "learning_rate": 4.9537037037037035e-05, "loss": 0.57, "step": 260 }, { "epoch": 10.969072164948454, "eval_accuracy": 0.8318840579710145, "eval_loss": 0.44795188307762146, "eval_precision": 0.8344579895060443, "eval_recall": 0.8318840579710145, "eval_runtime": 1.8183, "eval_samples_per_second": 189.733, "eval_steps_per_second": 6.049, "step": 266 }, { "epoch": 11.1340206185567, "grad_norm": 25.91600799560547, "learning_rate": 4.930555555555556e-05, "loss": 0.4791, "step": 270 }, { "epoch": 11.54639175257732, "grad_norm": 23.493484497070312, "learning_rate": 4.9074074074074075e-05, "loss": 0.4372, "step": 280 }, { "epoch": 11.958762886597938, "grad_norm": 14.273780822753906, "learning_rate": 4.8842592592592595e-05, "loss": 0.4624, "step": 290 }, { "epoch": 12.0, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.42139920592308044, "eval_precision": 0.846014277166443, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7884, "eval_samples_per_second": 192.914, "eval_steps_per_second": 6.151, "step": 291 }, { "epoch": 12.371134020618557, "grad_norm": 26.43771743774414, "learning_rate": 4.8611111111111115e-05, "loss": 0.4509, "step": 300 }, { "epoch": 12.783505154639176, "grad_norm": 29.501718521118164, "learning_rate": 4.837962962962963e-05, "loss": 0.417, "step": 310 }, { "epoch": 12.989690721649485, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.44392213225364685, "eval_precision": 0.8485676738054103, "eval_recall": 0.8492753623188406, "eval_runtime": 1.762, "eval_samples_per_second": 195.797, "eval_steps_per_second": 6.243, "step": 315 }, { "epoch": 13.195876288659793, "grad_norm": 16.380001068115234, "learning_rate": 4.814814814814815e-05, "loss": 0.4042, "step": 320 }, { "epoch": 13.608247422680412, "grad_norm": 26.098731994628906, "learning_rate": 4.791666666666667e-05, "loss": 0.3814, "step": 330 }, { "epoch": 13.97938144329897, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.41379421949386597, "eval_precision": 0.8477774513274812, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7998, "eval_samples_per_second": 191.689, "eval_steps_per_second": 6.112, "step": 339 }, { "epoch": 14.02061855670103, "grad_norm": 13.136883735656738, "learning_rate": 4.768518518518519e-05, "loss": 0.4209, "step": 340 }, { "epoch": 14.43298969072165, "grad_norm": 18.104930877685547, "learning_rate": 4.745370370370371e-05, "loss": 0.3817, "step": 350 }, { "epoch": 14.845360824742269, "grad_norm": 27.79136848449707, "learning_rate": 4.722222222222222e-05, "loss": 0.3737, "step": 360 }, { "epoch": 14.969072164948454, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.41388532519340515, "eval_precision": 0.8466409143288909, "eval_recall": 0.8463768115942029, "eval_runtime": 1.8854, "eval_samples_per_second": 182.983, "eval_steps_per_second": 5.834, "step": 363 }, { "epoch": 15.257731958762886, "grad_norm": 33.14027786254883, "learning_rate": 4.699074074074074e-05, "loss": 0.3782, "step": 370 }, { "epoch": 15.670103092783505, "grad_norm": 10.574623107910156, "learning_rate": 4.675925925925926e-05, "loss": 0.3971, "step": 380 }, { "epoch": 16.0, "eval_accuracy": 0.863768115942029, "eval_loss": 0.4119352400302887, "eval_precision": 0.8664915871553495, "eval_recall": 0.863768115942029, "eval_runtime": 1.8638, "eval_samples_per_second": 185.11, "eval_steps_per_second": 5.902, "step": 388 }, { "epoch": 16.082474226804123, "grad_norm": 14.796497344970703, "learning_rate": 4.652777777777778e-05, "loss": 0.3227, "step": 390 }, { "epoch": 16.49484536082474, "grad_norm": 13.750545501708984, "learning_rate": 4.62962962962963e-05, "loss": 0.306, "step": 400 }, { "epoch": 16.90721649484536, "grad_norm": 15.056818962097168, "learning_rate": 4.6064814814814814e-05, "loss": 0.343, "step": 410 }, { "epoch": 16.989690721649485, "eval_accuracy": 0.8608695652173913, "eval_loss": 0.4421471655368805, "eval_precision": 0.8659298079116737, "eval_recall": 0.8608695652173913, "eval_runtime": 1.7876, "eval_samples_per_second": 192.996, "eval_steps_per_second": 6.154, "step": 412 }, { "epoch": 17.31958762886598, "grad_norm": 19.41351318359375, "learning_rate": 4.5833333333333334e-05, "loss": 0.3383, "step": 420 }, { "epoch": 17.7319587628866, "grad_norm": 22.833810806274414, "learning_rate": 4.5601851851851854e-05, "loss": 0.3311, "step": 430 }, { "epoch": 17.97938144329897, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.45808833837509155, "eval_precision": 0.8503668982654489, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8173, "eval_samples_per_second": 189.846, "eval_steps_per_second": 6.053, "step": 436 }, { "epoch": 18.144329896907216, "grad_norm": 9.80312442779541, "learning_rate": 4.5370370370370374e-05, "loss": 0.301, "step": 440 }, { "epoch": 18.556701030927837, "grad_norm": 17.442903518676758, "learning_rate": 4.5138888888888894e-05, "loss": 0.2594, "step": 450 }, { "epoch": 18.969072164948454, "grad_norm": 25.01900863647461, "learning_rate": 4.490740740740741e-05, "loss": 0.2652, "step": 460 }, { "epoch": 18.969072164948454, "eval_accuracy": 0.8405797101449275, "eval_loss": 0.4563068747520447, "eval_precision": 0.8441116322796441, "eval_recall": 0.8405797101449275, "eval_runtime": 1.8121, "eval_samples_per_second": 190.387, "eval_steps_per_second": 6.07, "step": 460 }, { "epoch": 19.38144329896907, "grad_norm": 22.951929092407227, "learning_rate": 4.467592592592593e-05, "loss": 0.2726, "step": 470 }, { "epoch": 19.79381443298969, "grad_norm": 17.189971923828125, "learning_rate": 4.4444444444444447e-05, "loss": 0.3026, "step": 480 }, { "epoch": 20.0, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.4535578489303589, "eval_precision": 0.8549145070160367, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8156, "eval_samples_per_second": 190.019, "eval_steps_per_second": 6.059, "step": 485 }, { "epoch": 20.20618556701031, "grad_norm": 19.29929542541504, "learning_rate": 4.4212962962962966e-05, "loss": 0.2808, "step": 490 }, { "epoch": 20.61855670103093, "grad_norm": 23.201435089111328, "learning_rate": 4.3981481481481486e-05, "loss": 0.2562, "step": 500 }, { "epoch": 20.989690721649485, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.44093257188796997, "eval_precision": 0.8493084398986088, "eval_recall": 0.8463768115942029, "eval_runtime": 1.9468, "eval_samples_per_second": 177.217, "eval_steps_per_second": 5.65, "step": 509 }, { "epoch": 21.030927835051546, "grad_norm": 12.947028160095215, "learning_rate": 4.375e-05, "loss": 0.2739, "step": 510 }, { "epoch": 21.443298969072163, "grad_norm": 21.544536590576172, "learning_rate": 4.351851851851852e-05, "loss": 0.2383, "step": 520 }, { "epoch": 21.855670103092784, "grad_norm": 12.224617958068848, "learning_rate": 4.328703703703704e-05, "loss": 0.2282, "step": 530 }, { "epoch": 21.97938144329897, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.4388555884361267, "eval_precision": 0.8451190974708183, "eval_recall": 0.8434782608695652, "eval_runtime": 1.7718, "eval_samples_per_second": 194.721, "eval_steps_per_second": 6.208, "step": 533 }, { "epoch": 22.2680412371134, "grad_norm": 17.55919647216797, "learning_rate": 4.305555555555556e-05, "loss": 0.2505, "step": 540 }, { "epoch": 22.68041237113402, "grad_norm": 10.570196151733398, "learning_rate": 4.282407407407408e-05, "loss": 0.2374, "step": 550 }, { "epoch": 22.969072164948454, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.4452122747898102, "eval_precision": 0.8589461524849866, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8751, "eval_samples_per_second": 183.989, "eval_steps_per_second": 5.866, "step": 557 }, { "epoch": 23.09278350515464, "grad_norm": 25.781587600708008, "learning_rate": 4.259259259259259e-05, "loss": 0.2355, "step": 560 }, { "epoch": 23.50515463917526, "grad_norm": 22.854766845703125, "learning_rate": 4.236111111111111e-05, "loss": 0.2553, "step": 570 }, { "epoch": 23.917525773195877, "grad_norm": 15.405595779418945, "learning_rate": 4.212962962962963e-05, "loss": 0.216, "step": 580 }, { "epoch": 24.0, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.4375264048576355, "eval_precision": 0.858123097800969, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8051, "eval_samples_per_second": 191.128, "eval_steps_per_second": 6.094, "step": 582 }, { "epoch": 24.329896907216494, "grad_norm": 15.453635215759277, "learning_rate": 4.1898148148148145e-05, "loss": 0.2019, "step": 590 }, { "epoch": 24.742268041237114, "grad_norm": 12.363275527954102, "learning_rate": 4.166666666666667e-05, "loss": 0.2127, "step": 600 }, { "epoch": 24.989690721649485, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.44218453764915466, "eval_precision": 0.8587798835624924, "eval_recall": 0.8579710144927536, "eval_runtime": 1.9062, "eval_samples_per_second": 180.991, "eval_steps_per_second": 5.771, "step": 606 }, { "epoch": 25.15463917525773, "grad_norm": 15.13847827911377, "learning_rate": 4.1435185185185185e-05, "loss": 0.2301, "step": 610 }, { "epoch": 25.567010309278352, "grad_norm": 20.761062622070312, "learning_rate": 4.1203703703703705e-05, "loss": 0.1807, "step": 620 }, { "epoch": 25.97938144329897, "grad_norm": 17.889150619506836, "learning_rate": 4.0972222222222225e-05, "loss": 0.2004, "step": 630 }, { "epoch": 25.97938144329897, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.46348363161087036, "eval_precision": 0.8519325944084339, "eval_recall": 0.8521739130434782, "eval_runtime": 1.7728, "eval_samples_per_second": 194.609, "eval_steps_per_second": 6.205, "step": 630 }, { "epoch": 26.391752577319586, "grad_norm": 23.56374168395996, "learning_rate": 4.074074074074074e-05, "loss": 0.2427, "step": 640 }, { "epoch": 26.804123711340207, "grad_norm": 9.772664070129395, "learning_rate": 4.0509259259259265e-05, "loss": 0.2029, "step": 650 }, { "epoch": 26.969072164948454, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5214529037475586, "eval_precision": 0.8545500895204992, "eval_recall": 0.8492753623188406, "eval_runtime": 1.9291, "eval_samples_per_second": 178.841, "eval_steps_per_second": 5.702, "step": 654 }, { "epoch": 27.216494845360824, "grad_norm": 14.480449676513672, "learning_rate": 4.027777777777778e-05, "loss": 0.1903, "step": 660 }, { "epoch": 27.628865979381445, "grad_norm": 16.415973663330078, "learning_rate": 4.00462962962963e-05, "loss": 0.1794, "step": 670 }, { "epoch": 28.0, "eval_accuracy": 0.863768115942029, "eval_loss": 0.47563326358795166, "eval_precision": 0.8669166767891824, "eval_recall": 0.863768115942029, "eval_runtime": 1.7555, "eval_samples_per_second": 196.529, "eval_steps_per_second": 6.266, "step": 679 }, { "epoch": 28.04123711340206, "grad_norm": 8.689855575561523, "learning_rate": 3.981481481481482e-05, "loss": 0.1822, "step": 680 }, { "epoch": 28.45360824742268, "grad_norm": 12.505402565002441, "learning_rate": 3.958333333333333e-05, "loss": 0.1828, "step": 690 }, { "epoch": 28.8659793814433, "grad_norm": 15.491950988769531, "learning_rate": 3.935185185185186e-05, "loss": 0.1835, "step": 700 }, { "epoch": 28.989690721649485, "eval_accuracy": 0.8608695652173913, "eval_loss": 0.4727528393268585, "eval_precision": 0.8649801117780185, "eval_recall": 0.8608695652173913, "eval_runtime": 1.8858, "eval_samples_per_second": 182.95, "eval_steps_per_second": 5.833, "step": 703 }, { "epoch": 29.278350515463917, "grad_norm": 16.289226531982422, "learning_rate": 3.912037037037037e-05, "loss": 0.1907, "step": 710 }, { "epoch": 29.690721649484537, "grad_norm": 13.304434776306152, "learning_rate": 3.888888888888889e-05, "loss": 0.1781, "step": 720 }, { "epoch": 29.97938144329897, "eval_accuracy": 0.855072463768116, "eval_loss": 0.4636934697628021, "eval_precision": 0.8568131435327558, "eval_recall": 0.855072463768116, "eval_runtime": 1.8681, "eval_samples_per_second": 184.683, "eval_steps_per_second": 5.888, "step": 727 }, { "epoch": 30.103092783505154, "grad_norm": 6.991786003112793, "learning_rate": 3.865740740740741e-05, "loss": 0.1829, "step": 730 }, { "epoch": 30.51546391752577, "grad_norm": 10.514315605163574, "learning_rate": 3.8425925925925924e-05, "loss": 0.1627, "step": 740 }, { "epoch": 30.927835051546392, "grad_norm": 9.121224403381348, "learning_rate": 3.8194444444444444e-05, "loss": 0.1671, "step": 750 }, { "epoch": 30.969072164948454, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.485573947429657, "eval_precision": 0.8599276434444294, "eval_recall": 0.8579710144927536, "eval_runtime": 1.9437, "eval_samples_per_second": 177.497, "eval_steps_per_second": 5.659, "step": 751 }, { "epoch": 31.34020618556701, "grad_norm": 13.762226104736328, "learning_rate": 3.7962962962962964e-05, "loss": 0.1721, "step": 760 }, { "epoch": 31.75257731958763, "grad_norm": 10.415836334228516, "learning_rate": 3.7731481481481484e-05, "loss": 0.1762, "step": 770 }, { "epoch": 32.0, "eval_accuracy": 0.8666666666666667, "eval_loss": 0.5007998943328857, "eval_precision": 0.8684023473901008, "eval_recall": 0.8666666666666667, "eval_runtime": 1.769, "eval_samples_per_second": 195.026, "eval_steps_per_second": 6.218, "step": 776 }, { "epoch": 32.16494845360825, "grad_norm": 10.8311767578125, "learning_rate": 3.7500000000000003e-05, "loss": 0.1707, "step": 780 }, { "epoch": 32.577319587628864, "grad_norm": 12.070932388305664, "learning_rate": 3.726851851851852e-05, "loss": 0.1673, "step": 790 }, { "epoch": 32.98969072164948, "grad_norm": 8.654770851135254, "learning_rate": 3.7037037037037037e-05, "loss": 0.1867, "step": 800 }, { "epoch": 32.98969072164948, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.5058211088180542, "eval_precision": 0.8584843785997619, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8394, "eval_samples_per_second": 187.561, "eval_steps_per_second": 5.98, "step": 800 }, { "epoch": 33.402061855670105, "grad_norm": 8.323944091796875, "learning_rate": 3.6805555555555556e-05, "loss": 0.1553, "step": 810 }, { "epoch": 33.81443298969072, "grad_norm": 14.134881973266602, "learning_rate": 3.6574074074074076e-05, "loss": 0.1409, "step": 820 }, { "epoch": 33.97938144329897, "eval_accuracy": 0.8405797101449275, "eval_loss": 0.5489646792411804, "eval_precision": 0.8408524440704116, "eval_recall": 0.8405797101449275, "eval_runtime": 1.7738, "eval_samples_per_second": 194.496, "eval_steps_per_second": 6.201, "step": 824 }, { "epoch": 34.22680412371134, "grad_norm": 17.74443244934082, "learning_rate": 3.6342592592592596e-05, "loss": 0.1498, "step": 830 }, { "epoch": 34.63917525773196, "grad_norm": 14.35798454284668, "learning_rate": 3.611111111111111e-05, "loss": 0.1315, "step": 840 }, { "epoch": 34.96907216494845, "eval_accuracy": 0.8347826086956521, "eval_loss": 0.528394877910614, "eval_precision": 0.8356368409524089, "eval_recall": 0.8347826086956521, "eval_runtime": 1.8034, "eval_samples_per_second": 191.304, "eval_steps_per_second": 6.1, "step": 848 }, { "epoch": 35.05154639175258, "grad_norm": 15.67455005645752, "learning_rate": 3.587962962962963e-05, "loss": 0.163, "step": 850 }, { "epoch": 35.4639175257732, "grad_norm": 6.1969828605651855, "learning_rate": 3.564814814814815e-05, "loss": 0.1406, "step": 860 }, { "epoch": 35.876288659793815, "grad_norm": 14.651385307312012, "learning_rate": 3.541666666666667e-05, "loss": 0.1315, "step": 870 }, { "epoch": 36.0, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.5415348410606384, "eval_precision": 0.8487979974677805, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7509, "eval_samples_per_second": 197.042, "eval_steps_per_second": 6.282, "step": 873 }, { "epoch": 36.28865979381443, "grad_norm": 15.739358901977539, "learning_rate": 3.518518518518519e-05, "loss": 0.1944, "step": 880 }, { "epoch": 36.70103092783505, "grad_norm": 16.889202117919922, "learning_rate": 3.49537037037037e-05, "loss": 0.1974, "step": 890 }, { "epoch": 36.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.519416332244873, "eval_precision": 0.8536148561469765, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7833, "eval_samples_per_second": 193.461, "eval_steps_per_second": 6.168, "step": 897 }, { "epoch": 37.11340206185567, "grad_norm": 10.011604309082031, "learning_rate": 3.472222222222222e-05, "loss": 0.1605, "step": 900 }, { "epoch": 37.52577319587629, "grad_norm": 18.694128036499023, "learning_rate": 3.449074074074074e-05, "loss": 0.1515, "step": 910 }, { "epoch": 37.93814432989691, "grad_norm": 9.140711784362793, "learning_rate": 3.425925925925926e-05, "loss": 0.1337, "step": 920 }, { "epoch": 37.97938144329897, "eval_accuracy": 0.8608695652173913, "eval_loss": 0.5088416337966919, "eval_precision": 0.8602982452483552, "eval_recall": 0.8608695652173913, "eval_runtime": 1.7456, "eval_samples_per_second": 197.634, "eval_steps_per_second": 6.301, "step": 921 }, { "epoch": 38.350515463917525, "grad_norm": 12.548330307006836, "learning_rate": 3.402777777777778e-05, "loss": 0.1439, "step": 930 }, { "epoch": 38.76288659793814, "grad_norm": 12.762455940246582, "learning_rate": 3.3796296296296295e-05, "loss": 0.173, "step": 940 }, { "epoch": 38.96907216494845, "eval_accuracy": 0.8666666666666667, "eval_loss": 0.4912014305591583, "eval_precision": 0.867978256170476, "eval_recall": 0.8666666666666667, "eval_runtime": 1.8067, "eval_samples_per_second": 190.96, "eval_steps_per_second": 6.089, "step": 945 }, { "epoch": 39.175257731958766, "grad_norm": 12.083857536315918, "learning_rate": 3.3564814814814815e-05, "loss": 0.1477, "step": 950 }, { "epoch": 39.58762886597938, "grad_norm": 17.14080238342285, "learning_rate": 3.3333333333333335e-05, "loss": 0.1285, "step": 960 }, { "epoch": 40.0, "grad_norm": 13.190485000610352, "learning_rate": 3.3101851851851855e-05, "loss": 0.1409, "step": 970 }, { "epoch": 40.0, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5222660899162292, "eval_precision": 0.8501727809182621, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8482, "eval_samples_per_second": 186.669, "eval_steps_per_second": 5.952, "step": 970 }, { "epoch": 40.41237113402062, "grad_norm": 8.88687801361084, "learning_rate": 3.2870370370370375e-05, "loss": 0.151, "step": 980 }, { "epoch": 40.824742268041234, "grad_norm": 7.21800422668457, "learning_rate": 3.263888888888889e-05, "loss": 0.1379, "step": 990 }, { "epoch": 40.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5204349160194397, "eval_precision": 0.8486749182344644, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8062, "eval_samples_per_second": 191.006, "eval_steps_per_second": 6.09, "step": 994 }, { "epoch": 41.23711340206186, "grad_norm": 10.057676315307617, "learning_rate": 3.240740740740741e-05, "loss": 0.1079, "step": 1000 }, { "epoch": 41.649484536082475, "grad_norm": 13.667500495910645, "learning_rate": 3.217592592592593e-05, "loss": 0.1437, "step": 1010 }, { "epoch": 41.97938144329897, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.5860036611557007, "eval_precision": 0.8550665818648641, "eval_recall": 0.8521739130434782, "eval_runtime": 1.7468, "eval_samples_per_second": 197.503, "eval_steps_per_second": 6.297, "step": 1018 }, { "epoch": 42.06185567010309, "grad_norm": 6.985457420349121, "learning_rate": 3.194444444444444e-05, "loss": 0.1521, "step": 1020 }, { "epoch": 42.47422680412371, "grad_norm": 16.70668601989746, "learning_rate": 3.171296296296297e-05, "loss": 0.1393, "step": 1030 }, { "epoch": 42.88659793814433, "grad_norm": 6.907033920288086, "learning_rate": 3.148148148148148e-05, "loss": 0.1022, "step": 1040 }, { "epoch": 42.96907216494845, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.5460776686668396, "eval_precision": 0.8491763964495722, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7961, "eval_samples_per_second": 192.078, "eval_steps_per_second": 6.124, "step": 1042 }, { "epoch": 43.29896907216495, "grad_norm": 9.046392440795898, "learning_rate": 3.125e-05, "loss": 0.1385, "step": 1050 }, { "epoch": 43.71134020618557, "grad_norm": 10.188021659851074, "learning_rate": 3.101851851851852e-05, "loss": 0.1181, "step": 1060 }, { "epoch": 44.0, "eval_accuracy": 0.855072463768116, "eval_loss": 0.541079044342041, "eval_precision": 0.856643419178803, "eval_recall": 0.855072463768116, "eval_runtime": 1.7664, "eval_samples_per_second": 195.31, "eval_steps_per_second": 6.227, "step": 1067 }, { "epoch": 44.123711340206185, "grad_norm": 8.506319046020508, "learning_rate": 3.0787037037037034e-05, "loss": 0.1411, "step": 1070 }, { "epoch": 44.5360824742268, "grad_norm": 15.423176765441895, "learning_rate": 3.055555555555556e-05, "loss": 0.1346, "step": 1080 }, { "epoch": 44.94845360824742, "grad_norm": 6.524370193481445, "learning_rate": 3.0324074074074077e-05, "loss": 0.1212, "step": 1090 }, { "epoch": 44.98969072164948, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.5293735861778259, "eval_precision": 0.8580282602145957, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8173, "eval_samples_per_second": 189.843, "eval_steps_per_second": 6.053, "step": 1091 }, { "epoch": 45.36082474226804, "grad_norm": 12.142955780029297, "learning_rate": 3.0092592592592593e-05, "loss": 0.105, "step": 1100 }, { "epoch": 45.77319587628866, "grad_norm": 11.581314086914062, "learning_rate": 2.9861111111111113e-05, "loss": 0.1049, "step": 1110 }, { "epoch": 45.97938144329897, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.566691517829895, "eval_precision": 0.8491712997027965, "eval_recall": 0.8492753623188406, "eval_runtime": 1.799, "eval_samples_per_second": 191.772, "eval_steps_per_second": 6.114, "step": 1115 }, { "epoch": 46.18556701030928, "grad_norm": 15.353252410888672, "learning_rate": 2.962962962962963e-05, "loss": 0.1335, "step": 1120 }, { "epoch": 46.597938144329895, "grad_norm": 11.990909576416016, "learning_rate": 2.9398148148148146e-05, "loss": 0.1132, "step": 1130 }, { "epoch": 46.96907216494845, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.5908281207084656, "eval_precision": 0.8491182494977805, "eval_recall": 0.8463768115942029, "eval_runtime": 1.8291, "eval_samples_per_second": 188.615, "eval_steps_per_second": 6.014, "step": 1139 }, { "epoch": 47.01030927835052, "grad_norm": 7.466699600219727, "learning_rate": 2.916666666666667e-05, "loss": 0.1229, "step": 1140 }, { "epoch": 47.422680412371136, "grad_norm": 4.299150466918945, "learning_rate": 2.8935185185185186e-05, "loss": 0.1181, "step": 1150 }, { "epoch": 47.83505154639175, "grad_norm": 8.699248313903809, "learning_rate": 2.8703703703703706e-05, "loss": 0.1313, "step": 1160 }, { "epoch": 48.0, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.5995594263076782, "eval_precision": 0.8581686976058893, "eval_recall": 0.8521739130434782, "eval_runtime": 1.7851, "eval_samples_per_second": 193.27, "eval_steps_per_second": 6.162, "step": 1164 }, { "epoch": 48.24742268041237, "grad_norm": 7.394286632537842, "learning_rate": 2.8472222222222223e-05, "loss": 0.1287, "step": 1170 }, { "epoch": 48.65979381443299, "grad_norm": 10.575745582580566, "learning_rate": 2.824074074074074e-05, "loss": 0.1312, "step": 1180 }, { "epoch": 48.98969072164948, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.542959451675415, "eval_precision": 0.8607254186783246, "eval_recall": 0.8579710144927536, "eval_runtime": 1.7426, "eval_samples_per_second": 197.985, "eval_steps_per_second": 6.313, "step": 1188 }, { "epoch": 49.07216494845361, "grad_norm": 14.257989883422852, "learning_rate": 2.8009259259259263e-05, "loss": 0.1341, "step": 1190 }, { "epoch": 49.48453608247423, "grad_norm": 9.95071029663086, "learning_rate": 2.777777777777778e-05, "loss": 0.138, "step": 1200 }, { "epoch": 49.896907216494846, "grad_norm": 10.54672622680664, "learning_rate": 2.75462962962963e-05, "loss": 0.0996, "step": 1210 }, { "epoch": 49.97938144329897, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.5776570439338684, "eval_precision": 0.8561151948364225, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8283, "eval_samples_per_second": 188.7, "eval_steps_per_second": 6.017, "step": 1212 }, { "epoch": 50.30927835051546, "grad_norm": 9.269867897033691, "learning_rate": 2.7314814814814816e-05, "loss": 0.1183, "step": 1220 }, { "epoch": 50.72164948453608, "grad_norm": 3.963714361190796, "learning_rate": 2.7083333333333332e-05, "loss": 0.1389, "step": 1230 }, { "epoch": 50.96907216494845, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.5757654905319214, "eval_precision": 0.8486477905744771, "eval_recall": 0.8434782608695652, "eval_runtime": 1.8064, "eval_samples_per_second": 190.984, "eval_steps_per_second": 6.089, "step": 1236 }, { "epoch": 51.134020618556704, "grad_norm": 24.62941551208496, "learning_rate": 2.6851851851851855e-05, "loss": 0.1188, "step": 1240 }, { "epoch": 51.54639175257732, "grad_norm": 14.212287902832031, "learning_rate": 2.6620370370370372e-05, "loss": 0.1257, "step": 1250 }, { "epoch": 51.95876288659794, "grad_norm": 10.230920791625977, "learning_rate": 2.6388888888888892e-05, "loss": 0.1079, "step": 1260 }, { "epoch": 52.0, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.5540273785591125, "eval_precision": 0.8611434608590304, "eval_recall": 0.8579710144927536, "eval_runtime": 1.7965, "eval_samples_per_second": 192.043, "eval_steps_per_second": 6.123, "step": 1261 }, { "epoch": 52.371134020618555, "grad_norm": 12.681902885437012, "learning_rate": 2.615740740740741e-05, "loss": 0.0964, "step": 1270 }, { "epoch": 52.78350515463917, "grad_norm": 14.907917022705078, "learning_rate": 2.5925925925925925e-05, "loss": 0.0972, "step": 1280 }, { "epoch": 52.98969072164948, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5599762797355652, "eval_precision": 0.8559313253403165, "eval_recall": 0.855072463768116, "eval_runtime": 1.8665, "eval_samples_per_second": 184.836, "eval_steps_per_second": 5.893, "step": 1285 }, { "epoch": 53.1958762886598, "grad_norm": 13.571532249450684, "learning_rate": 2.5694444444444445e-05, "loss": 0.1164, "step": 1290 }, { "epoch": 53.608247422680414, "grad_norm": 14.119112014770508, "learning_rate": 2.5462962962962965e-05, "loss": 0.0985, "step": 1300 }, { "epoch": 53.97938144329897, "eval_accuracy": 0.863768115942029, "eval_loss": 0.5391947627067566, "eval_precision": 0.865555829019492, "eval_recall": 0.863768115942029, "eval_runtime": 1.8914, "eval_samples_per_second": 182.408, "eval_steps_per_second": 5.816, "step": 1309 }, { "epoch": 54.02061855670103, "grad_norm": 11.18630599975586, "learning_rate": 2.5231481481481485e-05, "loss": 0.1139, "step": 1310 }, { "epoch": 54.43298969072165, "grad_norm": 14.511212348937988, "learning_rate": 2.5e-05, "loss": 0.1117, "step": 1320 }, { "epoch": 54.845360824742265, "grad_norm": 4.760071277618408, "learning_rate": 2.4768518518518518e-05, "loss": 0.1112, "step": 1330 }, { "epoch": 54.96907216494845, "eval_accuracy": 0.863768115942029, "eval_loss": 0.5410789847373962, "eval_precision": 0.8655836794521399, "eval_recall": 0.863768115942029, "eval_runtime": 1.8766, "eval_samples_per_second": 183.845, "eval_steps_per_second": 5.862, "step": 1333 }, { "epoch": 55.25773195876289, "grad_norm": 8.37569808959961, "learning_rate": 2.4537037037037038e-05, "loss": 0.1062, "step": 1340 }, { "epoch": 55.670103092783506, "grad_norm": 10.700220108032227, "learning_rate": 2.4305555555555558e-05, "loss": 0.1308, "step": 1350 }, { "epoch": 56.0, "eval_accuracy": 0.863768115942029, "eval_loss": 0.5445396900177002, "eval_precision": 0.8653666576853845, "eval_recall": 0.863768115942029, "eval_runtime": 1.8208, "eval_samples_per_second": 189.479, "eval_steps_per_second": 6.041, "step": 1358 }, { "epoch": 56.08247422680412, "grad_norm": 19.0463924407959, "learning_rate": 2.4074074074074074e-05, "loss": 0.1081, "step": 1360 }, { "epoch": 56.49484536082474, "grad_norm": 6.819794654846191, "learning_rate": 2.3842592592592594e-05, "loss": 0.1072, "step": 1370 }, { "epoch": 56.90721649484536, "grad_norm": 6.308873176574707, "learning_rate": 2.361111111111111e-05, "loss": 0.1005, "step": 1380 }, { "epoch": 56.98969072164948, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5554308891296387, "eval_precision": 0.8551462662985753, "eval_recall": 0.855072463768116, "eval_runtime": 1.868, "eval_samples_per_second": 184.69, "eval_steps_per_second": 5.889, "step": 1382 }, { "epoch": 57.31958762886598, "grad_norm": 5.025654315948486, "learning_rate": 2.337962962962963e-05, "loss": 0.088, "step": 1390 }, { "epoch": 57.7319587628866, "grad_norm": 10.021939277648926, "learning_rate": 2.314814814814815e-05, "loss": 0.0871, "step": 1400 }, { "epoch": 57.97938144329897, "eval_accuracy": 0.8405797101449275, "eval_loss": 0.5966009497642517, "eval_precision": 0.8440749450064067, "eval_recall": 0.8405797101449275, "eval_runtime": 1.7974, "eval_samples_per_second": 191.939, "eval_steps_per_second": 6.12, "step": 1406 }, { "epoch": 58.144329896907216, "grad_norm": 16.077518463134766, "learning_rate": 2.2916666666666667e-05, "loss": 0.089, "step": 1410 }, { "epoch": 58.55670103092783, "grad_norm": 14.556241035461426, "learning_rate": 2.2685185185185187e-05, "loss": 0.1072, "step": 1420 }, { "epoch": 58.96907216494845, "grad_norm": 9.045204162597656, "learning_rate": 2.2453703703703703e-05, "loss": 0.1102, "step": 1430 }, { "epoch": 58.96907216494845, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.5807223916053772, "eval_precision": 0.8543040805400182, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8412, "eval_samples_per_second": 187.376, "eval_steps_per_second": 5.974, "step": 1430 }, { "epoch": 59.381443298969074, "grad_norm": 12.29312515258789, "learning_rate": 2.2222222222222223e-05, "loss": 0.1021, "step": 1440 }, { "epoch": 59.79381443298969, "grad_norm": 13.808602333068848, "learning_rate": 2.1990740740740743e-05, "loss": 0.1028, "step": 1450 }, { "epoch": 60.0, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.5653913021087646, "eval_precision": 0.8490636359945823, "eval_recall": 0.8434782608695652, "eval_runtime": 1.8195, "eval_samples_per_second": 189.615, "eval_steps_per_second": 6.046, "step": 1455 }, { "epoch": 60.20618556701031, "grad_norm": 8.929511070251465, "learning_rate": 2.175925925925926e-05, "loss": 0.1103, "step": 1460 }, { "epoch": 60.618556701030926, "grad_norm": 14.425239562988281, "learning_rate": 2.152777777777778e-05, "loss": 0.107, "step": 1470 }, { "epoch": 60.98969072164948, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.577854573726654, "eval_precision": 0.8460752319344831, "eval_recall": 0.8434782608695652, "eval_runtime": 1.8265, "eval_samples_per_second": 188.883, "eval_steps_per_second": 6.022, "step": 1479 }, { "epoch": 61.03092783505155, "grad_norm": 10.870781898498535, "learning_rate": 2.1296296296296296e-05, "loss": 0.0954, "step": 1480 }, { "epoch": 61.44329896907217, "grad_norm": 10.188617706298828, "learning_rate": 2.1064814814814816e-05, "loss": 0.0942, "step": 1490 }, { "epoch": 61.855670103092784, "grad_norm": 6.4580302238464355, "learning_rate": 2.0833333333333336e-05, "loss": 0.0848, "step": 1500 }, { "epoch": 61.97938144329897, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5842954516410828, "eval_precision": 0.8569219850916401, "eval_recall": 0.855072463768116, "eval_runtime": 1.8368, "eval_samples_per_second": 187.828, "eval_steps_per_second": 5.989, "step": 1503 }, { "epoch": 62.2680412371134, "grad_norm": 13.236536979675293, "learning_rate": 2.0601851851851853e-05, "loss": 0.0993, "step": 1510 }, { "epoch": 62.68041237113402, "grad_norm": 11.377030372619629, "learning_rate": 2.037037037037037e-05, "loss": 0.0976, "step": 1520 }, { "epoch": 62.96907216494845, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.6161760687828064, "eval_precision": 0.8454310204706964, "eval_recall": 0.8434782608695652, "eval_runtime": 1.7609, "eval_samples_per_second": 195.923, "eval_steps_per_second": 6.247, "step": 1527 }, { "epoch": 63.09278350515464, "grad_norm": 9.68355655670166, "learning_rate": 2.013888888888889e-05, "loss": 0.0788, "step": 1530 }, { "epoch": 63.50515463917526, "grad_norm": 6.282276153564453, "learning_rate": 1.990740740740741e-05, "loss": 0.103, "step": 1540 }, { "epoch": 63.91752577319588, "grad_norm": 4.893520832061768, "learning_rate": 1.967592592592593e-05, "loss": 0.0977, "step": 1550 }, { "epoch": 64.0, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.5822046995162964, "eval_precision": 0.8468574730482583, "eval_recall": 0.8463768115942029, "eval_runtime": 1.8068, "eval_samples_per_second": 190.942, "eval_steps_per_second": 6.088, "step": 1552 }, { "epoch": 64.3298969072165, "grad_norm": 10.216239929199219, "learning_rate": 1.9444444444444445e-05, "loss": 0.1112, "step": 1560 }, { "epoch": 64.74226804123711, "grad_norm": 22.551631927490234, "learning_rate": 1.9212962962962962e-05, "loss": 0.1256, "step": 1570 }, { "epoch": 64.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.575657308101654, "eval_precision": 0.851359361697526, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8317, "eval_samples_per_second": 188.346, "eval_steps_per_second": 6.005, "step": 1576 }, { "epoch": 65.15463917525773, "grad_norm": 6.853829383850098, "learning_rate": 1.8981481481481482e-05, "loss": 0.096, "step": 1580 }, { "epoch": 65.56701030927834, "grad_norm": 14.361750602722168, "learning_rate": 1.8750000000000002e-05, "loss": 0.0942, "step": 1590 }, { "epoch": 65.97938144329896, "grad_norm": 9.966873168945312, "learning_rate": 1.8518518518518518e-05, "loss": 0.0883, "step": 1600 }, { "epoch": 65.97938144329896, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.5716322660446167, "eval_precision": 0.8466640969128532, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7836, "eval_samples_per_second": 193.433, "eval_steps_per_second": 6.167, "step": 1600 }, { "epoch": 66.3917525773196, "grad_norm": 9.780498504638672, "learning_rate": 1.8287037037037038e-05, "loss": 0.0791, "step": 1610 }, { "epoch": 66.80412371134021, "grad_norm": 10.076851844787598, "learning_rate": 1.8055555555555555e-05, "loss": 0.0808, "step": 1620 }, { "epoch": 66.96907216494846, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5726441144943237, "eval_precision": 0.8562372477793413, "eval_recall": 0.855072463768116, "eval_runtime": 1.782, "eval_samples_per_second": 193.608, "eval_steps_per_second": 6.173, "step": 1624 }, { "epoch": 67.21649484536083, "grad_norm": 10.814988136291504, "learning_rate": 1.7824074074074075e-05, "loss": 0.0604, "step": 1630 }, { "epoch": 67.62886597938144, "grad_norm": 14.779629707336426, "learning_rate": 1.7592592592592595e-05, "loss": 0.1034, "step": 1640 }, { "epoch": 68.0, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5412786602973938, "eval_precision": 0.8548742107305042, "eval_recall": 0.855072463768116, "eval_runtime": 1.8607, "eval_samples_per_second": 185.418, "eval_steps_per_second": 5.912, "step": 1649 }, { "epoch": 68.04123711340206, "grad_norm": 7.925902843475342, "learning_rate": 1.736111111111111e-05, "loss": 0.098, "step": 1650 }, { "epoch": 68.45360824742268, "grad_norm": 8.179915428161621, "learning_rate": 1.712962962962963e-05, "loss": 0.0871, "step": 1660 }, { "epoch": 68.8659793814433, "grad_norm": 8.375000953674316, "learning_rate": 1.6898148148148148e-05, "loss": 0.0845, "step": 1670 }, { "epoch": 68.98969072164948, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.5826108455657959, "eval_precision": 0.8476663926581475, "eval_recall": 0.8434782608695652, "eval_runtime": 1.8967, "eval_samples_per_second": 181.896, "eval_steps_per_second": 5.8, "step": 1673 }, { "epoch": 69.27835051546391, "grad_norm": 8.613913536071777, "learning_rate": 1.6666666666666667e-05, "loss": 0.0911, "step": 1680 }, { "epoch": 69.69072164948453, "grad_norm": 9.535558700561523, "learning_rate": 1.6435185185185187e-05, "loss": 0.0916, "step": 1690 }, { "epoch": 69.97938144329896, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.566058337688446, "eval_precision": 0.8522049189345976, "eval_recall": 0.8521739130434782, "eval_runtime": 1.7731, "eval_samples_per_second": 194.574, "eval_steps_per_second": 6.204, "step": 1697 }, { "epoch": 70.10309278350516, "grad_norm": 7.769627571105957, "learning_rate": 1.6203703703703704e-05, "loss": 0.1011, "step": 1700 }, { "epoch": 70.51546391752578, "grad_norm": 9.350245475769043, "learning_rate": 1.597222222222222e-05, "loss": 0.0896, "step": 1710 }, { "epoch": 70.9278350515464, "grad_norm": 11.536579132080078, "learning_rate": 1.574074074074074e-05, "loss": 0.0912, "step": 1720 }, { "epoch": 70.96907216494846, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5770707130432129, "eval_precision": 0.84979303172866, "eval_recall": 0.8492753623188406, "eval_runtime": 1.817, "eval_samples_per_second": 189.875, "eval_steps_per_second": 6.054, "step": 1721 }, { "epoch": 71.34020618556701, "grad_norm": 15.122323989868164, "learning_rate": 1.550925925925926e-05, "loss": 0.0995, "step": 1730 }, { "epoch": 71.75257731958763, "grad_norm": 12.938358306884766, "learning_rate": 1.527777777777778e-05, "loss": 0.0863, "step": 1740 }, { "epoch": 72.0, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5769326686859131, "eval_precision": 0.8550354692908756, "eval_recall": 0.855072463768116, "eval_runtime": 1.8313, "eval_samples_per_second": 188.386, "eval_steps_per_second": 6.007, "step": 1746 }, { "epoch": 72.16494845360825, "grad_norm": 6.935812950134277, "learning_rate": 1.5046296296296297e-05, "loss": 0.0731, "step": 1750 }, { "epoch": 72.57731958762886, "grad_norm": 10.120232582092285, "learning_rate": 1.4814814814814815e-05, "loss": 0.1101, "step": 1760 }, { "epoch": 72.98969072164948, "grad_norm": 5.746927738189697, "learning_rate": 1.4583333333333335e-05, "loss": 0.083, "step": 1770 }, { "epoch": 72.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5860167145729065, "eval_precision": 0.8486187988428825, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8602, "eval_samples_per_second": 185.466, "eval_steps_per_second": 5.913, "step": 1770 }, { "epoch": 73.4020618556701, "grad_norm": 14.205853462219238, "learning_rate": 1.4351851851851853e-05, "loss": 0.1003, "step": 1780 }, { "epoch": 73.81443298969072, "grad_norm": 6.671767711639404, "learning_rate": 1.412037037037037e-05, "loss": 0.0839, "step": 1790 }, { "epoch": 73.97938144329896, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5647125244140625, "eval_precision": 0.8550673486786019, "eval_recall": 0.855072463768116, "eval_runtime": 1.843, "eval_samples_per_second": 187.195, "eval_steps_per_second": 5.969, "step": 1794 }, { "epoch": 74.22680412371135, "grad_norm": 6.19529914855957, "learning_rate": 1.388888888888889e-05, "loss": 0.0798, "step": 1800 }, { "epoch": 74.63917525773196, "grad_norm": 13.039739608764648, "learning_rate": 1.3657407407407408e-05, "loss": 0.0903, "step": 1810 }, { "epoch": 74.96907216494846, "eval_accuracy": 0.855072463768116, "eval_loss": 0.601210355758667, "eval_precision": 0.8534831427546733, "eval_recall": 0.855072463768116, "eval_runtime": 1.7476, "eval_samples_per_second": 197.417, "eval_steps_per_second": 6.294, "step": 1818 }, { "epoch": 75.05154639175258, "grad_norm": 6.386416435241699, "learning_rate": 1.3425925925925928e-05, "loss": 0.0872, "step": 1820 }, { "epoch": 75.4639175257732, "grad_norm": 7.484694957733154, "learning_rate": 1.3194444444444446e-05, "loss": 0.0751, "step": 1830 }, { "epoch": 75.87628865979381, "grad_norm": 10.781839370727539, "learning_rate": 1.2962962962962962e-05, "loss": 0.074, "step": 1840 }, { "epoch": 76.0, "eval_accuracy": 0.8463768115942029, "eval_loss": 0.6048101186752319, "eval_precision": 0.8461499789126601, "eval_recall": 0.8463768115942029, "eval_runtime": 1.7696, "eval_samples_per_second": 194.962, "eval_steps_per_second": 6.216, "step": 1843 }, { "epoch": 76.28865979381443, "grad_norm": 17.32390022277832, "learning_rate": 1.2731481481481482e-05, "loss": 0.0943, "step": 1850 }, { "epoch": 76.70103092783505, "grad_norm": 12.162288665771484, "learning_rate": 1.25e-05, "loss": 0.0907, "step": 1860 }, { "epoch": 76.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5806660056114197, "eval_precision": 0.8495330403324792, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7482, "eval_samples_per_second": 197.35, "eval_steps_per_second": 6.292, "step": 1867 }, { "epoch": 77.11340206185567, "grad_norm": 6.960859298706055, "learning_rate": 1.2268518518518519e-05, "loss": 0.0748, "step": 1870 }, { "epoch": 77.52577319587628, "grad_norm": 14.269356727600098, "learning_rate": 1.2037037037037037e-05, "loss": 0.0781, "step": 1880 }, { "epoch": 77.9381443298969, "grad_norm": 6.466542720794678, "learning_rate": 1.1805555555555555e-05, "loss": 0.0613, "step": 1890 }, { "epoch": 77.97938144329896, "eval_accuracy": 0.8376811594202899, "eval_loss": 0.5774852633476257, "eval_precision": 0.8381818122940702, "eval_recall": 0.8376811594202899, "eval_runtime": 1.7656, "eval_samples_per_second": 195.404, "eval_steps_per_second": 6.23, "step": 1891 }, { "epoch": 78.35051546391753, "grad_norm": 16.949039459228516, "learning_rate": 1.1574074074074075e-05, "loss": 0.0783, "step": 1900 }, { "epoch": 78.76288659793815, "grad_norm": 5.50955057144165, "learning_rate": 1.1342592592592593e-05, "loss": 0.0964, "step": 1910 }, { "epoch": 78.96907216494846, "eval_accuracy": 0.8666666666666667, "eval_loss": 0.5758916735649109, "eval_precision": 0.8675733846947259, "eval_recall": 0.8666666666666667, "eval_runtime": 1.7818, "eval_samples_per_second": 193.62, "eval_steps_per_second": 6.173, "step": 1915 }, { "epoch": 79.17525773195877, "grad_norm": 7.778840065002441, "learning_rate": 1.1111111111111112e-05, "loss": 0.0775, "step": 1920 }, { "epoch": 79.58762886597938, "grad_norm": 10.63167667388916, "learning_rate": 1.087962962962963e-05, "loss": 0.0849, "step": 1930 }, { "epoch": 80.0, "grad_norm": 10.529654502868652, "learning_rate": 1.0648148148148148e-05, "loss": 0.0735, "step": 1940 }, { "epoch": 80.0, "eval_accuracy": 0.855072463768116, "eval_loss": 0.5961835384368896, "eval_precision": 0.8565539653910103, "eval_recall": 0.855072463768116, "eval_runtime": 1.7657, "eval_samples_per_second": 195.391, "eval_steps_per_second": 6.23, "step": 1940 }, { "epoch": 80.41237113402062, "grad_norm": 10.91960334777832, "learning_rate": 1.0416666666666668e-05, "loss": 0.0803, "step": 1950 }, { "epoch": 80.82474226804123, "grad_norm": 6.953213691711426, "learning_rate": 1.0185185185185185e-05, "loss": 0.0663, "step": 1960 }, { "epoch": 80.98969072164948, "eval_accuracy": 0.8434782608695652, "eval_loss": 0.5768997669219971, "eval_precision": 0.8441240738989768, "eval_recall": 0.8434782608695652, "eval_runtime": 1.8615, "eval_samples_per_second": 185.334, "eval_steps_per_second": 5.909, "step": 1964 }, { "epoch": 81.23711340206185, "grad_norm": 14.6912841796875, "learning_rate": 9.953703703703704e-06, "loss": 0.0756, "step": 1970 }, { "epoch": 81.64948453608247, "grad_norm": 11.421167373657227, "learning_rate": 9.722222222222223e-06, "loss": 0.0719, "step": 1980 }, { "epoch": 81.97938144329896, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5826414823532104, "eval_precision": 0.8506964547245877, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8427, "eval_samples_per_second": 187.221, "eval_steps_per_second": 5.969, "step": 1988 }, { "epoch": 82.0618556701031, "grad_norm": 16.955421447753906, "learning_rate": 9.490740740740741e-06, "loss": 0.0756, "step": 1990 }, { "epoch": 82.47422680412372, "grad_norm": 13.900518417358398, "learning_rate": 9.259259259259259e-06, "loss": 0.0683, "step": 2000 }, { "epoch": 82.88659793814433, "grad_norm": 9.04283618927002, "learning_rate": 9.027777777777777e-06, "loss": 0.0718, "step": 2010 }, { "epoch": 82.96907216494846, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.5879714488983154, "eval_precision": 0.8590052571684228, "eval_recall": 0.8579710144927536, "eval_runtime": 1.7802, "eval_samples_per_second": 193.802, "eval_steps_per_second": 6.179, "step": 2012 }, { "epoch": 83.29896907216495, "grad_norm": 8.817221641540527, "learning_rate": 8.796296296296297e-06, "loss": 0.0699, "step": 2020 }, { "epoch": 83.71134020618557, "grad_norm": 9.379308700561523, "learning_rate": 8.564814814814816e-06, "loss": 0.0925, "step": 2030 }, { "epoch": 84.0, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5986330509185791, "eval_precision": 0.8512692229678578, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7681, "eval_samples_per_second": 195.129, "eval_steps_per_second": 6.221, "step": 2037 }, { "epoch": 84.12371134020619, "grad_norm": 8.215590476989746, "learning_rate": 8.333333333333334e-06, "loss": 0.0617, "step": 2040 }, { "epoch": 84.5360824742268, "grad_norm": 5.024844169616699, "learning_rate": 8.101851851851852e-06, "loss": 0.0729, "step": 2050 }, { "epoch": 84.94845360824742, "grad_norm": 9.782211303710938, "learning_rate": 7.87037037037037e-06, "loss": 0.0621, "step": 2060 }, { "epoch": 84.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5914923548698425, "eval_precision": 0.8496762597563219, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7614, "eval_samples_per_second": 195.868, "eval_steps_per_second": 6.245, "step": 2061 }, { "epoch": 85.36082474226804, "grad_norm": 7.3921942710876465, "learning_rate": 7.63888888888889e-06, "loss": 0.0621, "step": 2070 }, { "epoch": 85.77319587628865, "grad_norm": 10.206525802612305, "learning_rate": 7.4074074074074075e-06, "loss": 0.059, "step": 2080 }, { "epoch": 85.97938144329896, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.577899694442749, "eval_precision": 0.8577329472646936, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8903, "eval_samples_per_second": 182.511, "eval_steps_per_second": 5.819, "step": 2085 }, { "epoch": 86.18556701030928, "grad_norm": 18.180044174194336, "learning_rate": 7.1759259259259266e-06, "loss": 0.0663, "step": 2090 }, { "epoch": 86.5979381443299, "grad_norm": 10.320213317871094, "learning_rate": 6.944444444444445e-06, "loss": 0.0806, "step": 2100 }, { "epoch": 86.96907216494846, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.5928123593330383, "eval_precision": 0.850145540799145, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8068, "eval_samples_per_second": 190.946, "eval_steps_per_second": 6.088, "step": 2109 }, { "epoch": 87.01030927835052, "grad_norm": 13.640397071838379, "learning_rate": 6.712962962962964e-06, "loss": 0.0581, "step": 2110 }, { "epoch": 87.42268041237114, "grad_norm": 9.787714004516602, "learning_rate": 6.481481481481481e-06, "loss": 0.0641, "step": 2120 }, { "epoch": 87.83505154639175, "grad_norm": 7.827996730804443, "learning_rate": 6.25e-06, "loss": 0.0617, "step": 2130 }, { "epoch": 88.0, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.606200098991394, "eval_precision": 0.8519519771693684, "eval_recall": 0.8521739130434782, "eval_runtime": 1.7968, "eval_samples_per_second": 192.013, "eval_steps_per_second": 6.122, "step": 2134 }, { "epoch": 88.24742268041237, "grad_norm": 10.409219741821289, "learning_rate": 6.0185185185185185e-06, "loss": 0.0677, "step": 2140 }, { "epoch": 88.65979381443299, "grad_norm": 13.120059967041016, "learning_rate": 5.787037037037038e-06, "loss": 0.0651, "step": 2150 }, { "epoch": 88.98969072164948, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.6067116260528564, "eval_precision": 0.8518690976003952, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8144, "eval_samples_per_second": 190.144, "eval_steps_per_second": 6.063, "step": 2158 }, { "epoch": 89.0721649484536, "grad_norm": 8.974705696105957, "learning_rate": 5.555555555555556e-06, "loss": 0.0672, "step": 2160 }, { "epoch": 89.48453608247422, "grad_norm": 13.397907257080078, "learning_rate": 5.324074074074074e-06, "loss": 0.0727, "step": 2170 }, { "epoch": 89.89690721649484, "grad_norm": 4.159496784210205, "learning_rate": 5.092592592592592e-06, "loss": 0.0754, "step": 2180 }, { "epoch": 89.97938144329896, "eval_accuracy": 0.855072463768116, "eval_loss": 0.6107772588729858, "eval_precision": 0.8553431503660337, "eval_recall": 0.855072463768116, "eval_runtime": 1.7776, "eval_samples_per_second": 194.084, "eval_steps_per_second": 6.188, "step": 2182 }, { "epoch": 90.30927835051547, "grad_norm": 11.130279541015625, "learning_rate": 4.861111111111111e-06, "loss": 0.079, "step": 2190 }, { "epoch": 90.72164948453609, "grad_norm": 13.203577995300293, "learning_rate": 4.6296296296296296e-06, "loss": 0.0682, "step": 2200 }, { "epoch": 90.96907216494846, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.618496298789978, "eval_precision": 0.8488872700953353, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7798, "eval_samples_per_second": 193.847, "eval_steps_per_second": 6.181, "step": 2206 }, { "epoch": 91.1340206185567, "grad_norm": 10.04045581817627, "learning_rate": 4.398148148148149e-06, "loss": 0.0699, "step": 2210 }, { "epoch": 91.54639175257732, "grad_norm": 2.500128984451294, "learning_rate": 4.166666666666667e-06, "loss": 0.0664, "step": 2220 }, { "epoch": 91.95876288659794, "grad_norm": 9.432464599609375, "learning_rate": 3.935185185185185e-06, "loss": 0.0763, "step": 2230 }, { "epoch": 92.0, "eval_accuracy": 0.8579710144927536, "eval_loss": 0.6168191432952881, "eval_precision": 0.8575139456543875, "eval_recall": 0.8579710144927536, "eval_runtime": 1.8002, "eval_samples_per_second": 191.65, "eval_steps_per_second": 6.111, "step": 2231 }, { "epoch": 92.37113402061856, "grad_norm": 9.279271125793457, "learning_rate": 3.7037037037037037e-06, "loss": 0.0742, "step": 2240 }, { "epoch": 92.78350515463917, "grad_norm": 19.246337890625, "learning_rate": 3.4722222222222224e-06, "loss": 0.0703, "step": 2250 }, { "epoch": 92.98969072164948, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.6258795261383057, "eval_precision": 0.8520768323971984, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8416, "eval_samples_per_second": 187.341, "eval_steps_per_second": 5.973, "step": 2255 }, { "epoch": 93.19587628865979, "grad_norm": 5.38301420211792, "learning_rate": 3.2407407407407406e-06, "loss": 0.0559, "step": 2260 }, { "epoch": 93.6082474226804, "grad_norm": 7.105731964111328, "learning_rate": 3.0092592592592593e-06, "loss": 0.0861, "step": 2270 }, { "epoch": 93.97938144329896, "eval_accuracy": 0.855072463768116, "eval_loss": 0.6128158569335938, "eval_precision": 0.8553431503660337, "eval_recall": 0.855072463768116, "eval_runtime": 1.776, "eval_samples_per_second": 194.252, "eval_steps_per_second": 6.194, "step": 2279 }, { "epoch": 94.02061855670104, "grad_norm": 14.296255111694336, "learning_rate": 2.777777777777778e-06, "loss": 0.089, "step": 2280 }, { "epoch": 94.43298969072166, "grad_norm": 11.694154739379883, "learning_rate": 2.546296296296296e-06, "loss": 0.07, "step": 2290 }, { "epoch": 94.84536082474227, "grad_norm": 8.240065574645996, "learning_rate": 2.3148148148148148e-06, "loss": 0.0807, "step": 2300 }, { "epoch": 94.96907216494846, "eval_accuracy": 0.855072463768116, "eval_loss": 0.6139995455741882, "eval_precision": 0.8546533219302098, "eval_recall": 0.855072463768116, "eval_runtime": 1.763, "eval_samples_per_second": 195.691, "eval_steps_per_second": 6.239, "step": 2303 }, { "epoch": 95.25773195876289, "grad_norm": 6.740184307098389, "learning_rate": 2.0833333333333334e-06, "loss": 0.0814, "step": 2310 }, { "epoch": 95.6701030927835, "grad_norm": 9.714829444885254, "learning_rate": 1.8518518518518519e-06, "loss": 0.0621, "step": 2320 }, { "epoch": 96.0, "eval_accuracy": 0.8521739130434782, "eval_loss": 0.6132925748825073, "eval_precision": 0.8531657869027159, "eval_recall": 0.8521739130434782, "eval_runtime": 1.8081, "eval_samples_per_second": 190.808, "eval_steps_per_second": 6.084, "step": 2328 }, { "epoch": 96.08247422680412, "grad_norm": 11.212587356567383, "learning_rate": 1.6203703703703703e-06, "loss": 0.065, "step": 2330 }, { "epoch": 96.49484536082474, "grad_norm": 5.428162097930908, "learning_rate": 1.388888888888889e-06, "loss": 0.0621, "step": 2340 }, { "epoch": 96.90721649484536, "grad_norm": 15.444799423217773, "learning_rate": 1.1574074074074074e-06, "loss": 0.0831, "step": 2350 }, { "epoch": 96.98969072164948, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.6100958585739136, "eval_precision": 0.8507158478342087, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7991, "eval_samples_per_second": 191.765, "eval_steps_per_second": 6.114, "step": 2352 }, { "epoch": 97.31958762886597, "grad_norm": 12.789685249328613, "learning_rate": 9.259259259259259e-07, "loss": 0.0584, "step": 2360 }, { "epoch": 97.73195876288659, "grad_norm": 9.271283149719238, "learning_rate": 6.944444444444445e-07, "loss": 0.0625, "step": 2370 }, { "epoch": 97.97938144329896, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.6096817851066589, "eval_precision": 0.8507158478342087, "eval_recall": 0.8492753623188406, "eval_runtime": 1.8191, "eval_samples_per_second": 189.651, "eval_steps_per_second": 6.047, "step": 2376 }, { "epoch": 98.14432989690722, "grad_norm": 10.486361503601074, "learning_rate": 4.6296296296296297e-07, "loss": 0.0563, "step": 2380 }, { "epoch": 98.55670103092784, "grad_norm": 4.260477066040039, "learning_rate": 2.3148148148148148e-07, "loss": 0.0648, "step": 2390 }, { "epoch": 98.96907216494846, "grad_norm": 8.932230949401855, "learning_rate": 0.0, "loss": 0.0571, "step": 2400 }, { "epoch": 98.96907216494846, "eval_accuracy": 0.8492753623188406, "eval_loss": 0.6083797812461853, "eval_precision": 0.8507158478342087, "eval_recall": 0.8492753623188406, "eval_runtime": 1.7521, "eval_samples_per_second": 196.912, "eval_steps_per_second": 6.278, "step": 2400 }, { "epoch": 98.96907216494846, "step": 2400, "total_flos": 7.732715563096474e+18, "train_loss": 0.2344164727628231, "train_runtime": 4723.8268, "train_samples_per_second": 65.709, "train_steps_per_second": 0.508 } ], "logging_steps": 10, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.732715563096474e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }