diff --git "a/8x_A100_40GB/nohup.out" "b/8x_A100_40GB/nohup.out" --- "a/8x_A100_40GB/nohup.out" +++ "b/8x_A100_40GB/nohup.out" @@ -10993,3 +10993,1422 @@ step 9349/19560 | loss 3.373840 (-1.36z)| norm 0.2708 (-0.47z)| lr 3.39e-04 | 32 step 9350/19560 | loss 3.441908 (+0.36z)| norm 0.2816 (+0.17z)| lr 3.39e-04 | 323.16 ms | 52.2% bf16 MFU | 1623486 tok/s step 9351/19560 | loss 3.381757 (-1.16z)| norm 0.2806 (+0.12z)| lr 3.39e-04 | 322.62 ms | 52.3% bf16 MFU | 1623568 tok/s step 9352/19560 | loss 3.416733 (-0.27z)| norm 0.2697 (-0.53z)| lr 3.39e-04 | 323.56 ms | 52.2% bf16 MFU | 1623408 tok/s +step 9353/19560 | loss 3.409210 (-0.46z)| norm 0.2755 (-0.17z)| lr 3.39e-04 | 322.87 ms | 52.3% bf16 MFU | 1623429 tok/s +step 9354/19560 | loss 3.422941 (-0.12z)| norm 0.2769 (-0.09z)| lr 3.39e-04 | 323.16 ms | 52.2% bf16 MFU | 1623376 tok/s +step 9355/19560 | loss 3.361411 (-1.65z)| norm 0.2803 (+0.10z)| lr 3.39e-04 | 323.16 ms | 52.2% bf16 MFU | 1623325 tok/s +step 9356/19560 | loss 3.409941 (-0.42z)| norm 0.3015 (+1.36z)| lr 3.39e-04 | 322.51 ms | 52.3% bf16 MFU | 1623441 tok/s +step 9357/19560 | loss 3.412963 (-0.36z)| norm 0.2787 (-0.01z)| lr 3.39e-04 | 323.34 ms | 52.2% bf16 MFU | 1623341 tok/s +step 9358/19560 | loss 3.401350 (-0.65z)| norm 0.2948 (+0.97z)| lr 3.39e-04 | 323.19 ms | 52.2% bf16 MFU | 1623287 tok/s +step 9359/19560 | loss 3.406418 (-0.51z)| norm 0.3045 (+1.52z)| lr 3.38e-04 | 322.38 ms | 52.4% bf16 MFU | 1623438 tok/s +step 9360/19560 | loss 3.482491 (+1.42z)| norm 0.2540 (-1.47z)| lr 3.38e-04 | 323.06 ms | 52.2% bf16 MFU | 1623411 tok/s +step 9361/19560 | loss 3.415249 (-0.28z)| norm 0.2976 (+1.10z)| lr 3.38e-04 | 323.34 ms | 52.2% bf16 MFU | 1623313 tok/s +step 9362/19560 | loss 3.458653 (+0.82z)| norm 0.2604 (-1.09z)| lr 3.38e-04 | 322.92 ms | 52.3% bf16 MFU | 1623327 tok/s +step 9363/19560 | loss 3.468327 (+1.07z)| norm 0.2734 (-0.32z)| lr 3.38e-04 | 323.02 ms | 52.2% bf16 MFU | 1623315 tok/s +step 9364/19560 | loss 3.516856 (+2.25z)| norm 0.2843 (+0.32z)| lr 3.38e-04 | 323.32 ms | 52.2% bf16 MFU | 1623227 tok/s +step 9365/19560 | loss 3.435175 (+0.20z)| norm 0.2723 (-0.39z)| lr 3.38e-04 | 323.13 ms | 52.2% bf16 MFU | 1623193 tok/s +step 9366/19560 | loss 3.392496 (-0.86z)| norm 0.2853 (+0.40z)| lr 3.38e-04 | 323.03 ms | 52.2% bf16 MFU | 1623184 tok/s +step 9367/19560 | loss 3.390326 (-0.91z)| norm 0.2549 (-1.41z)| lr 3.38e-04 | 322.70 ms | 52.3% bf16 MFU | 1623261 tok/s +step 9368/19560 | loss 3.473814 (+1.18z)| norm 0.2647 (-0.82z)| lr 3.38e-04 | 322.54 ms | 52.3% bf16 MFU | 1623372 tok/s +step 9369/19560 | loss 3.458562 (+0.78z)| norm 0.2694 (-0.54z)| lr 3.38e-04 | 323.03 ms | 52.2% bf16 MFU | 1623355 tok/s +step 9370/19560 | loss 3.462722 (+0.87z)| norm 0.2637 (-0.88z)| lr 3.38e-04 | 323.62 ms | 52.2% bf16 MFU | 1623190 tok/s +step 9371/19560 | loss 3.389178 (-0.97z)| norm 0.2624 (-0.94z)| lr 3.38e-04 | 322.64 ms | 52.3% bf16 MFU | 1623281 tok/s +step 9372/19560 | loss 3.401764 (-0.64z)| norm 0.2531 (-1.48z)| lr 3.38e-04 | 322.36 ms | 52.4% bf16 MFU | 1623437 tok/s +step 9373/19560 | loss 3.531305 (+2.53z)| norm 0.2926 (+0.90z)| lr 3.38e-04 | 323.30 ms | 52.2% bf16 MFU | 1623350 tok/s +step 9374/19560 | loss 3.499157 (+1.71z)| norm 0.2692 (-0.50z)| lr 3.38e-04 | 322.95 ms | 52.3% bf16 MFU | 1623353 tok/s +step 9375/19560 | loss 3.446391 (+0.43z)| norm 0.2590 (-1.10z)| lr 3.38e-04 | 322.97 ms | 52.3% bf16 MFU | 1623352 tok/s +step 9376/19560 | loss 3.399298 (-0.73z)| norm 0.2674 (-0.59z)| lr 3.38e-04 | 322.76 ms | 52.3% bf16 MFU | 1623405 tok/s +step 9377/19560 | loss 3.394077 (-0.84z)| norm 0.2645 (-0.77z)| lr 3.38e-04 | 322.49 ms | 52.3% bf16 MFU | 1623522 tok/s +step 9378/19560 | loss 3.365987 (-1.51z)| norm 0.2751 (-0.13z)| lr 3.38e-04 | 322.65 ms | 52.3% bf16 MFU | 1623594 tok/s +step 9379/19560 | loss 3.392663 (-0.85z)| norm 0.2570 (-1.25z)| lr 3.37e-04 | 322.76 ms | 52.3% bf16 MFU | 1623633 tok/s +step 9380/19560 | loss 3.421810 (-0.15z)| norm 0.2831 (+0.35z)| lr 3.37e-04 | 322.29 ms | 52.4% bf16 MFU | 1623790 tok/s +step 9381/19560 | loss 3.430404 (+0.06z)| norm 0.2790 (+0.08z)| lr 3.37e-04 | 322.53 ms | 52.3% bf16 MFU | 1623877 tok/s +step 9382/19560 | loss 3.417909 (-0.25z)| norm 0.2601 (-1.07z)| lr 3.37e-04 | 323.57 ms | 52.2% bf16 MFU | 1623699 tok/s +step 9383/19560 | loss 3.405516 (-0.55z)| norm 0.2620 (-0.95z)| lr 3.37e-04 | 322.59 ms | 52.3% bf16 MFU | 1623777 tok/s +step 9384/19560 | loss 3.417656 (-0.26z)| norm 0.2822 (+0.28z)| lr 3.37e-04 | 322.86 ms | 52.3% bf16 MFU | 1623783 tok/s +step 9385/19560 | loss 3.398551 (-0.73z)| norm 0.2791 (+0.10z)| lr 3.37e-04 | 322.70 ms | 52.3% bf16 MFU | 1623829 tok/s +step 9386/19560 | loss 3.366122 (-1.49z)| norm 0.2790 (+0.09z)| lr 3.37e-04 | 322.70 ms | 52.3% bf16 MFU | 1623872 tok/s +step 9387/19560 | loss 3.399670 (-0.67z)| norm 0.2779 (+0.01z)| lr 3.37e-04 | 323.52 ms | 52.2% bf16 MFU | 1623707 tok/s +step 9388/19560 | loss 3.408010 (-0.47z)| norm 0.2697 (-0.48z)| lr 3.37e-04 | 322.75 ms | 52.3% bf16 MFU | 1623743 tok/s +step 9389/19560 | loss 3.368790 (-1.40z)| norm 0.2706 (-0.43z)| lr 3.37e-04 | 322.55 ms | 52.3% bf16 MFU | 1623829 tok/s +step 9390/19560 | loss 3.385008 (-1.01z)| norm 0.2707 (-0.41z)| lr 3.37e-04 | 322.55 ms | 52.3% bf16 MFU | 1623910 tok/s +step 9391/19560 | loss 3.479974 (+1.29z)| norm 0.2598 (-1.09z)| lr 3.37e-04 | 322.21 ms | 52.4% bf16 MFU | 1624073 tok/s +step 9392/19560 | loss 3.413160 (-0.35z)| norm 0.2688 (-0.52z)| lr 3.37e-04 | 322.96 ms | 52.3% bf16 MFU | 1624037 tok/s +step 9393/19560 | loss 3.408783 (-0.45z)| norm 0.2395 (-2.26z)| lr 3.37e-04 | 322.77 ms | 52.3% bf16 MFU | 1624052 tok/s +step 9394/19560 | loss 3.439651 (+0.31z)| norm 0.2867 (+0.60z)| lr 3.37e-04 | 322.45 ms | 52.3% bf16 MFU | 1624148 tok/s +step 9395/19560 | loss 3.398242 (-0.72z)| norm 0.2630 (-0.83z)| lr 3.37e-04 | 322.98 ms | 52.3% bf16 MFU | 1624106 tok/s +step 9396/19560 | loss 3.384566 (-1.04z)| norm 0.2377 (-2.30z)| lr 3.37e-04 | 322.57 ms | 52.3% bf16 MFU | 1624167 tok/s +step 9397/19560 | loss 3.398888 (-0.68z)| norm 0.2698 (-0.38z)| lr 3.37e-04 | 323.17 ms | 52.2% bf16 MFU | 1624076 tok/s +step 9398/19560 | loss 3.446718 (+0.50z)| norm 0.2464 (-1.74z)| lr 3.37e-04 | 322.45 ms | 52.3% bf16 MFU | 1624170 tok/s +step 9399/19560 | loss 3.483299 (+1.38z)| norm 0.2461 (-1.74z)| lr 3.36e-04 | 322.45 ms | 52.3% bf16 MFU | 1624260 tok/s +step 9400/19560 | loss 3.433453 (+0.15z)| norm 0.2692 (-0.39z)| lr 3.36e-04 | 322.78 ms | 52.3% bf16 MFU | 1624261 tok/s +step 9401/19560 | loss 3.425845 (-0.06z)| norm 0.2363 (-2.26z)| lr 3.36e-04 | 323.02 ms | 52.2% bf16 MFU | 1624202 tok/s +step 9402/19560 | loss 3.444388 (+0.41z)| norm 0.2742 (-0.08z)| lr 3.36e-04 | 322.89 ms | 52.3% bf16 MFU | 1624179 tok/s +step 9403/19560 | loss 3.391920 (-0.89z)| norm 0.2726 (-0.18z)| lr 3.36e-04 | 322.10 ms | 52.4% bf16 MFU | 1624355 tok/s +step 9404/19560 | loss 3.391119 (-0.90z)| norm 0.2521 (-1.36z)| lr 3.36e-04 | 322.41 ms | 52.3% bf16 MFU | 1624444 tok/s +step 9405/19560 | loss 3.478980 (+1.27z)| norm 0.2788 (+0.17z)| lr 3.36e-04 | 322.54 ms | 52.3% bf16 MFU | 1624498 tok/s +step 9406/19560 | loss 3.472588 (+1.10z)| norm 0.2843 (+0.50z)| lr 3.36e-04 | 322.43 ms | 52.3% bf16 MFU | 1624577 tok/s +step 9407/19560 | loss 3.441433 (+0.34z)| norm 0.2930 (+1.06z)| lr 3.36e-04 | 322.36 ms | 52.4% bf16 MFU | 1624668 tok/s +step 9408/19560 | loss 3.421423 (-0.15z)| norm 0.2808 (+0.31z)| lr 3.36e-04 | 322.49 ms | 52.3% bf16 MFU | 1624723 tok/s +step 9409/19560 | loss 3.420683 (-0.16z)| norm 0.2779 (+0.14z)| lr 3.36e-04 | 322.55 ms | 52.3% bf16 MFU | 1624758 tok/s +step 9410/19560 | loss 3.418828 (-0.20z)| norm 0.3056 (+1.80z)| lr 3.36e-04 | 322.61 ms | 52.3% bf16 MFU | 1624777 tok/s +step 9411/19560 | loss 3.454445 (+0.68z)| norm 0.2952 (+1.16z)| lr 3.36e-04 | 322.43 ms | 52.3% bf16 MFU | 1624841 tok/s +step 9412/19560 | loss 3.457003 (+0.73z)| norm 0.2788 (+0.19z)| lr 3.36e-04 | 322.62 ms | 52.3% bf16 MFU | 1624854 tok/s +step 9413/19560 | loss 3.340709 (-2.11z)| norm 0.3122 (+2.20z)| lr 3.36e-04 | 323.16 ms | 52.2% bf16 MFU | 1624730 tok/s +step 9414/19560 | loss 3.435354 (+0.20z)| norm 0.2526 (-1.40z)| lr 3.36e-04 | 322.63 ms | 52.3% bf16 MFU | 1624747 tok/s +step 9415/19560 | loss 3.374478 (-1.28z)| norm 0.2839 (+0.50z)| lr 3.36e-04 | 322.74 ms | 52.3% bf16 MFU | 1624735 tok/s +step 9416/19560 | loss 3.395993 (-0.76z)| norm 0.2609 (-0.91z)| lr 3.36e-04 | 322.48 ms | 52.3% bf16 MFU | 1624787 tok/s +step 9417/19560 | loss 3.441001 (+0.34z)| norm 0.2832 (+0.46z)| lr 3.36e-04 | 322.36 ms | 52.4% bf16 MFU | 1624867 tok/s +step 9418/19560 | loss 3.459995 (+0.79z)| norm 0.2900 (+0.87z)| lr 3.36e-04 | 322.56 ms | 52.3% bf16 MFU | 1624894 tok/s +step 9419/19560 | loss 3.443259 (+0.38z)| norm 0.2689 (-0.43z)| lr 3.35e-04 | 322.89 ms | 52.3% bf16 MFU | 1624835 tok/s +step 9420/19560 | loss 3.370189 (-1.42z)| norm 0.2564 (-1.19z)| lr 3.35e-04 | 322.65 ms | 52.3% bf16 MFU | 1624840 tok/s +step 9421/19560 | loss 3.402342 (-0.62z)| norm 0.2702 (-0.34z)| lr 3.35e-04 | 322.50 ms | 52.3% bf16 MFU | 1624883 tok/s +step 9422/19560 | loss 3.385766 (-1.02z)| norm 0.2843 (+0.51z)| lr 3.35e-04 | 322.61 ms | 52.3% bf16 MFU | 1624895 tok/s +step 9423/19560 | loss 3.429618 (+0.05z)| norm 0.2556 (-1.24z)| lr 3.35e-04 | 322.32 ms | 52.4% bf16 MFU | 1624980 tok/s +step 9424/19560 | loss 3.502450 (+1.81z)| norm 0.2836 (+0.47z)| lr 3.35e-04 | 322.93 ms | 52.3% bf16 MFU | 1624906 tok/s +step 9425/19560 | loss 3.564057 (+3.16z)| norm 0.2613 (-0.89z)| lr 3.35e-04 | 322.80 ms | 52.3% bf16 MFU | 1624869 tok/s +step 9426/19560 | loss 3.432077 (+0.05z)| norm 0.2718 (-0.27z)| lr 3.35e-04 | 322.54 ms | 52.3% bf16 MFU | 1624901 tok/s +step 9427/19560 | loss 3.393736 (-0.84z)| norm 0.2678 (-0.52z)| lr 3.35e-04 | 322.59 ms | 52.3% bf16 MFU | 1624917 tok/s +step 9428/19560 | loss 3.428361 (-0.02z)| norm 0.2731 (-0.20z)| lr 3.35e-04 | 322.67 ms | 52.3% bf16 MFU | 1624914 tok/s +step 9429/19560 | loss 3.431750 (+0.05z)| norm 0.2528 (-1.48z)| lr 3.35e-04 | 322.52 ms | 52.3% bf16 MFU | 1624947 tok/s +step 9430/19560 | loss 3.443682 (+0.33z)| norm 0.2759 (-0.03z)| lr 3.35e-04 | 322.50 ms | 52.3% bf16 MFU | 1624985 tok/s +step 9431/19560 | loss 3.410707 (-0.48z)| norm 0.2359 (-2.45z)| lr 3.35e-04 | 322.52 ms | 52.3% bf16 MFU | 1625017 tok/s +step 9432/19560 | loss 3.477991 (+1.14z)| norm 0.2963 (+1.22z)| lr 3.35e-04 | 322.47 ms | 52.3% bf16 MFU | 1625059 tok/s +step 9433/19560 | loss 3.358592 (-1.73z)| norm 0.2626 (-0.81z)| lr 3.35e-04 | 322.63 ms | 52.3% bf16 MFU | 1625059 tok/s +step 9434/19560 | loss 3.444680 (+0.33z)| norm 0.2774 (+0.08z)| lr 3.35e-04 | 323.12 ms | 52.2% bf16 MFU | 1624935 tok/s +step 9435/19560 | loss 3.425455 (-0.12z)| norm 0.2741 (-0.13z)| lr 3.35e-04 | 322.17 ms | 52.4% bf16 MFU | 1625057 tok/s +step 9436/19560 | loss 3.422276 (-0.19z)| norm 0.2813 (+0.34z)| lr 3.35e-04 | 322.83 ms | 52.3% bf16 MFU | 1625006 tok/s +step 9437/19560 | loss 3.421096 (-0.22z)| norm 0.2694 (-0.41z)| lr 3.35e-04 | 322.68 ms | 52.3% bf16 MFU | 1624996 tok/s +step 9438/19560 | loss 3.432518 (+0.06z)| norm 0.3164 (+2.50z)| lr 3.35e-04 | 322.31 ms | 52.4% bf16 MFU | 1625080 tok/s +step 9439/19560 | loss 3.445301 (+0.38z)| norm 0.2706 (-0.32z)| lr 3.35e-04 | 322.75 ms | 52.3% bf16 MFU | 1625049 tok/s +step 9440/19560 | loss 3.462773 (+0.87z)| norm 0.2674 (-0.51z)| lr 3.34e-04 | 323.13 ms | 52.2% bf16 MFU | 1624924 tok/s +step 9441/19560 | loss 3.484311 (+1.40z)| norm 0.3059 (+1.88z)| lr 3.34e-04 | 322.33 ms | 52.4% bf16 MFU | 1625006 tok/s +step 9442/19560 | loss 3.398172 (-0.79z)| norm 0.2740 (-0.11z)| lr 3.34e-04 | 322.39 ms | 52.3% bf16 MFU | 1625068 tok/s +step 9443/19560 | loss 3.488088 (+1.49z)| norm 0.2973 (+1.34z)| lr 3.34e-04 | 322.81 ms | 52.3% bf16 MFU | 1625022 tok/s +step 9444/19560 | loss 3.411378 (-0.45z)| norm 0.3081 (+1.97z)| lr 3.34e-04 | 322.77 ms | 52.3% bf16 MFU | 1624987 tok/s +step 9445/19560 | loss 3.364311 (-1.62z)| norm 0.2872 (+0.70z)| lr 3.34e-04 | 322.83 ms | 52.3% bf16 MFU | 1624940 tok/s +step 9446/19560 | loss 3.414347 (-0.36z)| norm 0.3146 (+2.31z)| lr 3.34e-04 | 322.73 ms | 52.3% bf16 MFU | 1624921 tok/s +step 9447/19560 | loss 3.487561 (+1.57z)| norm 0.3160 (+2.33z)| lr 3.34e-04 | 322.81 ms | 52.3% bf16 MFU | 1624881 tok/s +step 9448/19560 | loss 3.450596 (+0.61z)| norm 0.2810 (+0.27z)| lr 3.34e-04 | 322.77 ms | 52.3% bf16 MFU | 1624854 tok/s +step 9449/19560 | loss 3.452591 (+0.67z)| norm 0.2977 (+1.25z)| lr 3.34e-04 | 322.31 ms | 52.4% bf16 MFU | 1624944 tok/s +step 9450/19560 | loss 3.432281 (+0.13z)| norm 0.2647 (-0.68z)| lr 3.34e-04 | 322.68 ms | 52.3% bf16 MFU | 1624936 tok/s +step 9451/19560 | loss 3.429868 (+0.06z)| norm 0.3088 (+1.93z)| lr 3.34e-04 | 322.95 ms | 52.3% bf16 MFU | 1624862 tok/s +step 9452/19560 | loss 3.467342 (+1.05z)| norm 0.2773 (+0.08z)| lr 3.34e-04 | 322.47 ms | 52.3% bf16 MFU | 1624912 tok/s +step 9453/19560 | loss 3.381235 (-1.23z)| norm 0.2795 (+0.22z)| lr 3.34e-04 | 322.92 ms | 52.3% bf16 MFU | 1624846 tok/s +step 9454/19560 | loss 3.419372 (-0.20z)| norm 0.2797 (+0.24z)| lr 3.34e-04 | 322.98 ms | 52.3% bf16 MFU | 1624769 tok/s +step 9455/19560 | loss 3.433297 (+0.17z)| norm 0.2793 (+0.21z)| lr 3.34e-04 | 322.58 ms | 52.3% bf16 MFU | 1624796 tok/s +step 9456/19560 | loss 3.448240 (+0.56z)| norm 0.2668 (-0.54z)| lr 3.34e-04 | 322.34 ms | 52.4% bf16 MFU | 1624880 tok/s +step 9457/19560 | loss 3.472896 (+1.21z)| norm 0.2819 (+0.38z)| lr 3.34e-04 | 322.93 ms | 52.3% bf16 MFU | 1624812 tok/s +step 9458/19560 | loss 3.389090 (-1.02z)| norm 0.3017 (+1.59z)| lr 3.34e-04 | 322.82 ms | 52.3% bf16 MFU | 1624776 tok/s +step 9459/19560 | loss 3.414068 (-0.35z)| norm 0.2844 (+0.52z)| lr 3.34e-04 | 322.26 ms | 52.4% bf16 MFU | 1624883 tok/s +step 9460/19560 | loss 3.422533 (-0.12z)| norm 0.3049 (+1.74z)| lr 3.33e-04 | 322.77 ms | 52.3% bf16 MFU | 1624855 tok/s +step 9461/19560 | loss 3.446362 (+0.51z)| norm 0.2727 (-0.21z)| lr 3.33e-04 | 322.69 ms | 52.3% bf16 MFU | 1624848 tok/s +step 9462/19560 | loss 3.412710 (-0.39z)| norm 0.2787 (+0.15z)| lr 3.33e-04 | 322.47 ms | 52.3% bf16 MFU | 1624898 tok/s +step 9463/19560 | loss 3.506064 (+2.06z)| norm 0.2792 (+0.18z)| lr 3.33e-04 | 323.10 ms | 52.2% bf16 MFU | 1624788 tok/s +step 9464/19560 | loss 3.395244 (-0.85z)| norm 0.2961 (+1.20z)| lr 3.33e-04 | 322.39 ms | 52.3% bf16 MFU | 1624860 tok/s +step 9465/19560 | loss 3.402099 (-0.67z)| norm 0.2477 (-1.71z)| lr 3.33e-04 | 322.63 ms | 52.3% bf16 MFU | 1624869 tok/s +step 9466/19560 | loss 3.391122 (-0.95z)| norm 0.3037 (+1.63z)| lr 3.33e-04 | 322.53 ms | 52.3% bf16 MFU | 1624902 tok/s +step 9467/19560 | loss 3.465262 (+0.99z)| norm 0.2829 (+0.39z)| lr 3.33e-04 | 322.75 ms | 52.3% bf16 MFU | 1624880 tok/s +step 9468/19560 | loss 3.398269 (-0.76z)| norm 0.2879 (+0.68z)| lr 3.33e-04 | 322.75 ms | 52.3% bf16 MFU | 1624859 tok/s +step 9469/19560 | loss 3.430142 (+0.08z)| norm 0.2687 (-0.45z)| lr 3.33e-04 | 322.57 ms | 52.3% bf16 MFU | 1624882 tok/s +step 9470/19560 | loss 3.417952 (-0.24z)| norm 0.2757 (-0.03z)| lr 3.33e-04 | 322.78 ms | 52.3% bf16 MFU | 1624853 tok/s +step 9471/19560 | loss 3.451258 (+0.64z)| norm 0.2632 (-0.77z)| lr 3.33e-04 | 323.50 ms | 52.2% bf16 MFU | 1624645 tok/s +step 9472/19560 | loss 3.421206 (-0.16z)| norm 0.2822 (+0.36z)| lr 3.33e-04 | 322.81 ms | 52.3% bf16 MFU | 1624621 tok/s +step 9473/19560 | loss 3.405468 (-0.57z)| norm 0.2669 (-0.55z)| lr 3.33e-04 | 323.29 ms | 52.2% bf16 MFU | 1624475 tok/s +step 9474/19560 | loss 3.392671 (-0.90z)| norm 0.2717 (-0.26z)| lr 3.33e-04 | 322.85 ms | 52.3% bf16 MFU | 1624447 tok/s +step 9475/19560 | loss 3.420556 (-0.16z)| norm 0.2725 (-0.21z)| lr 3.33e-04 | 322.96 ms | 52.3% bf16 MFU | 1624395 tok/s +step 9476/19560 | loss 3.370253 (-1.46z)| norm 0.3004 (+1.44z)| lr 3.33e-04 | 322.56 ms | 52.3% bf16 MFU | 1624446 tok/s +step 9477/19560 | loss 3.523122 (+2.48z)| norm 0.2938 (+1.04z)| lr 3.33e-04 | 322.47 ms | 52.3% bf16 MFU | 1624516 tok/s +step 9478/19560 | loss 3.456463 (+0.76z)| norm 0.3110 (+2.01z)| lr 3.33e-04 | 322.31 ms | 52.4% bf16 MFU | 1624624 tok/s +step 9479/19560 | loss 3.396101 (-0.80z)| norm 0.2805 (+0.23z)| lr 3.33e-04 | 322.71 ms | 52.3% bf16 MFU | 1624624 tok/s +step 9480/19560 | loss 3.379694 (-1.21z)| norm 0.2859 (+0.54z)| lr 3.32e-04 | 322.48 ms | 52.3% bf16 MFU | 1624684 tok/s +step 9481/19560 | loss 3.370990 (-1.42z)| norm 0.2971 (+1.18z)| lr 3.32e-04 | 323.10 ms | 52.2% bf16 MFU | 1624582 tok/s +step 9482/19560 | loss 3.437275 (+0.27z)| norm 0.2822 (+0.31z)| lr 3.32e-04 | 322.87 ms | 52.3% bf16 MFU | 1624545 tok/s +step 9483/19560 | loss 3.503578 (+1.92z)| norm 0.3011 (+1.39z)| lr 3.32e-04 | 322.46 ms | 52.3% bf16 MFU | 1624614 tok/s +step 9484/19560 | loss 3.432920 (+0.12z)| norm 0.2929 (+0.92z)| lr 3.32e-04 | 322.55 ms | 52.3% bf16 MFU | 1624654 tok/s +step 9485/19560 | loss 3.457134 (+0.73z)| norm 0.3087 (+1.80z)| lr 3.32e-04 | 323.06 ms | 52.2% bf16 MFU | 1624566 tok/s +step 9486/19560 | loss 3.386091 (-1.07z)| norm 0.2799 (+0.16z)| lr 3.32e-04 | 322.12 ms | 52.4% bf16 MFU | 1624718 tok/s +step 9487/19560 | loss 3.378986 (-1.24z)| norm 0.2861 (+0.53z)| lr 3.32e-04 | 322.39 ms | 52.3% bf16 MFU | 1624794 tok/s +step 9488/19560 | loss 3.395498 (-0.81z)| norm 0.2801 (+0.17z)| lr 3.32e-04 | 322.82 ms | 52.3% bf16 MFU | 1624758 tok/s +step 9489/19560 | loss 3.416463 (-0.28z)| norm 0.2705 (-0.38z)| lr 3.32e-04 | 323.39 ms | 52.2% bf16 MFU | 1624581 tok/s +step 9490/19560 | loss 3.479631 (+1.31z)| norm 0.2818 (+0.28z)| lr 3.32e-04 | 322.58 ms | 52.3% bf16 MFU | 1624616 tok/s +step 9491/19560 | loss 3.437658 (+0.26z)| norm 0.2676 (-0.55z)| lr 3.32e-04 | 322.81 ms | 52.3% bf16 MFU | 1624592 tok/s +step 9492/19560 | loss 3.362331 (-1.63z)| norm 0.2812 (+0.25z)| lr 3.32e-04 | 322.64 ms | 52.3% bf16 MFU | 1624612 tok/s +step 9493/19560 | loss 3.424560 (-0.04z)| norm 0.2568 (-1.17z)| lr 3.32e-04 | 322.96 ms | 52.3% bf16 MFU | 1624550 tok/s +step 9494/19560 | loss 3.413147 (-0.33z)| norm 0.2741 (-0.16z)| lr 3.32e-04 | 322.86 ms | 52.3% bf16 MFU | 1624516 tok/s +step 9495/19560 | loss 3.348840 (-1.95z)| norm 0.3289 (+2.93z)| lr 3.32e-04 | 323.19 ms | 52.2% bf16 MFU | 1624401 tok/s +step 9496/19560 | loss 3.489193 (+1.60z)| norm 0.3127 (+1.96z)| lr 3.32e-04 | 322.79 ms | 52.3% bf16 MFU | 1624392 tok/s +step 9497/19560 | loss 3.370372 (-1.38z)| norm 0.2914 (+0.75z)| lr 3.32e-04 | 322.65 ms | 52.3% bf16 MFU | 1624418 tok/s +step 9498/19560 | loss 3.414044 (-0.27z)| norm 0.2753 (-0.15z)| lr 3.32e-04 | 322.67 ms | 52.3% bf16 MFU | 1624438 tok/s +step 9499/19560 | loss 3.340934 (-2.07z)| norm 0.2967 (+1.03z)| lr 3.32e-04 | 322.23 ms | 52.4% bf16 MFU | 1624570 tok/s +step 9500/19560 | loss 3.457988 (+0.82z)| norm 0.2829 (+0.25z)| lr 3.31e-04 | 323.65 ms | 52.1% bf16 MFU | 1624338 tok/s +val loss 3.408037 +evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 HellaSwag: 2870/10042 = 0.285800 +step 9501/19560 | loss 3.398712 (-0.64z)| norm 0.2971 (+1.04z)| lr 3.31e-04 | 322.53 ms | 52.3% bf16 MFU | 1624398 tok/s +step 9502/19560 | loss 3.445105 (+0.56z)| norm 0.2628 (-0.88z)| lr 3.31e-04 | 322.60 ms | 52.3% bf16 MFU | 1624438 tok/s +step 9503/19560 | loss 3.410753 (-0.32z)| norm 0.2675 (-0.63z)| lr 3.31e-04 | 323.18 ms | 52.2% bf16 MFU | 1624329 tok/s +step 9504/19560 | loss 3.390606 (-0.84z)| norm 0.2527 (-1.44z)| lr 3.31e-04 | 323.72 ms | 52.1% bf16 MFU | 1624091 tok/s +step 9505/19560 | loss 3.410942 (-0.32z)| norm 0.2714 (-0.40z)| lr 3.31e-04 | 322.98 ms | 52.3% bf16 MFU | 1624050 tok/s +step 9506/19560 | loss 3.357731 (-1.69z)| norm 0.2807 (+0.12z)| lr 3.31e-04 | 323.17 ms | 52.2% bf16 MFU | 1623963 tok/s +step 9507/19560 | loss 3.389888 (-0.86z)| norm 0.2544 (-1.35z)| lr 3.31e-04 | 322.80 ms | 52.3% bf16 MFU | 1623974 tok/s +step 9508/19560 | loss 3.437574 (+0.37z)| norm 0.2530 (-1.41z)| lr 3.31e-04 | 323.01 ms | 52.2% bf16 MFU | 1623932 tok/s +step 9509/19560 | loss 3.419964 (-0.08z)| norm 0.2771 (-0.07z)| lr 3.31e-04 | 323.19 ms | 52.2% bf16 MFU | 1623848 tok/s +step 9510/19560 | loss 3.413815 (-0.24z)| norm 0.2705 (-0.44z)| lr 3.31e-04 | 322.96 ms | 52.3% bf16 MFU | 1623824 tok/s +step 9511/19560 | loss 3.431769 (+0.22z)| norm 0.2834 (+0.27z)| lr 3.31e-04 | 323.27 ms | 52.2% bf16 MFU | 1623725 tok/s +step 9512/19560 | loss 3.430604 (+0.18z)| norm 0.2881 (+0.53z)| lr 3.31e-04 | 323.14 ms | 52.2% bf16 MFU | 1623662 tok/s +step 9513/19560 | loss 3.381630 (-1.07z)| norm 0.2529 (-1.41z)| lr 3.31e-04 | 323.10 ms | 52.2% bf16 MFU | 1623614 tok/s +step 9514/19560 | loss 3.487438 (+1.62z)| norm 0.2912 (+0.70z)| lr 3.31e-04 | 323.11 ms | 52.2% bf16 MFU | 1623565 tok/s +step 9515/19560 | loss 3.356934 (-1.70z)| norm 0.2852 (+0.37z)| lr 3.31e-04 | 322.93 ms | 52.3% bf16 MFU | 1623563 tok/s +step 9516/19560 | loss 3.425980 (+0.05z)| norm 0.2887 (+0.55z)| lr 3.31e-04 | 322.67 ms | 52.3% bf16 MFU | 1623628 tok/s +step 9517/19560 | loss 3.378067 (-1.17z)| norm 0.3037 (+1.36z)| lr 3.31e-04 | 322.79 ms | 52.3% bf16 MFU | 1623658 tok/s +step 9518/19560 | loss 3.417153 (-0.19z)| norm 0.2831 (+0.22z)| lr 3.31e-04 | 322.97 ms | 52.3% bf16 MFU | 1623641 tok/s +step 9519/19560 | loss 3.363410 (-1.53z)| norm 0.2638 (-0.84z)| lr 3.31e-04 | 323.29 ms | 52.2% bf16 MFU | 1623545 tok/s +step 9520/19560 | loss 3.470274 (+1.18z)| norm 0.2994 (+1.10z)| lr 3.30e-04 | 323.38 ms | 52.2% bf16 MFU | 1623431 tok/s +step 9521/19560 | loss 3.402656 (-0.54z)| norm 0.2699 (-0.54z)| lr 3.30e-04 | 322.39 ms | 52.4% bf16 MFU | 1623573 tok/s +step 9522/19560 | loss 3.420963 (-0.07z)| norm 0.2783 (-0.07z)| lr 3.30e-04 | 322.99 ms | 52.3% bf16 MFU | 1623557 tok/s +step 9523/19560 | loss 3.404759 (-0.48z)| norm 0.2726 (-0.39z)| lr 3.30e-04 | 323.75 ms | 52.1% bf16 MFU | 1623351 tok/s +step 9524/19560 | loss 3.417156 (-0.18z)| norm 0.2863 (+0.36z)| lr 3.30e-04 | 323.12 ms | 52.2% bf16 MFU | 1623311 tok/s +step 9525/19560 | loss 3.452929 (+0.72z)| norm 0.2840 (+0.23z)| lr 3.30e-04 | 322.81 ms | 52.3% bf16 MFU | 1623353 tok/s +step 9526/19560 | loss 3.382871 (-1.05z)| norm 0.2656 (-0.85z)| lr 3.30e-04 | 322.90 ms | 52.3% bf16 MFU | 1623369 tok/s +step 9527/19560 | loss 3.452681 (+0.74z)| norm 0.2880 (+0.44z)| lr 3.30e-04 | 323.31 ms | 52.2% bf16 MFU | 1623282 tok/s +step 9528/19560 | loss 3.415507 (-0.21z)| norm 0.2782 (-0.14z)| lr 3.30e-04 | 322.65 ms | 52.3% bf16 MFU | 1623366 tok/s +step 9529/19560 | loss 3.358057 (-1.65z)| norm 0.3546 (+4.13z)| lr 3.30e-04 | 322.86 ms | 52.3% bf16 MFU | 1623391 tok/s +step 9530/19560 | loss 3.426531 (+0.09z)| norm 0.3087 (+1.51z)| lr 3.30e-04 | 323.10 ms | 52.2% bf16 MFU | 1623355 tok/s +step 9531/19560 | loss 3.471906 (+1.22z)| norm 0.2744 (-0.42z)| lr 3.30e-04 | 322.90 ms | 52.3% bf16 MFU | 1623372 tok/s +step 9532/19560 | loss 3.513239 (+2.20z)| norm 0.2928 (+0.60z)| lr 3.30e-04 | 322.35 ms | 52.4% bf16 MFU | 1623526 tok/s +step 9533/19560 | loss 3.499510 (+1.85z)| norm 0.2848 (+0.15z)| lr 3.30e-04 | 322.78 ms | 52.3% bf16 MFU | 1623564 tok/s +step 9534/19560 | loss 3.420678 (-0.09z)| norm 0.2687 (-0.75z)| lr 3.30e-04 | 322.77 ms | 52.3% bf16 MFU | 1623602 tok/s +step 9535/19560 | loss 3.407461 (-0.41z)| norm 0.2589 (-1.28z)| lr 3.30e-04 | 322.81 ms | 52.3% bf16 MFU | 1623630 tok/s +step 9536/19560 | loss 3.378646 (-1.11z)| norm 0.2682 (-0.75z)| lr 3.30e-04 | 322.81 ms | 52.3% bf16 MFU | 1623656 tok/s +step 9537/19560 | loss 3.447454 (+0.58z)| norm 0.2508 (-1.70z)| lr 3.30e-04 | 323.17 ms | 52.2% bf16 MFU | 1623590 tok/s +step 9538/19560 | loss 3.448501 (+0.60z)| norm 0.2877 (+0.36z)| lr 3.30e-04 | 322.98 ms | 52.3% bf16 MFU | 1623574 tok/s +step 9539/19560 | loss 3.383805 (-0.98z)| norm 0.2726 (-0.48z)| lr 3.30e-04 | 322.44 ms | 52.3% bf16 MFU | 1623695 tok/s +step 9540/19560 | loss 3.386996 (-0.89z)| norm 0.2851 (+0.22z)| lr 3.29e-04 | 322.47 ms | 52.3% bf16 MFU | 1623801 tok/s +step 9541/19560 | loss 3.405843 (-0.44z)| norm 0.2619 (-1.06z)| lr 3.29e-04 | 323.20 ms | 52.2% bf16 MFU | 1623721 tok/s +step 9542/19560 | loss 3.408195 (-0.38z)| norm 0.3054 (+1.36z)| lr 3.29e-04 | 322.83 ms | 52.3% bf16 MFU | 1623736 tok/s +step 9543/19560 | loss 3.497842 (+1.82z)| norm 0.2833 (+0.12z)| lr 3.29e-04 | 322.95 ms | 52.3% bf16 MFU | 1623721 tok/s +step 9544/19560 | loss 3.473863 (+1.21z)| norm 0.2939 (+0.70z)| lr 3.29e-04 | 322.90 ms | 52.3% bf16 MFU | 1623719 tok/s +step 9545/19560 | loss 3.450199 (+0.62z)| norm 0.2683 (-0.74z)| lr 3.29e-04 | 322.36 ms | 52.4% bf16 MFU | 1623853 tok/s +step 9546/19560 | loss 3.388040 (-0.90z)| norm 0.2980 (+0.93z)| lr 3.29e-04 | 322.72 ms | 52.3% bf16 MFU | 1623890 tok/s +step 9547/19560 | loss 3.427848 (+0.09z)| norm 0.3167 (+1.94z)| lr 3.29e-04 | 322.79 ms | 52.3% bf16 MFU | 1623908 tok/s +step 9548/19560 | loss 3.439544 (+0.37z)| norm 0.2899 (+0.44z)| lr 3.29e-04 | 323.39 ms | 52.2% bf16 MFU | 1623774 tok/s +step 9549/19560 | loss 3.352725 (-1.76z)| norm 0.2819 (-0.01z)| lr 3.29e-04 | 322.71 ms | 52.3% bf16 MFU | 1623818 tok/s +step 9550/19560 | loss 3.443189 (+0.45z)| norm 0.3039 (+1.21z)| lr 3.29e-04 | 322.73 ms | 52.3% bf16 MFU | 1623855 tok/s +step 9551/19560 | loss 3.378059 (-1.14z)| norm 0.2841 (+0.09z)| lr 3.29e-04 | 322.84 ms | 52.3% bf16 MFU | 1623862 tok/s +step 9552/19560 | loss 3.483960 (+1.47z)| norm 0.3115 (+1.60z)| lr 3.29e-04 | 322.51 ms | 52.3% bf16 MFU | 1623951 tok/s +step 9553/19560 | loss 3.404797 (-0.47z)| norm 0.2963 (+0.74z)| lr 3.29e-04 | 322.64 ms | 52.3% bf16 MFU | 1624004 tok/s +step 9554/19560 | loss 3.389563 (-0.86z)| norm 0.2833 (+0.01z)| lr 3.29e-04 | 322.65 ms | 52.3% bf16 MFU | 1624052 tok/s +step 9555/19560 | loss 3.433996 (+0.28z)| norm 0.2644 (-1.05z)| lr 3.29e-04 | 322.86 ms | 52.3% bf16 MFU | 1624044 tok/s +step 9556/19560 | loss 3.442869 (+0.51z)| norm 0.2988 (+0.87z)| lr 3.29e-04 | 322.45 ms | 52.3% bf16 MFU | 1624139 tok/s +step 9557/19560 | loss 3.474622 (+1.31z)| norm 0.2587 (-1.38z)| lr 3.29e-04 | 322.97 ms | 52.3% bf16 MFU | 1624098 tok/s +step 9558/19560 | loss 3.495613 (+1.82z)| norm 0.2848 (+0.08z)| lr 3.29e-04 | 322.49 ms | 52.3% bf16 MFU | 1624179 tok/s +step 9559/19560 | loss 3.334553 (-2.21z)| norm 0.2526 (-1.76z)| lr 3.29e-04 | 322.65 ms | 52.3% bf16 MFU | 1624218 tok/s +step 9560/19560 | loss 3.402310 (-0.51z)| norm 0.2607 (-1.28z)| lr 3.28e-04 | 322.43 ms | 52.3% bf16 MFU | 1624310 tok/s +step 9561/19560 | loss 3.378043 (-1.13z)| norm 0.2859 (+0.14z)| lr 3.28e-04 | 322.47 ms | 52.3% bf16 MFU | 1624387 tok/s +step 9562/19560 | loss 3.457706 (+0.87z)| norm 0.2830 (-0.03z)| lr 3.28e-04 | 323.06 ms | 52.2% bf16 MFU | 1624312 tok/s +step 9563/19560 | loss 3.415602 (-0.18z)| norm 0.2741 (-0.53z)| lr 3.28e-04 | 322.16 ms | 52.4% bf16 MFU | 1624466 tok/s +step 9564/19560 | loss 3.451457 (+0.71z)| norm 0.2803 (-0.18z)| lr 3.28e-04 | 322.10 ms | 52.4% bf16 MFU | 1624630 tok/s +step 9565/19560 | loss 3.379339 (-1.09z)| norm 0.2815 (-0.12z)| lr 3.28e-04 | 322.62 ms | 52.3% bf16 MFU | 1624652 tok/s +step 9566/19560 | loss 3.456500 (+0.83z)| norm 0.2536 (-1.69z)| lr 3.28e-04 | 322.86 ms | 52.3% bf16 MFU | 1624614 tok/s +step 9567/19560 | loss 3.421494 (-0.03z)| norm 0.2576 (-1.45z)| lr 3.28e-04 | 322.47 ms | 52.3% bf16 MFU | 1624676 tok/s +step 9568/19560 | loss 3.437414 (+0.37z)| norm 0.2525 (-1.72z)| lr 3.28e-04 | 322.32 ms | 52.4% bf16 MFU | 1624773 tok/s +step 9569/19560 | loss 3.432430 (+0.26z)| norm 0.2607 (-1.23z)| lr 3.28e-04 | 322.35 ms | 52.4% bf16 MFU | 1624856 tok/s +step 9570/19560 | loss 3.503955 (+2.01z)| norm 0.2553 (-1.52z)| lr 3.28e-04 | 322.45 ms | 52.3% bf16 MFU | 1624911 tok/s +step 9571/19560 | loss 3.390919 (-0.79z)| norm 0.2698 (-0.69z)| lr 3.28e-04 | 322.32 ms | 52.4% bf16 MFU | 1624996 tok/s +step 9572/19560 | loss 3.393704 (-0.71z)| norm 0.2661 (-0.89z)| lr 3.28e-04 | 322.86 ms | 52.3% bf16 MFU | 1624941 tok/s +step 9573/19560 | loss 3.419688 (-0.07z)| norm 0.2611 (-1.15z)| lr 3.28e-04 | 322.44 ms | 52.3% bf16 MFU | 1624994 tok/s +step 9574/19560 | loss 3.440633 (+0.45z)| norm 0.2997 (+1.03z)| lr 3.28e-04 | 322.67 ms | 52.3% bf16 MFU | 1624987 tok/s +step 9575/19560 | loss 3.408379 (-0.35z)| norm 0.2487 (-1.83z)| lr 3.28e-04 | 322.32 ms | 52.4% bf16 MFU | 1625069 tok/s +step 9576/19560 | loss 3.450395 (+0.72z)| norm 0.2820 (+0.06z)| lr 3.28e-04 | 322.22 ms | 52.4% bf16 MFU | 1625171 tok/s +step 9577/19560 | loss 3.389889 (-0.81z)| norm 0.2592 (-1.22z)| lr 3.28e-04 | 322.37 ms | 52.4% bf16 MFU | 1625230 tok/s +step 9578/19560 | loss 3.357328 (-1.61z)| norm 0.2739 (-0.39z)| lr 3.28e-04 | 322.49 ms | 52.3% bf16 MFU | 1625255 tok/s +step 9579/19560 | loss 3.438962 (+0.45z)| norm 0.2754 (-0.29z)| lr 3.28e-04 | 322.96 ms | 52.3% bf16 MFU | 1625162 tok/s +step 9580/19560 | loss 3.403343 (-0.44z)| norm 0.3030 (+1.27z)| lr 3.27e-04 | 323.16 ms | 52.2% bf16 MFU | 1625023 tok/s +step 9581/19560 | loss 3.392036 (-0.73z)| norm 0.2803 (-0.02z)| lr 3.27e-04 | 322.34 ms | 52.4% bf16 MFU | 1625098 tok/s +step 9582/19560 | loss 3.433764 (+0.33z)| norm 0.2541 (-1.49z)| lr 3.27e-04 | 322.82 ms | 52.3% bf16 MFU | 1625048 tok/s +step 9583/19560 | loss 3.405005 (-0.40z)| norm 0.2921 (+0.65z)| lr 3.27e-04 | 322.52 ms | 52.3% bf16 MFU | 1625075 tok/s +step 9584/19560 | loss 3.427612 (+0.18z)| norm 0.2695 (-0.63z)| lr 3.27e-04 | 322.48 ms | 52.3% bf16 MFU | 1625112 tok/s +step 9585/19560 | loss 3.419753 (-0.01z)| norm 0.2765 (-0.23z)| lr 3.27e-04 | 322.31 ms | 52.4% bf16 MFU | 1625189 tok/s +step 9586/19560 | loss 3.376671 (-1.10z)| norm 0.2745 (-0.33z)| lr 3.27e-04 | 322.62 ms | 52.3% bf16 MFU | 1625184 tok/s +step 9587/19560 | loss 3.430777 (+0.27z)| norm 0.2897 (+0.53z)| lr 3.27e-04 | 322.72 ms | 52.3% bf16 MFU | 1625155 tok/s +step 9588/19560 | loss 3.386595 (-0.84z)| norm 0.2476 (-1.82z)| lr 3.27e-04 | 323.10 ms | 52.2% bf16 MFU | 1625031 tok/s +step 9589/19560 | loss 3.412277 (-0.19z)| norm 0.2861 (+0.34z)| lr 3.27e-04 | 322.28 ms | 52.4% bf16 MFU | 1625120 tok/s +step 9590/19560 | loss 3.412431 (-0.18z)| norm 0.2816 (+0.09z)| lr 3.27e-04 | 322.19 ms | 52.4% bf16 MFU | 1625226 tok/s +step 9591/19560 | loss 3.411419 (-0.19z)| norm 0.2841 (+0.22z)| lr 3.27e-04 | 322.69 ms | 52.3% bf16 MFU | 1625203 tok/s +step 9592/19560 | loss 3.439967 (+0.54z)| norm 0.2766 (-0.19z)| lr 3.27e-04 | 322.73 ms | 52.3% bf16 MFU | 1625171 tok/s +step 9593/19560 | loss 3.431129 (+0.31z)| norm 0.2789 (-0.07z)| lr 3.27e-04 | 322.63 ms | 52.3% bf16 MFU | 1625164 tok/s +step 9594/19560 | loss 3.416606 (-0.08z)| norm 0.2683 (-0.67z)| lr 3.27e-04 | 322.77 ms | 52.3% bf16 MFU | 1625123 tok/s +step 9595/19560 | loss 3.357013 (-1.60z)| norm 0.2670 (-0.74z)| lr 3.27e-04 | 322.55 ms | 52.3% bf16 MFU | 1625140 tok/s +step 9596/19560 | loss 3.469311 (+1.29z)| norm 0.3049 (+1.43z)| lr 3.27e-04 | 322.29 ms | 52.4% bf16 MFU | 1625222 tok/s +step 9597/19560 | loss 3.319048 (-2.50z)| norm 0.2748 (-0.30z)| lr 3.27e-04 | 322.93 ms | 52.3% bf16 MFU | 1625136 tok/s +step 9598/19560 | loss 3.426394 (+0.20z)| norm 0.2720 (-0.45z)| lr 3.27e-04 | 322.51 ms | 52.3% bf16 MFU | 1625162 tok/s +step 9599/19560 | loss 3.398706 (-0.49z)| norm 0.2724 (-0.44z)| lr 3.27e-04 | 322.29 ms | 52.4% bf16 MFU | 1625243 tok/s +step 9600/19560 | loss 3.516841 (+2.41z)| norm 0.2780 (-0.11z)| lr 3.27e-04 | 322.74 ms | 52.3% bf16 MFU | 1625206 tok/s +step 9601/19560 | loss 3.467596 (+1.18z)| norm 0.2989 (+1.07z)| lr 3.26e-04 | 322.48 ms | 52.3% bf16 MFU | 1625235 tok/s +step 9602/19560 | loss 3.374290 (-1.10z)| norm 0.2793 (-0.06z)| lr 3.26e-04 | 322.68 ms | 52.3% bf16 MFU | 1625213 tok/s +step 9603/19560 | loss 3.390387 (-0.70z)| norm 0.2762 (-0.24z)| lr 3.26e-04 | 322.42 ms | 52.3% bf16 MFU | 1625256 tok/s +step 9604/19560 | loss 3.341550 (-1.86z)| norm 0.2918 (+0.67z)| lr 3.26e-04 | 322.54 ms | 52.3% bf16 MFU | 1625269 tok/s +step 9605/19560 | loss 3.390992 (-0.66z)| norm 0.3027 (+1.28z)| lr 3.26e-04 | 323.32 ms | 52.2% bf16 MFU | 1625084 tok/s +step 9606/19560 | loss 3.572043 (+3.61z)| norm 0.2770 (-0.18z)| lr 3.26e-04 | 322.11 ms | 52.4% bf16 MFU | 1625213 tok/s +step 9607/19560 | loss 3.410288 (-0.20z)| norm 0.3096 (+1.68z)| lr 3.26e-04 | 322.81 ms | 52.3% bf16 MFU | 1625159 tok/s +step 9608/19560 | loss 3.407198 (-0.28z)| norm 0.2905 (+0.58z)| lr 3.26e-04 | 322.48 ms | 52.3% bf16 MFU | 1625191 tok/s +step 9609/19560 | loss 3.412926 (-0.15z)| norm 0.2706 (-0.54z)| lr 3.26e-04 | 322.49 ms | 52.3% bf16 MFU | 1625218 tok/s +step 9610/19560 | loss 3.496989 (+1.81z)| norm 0.3049 (+1.40z)| lr 3.26e-04 | 322.72 ms | 52.3% bf16 MFU | 1625186 tok/s +step 9611/19560 | loss 3.463747 (+1.05z)| norm 0.2823 (+0.12z)| lr 3.26e-04 | 322.26 ms | 52.4% bf16 MFU | 1625271 tok/s +step 9612/19560 | loss 3.417464 (-0.04z)| norm 0.2945 (+0.82z)| lr 3.26e-04 | 322.62 ms | 52.3% bf16 MFU | 1625261 tok/s +step 9613/19560 | loss 3.419121 (+0.00z)| norm 0.2960 (+0.92z)| lr 3.26e-04 | 322.87 ms | 52.3% bf16 MFU | 1625191 tok/s +step 9614/19560 | loss 3.437354 (+0.43z)| norm 0.2916 (+0.66z)| lr 3.26e-04 | 322.59 ms | 52.3% bf16 MFU | 1625194 tok/s +step 9615/19560 | loss 3.409099 (-0.25z)| norm 0.2623 (-1.01z)| lr 3.26e-04 | 322.54 ms | 52.3% bf16 MFU | 1625209 tok/s +step 9616/19560 | loss 3.368961 (-1.20z)| norm 0.2766 (-0.19z)| lr 3.26e-04 | 322.57 ms | 52.3% bf16 MFU | 1625216 tok/s +step 9617/19560 | loss 3.448697 (+0.69z)| norm 0.2681 (-0.68z)| lr 3.26e-04 | 323.11 ms | 52.2% bf16 MFU | 1625086 tok/s +step 9618/19560 | loss 3.414952 (-0.10z)| norm 0.2377 (-2.34z)| lr 3.26e-04 | 322.85 ms | 52.3% bf16 MFU | 1625028 tok/s +step 9619/19560 | loss 3.419129 (+0.00z)| norm 0.2704 (-0.52z)| lr 3.26e-04 | 322.51 ms | 52.3% bf16 MFU | 1625060 tok/s +step 9620/19560 | loss 3.391438 (-0.67z)| norm 0.2648 (-0.82z)| lr 3.26e-04 | 322.71 ms | 52.3% bf16 MFU | 1625039 tok/s +step 9621/19560 | loss 3.486199 (+1.59z)| norm 0.2896 (+0.55z)| lr 3.25e-04 | 322.54 ms | 52.3% bf16 MFU | 1625061 tok/s +step 9622/19560 | loss 3.402745 (-0.40z)| norm 0.2591 (-1.15z)| lr 3.25e-04 | 322.88 ms | 52.3% bf16 MFU | 1624998 tok/s +step 9623/19560 | loss 3.464547 (+1.06z)| norm 0.2860 (+0.39z)| lr 3.25e-04 | 322.58 ms | 52.3% bf16 MFU | 1625012 tok/s +step 9624/19560 | loss 3.343124 (-1.83z)| norm 0.2689 (-0.59z)| lr 3.25e-04 | 323.18 ms | 52.2% bf16 MFU | 1624875 tok/s +step 9625/19560 | loss 3.628503 (+4.56z)| norm 0.2757 (-0.18z)| lr 3.25e-04 | 322.59 ms | 52.3% bf16 MFU | 1624893 tok/s +step 9626/19560 | loss 3.359375 (-1.35z)| norm 0.3009 (+1.27z)| lr 3.25e-04 | 322.64 ms | 52.3% bf16 MFU | 1624899 tok/s +step 9627/19560 | loss 3.435404 (+0.30z)| norm 0.2887 (+0.57z)| lr 3.25e-04 | 322.62 ms | 52.3% bf16 MFU | 1624909 tok/s +step 9628/19560 | loss 3.428100 (+0.15z)| norm 0.2860 (+0.41z)| lr 3.25e-04 | 322.82 ms | 52.3% bf16 MFU | 1624869 tok/s +step 9629/19560 | loss 3.367707 (-1.18z)| norm 0.3015 (+1.31z)| lr 3.25e-04 | 322.41 ms | 52.3% bf16 MFU | 1624934 tok/s +step 9630/19560 | loss 3.447815 (+0.59z)| norm 0.2877 (+0.50z)| lr 3.25e-04 | 322.41 ms | 52.3% bf16 MFU | 1624996 tok/s +step 9631/19560 | loss 3.406980 (-0.31z)| norm 0.2917 (+0.72z)| lr 3.25e-04 | 322.42 ms | 52.3% bf16 MFU | 1625052 tok/s +step 9632/19560 | loss 3.407991 (-0.30z)| norm 0.2939 (+0.83z)| lr 3.25e-04 | 322.67 ms | 52.3% bf16 MFU | 1625042 tok/s +step 9633/19560 | loss 3.389342 (-0.70z)| norm 0.2712 (-0.50z)| lr 3.25e-04 | 322.64 ms | 52.3% bf16 MFU | 1625038 tok/s +step 9634/19560 | loss 3.400355 (-0.47z)| norm 0.3093 (+1.70z)| lr 3.25e-04 | 322.54 ms | 52.3% bf16 MFU | 1625061 tok/s +step 9635/19560 | loss 3.383321 (-0.85z)| norm 0.2712 (-0.52z)| lr 3.25e-04 | 322.81 ms | 52.3% bf16 MFU | 1625016 tok/s +step 9636/19560 | loss 3.382001 (-0.87z)| norm 0.3639 (+4.49z)| lr 3.25e-04 | 322.84 ms | 52.3% bf16 MFU | 1624964 tok/s +step 9637/19560 | loss 3.516180 (+2.06z)| norm 0.3050 (+1.28z)| lr 3.25e-04 | 322.32 ms | 52.4% bf16 MFU | 1625046 tok/s +step 9638/19560 | loss 3.457141 (+0.76z)| norm 0.3384 (+2.95z)| lr 3.25e-04 | 322.24 ms | 52.4% bf16 MFU | 1625144 tok/s +step 9639/19560 | loss 3.370181 (-1.11z)| norm 0.2743 (-0.38z)| lr 3.25e-04 | 323.46 ms | 52.2% bf16 MFU | 1624931 tok/s +step 9640/19560 | loss 3.413703 (-0.17z)| norm 0.3029 (+1.10z)| lr 3.25e-04 | 322.23 ms | 52.4% bf16 MFU | 1625038 tok/s +step 9641/19560 | loss 3.484541 (+1.34z)| norm 0.2839 (+0.10z)| lr 3.24e-04 | 322.33 ms | 52.4% bf16 MFU | 1625114 tok/s +step 9642/19560 | loss 3.422968 (+0.02z)| norm 0.2885 (+0.34z)| lr 3.24e-04 | 322.21 ms | 52.4% bf16 MFU | 1625217 tok/s +step 9643/19560 | loss 3.493830 (+1.54z)| norm 0.2742 (-0.40z)| lr 3.24e-04 | 322.79 ms | 52.3% bf16 MFU | 1625167 tok/s +step 9644/19560 | loss 3.426471 (+0.08z)| norm 0.2541 (-1.43z)| lr 3.24e-04 | 322.25 ms | 52.4% bf16 MFU | 1625257 tok/s +step 9645/19560 | loss 3.377286 (-0.99z)| norm 0.2498 (-1.62z)| lr 3.24e-04 | 323.40 ms | 52.2% bf16 MFU | 1625053 tok/s +step 9646/19560 | loss 3.345280 (-1.65z)| norm 0.2523 (-1.46z)| lr 3.24e-04 | 322.98 ms | 52.3% bf16 MFU | 1624964 tok/s +step 9647/19560 | loss 3.440675 (+0.38z)| norm 0.2634 (-0.90z)| lr 3.24e-04 | 322.79 ms | 52.3% bf16 MFU | 1624928 tok/s +step 9648/19560 | loss 3.407854 (-0.32z)| norm 0.2807 (-0.01z)| lr 3.24e-04 | 322.80 ms | 52.3% bf16 MFU | 1624891 tok/s +step 9649/19560 | loss 3.383898 (-0.83z)| norm 0.2565 (-1.24z)| lr 3.24e-04 | 322.59 ms | 52.3% bf16 MFU | 1624908 tok/s +step 9650/19560 | loss 3.406762 (-0.33z)| norm 0.2887 (+0.41z)| lr 3.24e-04 | 322.55 ms | 52.3% bf16 MFU | 1624936 tok/s +step 9651/19560 | loss 3.403361 (-0.41z)| norm 0.2572 (-1.19z)| lr 3.24e-04 | 322.99 ms | 52.3% bf16 MFU | 1624850 tok/s +step 9652/19560 | loss 3.411327 (-0.23z)| norm 0.2658 (-0.75z)| lr 3.24e-04 | 322.70 ms | 52.3% bf16 MFU | 1624840 tok/s +step 9653/19560 | loss 3.391505 (-0.65z)| norm 0.2662 (-0.72z)| lr 3.24e-04 | 322.63 ms | 52.3% bf16 MFU | 1624850 tok/s +step 9654/19560 | loss 3.421204 (-0.02z)| norm 0.2601 (-1.02z)| lr 3.24e-04 | 322.87 ms | 52.3% bf16 MFU | 1624799 tok/s +step 9655/19560 | loss 3.348032 (-1.57z)| norm 0.2600 (-1.01z)| lr 3.24e-04 | 322.51 ms | 52.3% bf16 MFU | 1624842 tok/s +step 9656/19560 | loss 3.421371 (+0.00z)| norm 0.2930 (+0.64z)| lr 3.24e-04 | 323.18 ms | 52.2% bf16 MFU | 1624714 tok/s +step 9657/19560 | loss 3.482184 (+1.29z)| norm 0.2802 (+0.03z)| lr 3.24e-04 | 322.19 ms | 52.4% bf16 MFU | 1624840 tok/s +step 9658/19560 | loss 3.466781 (+0.95z)| norm 0.3122 (+1.73z)| lr 3.24e-04 | 323.38 ms | 52.2% bf16 MFU | 1624663 tok/s +step 9659/19560 | loss 3.446273 (+0.52z)| norm 0.2983 (+0.98z)| lr 3.24e-04 | 322.70 ms | 52.3% bf16 MFU | 1624663 tok/s +step 9660/19560 | loss 3.509075 (+1.87z)| norm 0.2971 (+0.91z)| lr 3.24e-04 | 322.57 ms | 52.3% bf16 MFU | 1624698 tok/s +step 9661/19560 | loss 3.387250 (-0.74z)| norm 0.2634 (-0.86z)| lr 3.23e-04 | 323.53 ms | 52.2% bf16 MFU | 1624490 tok/s +step 9662/19560 | loss 3.385260 (-0.78z)| norm 0.2866 (+0.36z)| lr 3.23e-04 | 322.98 ms | 52.3% bf16 MFU | 1624428 tok/s +step 9663/19560 | loss 3.382156 (-0.84z)| norm 0.2481 (-1.66z)| lr 3.23e-04 | 323.63 ms | 52.1% bf16 MFU | 1624208 tok/s +step 9664/19560 | loss 3.544446 (+2.58z)| norm 0.2874 (+0.40z)| lr 3.23e-04 | 322.52 ms | 52.3% bf16 MFU | 1624277 tok/s +step 9665/19560 | loss 3.454575 (+0.68z)| norm 0.2761 (-0.21z)| lr 3.23e-04 | 322.79 ms | 52.3% bf16 MFU | 1624274 tok/s +step 9666/19560 | loss 3.428499 (+0.14z)| norm 0.3032 (+1.21z)| lr 3.23e-04 | 323.85 ms | 52.1% bf16 MFU | 1624006 tok/s +step 9667/19560 | loss 3.432108 (+0.20z)| norm 0.2841 (+0.20z)| lr 3.23e-04 | 322.32 ms | 52.4% bf16 MFU | 1624136 tok/s +step 9668/19560 | loss 3.344786 (-1.62z)| norm 0.3172 (+1.90z)| lr 3.23e-04 | 323.11 ms | 52.2% bf16 MFU | 1624062 tok/s +step 9669/19560 | loss 3.395907 (-0.55z)| norm 0.2625 (-0.94z)| lr 3.23e-04 | 322.67 ms | 52.3% bf16 MFU | 1624100 tok/s +step 9670/19560 | loss 3.527834 (+2.16z)| norm 0.2867 (+0.33z)| lr 3.23e-04 | 323.02 ms | 52.2% bf16 MFU | 1624050 tok/s +step 9671/19560 | loss 3.380484 (-0.86z)| norm 0.2688 (-0.60z)| lr 3.23e-04 | 322.93 ms | 52.3% bf16 MFU | 1624024 tok/s +step 9672/19560 | loss 3.407234 (-0.30z)| norm 0.2720 (-0.43z)| lr 3.23e-04 | 323.24 ms | 52.2% bf16 MFU | 1623923 tok/s +step 9673/19560 | loss 3.430533 (+0.19z)| norm 0.2854 (+0.27z)| lr 3.23e-04 | 322.92 ms | 52.3% bf16 MFU | 1623906 tok/s +step 9674/19560 | loss 3.497948 (+1.56z)| norm 0.2529 (-1.41z)| lr 3.23e-04 | 322.96 ms | 52.3% bf16 MFU | 1623881 tok/s +step 9675/19560 | loss 3.403937 (-0.37z)| norm 0.2775 (-0.11z)| lr 3.23e-04 | 322.42 ms | 52.3% bf16 MFU | 1623992 tok/s +step 9676/19560 | loss 3.420984 (-0.02z)| norm 0.2536 (-1.35z)| lr 3.23e-04 | 322.97 ms | 52.3% bf16 MFU | 1623958 tok/s +step 9677/19560 | loss 3.356939 (-1.35z)| norm 0.2667 (-0.66z)| lr 3.23e-04 | 322.54 ms | 52.3% bf16 MFU | 1624034 tok/s +step 9678/19560 | loss 3.396925 (-0.51z)| norm 0.2757 (-0.17z)| lr 3.23e-04 | 323.22 ms | 52.2% bf16 MFU | 1623936 tok/s +step 9679/19560 | loss 3.410880 (-0.23z)| norm 0.2490 (-1.56z)| lr 3.23e-04 | 322.97 ms | 52.3% bf16 MFU | 1623907 tok/s +step 9680/19560 | loss 3.391210 (-0.62z)| norm 0.2485 (-1.56z)| lr 3.23e-04 | 322.88 ms | 52.3% bf16 MFU | 1623900 tok/s +step 9681/19560 | loss 3.440197 (+0.39z)| norm 0.2706 (-0.39z)| lr 3.22e-04 | 323.18 ms | 52.2% bf16 MFU | 1623819 tok/s +step 9682/19560 | loss 3.419582 (-0.04z)| norm 0.2502 (-1.44z)| lr 3.22e-04 | 323.11 ms | 52.2% bf16 MFU | 1623760 tok/s +step 9683/19560 | loss 3.387608 (-0.70z)| norm 0.2807 (+0.15z)| lr 3.22e-04 | 322.68 ms | 52.3% bf16 MFU | 1623811 tok/s +step 9684/19560 | loss 3.390433 (-0.63z)| norm 0.2535 (-1.26z)| lr 3.22e-04 | 322.59 ms | 52.3% bf16 MFU | 1623882 tok/s +step 9685/19560 | loss 3.405862 (-0.30z)| norm 0.2776 (-0.00z)| lr 3.22e-04 | 323.40 ms | 52.2% bf16 MFU | 1623746 tok/s +step 9686/19560 | loss 3.457834 (+0.80z)| norm 0.2981 (+1.06z)| lr 3.22e-04 | 322.75 ms | 52.3% bf16 MFU | 1623780 tok/s +step 9687/19560 | loss 3.391748 (-0.61z)| norm 0.2680 (-0.52z)| lr 3.22e-04 | 322.61 ms | 52.3% bf16 MFU | 1623850 tok/s +step 9688/19560 | loss 3.372512 (-1.01z)| norm 0.2710 (-0.37z)| lr 3.22e-04 | 323.93 ms | 52.1% bf16 MFU | 1623584 tok/s +step 9689/19560 | loss 3.452368 (+0.67z)| norm 0.2890 (+0.58z)| lr 3.22e-04 | 322.31 ms | 52.4% bf16 MFU | 1623737 tok/s +step 9690/19560 | loss 3.388012 (-0.69z)| norm 0.2825 (+0.24z)| lr 3.22e-04 | 323.05 ms | 52.2% bf16 MFU | 1623697 tok/s +step 9691/19560 | loss 3.331596 (-1.85z)| norm 0.2819 (+0.20z)| lr 3.22e-04 | 322.94 ms | 52.3% bf16 MFU | 1623686 tok/s +step 9692/19560 | loss 3.409013 (-0.22z)| norm 0.2670 (-0.58z)| lr 3.22e-04 | 323.13 ms | 52.2% bf16 MFU | 1623627 tok/s +step 9693/19560 | loss 3.429181 (+0.20z)| norm 0.2715 (-0.33z)| lr 3.22e-04 | 322.90 ms | 52.3% bf16 MFU | 1623630 tok/s +step 9694/19560 | loss 3.402130 (-0.36z)| norm 0.2761 (-0.10z)| lr 3.22e-04 | 322.54 ms | 52.3% bf16 MFU | 1623724 tok/s +step 9695/19560 | loss 3.372706 (-0.97z)| norm 0.2732 (-0.26z)| lr 3.22e-04 | 323.41 ms | 52.2% bf16 MFU | 1623595 tok/s +step 9696/19560 | loss 3.409619 (-0.19z)| norm 0.2725 (-0.31z)| lr 3.22e-04 | 322.92 ms | 52.3% bf16 MFU | 1623594 tok/s +step 9697/19560 | loss 3.382456 (-0.75z)| norm 0.2696 (-0.47z)| lr 3.22e-04 | 323.66 ms | 52.1% bf16 MFU | 1623407 tok/s +step 9698/19560 | loss 3.392912 (-0.52z)| norm 0.2684 (-0.55z)| lr 3.22e-04 | 322.42 ms | 52.3% bf16 MFU | 1623542 tok/s +step 9699/19560 | loss 3.379471 (-0.80z)| norm 0.2678 (-0.58z)| lr 3.22e-04 | 322.66 ms | 52.3% bf16 MFU | 1623610 tok/s +step 9700/19560 | loss 3.389682 (-0.59z)| norm 0.2626 (-0.86z)| lr 3.22e-04 | 322.71 ms | 52.3% bf16 MFU | 1623662 tok/s +step 9701/19560 | loss 3.406093 (-0.24z)| norm 0.2873 (+0.47z)| lr 3.21e-04 | 322.68 ms | 52.3% bf16 MFU | 1623718 tok/s +step 9702/19560 | loss 3.435103 (+0.38z)| norm 0.2516 (-1.44z)| lr 3.21e-04 | 322.90 ms | 52.3% bf16 MFU | 1623716 tok/s +step 9703/19560 | loss 3.409084 (-0.17z)| norm 0.2750 (-0.19z)| lr 3.21e-04 | 322.79 ms | 52.3% bf16 MFU | 1623742 tok/s +step 9704/19560 | loss 3.437378 (+0.43z)| norm 0.2608 (-0.95z)| lr 3.21e-04 | 322.16 ms | 52.4% bf16 MFU | 1623926 tok/s +step 9705/19560 | loss 3.447916 (+0.65z)| norm 0.2825 (+0.22z)| lr 3.21e-04 | 322.79 ms | 52.3% bf16 MFU | 1623941 tok/s +step 9706/19560 | loss 3.406907 (-0.23z)| norm 0.2653 (-0.71z)| lr 3.21e-04 | 323.43 ms | 52.2% bf16 MFU | 1623796 tok/s +step 9707/19560 | loss 3.376355 (-0.88z)| norm 0.2908 (+0.66z)| lr 3.21e-04 | 323.18 ms | 52.2% bf16 MFU | 1623720 tok/s +step 9708/19560 | loss 3.438224 (+0.44z)| norm 0.2794 (+0.06z)| lr 3.21e-04 | 322.48 ms | 52.3% bf16 MFU | 1623824 tok/s +step 9709/19560 | loss 3.457088 (+0.83z)| norm 0.3003 (+1.18z)| lr 3.21e-04 | 322.67 ms | 52.3% bf16 MFU | 1623876 tok/s +step 9710/19560 | loss 3.429690 (+0.25z)| norm 0.2498 (-1.56z)| lr 3.21e-04 | 323.05 ms | 52.2% bf16 MFU | 1623828 tok/s +step 9711/19560 | loss 3.380527 (-0.80z)| norm 0.2702 (-0.44z)| lr 3.21e-04 | 323.07 ms | 52.2% bf16 MFU | 1623779 tok/s +step 9712/19560 | loss 3.469054 (+1.08z)| norm 0.2650 (-0.72z)| lr 3.21e-04 | 322.70 ms | 52.3% bf16 MFU | 1623824 tok/s +step 9713/19560 | loss 3.597787 (+3.58z)| norm 0.2897 (+0.61z)| lr 3.21e-04 | 322.43 ms | 52.3% bf16 MFU | 1623935 tok/s +step 9714/19560 | loss 3.447893 (+0.56z)| norm 0.2594 (-1.02z)| lr 3.21e-04 | 322.60 ms | 52.3% bf16 MFU | 1623999 tok/s +step 9715/19560 | loss 3.379702 (-0.81z)| norm 0.2988 (+1.10z)| lr 3.21e-04 | 322.44 ms | 52.3% bf16 MFU | 1624098 tok/s +step 9716/19560 | loss 3.459443 (+0.78z)| norm 0.2882 (+0.51z)| lr 3.21e-04 | 322.86 ms | 52.3% bf16 MFU | 1624088 tok/s +step 9717/19560 | loss 3.397753 (-0.45z)| norm 0.2888 (+0.55z)| lr 3.21e-04 | 322.42 ms | 52.3% bf16 MFU | 1624189 tok/s +step 9718/19560 | loss 3.401414 (-0.38z)| norm 0.2677 (-0.59z)| lr 3.21e-04 | 323.04 ms | 52.2% bf16 MFU | 1624128 tok/s +step 9719/19560 | loss 3.440849 (+0.41z)| norm 0.2937 (+0.81z)| lr 3.21e-04 | 322.03 ms | 52.4% bf16 MFU | 1624326 tok/s +step 9720/19560 | loss 3.421198 (+0.02z)| norm 0.2625 (-0.86z)| lr 3.21e-04 | 322.46 ms | 52.3% bf16 MFU | 1624406 tok/s +step 9721/19560 | loss 3.382502 (-0.75z)| norm 0.3047 (+1.38z)| lr 3.20e-04 | 323.09 ms | 52.2% bf16 MFU | 1624323 tok/s +step 9722/19560 | loss 3.443902 (+0.48z)| norm 0.2632 (-0.83z)| lr 3.20e-04 | 322.51 ms | 52.3% bf16 MFU | 1624389 tok/s +step 9723/19560 | loss 3.404350 (-0.32z)| norm 0.2906 (+0.62z)| lr 3.20e-04 | 323.13 ms | 52.2% bf16 MFU | 1624297 tok/s +step 9724/19560 | loss 3.427516 (+0.15z)| norm 0.2628 (-0.85z)| lr 3.20e-04 | 322.95 ms | 52.3% bf16 MFU | 1624255 tok/s +step 9725/19560 | loss 3.411612 (-0.19z)| norm 0.2753 (-0.17z)| lr 3.20e-04 | 322.60 ms | 52.3% bf16 MFU | 1624302 tok/s +step 9726/19560 | loss 3.386694 (-0.69z)| norm 0.2727 (-0.31z)| lr 3.20e-04 | 322.81 ms | 52.3% bf16 MFU | 1624293 tok/s +step 9727/19560 | loss 3.452976 (+0.65z)| norm 0.2725 (-0.32z)| lr 3.20e-04 | 323.27 ms | 52.2% bf16 MFU | 1624168 tok/s +step 9728/19560 | loss 3.366651 (-1.10z)| norm 0.2801 (+0.08z)| lr 3.20e-04 | 323.17 ms | 52.2% bf16 MFU | 1624076 tok/s +step 9729/19560 | loss 3.452117 (+0.67z)| norm 0.2694 (-0.48z)| lr 3.20e-04 | 322.73 ms | 52.3% bf16 MFU | 1624098 tok/s +step 9730/19560 | loss 3.385055 (-0.72z)| norm 0.2822 (+0.21z)| lr 3.20e-04 | 322.73 ms | 52.3% bf16 MFU | 1624119 tok/s +step 9731/19560 | loss 3.397027 (-0.47z)| norm 0.3044 (+1.38z)| lr 3.20e-04 | 322.76 ms | 52.3% bf16 MFU | 1624133 tok/s +step 9732/19560 | loss 3.437696 (+0.36z)| norm 0.2668 (-0.62z)| lr 3.20e-04 | 322.82 ms | 52.3% bf16 MFU | 1624131 tok/s +step 9733/19560 | loss 3.407078 (-0.29z)| norm 0.2964 (+0.97z)| lr 3.20e-04 | 323.34 ms | 52.2% bf16 MFU | 1624000 tok/s +step 9734/19560 | loss 3.429811 (+0.22z)| norm 0.2631 (-0.81z)| lr 3.20e-04 | 322.17 ms | 52.4% bf16 MFU | 1624168 tok/s +step 9735/19560 | loss 3.392042 (-0.60z)| norm 0.2616 (-0.88z)| lr 3.20e-04 | 322.30 ms | 52.4% bf16 MFU | 1624296 tok/s +step 9736/19560 | loss 3.362450 (-1.23z)| norm 0.2873 (+0.51z)| lr 3.20e-04 | 322.84 ms | 52.3% bf16 MFU | 1624279 tok/s +step 9737/19560 | loss 3.436856 (+0.38z)| norm 0.2639 (-0.75z)| lr 3.20e-04 | 322.43 ms | 52.3% bf16 MFU | 1624369 tok/s +step 9738/19560 | loss 3.478938 (+1.30z)| norm 0.2924 (+0.80z)| lr 3.20e-04 | 322.76 ms | 52.3% bf16 MFU | 1624369 tok/s +step 9739/19560 | loss 3.462500 (+0.95z)| norm 0.2893 (+0.62z)| lr 3.20e-04 | 322.72 ms | 52.3% bf16 MFU | 1624380 tok/s +step 9740/19560 | loss 3.457505 (+0.83z)| norm 0.2816 (+0.22z)| lr 3.20e-04 | 322.91 ms | 52.3% bf16 MFU | 1624343 tok/s +step 9741/19560 | loss 3.374072 (-0.98z)| norm 0.2576 (-1.07z)| lr 3.19e-04 | 322.84 ms | 52.3% bf16 MFU | 1624325 tok/s +step 9742/19560 | loss 3.470052 (+1.09z)| norm 0.2770 (-0.01z)| lr 3.19e-04 | 322.27 ms | 52.4% bf16 MFU | 1624452 tok/s +step 9743/19560 | loss 3.423038 (+0.08z)| norm 0.3106 (+1.77z)| lr 3.19e-04 | 322.30 ms | 52.4% bf16 MFU | 1624565 tok/s +step 9744/19560 | loss 3.457068 (+0.80z)| norm 0.2994 (+1.16z)| lr 3.19e-04 | 322.98 ms | 52.3% bf16 MFU | 1624502 tok/s +step 9745/19560 | loss 3.375030 (-0.96z)| norm 0.2679 (-0.53z)| lr 3.19e-04 | 322.80 ms | 52.3% bf16 MFU | 1624486 tok/s +step 9746/19560 | loss 3.417289 (-0.05z)| norm 0.2807 (+0.14z)| lr 3.19e-04 | 322.73 ms | 52.3% bf16 MFU | 1624490 tok/s +step 9747/19560 | loss 3.440297 (+0.44z)| norm 0.3131 (+1.86z)| lr 3.19e-04 | 322.32 ms | 52.4% bf16 MFU | 1624597 tok/s +step 9748/19560 | loss 3.471790 (+1.10z)| norm 0.3091 (+1.62z)| lr 3.19e-04 | 322.56 ms | 52.3% bf16 MFU | 1624638 tok/s +step 9749/19560 | loss 3.403504 (-0.35z)| norm 0.2839 (+0.28z)| lr 3.19e-04 | 322.62 ms | 52.3% bf16 MFU | 1624662 tok/s +step 9750/19560 | loss 3.408429 (-0.25z)| norm 0.3074 (+1.50z)| lr 3.19e-04 | 322.43 ms | 52.3% bf16 MFU | 1624731 tok/s +val loss 3.402709 +ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79ting HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 HellaSwag: 2888/10042 = 0.287592 +step 9751/19560 | loss 3.484550 (+1.39z)| norm 0.3038 (+1.29z)| lr 3.19e-04 | 322.95 ms | 52.3% bf16 MFU | 1624667 tok/s +step 9752/19560 | loss 3.447915 (+0.59z)| norm 0.3209 (+2.14z)| lr 3.19e-04 | 322.43 ms | 52.3% bf16 MFU | 1624737 tok/s +step 9753/19560 | loss 3.419355 (+0.01z)| norm 0.3153 (+1.81z)| lr 3.19e-04 | 322.50 ms | 52.3% bf16 MFU | 1624785 tok/s +step 9754/19560 | loss 3.411340 (-0.20z)| norm 0.3134 (+1.70z)| lr 3.19e-04 | 322.40 ms | 52.3% bf16 MFU | 1624857 tok/s +step 9755/19560 | loss 3.394213 (-0.60z)| norm 0.2845 (+0.23z)| lr 3.19e-04 | 322.88 ms | 52.3% bf16 MFU | 1624803 tok/s +step 9756/19560 | loss 3.407912 (-0.27z)| norm 0.2915 (+0.58z)| lr 3.19e-04 | 322.87 ms | 52.3% bf16 MFU | 1624756 tok/s +step 9757/19560 | loss 3.397938 (-0.51z)| norm 0.2873 (+0.37z)| lr 3.19e-04 | 323.33 ms | 52.2% bf16 MFU | 1624594 tok/s +step 9758/19560 | loss 3.341270 (-1.84z)| norm 0.2647 (-0.77z)| lr 3.19e-04 | 322.67 ms | 52.3% bf16 MFU | 1624607 tok/s +step 9759/19560 | loss 3.457714 (+0.92z)| norm 0.2775 (-0.11z)| lr 3.19e-04 | 322.24 ms | 52.4% bf16 MFU | 1624729 tok/s +step 9760/19560 | loss 3.384669 (-0.80z)| norm 0.2673 (-0.62z)| lr 3.19e-04 | 322.70 ms | 52.3% bf16 MFU | 1624728 tok/s +step 9761/19560 | loss 3.414160 (-0.11z)| norm 0.2800 (+0.02z)| lr 3.18e-04 | 322.72 ms | 52.3% bf16 MFU | 1624721 tok/s +step 9762/19560 | loss 3.408455 (-0.25z)| norm 0.2669 (-0.63z)| lr 3.18e-04 | 322.75 ms | 52.3% bf16 MFU | 1624707 tok/s +step 9763/19560 | loss 3.404265 (-0.35z)| norm 0.2663 (-0.66z)| lr 3.18e-04 | 322.50 ms | 52.3% bf16 MFU | 1624756 tok/s +step 9764/19560 | loss 3.420811 (+0.03z)| norm 0.2882 (+0.54z)| lr 3.18e-04 | 322.11 ms | 52.4% bf16 MFU | 1624901 tok/s +step 9765/19560 | loss 3.420801 (+0.05z)| norm 0.2962 (+0.99z)| lr 3.18e-04 | 322.54 ms | 52.3% bf16 MFU | 1624932 tok/s +step 9766/19560 | loss 3.420154 (+0.04z)| norm 0.2722 (-0.34z)| lr 3.18e-04 | 322.68 ms | 52.3% bf16 MFU | 1624924 tok/s +step 9767/19560 | loss 3.353555 (-1.57z)| norm 0.2751 (-0.17z)| lr 3.18e-04 | 322.54 ms | 52.3% bf16 MFU | 1624953 tok/s +step 9768/19560 | loss 3.504362 (+2.05z)| norm 0.2488 (-1.68z)| lr 3.18e-04 | 322.52 ms | 52.3% bf16 MFU | 1624986 tok/s +step 9769/19560 | loss 3.431028 (+0.30z)| norm 0.2572 (-1.17z)| lr 3.18e-04 | 322.75 ms | 52.3% bf16 MFU | 1624958 tok/s +step 9770/19560 | loss 3.450794 (+0.77z)| norm 0.2733 (-0.23z)| lr 3.18e-04 | 323.03 ms | 52.2% bf16 MFU | 1624862 tok/s +step 9771/19560 | loss 3.409138 (-0.22z)| norm 0.2550 (-1.28z)| lr 3.18e-04 | 322.23 ms | 52.4% bf16 MFU | 1624973 tok/s +step 9772/19560 | loss 3.530210 (+2.64z)| norm 0.2960 (+1.08z)| lr 3.18e-04 | 322.67 ms | 52.3% bf16 MFU | 1624965 tok/s +step 9773/19560 | loss 3.396685 (-0.53z)| norm 0.2780 (+0.02z)| lr 3.18e-04 | 323.15 ms | 52.2% bf16 MFU | 1624838 tok/s +step 9774/19560 | loss 3.445486 (+0.62z)| norm 0.2920 (+0.83z)| lr 3.18e-04 | 322.93 ms | 52.3% bf16 MFU | 1624774 tok/s +step 9775/19560 | loss 3.424430 (+0.11z)| norm 0.2929 (+0.87z)| lr 3.18e-04 | 322.68 ms | 52.3% bf16 MFU | 1624775 tok/s +step 9776/19560 | loss 3.428991 (+0.22z)| norm 0.2583 (-1.16z)| lr 3.18e-04 | 322.71 ms | 52.3% bf16 MFU | 1624769 tok/s +step 9777/19560 | loss 3.465631 (+1.09z)| norm 0.2702 (-0.47z)| lr 3.18e-04 | 322.73 ms | 52.3% bf16 MFU | 1624759 tok/s +step 9778/19560 | loss 3.369946 (-1.20z)| norm 0.2471 (-1.80z)| lr 3.18e-04 | 322.18 ms | 52.4% bf16 MFU | 1624887 tok/s +step 9779/19560 | loss 3.480680 (+1.42z)| norm 0.2638 (-0.82z)| lr 3.18e-04 | 322.91 ms | 52.3% bf16 MFU | 1624825 tok/s +step 9780/19560 | loss 3.446385 (+0.60z)| norm 0.2617 (-0.94z)| lr 3.18e-04 | 322.55 ms | 52.3% bf16 MFU | 1624857 tok/s +step 9781/19560 | loss 3.441490 (+0.48z)| norm 0.2608 (-0.99z)| lr 3.17e-04 | 322.07 ms | 52.4% bf16 MFU | 1625007 tok/s +step 9782/19560 | loss 3.421106 (-0.01z)| norm 0.2614 (-0.96z)| lr 3.17e-04 | 322.74 ms | 52.3% bf16 MFU | 1624981 tok/s +step 9783/19560 | loss 3.507353 (+2.00z)| norm 0.2726 (-0.31z)| lr 3.17e-04 | 322.58 ms | 52.3% bf16 MFU | 1624995 tok/s +step 9784/19560 | loss 3.364250 (-1.36z)| norm 0.2653 (-0.73z)| lr 3.17e-04 | 322.93 ms | 52.3% bf16 MFU | 1624921 tok/s +step 9785/19560 | loss 3.327962 (-2.16z)| norm 0.2644 (-0.77z)| lr 3.17e-04 | 322.63 ms | 52.3% bf16 MFU | 1624926 tok/s +step 9786/19560 | loss 3.459730 (+0.90z)| norm 0.2409 (-2.11z)| lr 3.17e-04 | 322.33 ms | 52.4% bf16 MFU | 1625008 tok/s +step 9787/19560 | loss 3.374418 (-1.07z)| norm 0.2839 (+0.41z)| lr 3.17e-04 | 322.45 ms | 52.3% bf16 MFU | 1625056 tok/s +step 9788/19560 | loss 3.355455 (-1.49z)| norm 0.2664 (-0.60z)| lr 3.17e-04 | 322.69 ms | 52.3% bf16 MFU | 1625041 tok/s +step 9789/19560 | loss 3.385365 (-0.79z)| norm 0.2596 (-1.01z)| lr 3.17e-04 | 322.76 ms | 52.3% bf16 MFU | 1625009 tok/s +step 9790/19560 | loss 3.398296 (-0.49z)| norm 0.2797 (+0.19z)| lr 3.17e-04 | 322.41 ms | 52.3% bf16 MFU | 1625066 tok/s +step 9791/19560 | loss 3.419753 (+0.00z)| norm 0.2457 (-1.82z)| lr 3.17e-04 | 322.08 ms | 52.4% bf16 MFU | 1625204 tok/s +step 9792/19560 | loss 3.447514 (+0.69z)| norm 0.2841 (+0.45z)| lr 3.17e-04 | 323.17 ms | 52.2% bf16 MFU | 1625059 tok/s +step 9793/19560 | loss 3.483580 (+1.55z)| norm 0.2854 (+0.52z)| lr 3.17e-04 | 322.75 ms | 52.3% bf16 MFU | 1625028 tok/s +step 9794/19560 | loss 3.324063 (-2.23z)| norm 0.2568 (-1.15z)| lr 3.17e-04 | 322.36 ms | 52.4% bf16 MFU | 1625097 tok/s +step 9795/19560 | loss 3.409392 (-0.21z)| norm 0.2708 (-0.32z)| lr 3.17e-04 | 322.81 ms | 52.3% bf16 MFU | 1625048 tok/s +step 9796/19560 | loss 3.352537 (-1.55z)| norm 0.2516 (-1.45z)| lr 3.17e-04 | 322.81 ms | 52.3% bf16 MFU | 1625004 tok/s +step 9797/19560 | loss 3.460135 (+0.98z)| norm 0.2874 (+0.69z)| lr 3.17e-04 | 322.95 ms | 52.3% bf16 MFU | 1624925 tok/s +step 9798/19560 | loss 3.373882 (-1.05z)| norm 0.2722 (-0.21z)| lr 3.17e-04 | 322.40 ms | 52.3% bf16 MFU | 1624990 tok/s +step 9799/19560 | loss 3.382027 (-0.86z)| norm 0.2876 (+0.70z)| lr 3.17e-04 | 322.85 ms | 52.3% bf16 MFU | 1624937 tok/s +step 9800/19560 | loss 3.345006 (-1.72z)| norm 0.2690 (-0.41z)| lr 3.17e-04 | 322.59 ms | 52.3% bf16 MFU | 1624952 tok/s +step 9801/19560 | loss 3.398746 (-0.43z)| norm 0.2673 (-0.51z)| lr 3.16e-04 | 322.46 ms | 52.3% bf16 MFU | 1625000 tok/s +step 9802/19560 | loss 3.458173 (+1.01z)| norm 0.2531 (-1.36z)| lr 3.16e-04 | 322.39 ms | 52.3% bf16 MFU | 1625061 tok/s +step 9803/19560 | loss 3.329202 (-2.06z)| norm 0.3017 (+1.54z)| lr 3.16e-04 | 322.47 ms | 52.3% bf16 MFU | 1625100 tok/s +step 9804/19560 | loss 3.350571 (-1.52z)| norm 0.2669 (-0.54z)| lr 3.16e-04 | 322.61 ms | 52.3% bf16 MFU | 1625103 tok/s +step 9805/19560 | loss 3.352553 (-1.47z)| norm 0.2939 (+1.06z)| lr 3.16e-04 | 322.72 ms | 52.3% bf16 MFU | 1625078 tok/s +step 9806/19560 | loss 3.397637 (-0.41z)| norm 0.3597 (+4.53z)| lr 3.16e-04 | 322.71 ms | 52.3% bf16 MFU | 1625055 tok/s +step 9807/19560 | loss 3.412928 (-0.06z)| norm 0.2681 (-0.49z)| lr 3.16e-04 | 322.82 ms | 52.3% bf16 MFU | 1625007 tok/s +step 9808/19560 | loss 3.397870 (-0.41z)| norm 0.2621 (-0.83z)| lr 3.16e-04 | 322.59 ms | 52.3% bf16 MFU | 1625020 tok/s +step 9809/19560 | loss 3.345418 (-1.61z)| norm 0.2963 (+1.05z)| lr 3.16e-04 | 322.55 ms | 52.3% bf16 MFU | 1625041 tok/s +step 9810/19560 | loss 3.446320 (+0.73z)| norm 0.2492 (-1.55z)| lr 3.16e-04 | 322.44 ms | 52.3% bf16 MFU | 1625090 tok/s +step 9811/19560 | loss 3.423201 (+0.19z)| norm 0.2731 (-0.23z)| lr 3.16e-04 | 322.57 ms | 52.3% bf16 MFU | 1625103 tok/s +step 9812/19560 | loss 3.404634 (-0.25z)| norm 0.2710 (-0.35z)| lr 3.16e-04 | 322.69 ms | 52.3% bf16 MFU | 1625086 tok/s +step 9813/19560 | loss 3.366769 (-1.12z)| norm 0.2806 (+0.18z)| lr 3.16e-04 | 322.16 ms | 52.4% bf16 MFU | 1625202 tok/s +step 9814/19560 | loss 3.413543 (-0.02z)| norm 0.2824 (+0.29z)| lr 3.16e-04 | 323.07 ms | 52.2% bf16 MFU | 1625085 tok/s +step 9815/19560 | loss 3.495379 (+1.84z)| norm 0.2867 (+0.52z)| lr 3.16e-04 | 322.83 ms | 52.3% bf16 MFU | 1625032 tok/s +step 9816/19560 | loss 3.359944 (-1.27z)| norm 0.2649 (-0.70z)| lr 3.16e-04 | 322.75 ms | 52.3% bf16 MFU | 1625002 tok/s +step 9817/19560 | loss 3.432825 (+0.41z)| norm 0.2738 (-0.20z)| lr 3.16e-04 | 322.54 ms | 52.3% bf16 MFU | 1625028 tok/s +step 9818/19560 | loss 3.433241 (+0.41z)| norm 0.2657 (-0.64z)| lr 3.16e-04 | 322.56 ms | 52.3% bf16 MFU | 1625045 tok/s +step 9819/19560 | loss 3.345512 (-1.62z)| norm 0.2729 (-0.23z)| lr 3.16e-04 | 322.26 ms | 52.4% bf16 MFU | 1625138 tok/s +step 9820/19560 | loss 3.421173 (+0.13z)| norm 0.2829 (+0.32z)| lr 3.16e-04 | 322.89 ms | 52.3% bf16 MFU | 1625069 tok/s +step 9821/19560 | loss 3.380313 (-0.81z)| norm 0.2596 (-0.98z)| lr 3.15e-04 | 322.41 ms | 52.3% bf16 MFU | 1625124 tok/s +step 9822/19560 | loss 3.376333 (-0.89z)| norm 0.3140 (+2.01z)| lr 3.15e-04 | 323.70 ms | 52.1% bf16 MFU | 1624851 tok/s +step 9823/19560 | loss 3.414340 (-0.02z)| norm 0.2833 (+0.32z)| lr 3.15e-04 | 322.77 ms | 52.3% bf16 MFU | 1624826 tok/s +step 9824/19560 | loss 3.360106 (-1.26z)| norm 0.2739 (-0.20z)| lr 3.15e-04 | 322.70 ms | 52.3% bf16 MFU | 1624819 tok/s +step 9825/19560 | loss 3.405941 (-0.21z)| norm 0.2893 (+0.64z)| lr 3.15e-04 | 322.75 ms | 52.3% bf16 MFU | 1624801 tok/s +step 9826/19560 | loss 3.448039 (+0.74z)| norm 0.2886 (+0.59z)| lr 3.15e-04 | 322.63 ms | 52.3% bf16 MFU | 1624812 tok/s +step 9827/19560 | loss 3.367552 (-1.10z)| norm 0.2858 (+0.43z)| lr 3.15e-04 | 322.68 ms | 52.3% bf16 MFU | 1624811 tok/s +step 9828/19560 | loss 3.430592 (+0.34z)| norm 0.2803 (+0.12z)| lr 3.15e-04 | 322.47 ms | 52.3% bf16 MFU | 1624864 tok/s +step 9829/19560 | loss 3.426294 (+0.24z)| norm 0.2742 (-0.21z)| lr 3.15e-04 | 323.00 ms | 52.3% bf16 MFU | 1624779 tok/s +step 9830/19560 | loss 3.473379 (+1.30z)| norm 0.3309 (+2.81z)| lr 3.15e-04 | 322.27 ms | 52.4% bf16 MFU | 1624882 tok/s +step 9831/19560 | loss 3.409755 (-0.15z)| norm 0.3597 (+4.04z)| lr 3.15e-04 | 323.02 ms | 52.2% bf16 MFU | 1624793 tok/s +step 9832/19560 | loss 3.417242 (+0.02z)| norm 0.3734 (+4.33z)| lr 3.15e-04 | 322.50 ms | 52.3% bf16 MFU | 1624837 tok/s +step 9833/19560 | loss 3.373582 (-0.96z)| norm 0.3193 (+1.78z)| lr 3.15e-04 | 323.24 ms | 52.2% bf16 MFU | 1624695 tok/s +step 9834/19560 | loss 3.379063 (-0.83z)| norm 0.2822 (+0.08z)| lr 3.15e-04 | 322.62 ms | 52.3% bf16 MFU | 1624715 tok/s +step 9835/19560 | loss 3.409308 (-0.14z)| norm 0.3126 (+1.45z)| lr 3.15e-04 | 323.14 ms | 52.2% bf16 MFU | 1624604 tok/s +step 9836/19560 | loss 3.382964 (-0.73z)| norm 0.2880 (+0.33z)| lr 3.15e-04 | 323.17 ms | 52.2% bf16 MFU | 1624490 tok/s +step 9837/19560 | loss 3.366533 (-1.09z)| norm 0.2927 (+0.55z)| lr 3.15e-04 | 322.54 ms | 52.3% bf16 MFU | 1624542 tok/s +step 9838/19560 | loss 3.385661 (-0.65z)| norm 0.2881 (+0.33z)| lr 3.15e-04 | 322.77 ms | 52.3% bf16 MFU | 1624531 tok/s +step 9839/19560 | loss 3.476488 (+1.39z)| norm 0.3076 (+1.20z)| lr 3.15e-04 | 322.69 ms | 52.3% bf16 MFU | 1624543 tok/s +step 9840/19560 | loss 3.537253 (+2.69z)| norm 0.2756 (-0.27z)| lr 3.15e-04 | 323.03 ms | 52.2% bf16 MFU | 1624468 tok/s +step 9841/19560 | loss 3.398174 (-0.37z)| norm 0.2958 (+0.66z)| lr 3.14e-04 | 323.44 ms | 52.2% bf16 MFU | 1624294 tok/s +step 9842/19560 | loss 3.437152 (+0.55z)| norm 0.2833 (+0.08z)| lr 3.14e-04 | 322.82 ms | 52.3% bf16 MFU | 1624284 tok/s +step 9843/19560 | loss 3.375061 (-0.92z)| norm 0.2824 (+0.04z)| lr 3.14e-04 | 322.37 ms | 52.4% bf16 MFU | 1624388 tok/s +step 9844/19560 | loss 3.430334 (+0.40z)| norm 0.2814 (-0.00z)| lr 3.14e-04 | 322.91 ms | 52.3% bf16 MFU | 1624350 tok/s +step 9845/19560 | loss 3.359562 (-1.27z)| norm 0.2661 (-0.70z)| lr 3.14e-04 | 322.98 ms | 52.3% bf16 MFU | 1624298 tok/s +step 9846/19560 | loss 3.381593 (-0.74z)| norm 0.2854 (+0.19z)| lr 3.14e-04 | 322.44 ms | 52.3% bf16 MFU | 1624384 tok/s +step 9847/19560 | loss 3.400727 (-0.28z)| norm 0.2915 (+0.47z)| lr 3.14e-04 | 322.73 ms | 52.3% bf16 MFU | 1624392 tok/s +step 9848/19560 | loss 3.523227 (+2.52z)| norm 0.3050 (+1.07z)| lr 3.14e-04 | 322.76 ms | 52.3% bf16 MFU | 1624392 tok/s +step 9849/19560 | loss 3.383839 (-0.68z)| norm 0.3067 (+1.15z)| lr 3.14e-04 | 322.98 ms | 52.3% bf16 MFU | 1624336 tok/s +step 9850/19560 | loss 3.387843 (-0.58z)| norm 0.2874 (+0.25z)| lr 3.14e-04 | 322.81 ms | 52.3% bf16 MFU | 1624325 tok/s +step 9851/19560 | loss 3.470280 (+1.30z)| norm 0.2920 (+0.47z)| lr 3.14e-04 | 323.40 ms | 52.2% bf16 MFU | 1624167 tok/s +step 9852/19560 | loss 3.400861 (-0.29z)| norm 0.2889 (+0.31z)| lr 3.14e-04 | 322.53 ms | 52.3% bf16 MFU | 1624236 tok/s +step 9853/19560 | loss 3.404794 (-0.20z)| norm 0.2792 (-0.14z)| lr 3.14e-04 | 322.80 ms | 52.3% bf16 MFU | 1624233 tok/s +step 9854/19560 | loss 3.419583 (+0.14z)| norm 0.2666 (-0.72z)| lr 3.14e-04 | 322.79 ms | 52.3% bf16 MFU | 1624233 tok/s +step 9855/19560 | loss 3.490555 (+1.74z)| norm 0.2751 (-0.32z)| lr 3.14e-04 | 322.65 ms | 52.3% bf16 MFU | 1624269 tok/s +step 9856/19560 | loss 3.356803 (-1.29z)| norm 0.2637 (-0.84z)| lr 3.14e-04 | 322.67 ms | 52.3% bf16 MFU | 1624297 tok/s +step 9857/19560 | loss 3.428078 (+0.33z)| norm 0.2717 (-0.48z)| lr 3.14e-04 | 322.88 ms | 52.3% bf16 MFU | 1624272 tok/s +step 9858/19560 | loss 3.387767 (-0.59z)| norm 0.2711 (-0.50z)| lr 3.14e-04 | 322.79 ms | 52.3% bf16 MFU | 1624269 tok/s +step 9859/19560 | loss 3.425839 (+0.27z)| norm 0.2758 (-0.27z)| lr 3.14e-04 | 322.35 ms | 52.4% bf16 MFU | 1624379 tok/s +step 9860/19560 | loss 3.419418 (+0.13z)| norm 0.2961 (+0.65z)| lr 3.14e-04 | 323.63 ms | 52.2% bf16 MFU | 1624163 tok/s +step 9861/19560 | loss 3.358366 (-1.24z)| norm 0.2915 (+0.44z)| lr 3.13e-04 | 322.62 ms | 52.3% bf16 MFU | 1624208 tok/s +step 9862/19560 | loss 3.374186 (-0.87z)| norm 0.2866 (+0.21z)| lr 3.13e-04 | 322.49 ms | 52.3% bf16 MFU | 1624286 tok/s +step 9863/19560 | loss 3.423794 (+0.24z)| norm 0.3176 (+1.62z)| lr 3.13e-04 | 323.11 ms | 52.2% bf16 MFU | 1624203 tok/s +step 9864/19560 | loss 3.425268 (+0.26z)| norm 0.3007 (+0.83z)| lr 3.13e-04 | 322.83 ms | 52.3% bf16 MFU | 1624196 tok/s +step 9865/19560 | loss 3.444249 (+0.69z)| norm 0.2981 (+0.70z)| lr 3.13e-04 | 322.59 ms | 52.3% bf16 MFU | 1624248 tok/s +step 9866/19560 | loss 3.360647 (-1.19z)| norm 0.3222 (+1.78z)| lr 3.13e-04 | 323.07 ms | 52.2% bf16 MFU | 1624176 tok/s +step 9867/19560 | loss 3.356771 (-1.25z)| norm 0.2841 (+0.04z)| lr 3.13e-04 | 322.58 ms | 52.3% bf16 MFU | 1624233 tok/s +step 9868/19560 | loss 3.410315 (-0.03z)| norm 0.3019 (+0.85z)| lr 3.13e-04 | 322.76 ms | 52.3% bf16 MFU | 1624240 tok/s +step 9869/19560 | loss 3.424704 (+0.29z)| norm 0.2682 (-0.69z)| lr 3.13e-04 | 323.66 ms | 52.1% bf16 MFU | 1624023 tok/s +step 9870/19560 | loss 3.363992 (-1.08z)| norm 0.3106 (+1.22z)| lr 3.13e-04 | 322.57 ms | 52.3% bf16 MFU | 1624090 tok/s +step 9871/19560 | loss 3.420181 (+0.21z)| norm 0.3168 (+1.50z)| lr 3.13e-04 | 322.75 ms | 52.3% bf16 MFU | 1624108 tok/s +step 9872/19560 | loss 3.437461 (+0.61z)| norm 0.2778 (-0.26z)| lr 3.13e-04 | 322.79 ms | 52.3% bf16 MFU | 1624116 tok/s +step 9873/19560 | loss 3.414689 (+0.08z)| norm 0.3314 (+2.11z)| lr 3.13e-04 | 322.60 ms | 52.3% bf16 MFU | 1624169 tok/s +step 9874/19560 | loss 3.457574 (+1.05z)| norm 0.2795 (-0.20z)| lr 3.13e-04 | 323.06 ms | 52.2% bf16 MFU | 1624104 tok/s +step 9875/19560 | loss 3.406516 (-0.11z)| norm 0.3188 (+1.54z)| lr 3.13e-04 | 322.58 ms | 52.3% bf16 MFU | 1624163 tok/s +step 9876/19560 | loss 3.446620 (+0.82z)| norm 0.2873 (+0.15z)| lr 3.13e-04 | 322.65 ms | 52.3% bf16 MFU | 1624202 tok/s +step 9877/19560 | loss 3.464963 (+1.22z)| norm 0.2875 (+0.16z)| lr 3.13e-04 | 323.26 ms | 52.2% bf16 MFU | 1624086 tok/s +step 9878/19560 | loss 3.459225 (+1.08z)| norm 0.2959 (+0.54z)| lr 3.13e-04 | 322.34 ms | 52.4% bf16 MFU | 1624207 tok/s +step 9879/19560 | loss 3.366185 (-1.03z)| norm 0.2875 (+0.17z)| lr 3.13e-04 | 322.88 ms | 52.3% bf16 MFU | 1624186 tok/s +step 9880/19560 | loss 3.466543 (+1.27z)| norm 0.3043 (+0.94z)| lr 3.13e-04 | 323.06 ms | 52.2% bf16 MFU | 1624121 tok/s +step 9881/19560 | loss 3.367527 (-0.99z)| norm 0.2625 (-0.94z)| lr 3.12e-04 | 322.81 ms | 52.3% bf16 MFU | 1624122 tok/s +step 9882/19560 | loss 3.398580 (-0.28z)| norm 0.2801 (-0.12z)| lr 3.12e-04 | 323.43 ms | 52.2% bf16 MFU | 1623968 tok/s +step 9883/19560 | loss 3.461655 (+1.15z)| norm 0.3160 (+1.49z)| lr 3.12e-04 | 322.58 ms | 52.3% bf16 MFU | 1624035 tok/s +step 9884/19560 | loss 3.332651 (-1.75z)| norm 0.2793 (-0.17z)| lr 3.12e-04 | 322.72 ms | 52.3% bf16 MFU | 1624062 tok/s +step 9885/19560 | loss 3.410095 (-0.02z)| norm 0.2909 (+0.36z)| lr 3.12e-04 | 322.98 ms | 52.3% bf16 MFU | 1624022 tok/s +step 9886/19560 | loss 3.347725 (-1.42z)| norm 0.2864 (+0.15z)| lr 3.12e-04 | 322.91 ms | 52.3% bf16 MFU | 1624003 tok/s +step 9887/19560 | loss 3.371966 (-0.86z)| norm 0.2729 (-0.47z)| lr 3.12e-04 | 323.60 ms | 52.2% bf16 MFU | 1623811 tok/s +step 9888/19560 | loss 3.391099 (-0.43z)| norm 0.2780 (-0.24z)| lr 3.12e-04 | 323.11 ms | 52.2% bf16 MFU | 1623753 tok/s +step 9889/19560 | loss 3.347435 (-1.39z)| norm 0.2849 (+0.07z)| lr 3.12e-04 | 323.30 ms | 52.2% bf16 MFU | 1623649 tok/s +step 9890/19560 | loss 3.417593 (+0.17z)| norm 0.3067 (+1.05z)| lr 3.12e-04 | 322.63 ms | 52.3% bf16 MFU | 1623718 tok/s +step 9891/19560 | loss 3.331573 (-1.72z)| norm 0.3136 (+1.34z)| lr 3.12e-04 | 322.79 ms | 52.3% bf16 MFU | 1623743 tok/s +step 9892/19560 | loss 3.397535 (-0.25z)| norm 0.3167 (+1.46z)| lr 3.12e-04 | 323.14 ms | 52.2% bf16 MFU | 1623680 tok/s +step 9893/19560 | loss 3.411121 (+0.05z)| norm 0.3018 (+0.79z)| lr 3.12e-04 | 323.06 ms | 52.2% bf16 MFU | 1623640 tok/s +step 9894/19560 | loss 3.392609 (-0.36z)| norm 0.2778 (-0.29z)| lr 3.12e-04 | 322.79 ms | 52.3% bf16 MFU | 1623671 tok/s +step 9895/19560 | loss 3.437742 (+0.63z)| norm 0.2873 (+0.13z)| lr 3.12e-04 | 322.97 ms | 52.3% bf16 MFU | 1623655 tok/s +step 9896/19560 | loss 3.362039 (-1.04z)| norm 0.2651 (-0.88z)| lr 3.12e-04 | 322.55 ms | 52.3% bf16 MFU | 1623745 tok/s +step 9897/19560 | loss 3.358058 (-1.11z)| norm 0.2921 (+0.33z)| lr 3.12e-04 | 322.26 ms | 52.4% bf16 MFU | 1623904 tok/s +step 9898/19560 | loss 3.398170 (-0.21z)| norm 0.2618 (-1.04z)| lr 3.12e-04 | 322.95 ms | 52.3% bf16 MFU | 1623879 tok/s +step 9899/19560 | loss 3.467883 (+1.34z)| norm 0.2917 (+0.31z)| lr 3.12e-04 | 322.35 ms | 52.4% bf16 MFU | 1624007 tok/s +step 9900/19560 | loss 3.362881 (-1.00z)| norm 0.2575 (-1.23z)| lr 3.12e-04 | 322.95 ms | 52.3% bf16 MFU | 1623979 tok/s +step 9901/19560 | loss 3.380918 (-0.58z)| norm 0.3085 (+1.07z)| lr 3.11e-04 | 322.90 ms | 52.3% bf16 MFU | 1623964 tok/s +step 9902/19560 | loss 3.366138 (-0.91z)| norm 0.2588 (-1.16z)| lr 3.11e-04 | 322.24 ms | 52.4% bf16 MFU | 1624116 tok/s +step 9903/19560 | loss 3.408998 (+0.08z)| norm 0.2855 (+0.04z)| lr 3.11e-04 | 323.30 ms | 52.2% bf16 MFU | 1623994 tok/s +step 9904/19560 | loss 3.390144 (-0.35z)| norm 0.2631 (-0.97z)| lr 3.11e-04 | 322.24 ms | 52.4% bf16 MFU | 1624144 tok/s +step 9905/19560 | loss 3.416725 (+0.27z)| norm 0.2815 (-0.15z)| lr 3.11e-04 | 322.62 ms | 52.3% bf16 MFU | 1624192 tok/s +step 9906/19560 | loss 3.304705 (-2.26z)| norm 0.2643 (-0.93z)| lr 3.11e-04 | 323.17 ms | 52.2% bf16 MFU | 1624100 tok/s +step 9907/19560 | loss 3.365031 (-0.88z)| norm 0.2573 (-1.25z)| lr 3.11e-04 | 322.33 ms | 52.4% bf16 MFU | 1624222 tok/s +step 9908/19560 | loss 3.492005 (+2.00z)| norm 0.2702 (-0.67z)| lr 3.11e-04 | 322.21 ms | 52.4% bf16 MFU | 1624368 tok/s +step 9909/19560 | loss 3.434887 (+0.70z)| norm 0.2847 (-0.02z)| lr 3.11e-04 | 322.59 ms | 52.3% bf16 MFU | 1624410 tok/s +step 9910/19560 | loss 3.339010 (-1.44z)| norm 0.3032 (+0.82z)| lr 3.11e-04 | 323.20 ms | 52.2% bf16 MFU | 1624298 tok/s +step 9911/19560 | loss 3.387230 (-0.34z)| norm 0.2893 (+0.17z)| lr 3.11e-04 | 322.48 ms | 52.3% bf16 MFU | 1624374 tok/s +step 9912/19560 | loss 3.375338 (-0.62z)| norm 0.2883 (+0.12z)| lr 3.11e-04 | 322.40 ms | 52.3% bf16 MFU | 1624465 tok/s +step 9913/19560 | loss 3.398224 (-0.11z)| norm 0.2787 (-0.33z)| lr 3.11e-04 | 322.43 ms | 52.3% bf16 MFU | 1624544 tok/s +step 9914/19560 | loss 3.434472 (+0.74z)| norm 0.3026 (+0.76z)| lr 3.11e-04 | 323.15 ms | 52.2% bf16 MFU | 1624438 tok/s +step 9915/19560 | loss 3.429431 (+0.62z)| norm 0.3264 (+1.84z)| lr 3.11e-04 | 322.29 ms | 52.4% bf16 MFU | 1624554 tok/s +step 9916/19560 | loss 3.381327 (-0.52z)| norm 0.2973 (+0.49z)| lr 3.11e-04 | 322.19 ms | 52.4% bf16 MFU | 1624690 tok/s +step 9917/19560 | loss 3.404878 (+0.03z)| norm 0.3162 (+1.34z)| lr 3.11e-04 | 322.63 ms | 52.3% bf16 MFU | 1624707 tok/s +step 9918/19560 | loss 3.430924 (+0.64z)| norm 0.3068 (+0.89z)| lr 3.11e-04 | 322.70 ms | 52.3% bf16 MFU | 1624707 tok/s +step 9919/19560 | loss 3.493377 (+2.06z)| norm 0.3074 (+0.91z)| lr 3.11e-04 | 322.80 ms | 52.3% bf16 MFU | 1624682 tok/s +step 9920/19560 | loss 3.354851 (-1.12z)| norm 0.3100 (+1.02z)| lr 3.11e-04 | 322.36 ms | 52.4% bf16 MFU | 1624768 tok/s +step 9921/19560 | loss 3.478767 (+1.74z)| norm 0.3436 (+2.50z)| lr 3.10e-04 | 322.75 ms | 52.3% bf16 MFU | 1624752 tok/s +step 9922/19560 | loss 3.416354 (+0.28z)| norm 0.3227 (+1.52z)| lr 3.10e-04 | 323.25 ms | 52.2% bf16 MFU | 1624611 tok/s +step 9923/19560 | loss 3.400590 (-0.08z)| norm 0.3232 (+1.52z)| lr 3.10e-04 | 322.57 ms | 52.3% bf16 MFU | 1624648 tok/s +step 9924/19560 | loss 3.392999 (-0.27z)| norm 0.2985 (+0.39z)| lr 3.10e-04 | 322.25 ms | 52.4% bf16 MFU | 1624763 tok/s +step 9925/19560 | loss 3.424168 (+0.47z)| norm 0.2960 (+0.27z)| lr 3.10e-04 | 322.24 ms | 52.4% bf16 MFU | 1624876 tok/s +step 9926/19560 | loss 3.442568 (+0.89z)| norm 0.3035 (+0.60z)| lr 3.10e-04 | 322.91 ms | 52.3% bf16 MFU | 1624813 tok/s +step 9927/19560 | loss 3.415967 (+0.26z)| norm 0.2726 (-0.80z)| lr 3.10e-04 | 322.60 ms | 52.3% bf16 MFU | 1624831 tok/s +step 9928/19560 | loss 3.379057 (-0.62z)| norm 0.2876 (-0.12z)| lr 3.10e-04 | 322.60 ms | 52.3% bf16 MFU | 1624849 tok/s +step 9929/19560 | loss 3.404902 (-0.01z)| norm 0.2826 (-0.36z)| lr 3.10e-04 | 322.36 ms | 52.4% bf16 MFU | 1624927 tok/s +step 9930/19560 | loss 3.409867 (+0.12z)| norm 0.2644 (-1.21z)| lr 3.10e-04 | 322.24 ms | 52.4% bf16 MFU | 1625032 tok/s +step 9931/19560 | loss 3.400942 (-0.11z)| norm 0.2712 (-0.88z)| lr 3.10e-04 | 323.12 ms | 52.2% bf16 MFU | 1624909 tok/s +step 9932/19560 | loss 3.366341 (-0.96z)| norm 0.2605 (-1.37z)| lr 3.10e-04 | 322.93 ms | 52.3% bf16 MFU | 1624839 tok/s +step 9933/19560 | loss 3.415756 (+0.24z)| norm 0.2722 (-0.82z)| lr 3.10e-04 | 322.39 ms | 52.4% bf16 MFU | 1624910 tok/s +step 9934/19560 | loss 3.416163 (+0.24z)| norm 0.2531 (-1.70z)| lr 3.10e-04 | 322.41 ms | 52.3% bf16 MFU | 1624973 tok/s +step 9935/19560 | loss 3.512588 (+2.52z)| norm 0.2860 (-0.16z)| lr 3.10e-04 | 323.17 ms | 52.2% bf16 MFU | 1624841 tok/s +step 9936/19560 | loss 3.390676 (-0.39z)| norm 0.2707 (-0.89z)| lr 3.10e-04 | 322.78 ms | 52.3% bf16 MFU | 1624814 tok/s +step 9937/19560 | loss 3.428324 (+0.50z)| norm 0.2524 (-1.73z)| lr 3.10e-04 | 322.81 ms | 52.3% bf16 MFU | 1624780 tok/s +step 9938/19560 | loss 3.396899 (-0.25z)| norm 0.2626 (-1.26z)| lr 3.10e-04 | 322.38 ms | 52.4% bf16 MFU | 1624856 tok/s +step 9939/19560 | loss 3.405452 (-0.04z)| norm 0.2511 (-1.78z)| lr 3.10e-04 | 322.68 ms | 52.3% bf16 MFU | 1624852 tok/s +step 9940/19560 | loss 3.362383 (-1.07z)| norm 0.2704 (-0.87z)| lr 3.10e-04 | 322.53 ms | 52.3% bf16 MFU | 1624887 tok/s +step 9941/19560 | loss 3.406262 (-0.02z)| norm 0.2684 (-0.96z)| lr 3.09e-04 | 322.82 ms | 52.3% bf16 MFU | 1624846 tok/s +step 9942/19560 | loss 3.401573 (-0.13z)| norm 0.3061 (+0.80z)| lr 3.09e-04 | 322.12 ms | 52.4% bf16 MFU | 1624984 tok/s +step 9943/19560 | loss 3.523767 (+2.78z)| norm 0.2693 (-0.91z)| lr 3.09e-04 | 322.46 ms | 52.3% bf16 MFU | 1625030 tok/s +step 9944/19560 | loss 3.401977 (-0.13z)| norm 0.3169 (+1.28z)| lr 3.09e-04 | 322.27 ms | 52.4% bf16 MFU | 1625121 tok/s +step 9945/19560 | loss 3.437684 (+0.72z)| norm 0.2748 (-0.68z)| lr 3.09e-04 | 322.59 ms | 52.3% bf16 MFU | 1625127 tok/s +step 9946/19560 | loss 3.424180 (+0.40z)| norm 0.2847 (-0.22z)| lr 3.09e-04 | 322.29 ms | 52.4% bf16 MFU | 1625208 tok/s +step 9947/19560 | loss 3.398819 (-0.22z)| norm 0.2908 (+0.05z)| lr 3.09e-04 | 322.45 ms | 52.3% bf16 MFU | 1625246 tok/s +step 9948/19560 | loss 3.372057 (-0.86z)| norm 0.2727 (-0.79z)| lr 3.09e-04 | 322.41 ms | 52.3% bf16 MFU | 1625291 tok/s +step 9949/19560 | loss 3.420967 (+0.32z)| norm 0.2781 (-0.55z)| lr 3.09e-04 | 322.66 ms | 52.3% bf16 MFU | 1625271 tok/s +step 9950/19560 | loss 3.407738 (-0.01z)| norm 0.3113 (+1.02z)| lr 3.09e-04 | 323.12 ms | 52.2% bf16 MFU | 1625136 tok/s +step 9951/19560 | loss 3.344256 (-1.52z)| norm 0.2911 (+0.06z)| lr 3.09e-04 | 322.45 ms | 52.3% bf16 MFU | 1625176 tok/s +step 9952/19560 | loss 3.418759 (+0.26z)| norm 0.2801 (-0.46z)| lr 3.09e-04 | 322.57 ms | 52.3% bf16 MFU | 1625184 tok/s +step 9953/19560 | loss 3.408945 (+0.02z)| norm 0.2946 (+0.22z)| lr 3.09e-04 | 322.34 ms | 52.4% bf16 MFU | 1625249 tok/s +step 9954/19560 | loss 3.379551 (-0.67z)| norm 0.2709 (-0.89z)| lr 3.09e-04 | 322.66 ms | 52.3% bf16 MFU | 1625231 tok/s +step 9955/19560 | loss 3.356708 (-1.22z)| norm 0.9020 (+10.48z)| lr 3.09e-04 | 322.87 ms | 52.3% bf16 MFU | 1625161 tok/s +step 9956/19560 | loss 3.392318 (-0.36z)| norm 0.3005 (+0.10z)| lr 3.09e-04 | 322.74 ms | 52.3% bf16 MFU | 1625127 tok/s +step 9957/19560 | loss 3.414131 (+0.17z)| norm 0.2877 (-0.12z)| lr 3.09e-04 | 322.16 ms | 52.4% bf16 MFU | 1625240 tok/s +step 9958/19560 | loss 3.365431 (-0.99z)| norm 0.2711 (-0.40z)| lr 3.09e-04 | 322.64 ms | 52.3% bf16 MFU | 1625227 tok/s +step 9959/19560 | loss 3.393036 (-0.32z)| norm 0.2807 (-0.23z)| lr 3.09e-04 | 322.50 ms | 52.3% bf16 MFU | 1625250 tok/s +step 9960/19560 | loss 3.536603 (+3.03z)| norm 0.2787 (-0.25z)| lr 3.09e-04 | 322.30 ms | 52.4% bf16 MFU | 1625323 tok/s +step 9961/19560 | loss 3.421930 (+0.34z)| norm 0.2657 (-0.47z)| lr 3.08e-04 | 322.98 ms | 52.3% bf16 MFU | 1625221 tok/s +step 9962/19560 | loss 3.382137 (-0.59z)| norm 0.2780 (-0.25z)| lr 3.08e-04 | 322.48 ms | 52.3% bf16 MFU | 1625251 tok/s +step 9963/19560 | loss 3.396649 (-0.25z)| norm 0.2628 (-0.51z)| lr 3.08e-04 | 322.62 ms | 52.3% bf16 MFU | 1625244 tok/s +step 9964/19560 | loss 3.333853 (-1.69z)| norm 0.2763 (-0.28z)| lr 3.08e-04 | 322.80 ms | 52.3% bf16 MFU | 1625191 tok/s +step 9965/19560 | loss 3.375028 (-0.74z)| norm 0.2676 (-0.42z)| lr 3.08e-04 | 322.48 ms | 52.3% bf16 MFU | 1625220 tok/s +step 9966/19560 | loss 3.397249 (-0.23z)| norm 0.2780 (-0.24z)| lr 3.08e-04 | 322.74 ms | 52.3% bf16 MFU | 1625183 tok/s +step 9967/19560 | loss 3.418505 (+0.28z)| norm 0.2784 (-0.23z)| lr 3.08e-04 | 322.51 ms | 52.3% bf16 MFU | 1625205 tok/s +step 9968/19560 | loss 3.435609 (+0.73z)| norm 0.2733 (-0.32z)| lr 3.08e-04 | 322.24 ms | 52.4% bf16 MFU | 1625295 tok/s +step 9969/19560 | loss 3.352612 (-1.28z)| norm 0.3023 (+0.19z)| lr 3.08e-04 | 322.48 ms | 52.3% bf16 MFU | 1625319 tok/s +step 9970/19560 | loss 3.352092 (-1.27z)| norm 0.2885 (-0.05z)| lr 3.08e-04 | 322.80 ms | 52.3% bf16 MFU | 1625263 tok/s +step 9971/19560 | loss 3.360571 (-1.06z)| norm 0.2886 (-0.05z)| lr 3.08e-04 | 322.51 ms | 52.3% bf16 MFU | 1625283 tok/s +step 9972/19560 | loss 3.437040 (+0.78z)| norm 0.2819 (-0.17z)| lr 3.08e-04 | 322.18 ms | 52.4% bf16 MFU | 1625385 tok/s +step 9973/19560 | loss 3.343439 (-1.46z)| norm 0.2579 (-0.59z)| lr 3.08e-04 | 322.83 ms | 52.3% bf16 MFU | 1625318 tok/s +step 9974/19560 | loss 3.402462 (-0.06z)| norm 0.2903 (-0.02z)| lr 3.08e-04 | 322.48 ms | 52.3% bf16 MFU | 1625341 tok/s +step 9975/19560 | loss 3.423048 (+0.43z)| norm 0.2708 (-0.36z)| lr 3.08e-04 | 322.37 ms | 52.4% bf16 MFU | 1625392 tok/s +step 9976/19560 | loss 3.428644 (+0.60z)| norm 0.2742 (-0.30z)| lr 3.08e-04 | 322.41 ms | 52.3% bf16 MFU | 1625431 tok/s +step 9977/19560 | loss 3.581136 (+4.04z)| norm 0.3130 (+0.38z)| lr 3.08e-04 | 322.74 ms | 52.3% bf16 MFU | 1625385 tok/s +step 9978/19560 | loss 3.362245 (-1.00z)| norm 0.2759 (-0.27z)| lr 3.08e-04 | 322.42 ms | 52.3% bf16 MFU | 1625422 tok/s +step 9979/19560 | loss 3.403642 (-0.03z)| norm 0.3013 (+0.17z)| lr 3.08e-04 | 322.32 ms | 52.4% bf16 MFU | 1625482 tok/s +step 9980/19560 | loss 3.415594 (+0.24z)| norm 0.2702 (-0.36z)| lr 3.08e-04 | 322.55 ms | 52.3% bf16 MFU | 1625480 tok/s +step 9981/19560 | loss 3.425350 (+0.46z)| norm 0.2484 (-0.74z)| lr 3.07e-04 | 321.95 ms | 52.4% bf16 MFU | 1625629 tok/s +step 9982/19560 | loss 3.392904 (-0.28z)| norm 0.2643 (-0.46z)| lr 3.07e-04 | 322.54 ms | 52.3% bf16 MFU | 1625622 tok/s +step 9983/19560 | loss 3.385045 (-0.45z)| norm 0.2469 (-0.76z)| lr 3.07e-04 | 322.68 ms | 52.3% bf16 MFU | 1625580 tok/s +step 9984/19560 | loss 3.629923 (+4.78z)| norm 0.2690 (-0.38z)| lr 3.07e-04 | 322.50 ms | 52.3% bf16 MFU | 1625587 tok/s +step 9985/19560 | loss 3.426475 (+0.43z)| norm 0.2825 (-0.14z)| lr 3.07e-04 | 322.96 ms | 52.3% bf16 MFU | 1625478 tok/s +step 9986/19560 | loss 3.410400 (+0.08z)| norm 0.2780 (-0.22z)| lr 3.07e-04 | 322.60 ms | 52.3% bf16 MFU | 1625465 tok/s +step 9987/19560 | loss 3.374501 (-0.68z)| norm 0.2708 (-0.35z)| lr 3.07e-04 | 322.10 ms | 52.4% bf16 MFU | 1625578 tok/s +step 9988/19560 | loss 3.425613 (+0.41z)| norm 0.2708 (-0.34z)| lr 3.07e-04 | 322.45 ms | 52.3% bf16 MFU | 1625597 tok/s +step 9989/19560 | loss 3.423953 (+0.37z)| norm 0.2903 (-0.00z)| lr 3.07e-04 | 322.63 ms | 52.3% bf16 MFU | 1625569 tok/s +step 9990/19560 | loss 3.391456 (-0.33z)| norm 0.2650 (-0.44z)| lr 3.07e-04 | 322.99 ms | 52.3% bf16 MFU | 1625451 tok/s +step 9991/19560 | loss 3.378156 (-0.61z)| norm 0.2902 (+0.00z)| lr 3.07e-04 | 322.40 ms | 52.3% bf16 MFU | 1625489 tok/s +step 9992/19560 | loss 3.381907 (-0.52z)| norm 0.2736 (-0.28z)| lr 3.07e-04 | 322.27 ms | 52.4% bf16 MFU | 1625558 tok/s +step 9993/19560 | loss 3.437747 (+0.68z)| norm 0.2677 (-0.38z)| lr 3.07e-04 | 322.51 ms | 52.3% bf16 MFU | 1625562 tok/s +step 9994/19560 | loss 3.405512 (-0.02z)| norm 0.2808 (-0.15z)| lr 3.07e-04 | 322.58 ms | 52.3% bf16 MFU | 1625548 tok/s +step 9995/19560 | loss 3.379626 (-0.58z)| norm 0.2708 (-0.32z)| lr 3.07e-04 | 322.38 ms | 52.4% bf16 MFU | 1625585 tok/s +step 9996/19560 | loss 3.405016 (-0.03z)| norm 0.2650 (-0.42z)| lr 3.07e-04 | 322.63 ms | 52.3% bf16 MFU | 1625558 tok/s +step 9997/19560 | loss 3.406706 (+0.01z)| norm 0.2766 (-0.22z)| lr 3.07e-04 | 322.38 ms | 52.4% bf16 MFU | 1625595 tok/s +step 9998/19560 | loss 3.384115 (-0.49z)| norm 0.2680 (-0.36z)| lr 3.07e-04 | 322.96 ms | 52.3% bf16 MFU | 1625486 tok/s +step 9999/19560 | loss 3.374701 (-0.68z)| norm 0.3041 (+0.27z)| lr 3.07e-04 | 322.52 ms | 52.3% bf16 MFU | 1625492 tok/s +step 10000/19560 | loss 3.419989 (+0.30z)| norm 0.2602 (-0.49z)| lr 3.07e-04 | 322.77 ms | 52.3% bf16 MFU | 1625436 tok/s +val loss 3.396394 + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00007.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00004.bin +evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 HellaSwag: 2880/10042 = 0.286795 +Writing checkpoint at step 10000 +Writing model to log124M/model_00010000.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00003.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00005.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00002.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00001.bin + evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 Writing state to log124M/state_00010000_00006.bin +Writing state to log124M/state_00010000_00000.bin +step 10001/19560 | loss 3.387290 (-0.40z)| norm 0.3112 (+0.40z)| lr 3.06e-04 | 318.39 ms | 53.0% bf16 MFU | 1626498 tok/s +step 10002/19560 | loss 3.454981 (+1.06z)| norm 0.3007 (+0.21z)| lr 3.06e-04 | 321.28 ms | 52.5% bf16 MFU | 1626766 tok/s +step 10003/19560 | loss 3.372087 (-0.73z)| norm 0.2700 (-0.32z)| lr 3.06e-04 | 323.64 ms | 52.1% bf16 MFU | 1626425 tok/s +step 10004/19560 | loss 3.443544 (+0.82z)| norm 0.3114 (+0.40z)| lr 3.06e-04 | 322.44 ms | 52.3% bf16 MFU | 1626405 tok/s +step 10005/19560 | loss 3.457691 (+1.13z)| norm 0.2748 (-0.23z)| lr 3.06e-04 | 322.32 ms | 52.4% bf16 MFU | 1626416 tok/s +step 10006/19560 | loss 3.385425 (-0.43z)| norm 0.2844 (-0.07z)| lr 3.06e-04 | 323.77 ms | 52.1% bf16 MFU | 1626062 tok/s +step 10007/19560 | loss 3.411654 (+0.14z)| norm 0.2965 (+0.14z)| lr 3.06e-04 | 322.46 ms | 52.3% bf16 MFU | 1626053 tok/s +step 10008/19560 | loss 3.370457 (-0.75z)| norm 0.3042 (+0.28z)| lr 3.06e-04 | 322.65 ms | 52.3% bf16 MFU | 1625998 tok/s +step 10009/19560 | loss 3.442785 (+0.83z)| norm 0.2777 (-0.19z)| lr 3.06e-04 | 322.50 ms | 52.3% bf16 MFU | 1625982 tok/s +step 10010/19560 | loss 3.423753 (+0.40z)| norm 0.2865 (-0.03z)| lr 3.06e-04 | 322.70 ms | 52.3% bf16 MFU | 1625917 tok/s +step 10011/19560 | loss 3.422318 (+0.38z)| norm 0.2755 (-0.22z)| lr 3.06e-04 | 322.45 ms | 52.3% bf16 MFU | 1625918 tok/s +step 10012/19560 | loss 3.437294 (+0.70z)| norm 0.2946 (+0.11z)| lr 3.06e-04 | 322.81 ms | 52.3% bf16 MFU | 1625830 tok/s +step 10013/19560 | loss 3.367734 (-0.84z)| norm 0.2624 (-0.44z)| lr 3.06e-04 | 323.10 ms | 52.2% bf16 MFU | 1625673 tok/s +step 10014/19560 | loss 3.418018 (+0.27z)| norm 0.2853 (-0.05z)| lr 3.06e-04 | 322.65 ms | 52.3% bf16 MFU | 1625636 tok/s +step 10015/19560 | loss 3.367171 (-0.87z)| norm 0.2496 (-0.66z)| lr 3.06e-04 | 323.19 ms | 52.2% bf16 MFU | 1625466 tok/s +step 10016/19560 | loss 3.444044 (+0.84z)| norm 0.2672 (-0.36z)| lr 3.06e-04 | 323.16 ms | 52.2% bf16 MFU | 1625310 tok/s +step 10017/19560 | loss 3.392286 (-0.33z)| norm 0.3000 (+0.21z)| lr 3.06e-04 | 322.81 ms | 52.3% bf16 MFU | 1625251 tok/s +step 10018/19560 | loss 3.422940 (+0.36z)| norm 0.2612 (-0.45z)| lr 3.06e-04 | 322.69 ms | 52.3% bf16 MFU | 1625224 tok/s +step 10019/19560 | loss 3.440383 (+0.74z)| norm 0.2929 (+0.10z)| lr 3.06e-04 | 323.28 ms | 52.2% bf16 MFU | 1625051 tok/s +step 10020/19560 | loss 3.414966 (+0.16z)| norm 0.2794 (-0.13z)| lr 3.06e-04 | 322.93 ms | 52.3% bf16 MFU | 1624975 tok/s +step 10021/19560 | loss 3.444747 (+0.83z)| norm 0.2745 (-0.21z)| lr 3.05e-04 | 322.98 ms | 52.3% bf16 MFU | 1624890 tok/s +step 10022/19560 | loss 3.446858 (+0.86z)| norm 0.2697 (-0.30z)| lr 3.05e-04 | 323.05 ms | 52.2% bf16 MFU | 1624792 tok/s +step 10023/19560 | loss 3.587350 (+3.77z)| norm 0.2952 (+0.15z)| lr 3.05e-04 | 323.10 ms | 52.2% bf16 MFU | 1624687 tok/s +step 10024/19560 | loss 3.371219 (-0.82z)| norm 0.3197 (+0.56z)| lr 3.05e-04 | 323.36 ms | 52.2% bf16 MFU | 1624521 tok/s +step 10025/19560 | loss 3.383323 (-0.57z)| norm 0.2536 (-0.58z)| lr 3.05e-04 | 322.79 ms | 52.3% bf16 MFU | 1624508 tok/s +step 10026/19560 | loss 3.408937 (-0.02z)| norm 0.2836 (-0.06z)| lr 3.05e-04 | 322.90 ms | 52.3% bf16 MFU | 1624466 tok/s +step 10027/19560 | loss 3.450514 (+0.87z)| norm 0.2717 (-0.26z)| lr 3.05e-04 | 323.08 ms | 52.2% bf16 MFU | 1624382 tok/s +step 10028/19560 | loss 3.521168 (+2.31z)| norm 0.2735 (-0.23z)| lr 3.05e-04 | 323.87 ms | 52.1% bf16 MFU | 1624105 tok/s +step 10029/19560 | loss 3.420774 (+0.20z)| norm 0.2532 (-0.58z)| lr 3.05e-04 | 323.30 ms | 52.2% bf16 MFU | 1623985 tok/s +step 10030/19560 | loss 3.370452 (-0.86z)| norm 0.2551 (-0.55z)| lr 3.05e-04 | 322.61 ms | 52.3% bf16 MFU | 1624042 tok/s +step 10031/19560 | loss 3.416344 (+0.10z)| norm 0.2555 (-0.54z)| lr 3.05e-04 | 322.84 ms | 52.3% bf16 MFU | 1624040 tok/s +step 10032/19560 | loss 3.456600 (+0.94z)| norm 0.2553 (-0.54z)| lr 3.05e-04 | 322.91 ms | 52.3% bf16 MFU | 1624021 tok/s +step 10033/19560 | loss 3.351793 (-1.25z)| norm 0.2539 (-0.56z)| lr 3.05e-04 | 323.07 ms | 52.2% bf16 MFU | 1623962 tok/s +step 10034/19560 | loss 3.369354 (-0.91z)| norm 0.2448 (-0.71z)| lr 3.05e-04 | 323.20 ms | 52.2% bf16 MFU | 1623873 tok/s +step 10035/19560 | loss 3.373998 (-0.81z)| norm 0.2538 (-0.55z)| lr 3.05e-04 | 323.14 ms | 52.2% bf16 MFU | 1623804 tok/s +step 10036/19560 | loss 3.396340 (-0.32z)| norm 0.2614 (-0.42z)| lr 3.05e-04 | 322.91 ms | 52.3% bf16 MFU | 1623796 tok/s +step 10037/19560 | loss 3.411421 (+0.00z)| norm 0.2583 (-0.47z)| lr 3.05e-04 | 322.95 ms | 52.3% bf16 MFU | 1623778 tok/s +step 10038/19560 | loss 3.410031 (-0.04z)| norm 0.2619 (-0.40z)| lr 3.05e-04 | 322.90 ms | 52.3% bf16 MFU | 1623775 tok/s +step 10039/19560 | loss 3.385334 (-0.57z)| norm 0.2706 (-0.25z)| lr 3.05e-04 | 322.87 ms | 52.3% bf16 MFU | 1623777 tok/s +step 10040/19560 | loss 3.399068 (-0.28z)| norm 0.2881 (+0.05z)| lr 3.05e-04 | 323.15 ms | 52.2% bf16 MFU | 1623709 tok/s +step 10041/19560 | loss 3.414889 (+0.06z)| norm 0.2555 (-0.51z)| lr 3.04e-04 | 323.47 ms | 52.2% bf16 MFU | 1623565 tok/s +step 10042/19560 | loss 3.387003 (-0.54z)| norm 0.2735 (-0.19z)| lr 3.04e-04 | 322.45 ms | 52.3% bf16 MFU | 1623686 tok/s +step 10043/19560 | loss 3.410193 (-0.03z)| norm 0.2637 (-0.35z)| lr 3.04e-04 | 323.33 ms | 52.2% bf16 MFU | 1623577 tok/s +step 10044/19560 | loss 3.460828 (+1.06z)| norm 0.3017 (+0.30z)| lr 3.04e-04 | 322.32 ms | 52.4% bf16 MFU | 1623729 tok/s +step 10045/19560 | loss 3.390901 (-0.46z)| norm 0.3011 (+0.29z)| lr 3.04e-04 | 324.21 ms | 52.1% bf16 MFU | 1623399 tok/s +step 10046/19560 | loss 3.428874 (+0.37z)| norm 0.2507 (-0.57z)| lr 3.04e-04 | 322.72 ms | 52.3% bf16 MFU | 1623458 tok/s +step 10047/19560 | loss 3.383642 (-0.60z)| norm 0.2968 (+0.23z)| lr 3.04e-04 | 322.79 ms | 52.3% bf16 MFU | 1623496 tok/s +step 10048/19560 | loss 3.467684 (+1.22z)| norm 0.2901 (+0.11z)| lr 3.04e-04 | 323.78 ms | 52.1% bf16 MFU | 1623286 tok/s +step 10049/19560 | loss 3.366791 (-0.98z)| norm 0.2752 (-0.13z)| lr 3.04e-04 | 322.40 ms | 52.3% bf16 MFU | 1623432 tok/s +step 10050/19560 | loss 3.421862 (+0.23z)| norm 0.2925 (+0.17z)| lr 3.04e-04 | 322.67 ms | 52.3% bf16 MFU | 1623503 tok/s +step 10051/19560 | loss 3.395822 (-0.34z)| norm 0.2868 (+0.08z)| lr 3.04e-04 | 323.31 ms | 52.2% bf16 MFU | 1623409 tok/s +step 10052/19560 | loss 3.397180 (-0.31z)| norm 0.2727 (-0.17z)| lr 3.04e-04 | 322.80 ms | 52.3% bf16 MFU | 1623447 tok/s +step 10053/19560 | loss 3.426479 (+0.34z)| norm 0.2773 (-0.08z)| lr 3.04e-04 | 323.26 ms | 52.2% bf16 MFU | 1623369 tok/s +step 10054/19560 | loss 3.421022 (+0.22z)| norm 0.2964 (+0.25z)| lr 3.04e-04 | 322.49 ms | 52.3% bf16 MFU | 1623489 tok/s +step 10055/19560 | loss 3.362828 (-1.05z)| norm 0.2710 (-0.19z)| lr 3.04e-04 | 323.39 ms | 52.2% bf16 MFU | 1623376 tok/s +step 10056/19560 | loss 3.392787 (-0.40z)| norm 0.2570 (-0.43z)| lr 3.04e-04 | 322.43 ms | 52.3% bf16 MFU | 1623509 tok/s +step 10057/19560 | loss 3.380988 (-0.65z)| norm 0.2781 (-0.06z)| lr 3.04e-04 | 323.24 ms | 52.2% bf16 MFU | 1623432 tok/s +step 10058/19560 | loss 3.447933 (+0.81z)| norm 0.3076 (+0.44z)| lr 3.04e-04 | 323.54 ms | 52.2% bf16 MFU | 1623283 tok/s +step 10059/19560 | loss 3.422152 (+0.24z)| norm 0.2989 (+0.29z)| lr 3.04e-04 | 322.93 ms | 52.3% bf16 MFU | 1623295 tok/s +step 10060/19560 | loss 3.405226 (-0.13z)| norm 0.3333 (+0.88z)| lr 3.04e-04 | 323.42 ms | 52.2% bf16 MFU | 1623185 tok/s +step 10061/19560 | loss 3.429962 (+0.41z)| norm 0.2712 (-0.20z)| lr 3.03e-04 | 323.05 ms | 52.2% bf16 MFU | 1623171 tok/s +step 10062/19560 | loss 3.382407 (-0.63z)| norm 0.2940 (+0.19z)| lr 3.03e-04 | 323.05 ms | 52.2% bf16 MFU | 1623159 tok/s +step 10063/19560 | loss 3.480401 (+1.54z)| norm 0.2764 (-0.12z)| lr 3.03e-04 | 323.95 ms | 52.1% bf16 MFU | 1622921 tok/s +step 10064/19560 | loss 3.416005 (+0.11z)| norm 0.2879 (+0.08z)| lr 3.03e-04 | 323.11 ms | 52.2% bf16 MFU | 1622907 tok/s +step 10065/19560 | loss 3.366560 (-0.98z)| norm 0.2992 (+0.27z)| lr 3.03e-04 | 323.26 ms | 52.2% bf16 MFU | 1622856 tok/s +step 10066/19560 | loss 3.387463 (-0.51z)| norm 0.2522 (-0.54z)| lr 3.03e-04 | 322.97 ms | 52.3% bf16 MFU | 1622878 tok/s +step 10067/19560 | loss 3.345057 (-1.43z)| norm 0.3368 (+0.91z)| lr 3.03e-04 | 323.35 ms | 52.2% bf16 MFU | 1622805 tok/s +step 10068/19560 | loss 3.376671 (-0.74z)| norm 0.2683 (-0.28z)| lr 3.03e-04 | 323.51 ms | 52.2% bf16 MFU | 1622697 tok/s +step 10069/19560 | loss 3.385193 (-0.55z)| norm 0.3149 (+0.53z)| lr 3.03e-04 | 322.90 ms | 52.3% bf16 MFU | 1622747 tok/s +step 10070/19560 | loss 3.385457 (-0.54z)| norm 0.2723 (-0.21z)| lr 3.03e-04 | 323.75 ms | 52.1% bf16 MFU | 1622581 tok/s +step 10071/19560 | loss 3.369804 (-0.87z)| norm 0.3106 (+0.45z)| lr 3.03e-04 | 323.51 ms | 52.2% bf16 MFU | 1622484 tok/s +step 10072/19560 | loss 3.389155 (-0.44z)| norm 0.3128 (+0.49z)| lr 3.03e-04 | 322.81 ms | 52.3% bf16 MFU | 1622566 tok/s +step 10073/19560 | loss 3.382064 (-0.59z)| norm 0.2983 (+0.23z)| lr 3.03e-04 | 323.21 ms | 52.2% bf16 MFU | 1622543 tok/s +step 10074/19560 | loss 3.410367 (+0.05z)| norm 0.3139 (+0.50z)| lr 3.03e-04 | 322.20 ms | 52.4% bf16 MFU | 1622776 tok/s +step 10075/19560 | loss 3.402290 (-0.13z)| norm 0.2704 (-0.25z)| lr 3.03e-04 | 324.17 ms | 52.1% bf16 MFU | 1622505 tok/s +step 10076/19560 | loss 3.443523 (+0.78z)| norm 0.3027 (+0.31z)| lr 3.03e-04 | 323.14 ms | 52.2% bf16 MFU | 1622504 tok/s +step 10077/19560 | loss 3.446202 (+0.84z)| norm 0.2619 (-0.40z)| lr 3.03e-04 | 322.96 ms | 52.3% bf16 MFU | 1622547 tok/s +step 10078/19560 | loss 3.420761 (+0.26z)| norm 0.2702 (-0.25z)| lr 3.03e-04 | 323.45 ms | 52.2% bf16 MFU | 1622465 tok/s +step 10079/19560 | loss 3.461098 (+1.15z)| norm 0.2941 (+0.16z)| lr 3.03e-04 | 323.11 ms | 52.2% bf16 MFU | 1622474 tok/s +step 10080/19560 | loss 3.402225 (-0.17z)| norm 0.2449 (-0.68z)| lr 3.03e-04 | 322.96 ms | 52.3% bf16 MFU | 1622518 tok/s +step 10081/19560 | loss 3.382689 (-0.60z)| norm 0.2955 (+0.19z)| lr 3.02e-04 | 322.63 ms | 52.3% bf16 MFU | 1622644 tok/s +step 10082/19560 | loss 3.396438 (-0.30z)| norm 0.2608 (-0.41z)| lr 3.02e-04 | 324.18 ms | 52.1% bf16 MFU | 1622375 tok/s +step 10083/19560 | loss 3.364377 (-1.02z)| norm 0.2641 (-0.81z)| lr 3.02e-04 | 323.06 ms | 52.2% bf16 MFU | 1622400 tok/s +step 10084/19560 | loss 3.367034 (-0.95z)| norm 0.2877 (+0.45z)| lr 3.02e-04 | 322.88 ms | 52.3% bf16 MFU | 1622468 tok/s +step 10085/19560 | loss 3.395104 (-0.32z)| norm 0.2682 (-0.58z)| lr 3.02e-04 | 322.50 ms | 52.3% bf16 MFU | 1622629 tok/s +step 10086/19560 | loss 3.446591 (+0.82z)| norm 0.2666 (-0.66z)| lr 3.02e-04 | 323.11 ms | 52.2% bf16 MFU | 1622630 tok/s +step 10087/19560 | loss 3.411141 (+0.02z)| norm 0.2763 (-0.14z)| lr 3.02e-04 | 323.21 ms | 52.2% bf16 MFU | 1622606 tok/s +step 10088/19560 | loss 3.408317 (-0.02z)| norm 0.2799 (+0.05z)| lr 3.02e-04 | 323.00 ms | 52.3% bf16 MFU | 1622634 tok/s +step 10089/19560 | loss 3.382884 (-0.60z)| norm 0.2798 (+0.04z)| lr 3.02e-04 | 323.67 ms | 52.1% bf16 MFU | 1622492 tok/s +step 10090/19560 | loss 3.357160 (-1.19z)| norm 0.2820 (+0.16z)| lr 3.02e-04 | 323.18 ms | 52.2% bf16 MFU | 1622481 tok/s +step 10091/19560 | loss 3.414435 (+0.13z)| norm 0.2846 (+0.29z)| lr 3.02e-04 | 323.04 ms | 52.2% bf16 MFU | 1622507 tok/s +step 10092/19560 | loss 3.359769 (-1.15z)| norm 0.2912 (+0.63z)| lr 3.02e-04 | 323.08 ms | 52.2% bf16 MFU | 1622521 tok/s +step 10093/19560 | loss 3.429482 (+0.47z)| norm 0.2800 (+0.03z)| lr 3.02e-04 | 322.97 ms | 52.3% bf16 MFU | 1622562 tok/s +step 10094/19560 | loss 3.424188 (+0.34z)| norm 0.2968 (+0.92z)| lr 3.02e-04 | 322.69 ms | 52.3% bf16 MFU | 1622671 tok/s +step 10095/19560 | loss 3.367521 (-0.97z)| norm 0.2823 (+0.14z)| lr 3.02e-04 | 322.98 ms | 52.3% bf16 MFU | 1622701 tok/s +step 10096/19560 | loss 3.373178 (-0.83z)| norm 0.3346 (+2.81z)| lr 3.02e-04 | 322.78 ms | 52.3% bf16 MFU | 1622779 tok/s +step 10097/19560 | loss 3.415446 (+0.14z)| norm 0.2879 (+0.41z)| lr 3.02e-04 | 322.87 ms | 52.3% bf16 MFU | 1622833 tok/s +step 10098/19560 | loss 3.381071 (-0.67z)| norm 0.3358 (+2.79z)| lr 3.02e-04 | 323.21 ms | 52.2% bf16 MFU | 1622799 tok/s +step 10099/19560 | loss 3.397817 (-0.28z)| norm 0.2884 (+0.40z)| lr 3.02e-04 | 323.14 ms | 52.2% bf16 MFU | 1622784 tok/s +step 10100/19560 | loss 3.427428 (+0.42z)| norm 0.3165 (+1.78z)| lr 3.02e-04 | 323.39 ms | 52.2% bf16 MFU | 1622707 tok/s +step 10101/19560 | loss 3.402761 (-0.18z)| norm 0.2745 (-0.31z)| lr 3.01e-04 | 322.36 ms | 52.4% bf16 MFU | 1622892 tok/s +step 10102/19560 | loss 3.400195 (-0.24z)| norm 0.3105 (+1.46z)| lr 3.01e-04 | 323.10 ms | 52.2% bf16 MFU | 1622881 tok/s +step 10103/19560 | loss 3.388227 (-0.52z)| norm 0.2869 (+0.29z)| lr 3.01e-04 | 322.99 ms | 52.3% bf16 MFU | 1622899 tok/s +step 10104/19560 | loss 3.449624 (+0.94z)| norm 0.2592 (-1.08z)| lr 3.01e-04 | 323.39 ms | 52.2% bf16 MFU | 1622815 tok/s +step 10105/19560 | loss 3.356107 (-1.32z)| norm 0.3425 (+2.95z)| lr 3.01e-04 | 323.13 ms | 52.2% bf16 MFU | 1622802 tok/s +step 10106/19560 | loss 3.449329 (+1.02z)| norm 0.2609 (-0.97z)| lr 3.01e-04 | 322.57 ms | 52.3% bf16 MFU | 1622929 tok/s +step 10107/19560 | loss 3.452118 (+1.08z)| norm 0.2998 (+0.90z)| lr 3.01e-04 | 322.98 ms | 52.3% bf16 MFU | 1622947 tok/s +step 10108/19560 | loss 3.439776 (+0.76z)| norm 0.2721 (-0.43z)| lr 3.01e-04 | 322.98 ms | 52.3% bf16 MFU | 1622964 tok/s +step 10109/19560 | loss 3.433910 (+0.61z)| norm 0.2915 (+0.49z)| lr 3.01e-04 | 322.96 ms | 52.3% bf16 MFU | 1622986 tok/s +step 10110/19560 | loss 3.437836 (+0.70z)| norm 0.2781 (-0.17z)| lr 3.01e-04 | 322.73 ms | 52.3% bf16 MFU | 1623064 tok/s +step 10111/19560 | loss 3.459588 (+1.23z)| norm 0.2907 (+0.43z)| lr 3.01e-04 | 322.96 ms | 52.3% bf16 MFU | 1623080 tok/s +step 10112/19560 | loss 3.549150 (+3.75z)| norm 0.3011 (+0.93z)| lr 3.01e-04 | 322.70 ms | 52.3% bf16 MFU | 1623160 tok/s +step 10113/19560 | loss 3.380008 (-0.79z)| norm 0.2681 (-0.68z)| lr 3.01e-04 | 322.51 ms | 52.3% bf16 MFU | 1623284 tok/s +step 10114/19560 | loss 3.437291 (+0.74z)| norm 0.2954 (+0.65z)| lr 3.01e-04 | 322.50 ms | 52.3% bf16 MFU | 1623404 tok/s +step 10115/19560 | loss 3.407177 (-0.08z)| norm 0.2794 (-0.14z)| lr 3.01e-04 | 322.81 ms | 52.3% bf16 MFU | 1623440 tok/s +step 10116/19560 | loss 3.423483 (+0.36z)| norm 0.2870 (+0.23z)| lr 3.01e-04 | 323.12 ms | 52.2% bf16 MFU | 1623396 tok/s +step 10117/19560 | loss 3.380255 (-0.79z)| norm 0.2732 (-0.44z)| lr 3.01e-04 | 322.28 ms | 52.4% bf16 MFU | 1623566 tok/s +step 10118/19560 | loss 3.365291 (-1.18z)| norm 0.2897 (+0.36z)| lr 3.01e-04 | 323.35 ms | 52.2% bf16 MFU | 1623460 tok/s +step 10119/19560 | loss 3.443975 (+0.91z)| norm 0.2874 (+0.25z)| lr 3.01e-04 | 322.46 ms | 52.3% bf16 MFU | 1623581 tok/s +step 10120/19560 | loss 3.449111 (+1.03z)| norm 0.2775 (-0.24z)| lr 3.01e-04 | 323.12 ms | 52.2% bf16 MFU | 1623530 tok/s +step 10121/19560 | loss 3.371733 (-1.02z)| norm 0.2899 (+0.36z)| lr 3.00e-04 | 322.82 ms | 52.3% bf16 MFU | 1623557 tok/s +step 10122/19560 | loss 3.336457 (-1.92z)| norm 0.2802 (-0.12z)| lr 3.00e-04 | 322.24 ms | 52.4% bf16 MFU | 1623730 tok/s +step 10123/19560 | loss 3.341967 (-1.75z)| norm 0.2729 (-0.48z)| lr 3.00e-04 | 322.53 ms | 52.3% bf16 MFU | 1623821 tok/s +step 10124/19560 | loss 3.464136 (+1.41z)| norm 0.2843 (+0.08z)| lr 3.00e-04 | 322.74 ms | 52.3% bf16 MFU | 1623854 tok/s +step 10125/19560 | loss 3.364014 (-1.16z)| norm 0.2546 (-1.37z)| lr 3.00e-04 | 322.66 ms | 52.3% bf16 MFU | 1623907 tok/s +step 10126/19560 | loss 3.402504 (-0.18z)| norm 0.2792 (-0.17z)| lr 3.00e-04 | 322.87 ms | 52.3% bf16 MFU | 1623903 tok/s +step 10127/19560 | loss 3.395495 (-0.36z)| norm 0.2711 (-0.56z)| lr 3.00e-04 | 322.16 ms | 52.4% bf16 MFU | 1624077 tok/s +step 10128/19560 | loss 3.415736 (+0.16z)| norm 0.2793 (-0.16z)| lr 3.00e-04 | 322.83 ms | 52.3% bf16 MFU | 1624076 tok/s +step 10129/19560 | loss 3.366895 (-1.09z)| norm 0.2720 (-0.51z)| lr 3.00e-04 | 322.61 ms | 52.3% bf16 MFU | 1624131 tok/s +step 10130/19560 | loss 3.357682 (-1.31z)| norm 0.2836 (+0.07z)| lr 3.00e-04 | 322.39 ms | 52.3% bf16 MFU | 1624236 tok/s +step 10131/19560 | loss 3.371634 (-0.95z)| norm 0.2471 (-1.72z)| lr 3.00e-04 | 322.87 ms | 52.3% bf16 MFU | 1624216 tok/s +step 10132/19560 | loss 3.408292 (-0.00z)| norm 0.2620 (-0.97z)| lr 3.00e-04 | 322.37 ms | 52.4% bf16 MFU | 1624323 tok/s +step 10133/19560 | loss 3.432796 (+0.64z)| norm 0.2645 (-0.84z)| lr 3.00e-04 | 322.67 ms | 52.3% bf16 MFU | 1624348 tok/s +step 10134/19560 | loss 3.340226 (-1.73z)| norm 0.2865 (+0.25z)| lr 3.00e-04 | 322.61 ms | 52.3% bf16 MFU | 1624389 tok/s +step 10135/19560 | loss 3.395983 (-0.30z)| norm 0.2781 (-0.16z)| lr 3.00e-04 | 322.37 ms | 52.4% bf16 MFU | 1624489 tok/s +step 10136/19560 | loss 3.356447 (-1.30z)| norm 0.2684 (-0.63z)| lr 3.00e-04 | 322.32 ms | 52.4% bf16 MFU | 1624594 tok/s +step 10137/19560 | loss 3.343072 (-1.61z)| norm 0.2778 (-0.16z)| lr 3.00e-04 | 322.91 ms | 52.3% bf16 MFU | 1624547 tok/s +step 10138/19560 | loss 3.467633 (+1.52z)| norm 0.2844 (+0.16z)| lr 3.00e-04 | 322.49 ms | 52.3% bf16 MFU | 1624607 tok/s +step 10139/19560 | loss 3.443596 (+0.91z)| norm 0.2704 (-0.53z)| lr 3.00e-04 | 322.67 ms | 52.3% bf16 MFU | 1624618 tok/s +step 10140/19560 | loss 3.370003 (-0.92z)| norm 0.2696 (-0.56z)| lr 3.00e-04 | 323.12 ms | 52.2% bf16 MFU | 1624516 tok/s +step 10141/19560 | loss 3.394017 (-0.32z)| norm 0.2754 (-0.28z)| lr 3.00e-04 | 322.53 ms | 52.3% bf16 MFU | 1624568 tok/s +step 10142/19560 | loss 3.391545 (-0.38z)| norm 0.3039 (+1.13z)| lr 2.99e-04 | 323.02 ms | 52.2% bf16 MFU | 1624492 tok/s +step 10143/19560 | loss 3.355554 (-1.28z)| norm 0.2654 (-0.79z)| lr 2.99e-04 | 322.61 ms | 52.3% bf16 MFU | 1624525 tok/s +step 10144/19560 | loss 3.492547 (+2.11z)| norm 0.2871 (+0.29z)| lr 2.99e-04 | 323.10 ms | 52.2% bf16 MFU | 1624433 tok/s +step 10145/19560 | loss 3.346537 (-1.47z)| norm 0.2673 (-0.70z)| lr 2.99e-04 | 322.72 ms | 52.3% bf16 MFU | 1624442 tok/s +step 10146/19560 | loss 3.435891 (+0.71z)| norm 0.2580 (-1.16z)| lr 2.99e-04 | 322.71 ms | 52.3% bf16 MFU | 1624453 tok/s +step 10147/19560 | loss 3.404161 (-0.06z)| norm 0.2744 (-0.33z)| lr 2.99e-04 | 322.47 ms | 52.3% bf16 MFU | 1624522 tok/s +step 10148/19560 | loss 3.471923 (+1.58z)| norm 0.2660 (-0.74z)| lr 2.99e-04 | 322.61 ms | 52.3% bf16 MFU | 1624554 tok/s +step 10149/19560 | loss 3.408821 (+0.05z)| norm 0.2730 (-0.39z)| lr 2.99e-04 | 323.27 ms | 52.2% bf16 MFU | 1624417 tok/s +step 10150/19560 | loss 3.449536 (+1.04z)| norm 0.2678 (-0.65z)| lr 2.99e-04 | 322.62 ms | 52.3% bf16 MFU | 1624452 tok/s +step 10151/19560 | loss 3.421459 (+0.42z)| norm 0.2720 (-0.43z)| lr 2.99e-04 | 323.72 ms | 52.1% bf16 MFU | 1624208 tok/s +step 10152/19560 | loss 3.417914 (+0.32z)| norm 0.2747 (-0.28z)| lr 2.99e-04 | 322.30 ms | 52.4% bf16 MFU | 1624334 tok/s +step 10153/19560 | loss 3.386708 (-0.51z)| norm 0.2748 (-0.29z)| lr 2.99e-04 | 322.39 ms | 52.3% bf16 MFU | 1624429 tok/s +step 10154/19560 | loss 3.413061 (+0.19z)| norm 0.2530 (-1.38z)| lr 2.99e-04 | 323.10 ms | 52.2% bf16 MFU | 1624342 tok/s +step 10155/19560 | loss 3.407744 (+0.06z)| norm 0.2719 (-0.42z)| lr 2.99e-04 | 323.00 ms | 52.3% bf16 MFU | 1624285 tok/s +step 10156/19560 | loss 3.424638 (+0.55z)| norm 0.2476 (-1.63z)| lr 2.99e-04 | 323.00 ms | 52.3% bf16 MFU | 1624230 tok/s +step 10157/19560 | loss 3.404423 (-0.00z)| norm 0.2744 (-0.29z)| lr 2.99e-04 | 322.48 ms | 52.3% bf16 MFU | 1624310 tok/s +step 10158/19560 | loss 3.365352 (-1.08z)| norm 0.2588 (-1.09z)| lr 2.99e-04 | 322.50 ms | 52.3% bf16 MFU | 1624380 tok/s +step 10159/19560 | loss 3.404846 (+0.01z)| norm 0.2782 (-0.11z)| lr 2.99e-04 | 323.28 ms | 52.2% bf16 MFU | 1624250 tok/s +step 10160/19560 | loss 3.393644 (-0.29z)| norm 0.2752 (-0.27z)| lr 2.99e-04 | 322.39 ms | 52.4% bf16 MFU | 1624350 tok/s +step 10161/19560 | loss 3.375583 (-0.80z)| norm 0.2770 (-0.19z)| lr 2.99e-04 | 323.48 ms | 52.2% bf16 MFU | 1624171 tok/s +step 10162/19560 | loss 3.469923 (+1.81z)| norm 0.2537 (-1.41z)| lr 2.98e-04 | 322.66 ms | 52.3% bf16 MFU | 1624207 tok/s +step 10163/19560 | loss 3.364522 (-1.12z)| norm 0.2666 (-0.75z)| lr 2.98e-04 | 322.62 ms | 52.3% bf16 MFU | 1624252 tok/s +step 10164/19560 | loss 3.410353 (+0.15z)| norm 0.2318 (-2.51z)| lr 2.98e-04 | 321.82 ms | 52.4% bf16 MFU | 1624497 tok/s +step 10165/19560 | loss 3.400705 (-0.12z)| norm 0.2707 (-0.52z)| lr 2.98e-04 | 323.92 ms | 52.1% bf16 MFU | 1624201 tok/s +step 10166/19560 | loss 3.423496 (+0.51z)| norm 0.2766 (-0.22z)| lr 2.98e-04 | 322.84 ms | 52.3% bf16 MFU | 1624190 tok/s +step 10167/19560 | loss 3.384475 (-0.57z)| norm 0.2679 (-0.67z)| lr 2.98e-04 | 322.40 ms | 52.3% bf16 MFU | 1624290 tok/s +step 10168/19560 | loss 3.403498 (-0.04z)| norm 0.2882 (+0.38z)| lr 2.98e-04 | 322.77 ms | 52.3% bf16 MFU | 1624294 tok/s +step 10169/19560 | loss 3.441831 (+1.01z)| norm 0.2905 (+0.49z)| lr 2.98e-04 | 322.75 ms | 52.3% bf16 MFU | 1624301 tok/s +step 10170/19560 | loss 3.422220 (+0.46z)| norm 0.2995 (+0.95z)| lr 2.98e-04 | 323.10 ms | 52.2% bf16 MFU | 1624219 tok/s +step 10171/19560 | loss 3.366889 (-1.05z)| norm 0.2928 (+0.59z)| lr 2.98e-04 | 322.44 ms | 52.3% bf16 MFU | 1624309 tok/s +step 10172/19560 | loss 3.350686 (-1.48z)| norm 0.2901 (+0.45z)| lr 2.98e-04 | 323.25 ms | 52.2% bf16 MFU | 1624189 tok/s +step 10173/19560 | loss 3.329122 (-2.02z)| norm 0.2792 (-0.11z)| lr 2.98e-04 | 323.06 ms | 52.2% bf16 MFU | 1624123 tok/s +step 10174/19560 | loss 3.446707 (+1.15z)| norm 0.2837 (+0.11z)| lr 2.98e-04 | 322.20 ms | 52.4% bf16 MFU | 1624278 tok/s +step 10175/19560 | loss 3.386755 (-0.47z)| norm 0.2623 (-1.00z)| lr 2.98e-04 | 322.67 ms | 52.3% bf16 MFU | 1624308 tok/s +step 10176/19560 | loss 3.432918 (+0.80z)| norm 0.2907 (+0.50z)| lr 2.98e-04 | 323.09 ms | 52.2% bf16 MFU | 1624229 tok/s +step 10177/19560 | loss 3.406979 (+0.08z)| norm 0.2642 (-0.89z)| lr 2.98e-04 | 323.05 ms | 52.2% bf16 MFU | 1624165 tok/s +step 10178/19560 | loss 3.394552 (-0.25z)| norm 0.2662 (-0.78z)| lr 2.98e-04 | 322.56 ms | 52.3% bf16 MFU | 1624226 tok/s +step 10179/19560 | loss 3.350427 (-1.44z)| norm 0.2804 (-0.03z)| lr 2.98e-04 | 322.68 ms | 52.3% bf16 MFU | 1624254 tok/s +step 10180/19560 | loss 3.476495 (+1.94z)| norm 0.2619 (-0.99z)| lr 2.98e-04 | 322.38 ms | 52.4% bf16 MFU | 1624357 tok/s +step 10181/19560 | loss 3.338516 (-1.72z)| norm 0.2847 (+0.20z)| lr 2.98e-04 | 322.82 ms | 52.3% bf16 MFU | 1624344 tok/s +step 10182/19560 | loss 3.345769 (-1.50z)| norm 0.2797 (-0.05z)| lr 2.97e-04 | 322.81 ms | 52.3% bf16 MFU | 1624334 tok/s +step 10183/19560 | loss 3.429076 (+0.68z)| norm 0.3119 (+1.60z)| lr 2.97e-04 | 322.34 ms | 52.4% bf16 MFU | 1624442 tok/s +step 10184/19560 | loss 3.439487 (+0.94z)| norm 0.2853 (+0.21z)| lr 2.97e-04 | 322.72 ms | 52.3% bf16 MFU | 1624449 tok/s +step 10185/19560 | loss 3.452711 (+1.27z)| norm 0.2759 (-0.28z)| lr 2.97e-04 | 323.22 ms | 52.2% bf16 MFU | 1624330 tok/s +step 10186/19560 | loss 3.512410 (+2.74z)| norm 0.2755 (-0.29z)| lr 2.97e-04 | 322.74 ms | 52.3% bf16 MFU | 1624337 tok/s +step 10187/19560 | loss 3.467059 (+1.56z)| norm 0.2897 (+0.46z)| lr 2.97e-04 | 322.91 ms | 52.3% bf16 MFU | 1624303 tok/s +step 10188/19560 | loss 3.368812 (-0.91z)| norm 0.2679 (-0.68z)| lr 2.97e-04 | 322.39 ms | 52.4% bf16 MFU | 1624401 tok/s +step 10189/19560 | loss 3.383230 (-0.53z)| norm 0.2950 (+0.78z)| lr 2.97e-04 | 322.56 ms | 52.3% bf16 MFU | 1624451 tok/s +step 10190/19560 | loss 3.333957 (-1.75z)| norm 0.2693 (-0.61z)| lr 2.97e-04 | 323.21 ms | 52.2% bf16 MFU | 1624334 tok/s +step 10191/19560 | loss 3.556389 (+3.63z)| norm 0.2750 (-0.30z)| lr 2.97e-04 | 322.91 ms | 52.3% bf16 MFU | 1624298 tok/s +step 10192/19560 | loss 3.401381 (-0.08z)| norm 0.2665 (-0.75z)| lr 2.97e-04 | 322.47 ms | 52.3% bf16 MFU | 1624376 tok/s +step 10193/19560 | loss 3.352057 (-1.25z)| norm 0.2891 (+0.48z)| lr 2.97e-04 | 323.01 ms | 52.2% bf16 MFU | 1624314 tok/s +step 10194/19560 | loss 3.382709 (-0.52z)| norm 0.2817 (+0.07z)| lr 2.97e-04 | 322.91 ms | 52.3% bf16 MFU | 1624280 tok/s +step 10195/19560 | loss 3.402284 (-0.06z)| norm 0.3100 (+1.69z)| lr 2.97e-04 | 322.34 ms | 52.4% bf16 MFU | 1624393 tok/s +step 10196/19560 | loss 3.345845 (-1.40z)| norm 0.2859 (+0.32z)| lr 2.97e-04 | 322.45 ms | 52.3% bf16 MFU | 1624471 tok/s +step 10197/19560 | loss 3.368101 (-0.87z)| norm 0.2908 (+0.61z)| lr 2.97e-04 | 323.64 ms | 52.1% bf16 MFU | 1624245 tok/s +step 10198/19560 | loss 3.423469 (+0.45z)| norm 0.2597 (-1.17z)| lr 2.97e-04 | 322.57 ms | 52.3% bf16 MFU | 1624300 tok/s +step 10199/19560 | loss 3.382286 (-0.54z)| norm 0.2863 (+0.38z)| lr 2.97e-04 | 322.86 ms | 52.3% bf16 MFU | 1624279 tok/s +step 10200/19560 | loss 3.361192 (-1.03z)| norm 0.2673 (-0.72z)| lr 2.97e-04 | 322.95 ms | 52.3% bf16 MFU | 1624237 tok/s +step 10201/19560 | loss 3.381669 (-0.55z)| norm 0.2819 (+0.15z)| lr 2.97e-04 | 322.69 ms | 52.3% bf16 MFU | 1624261 tok/s +step 10202/19560 | loss 3.386502 (-0.43z)| norm 0.2964 (+1.03z)| lr 2.96e-04 | 322.69 ms | 52.3% bf16 MFU | 1624285 tok/s +step 10203/19560 | loss 3.401790 (-0.06z)| norm 0.2767 (-0.16z)| lr 2.96e-04 | 323.12 ms | 52.2% bf16 MFU | 1624199 tok/s +step 10204/19560 | loss 3.440336 (+0.85z)| norm 0.2866 (+0.44z)| lr 2.96e-04 | 322.83 ms | 52.3% bf16 MFU | 1624191 tok/s +step 10205/19560 | loss 3.372369 (-0.75z)| norm 0.2873 (+0.48z)| lr 2.96e-04 | 323.24 ms | 52.2% bf16 MFU | 1624080 tok/s +step 10206/19560 | loss 3.403750 (+0.00z)| norm 0.3538 (+4.14z)| lr 2.96e-04 | 322.39 ms | 52.4% bf16 MFU | 1624188 tok/s +step 10207/19560 | loss 3.411956 (+0.21z)| norm 0.3621 (+4.26z)| lr 2.96e-04 | 322.55 ms | 52.3% bf16 MFU | 1624250 tok/s +step 10208/19560 | loss 3.418391 (+0.36z)| norm 0.2967 (+0.83z)| lr 2.96e-04 | 323.58 ms | 52.2% bf16 MFU | 1624052 tok/s +step 10209/19560 | loss 3.371027 (-0.78z)| norm 0.2754 (-0.28z)| lr 2.96e-04 | 322.42 ms | 52.3% bf16 MFU | 1624154 tok/s +step 10210/19560 | loss 3.410616 (+0.17z)| norm 0.2802 (-0.04z)| lr 2.96e-04 | 322.90 ms | 52.3% bf16 MFU | 1624130 tok/s +step 10211/19560 | loss 3.330790 (-1.72z)| norm 0.2757 (-0.29z)| lr 2.96e-04 | 323.16 ms | 52.2% bf16 MFU | 1624043 tok/s +step 10212/19560 | loss 3.367306 (-0.85z)| norm 0.2772 (-0.20z)| lr 2.96e-04 | 322.35 ms | 52.4% bf16 MFU | 1624163 tok/s +step 10213/19560 | loss 3.421704 (+0.43z)| norm 0.2501 (-1.62z)| lr 2.96e-04 | 322.83 ms | 52.3% bf16 MFU | 1624156 tok/s +step 10214/19560 | loss 3.348931 (-1.27z)| norm 0.2821 (+0.06z)| lr 2.96e-04 | 322.62 ms | 52.3% bf16 MFU | 1624203 tok/s +step 10215/19560 | loss 3.397424 (-0.12z)| norm 0.2660 (-0.78z)| lr 2.96e-04 | 323.33 ms | 52.2% bf16 MFU | 1624070 tok/s +step 10216/19560 | loss 3.403931 (+0.03z)| norm 0.2796 (-0.07z)| lr 2.96e-04 | 322.18 ms | 52.4% bf16 MFU | 1624234 tok/s +step 10217/19560 | loss 3.400023 (-0.06z)| norm 0.2667 (-0.74z)| lr 2.96e-04 | 323.46 ms | 52.2% bf16 MFU | 1624065 tok/s +step 10218/19560 | loss 3.371965 (-0.73z)| norm 0.2595 (-1.11z)| lr 2.96e-04 | 323.45 ms | 52.2% bf16 MFU | 1623908 tok/s +step 10219/19560 | loss 3.379685 (-0.54z)| norm 0.2829 (+0.12z)| lr 2.96e-04 | 322.35 ms | 52.4% bf16 MFU | 1624036 tok/s +step 10220/19560 | loss 3.367248 (-0.84z)| norm 0.2580 (-1.16z)| lr 2.96e-04 | 322.76 ms | 52.3% bf16 MFU | 1624054 tok/s +step 10221/19560 | loss 3.407696 (+0.13z)| norm 0.2825 (+0.11z)| lr 2.96e-04 | 322.96 ms | 52.3% bf16 MFU | 1624022 tok/s +step 10222/19560 | loss 3.396454 (-0.14z)| norm 0.2677 (-0.65z)| lr 2.95e-04 | 322.71 ms | 52.3% bf16 MFU | 1624053 tok/s +step 10223/19560 | loss 3.376299 (-0.62z)| norm 0.2538 (-1.35z)| lr 2.95e-04 | 322.76 ms | 52.3% bf16 MFU | 1624070 tok/s +step 10224/19560 | loss 3.418604 (+0.38z)| norm 0.2669 (-0.66z)| lr 2.95e-04 | 324.21 ms | 52.1% bf16 MFU | 1623722 tok/s +step 10225/19560 | loss 3.403007 (+0.01z)| norm 0.2884 (+0.49z)| lr 2.95e-04 | 322.13 ms | 52.4% bf16 MFU | 1623915 tok/s +step 10226/19560 | loss 3.403640 (+0.02z)| norm 0.2784 (-0.03z)| lr 2.95e-04 | 322.81 ms | 52.3% bf16 MFU | 1623927 tok/s +step 10227/19560 | loss 3.412499 (+0.23z)| norm 0.2753 (-0.19z)| lr 2.95e-04 | 323.12 ms | 52.2% bf16 MFU | 1623860 tok/s +step 10228/19560 | loss 3.423842 (+0.50z)| norm 0.2777 (-0.05z)| lr 2.95e-04 | 323.13 ms | 52.2% bf16 MFU | 1623794 tok/s +step 10229/19560 | loss 3.432160 (+0.70z)| norm 0.2664 (-0.68z)| lr 2.95e-04 | 322.46 ms | 52.3% bf16 MFU | 1623900 tok/s +step 10230/19560 | loss 3.408857 (+0.14z)| norm 0.2733 (-0.28z)| lr 2.95e-04 | 322.74 ms | 52.3% bf16 MFU | 1623928 tok/s +step 10231/19560 | loss 3.416287 (+0.31z)| norm 0.2675 (-0.60z)| lr 2.95e-04 | 323.30 ms | 52.2% bf16 MFU | 1623816 tok/s +step 10232/19560 | loss 3.433891 (+0.74z)| norm 0.2583 (-1.13z)| lr 2.95e-04 | 323.29 ms | 52.2% bf16 MFU | 1623712 tok/s +step 10233/19560 | loss 3.438338 (+0.83z)| norm 0.2664 (-0.67z)| lr 2.95e-04 | 323.18 ms | 52.2% bf16 MFU | 1623640 tok/s +step 10234/19560 | loss 3.425485 (+0.53z)| norm 0.2528 (-1.47z)| lr 2.95e-04 | 322.55 ms | 52.3% bf16 MFU | 1623731 tok/s +step 10235/19560 | loss 3.460366 (+1.36z)| norm 0.2622 (-0.90z)| lr 2.95e-04 | 323.19 ms | 52.2% bf16 MFU | 1623656 tok/s +step 10236/19560 | loss 3.436856 (+0.80z)| norm 0.2610 (-0.96z)| lr 2.95e-04 | 322.68 ms | 52.3% bf16 MFU | 1623713 tok/s +step 10237/19560 | loss 3.513681 (+2.57z)| norm 0.2893 (+0.75z)| lr 2.95e-04 | 322.46 ms | 52.3% bf16 MFU | 1623822 tok/s +step 10238/19560 | loss 3.415410 (+0.27z)| norm 0.2579 (-1.13z)| lr 2.95e-04 | 323.19 ms | 52.2% bf16 MFU | 1623741 tok/s +step 10239/19560 | loss 3.400225 (-0.08z)| norm 0.3091 (+1.91z)| lr 2.95e-04 | 322.98 ms | 52.3% bf16 MFU | 1623718 tok/s +step 10240/19560 | loss 3.402577 (+0.00z)| norm 0.2681 (-0.51z)| lr 2.95e-04 | 323.29 ms | 52.2% bf16 MFU | 1623618 tok/s +step 10241/19560 | loss 3.352726 (-1.23z)| norm 0.2844 (+0.46z)| lr 2.95e-04 | 322.93 ms | 52.3% bf16 MFU | 1623614 tok/s +step 10242/19560 | loss 3.541848 (+3.30z)| norm 0.2753 (-0.08z)| lr 2.94e-04 | 323.26 ms | 52.2% bf16 MFU | 1623527 tok/s +step 10243/19560 | loss 3.385368 (-0.42z)| norm 0.2681 (-0.51z)| lr 2.94e-04 | 323.27 ms | 52.2% bf16 MFU | 1623441 tok/s +step 10244/19560 | loss 3.367690 (-0.83z)| norm 0.2781 (+0.10z)| lr 2.94e-04 | 323.10 ms | 52.2% bf16 MFU | 1623403 tok/s +step 10245/19560 | loss 3.394134 (-0.20z)| norm 0.2792 (+0.16z)| lr 2.94e-04 | 323.23 ms | 52.2% bf16 MFU | 1623335 tok/s +step 10246/19560 | loss 3.359515 (-1.02z)| norm 0.2716 (-0.29z)| lr 2.94e-04 | 322.60 ms | 52.3% bf16 MFU | 1623428 tok/s +step 10247/19560 | loss 3.410253 (+0.19z)| norm 0.2696 (-0.40z)| lr 2.94e-04 | 322.72 ms | 52.3% bf16 MFU | 1623485 tok/s +step 10248/19560 | loss 3.382840 (-0.45z)| norm 0.2829 (+0.40z)| lr 2.94e-04 | 323.23 ms | 52.2% bf16 MFU | 1623412 tok/s +step 10249/19560 | loss 3.399061 (-0.07z)| norm 0.2847 (+0.51z)| lr 2.94e-04 | 323.08 ms | 52.2% bf16 MFU | 1623382 tok/s +step 10250/19560 | loss 3.361068 (-0.99z)| norm 0.3005 (+1.44z)| lr 2.94e-04 | 323.04 ms | 52.2% bf16 MFU | 1623362 tok/s +val loss 3.391018 +evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 HellaSwag: 2933/10042 = 0.292073 +step 10251/19560 | loss 3.392031 (-0.26z)| norm 0.3107 (+2.00z)| lr 2.94e-04 | 322.85 ms | 52.3% bf16 MFU | 1623392 tok/s +step 10252/19560 | loss 3.388530 (-0.33z)| norm 0.3287 (+2.94z)| lr 2.94e-04 | 322.70 ms | 52.3% bf16 MFU | 1623458 tok/s +step 10253/19560 | loss 3.535133 (+3.11z)| norm 0.3302 (+2.91z)| lr 2.94e-04 | 323.48 ms | 52.2% bf16 MFU | 1623324 tok/s +step 10254/19560 | loss 3.453739 (+1.17z)| norm 0.3181 (+2.18z)| lr 2.94e-04 | 323.48 ms | 52.2% bf16 MFU | 1623196 tok/s +step 10255/19560 | loss 3.399994 (-0.09z)| norm 0.3077 (+1.58z)| lr 2.94e-04 | 322.99 ms | 52.3% bf16 MFU | 1623198 tok/s +step 10256/19560 | loss 3.424707 (+0.49z)| norm 0.2824 (+0.22z)| lr 2.94e-04 | 322.49 ms | 52.3% bf16 MFU | 1623326 tok/s +step 10257/19560 | loss 3.381806 (-0.52z)| norm 0.3593 (+4.04z)| lr 2.94e-04 | 323.50 ms | 52.2% bf16 MFU | 1623193 tok/s +step 10258/19560 | loss 3.324053 (-1.86z)| norm 0.2804 (+0.08z)| lr 2.94e-04 | 322.41 ms | 52.3% bf16 MFU | 1623340 tok/s +step 10259/19560 | loss 3.406758 (+0.07z)| norm 0.3127 (+1.67z)| lr 2.94e-04 | 322.55 ms | 52.3% bf16 MFU | 1623444 tok/s +step 10260/19560 | loss 3.429786 (+0.60z)| norm 0.2992 (+0.98z)| lr 2.94e-04 | 322.05 ms | 52.4% bf16 MFU | 1623672 tok/s +step 10261/19560 | loss 3.392919 (-0.25z)| norm 0.2983 (+0.92z)| lr 2.94e-04 | 323.13 ms | 52.2% bf16 MFU | 1623615 tok/s +step 10262/19560 | loss 3.466471 (+1.44z)| norm 0.3024 (+1.11z)| lr 2.93e-04 | 323.28 ms | 52.2% bf16 MFU | 1623523 tok/s +step 10263/19560 | loss 3.338270 (-1.53z)| norm 0.2990 (+0.93z)| lr 2.93e-04 | 323.05 ms | 52.2% bf16 MFU | 1623493 tok/s +step 10264/19560 | loss 3.383070 (-0.50z)| norm 0.2754 (-0.24z)| lr 2.93e-04 | 322.49 ms | 52.3% bf16 MFU | 1623607 tok/s +step 10265/19560 | loss 3.351059 (-1.25z)| norm 0.3098 (+1.44z)| lr 2.93e-04 | 323.29 ms | 52.2% bf16 MFU | 1623513 tok/s +step 10266/19560 | loss 3.342610 (-1.42z)| norm 0.2735 (-0.34z)| lr 2.93e-04 | 323.28 ms | 52.2% bf16 MFU | 1623427 tok/s +step 10267/19560 | loss 3.408967 (+0.13z)| norm 0.3107 (+1.46z)| lr 2.93e-04 | 322.82 ms | 52.3% bf16 MFU | 1623460 tok/s +step 10268/19560 | loss 3.417200 (+0.32z)| norm 0.2811 (+0.01z)| lr 2.93e-04 | 323.23 ms | 52.2% bf16 MFU | 1623390 tok/s +step 10269/19560 | loss 3.385556 (-0.42z)| norm 0.3543 (+3.40z)| lr 2.93e-04 | 323.01 ms | 52.3% bf16 MFU | 1623378 tok/s +step 10270/19560 | loss 3.393845 (-0.23z)| norm 0.3200 (+1.78z)| lr 2.93e-04 | 322.95 ms | 52.3% bf16 MFU | 1623381 tok/s +step 10271/19560 | loss 3.323777 (-1.85z)| norm 0.3798 (+4.19z)| lr 2.93e-04 | 322.73 ms | 52.3% bf16 MFU | 1623438 tok/s +step 10272/19560 | loss 3.448701 (+1.07z)| norm 0.3256 (+1.82z)| lr 2.93e-04 | 322.90 ms | 52.3% bf16 MFU | 1623449 tok/s +step 10273/19560 | loss 3.371027 (-0.76z)| norm 0.3336 (+2.10z)| lr 2.93e-04 | 322.57 ms | 52.3% bf16 MFU | 1623543 tok/s +step 10274/19560 | loss 3.405609 (+0.06z)| norm 0.3236 (+1.65z)| lr 2.93e-04 | 322.77 ms | 52.3% bf16 MFU | 1623584 tok/s +step 10275/19560 | loss 3.400434 (-0.06z)| norm 0.2989 (+0.62z)| lr 2.93e-04 | 323.07 ms | 52.2% bf16 MFU | 1623545 tok/s +step 10276/19560 | loss 3.460319 (+1.36z)| norm 0.3689 (+3.34z)| lr 2.93e-04 | 322.79 ms | 52.3% bf16 MFU | 1623579 tok/s +step 10277/19560 | loss 3.452221 (+1.16z)| norm 0.3199 (+1.37z)| lr 2.93e-04 | 322.54 ms | 52.3% bf16 MFU | 1623674 tok/s +step 10278/19560 | loss 3.425927 (+0.54z)| norm 0.3186 (+1.30z)| lr 2.93e-04 | 322.78 ms | 52.3% bf16 MFU | 1623704 tok/s +step 10279/19560 | loss 3.350157 (-1.24z)| norm 0.2995 (+0.54z)| lr 2.93e-04 | 322.74 ms | 52.3% bf16 MFU | 1623744 tok/s +step 10280/19560 | loss 3.414237 (+0.28z)| norm 0.3004 (+0.57z)| lr 2.93e-04 | 322.98 ms | 52.3% bf16 MFU | 1623720 tok/s +step 10281/19560 | loss 3.412311 (+0.23z)| norm 0.3060 (+0.77z)| lr 2.93e-04 | 323.23 ms | 52.2% bf16 MFU | 1623637 tok/s +step 10282/19560 | loss 3.446305 (+1.02z)| norm 0.2822 (-0.17z)| lr 2.92e-04 | 322.70 ms | 52.3% bf16 MFU | 1623690 tok/s +step 10283/19560 | loss 3.420545 (+0.41z)| norm 0.2922 (+0.22z)| lr 2.92e-04 | 322.90 ms | 52.3% bf16 MFU | 1623688 tok/s +step 10284/19560 | loss 3.372627 (-0.71z)| norm 0.2774 (-0.38z)| lr 2.92e-04 | 322.86 ms | 52.3% bf16 MFU | 1623697 tok/s +step 10285/19560 | loss 3.395025 (-0.18z)| norm 0.2926 (+0.22z)| lr 2.92e-04 | 322.90 ms | 52.3% bf16 MFU | 1623698 tok/s +step 10286/19560 | loss 3.477884 (+1.73z)| norm 0.2920 (+0.19z)| lr 2.92e-04 | 322.89 ms | 52.3% bf16 MFU | 1623698 tok/s +step 10287/19560 | loss 3.331223 (-1.65z)| norm 0.2737 (-0.54z)| lr 2.92e-04 | 322.90 ms | 52.3% bf16 MFU | 1623697 tok/s +step 10288/19560 | loss 3.383301 (-0.45z)| norm 0.2820 (-0.21z)| lr 2.92e-04 | 322.94 ms | 52.3% bf16 MFU | 1623685 tok/s +step 10289/19560 | loss 3.328901 (-1.68z)| norm 0.2648 (-0.90z)| lr 2.92e-04 | 322.61 ms | 52.3% bf16 MFU | 1623759 tok/s +step 10290/19560 | loss 3.423574 (+0.49z)| norm 0.2545 (-1.31z)| lr 2.92e-04 | 323.07 ms | 52.2% bf16 MFU | 1623714 tok/s +step 10291/19560 | loss 3.433373 (+0.71z)| norm 0.2625 (-0.99z)| lr 2.92e-04 | 322.63 ms | 52.3% bf16 MFU | 1623780 tok/s +step 10292/19560 | loss 3.420125 (+0.40z)| norm 0.2567 (-1.24z)| lr 2.92e-04 | 323.00 ms | 52.3% bf16 MFU | 1623751 tok/s +step 10293/19560 | loss 3.357872 (-1.02z)| norm 0.2709 (-0.67z)| lr 2.92e-04 | 322.56 ms | 52.3% bf16 MFU | 1623833 tok/s +step 10294/19560 | loss 3.453434 (+1.16z)| norm 0.2678 (-0.79z)| lr 2.92e-04 | 322.63 ms | 52.3% bf16 MFU | 1623894 tok/s +step 10295/19560 | loss 3.351514 (-1.15z)| norm 0.2536 (-1.35z)| lr 2.92e-04 | 323.14 ms | 52.2% bf16 MFU | 1623824 tok/s +step 10296/19560 | loss 3.382116 (-0.45z)| norm 0.2529 (-1.36z)| lr 2.92e-04 | 322.72 ms | 52.3% bf16 MFU | 1623862 tok/s +step 10297/19560 | loss 3.408166 (+0.14z)| norm 0.2478 (-1.54z)| lr 2.92e-04 | 322.72 ms | 52.3% bf16 MFU | 1623898 tok/s +step 10298/19560 | loss 3.419576 (+0.40z)| norm 0.2595 (-1.06z)| lr 2.92e-04 | 323.03 ms | 52.2% bf16 MFU | 1623855 tok/s +step 10299/19560 | loss 3.384824 (-0.39z)| norm 0.2439 (-1.64z)| lr 2.92e-04 | 322.84 ms | 52.3% bf16 MFU | 1623861 tok/s +step 10300/19560 | loss 3.365147 (-0.85z)| norm 0.2605 (-0.98z)| lr 2.92e-04 | 322.62 ms | 52.3% bf16 MFU | 1623922 tok/s +step 10301/19560 | loss 3.427049 (+0.56z)| norm 0.2461 (-1.51z)| lr 2.92e-04 | 322.71 ms | 52.3% bf16 MFU | 1623958 tok/s +step 10302/19560 | loss 3.351948 (-1.16z)| norm 0.2758 (-0.37z)| lr 2.91e-04 | 323.38 ms | 52.2% bf16 MFU | 1623825 tok/s +step 10303/19560 | loss 3.402617 (+0.01z)| norm 0.2555 (-1.15z)| lr 2.91e-04 | 323.12 ms | 52.2% bf16 MFU | 1623764 tok/s +step 10304/19560 | loss 3.343785 (-1.33z)| norm 0.2619 (-0.89z)| lr 2.91e-04 | 322.76 ms | 52.3% bf16 MFU | 1623795 tok/s +step 10305/19560 | loss 3.404347 (+0.06z)| norm 0.2740 (-0.43z)| lr 2.91e-04 | 322.74 ms | 52.3% bf16 MFU | 1623829 tok/s +step 10306/19560 | loss 3.470975 (+1.56z)| norm 0.2622 (-0.88z)| lr 2.91e-04 | 323.47 ms | 52.2% bf16 MFU | 1623679 tok/s +step 10307/19560 | loss 3.511176 (+2.41z)| norm 0.2779 (-0.28z)| lr 2.91e-04 | 323.12 ms | 52.2% bf16 MFU | 1623624 tok/s +step 10308/19560 | loss 3.396020 (-0.15z)| norm 0.2496 (-1.35z)| lr 2.91e-04 | 323.12 ms | 52.2% bf16 MFU | 1623573 tok/s +step 10309/19560 | loss 3.420400 (+0.39z)| norm 0.2815 (-0.13z)| lr 2.91e-04 | 322.50 ms | 52.3% bf16 MFU | 1623679 tok/s +step 10310/19560 | loss 3.422299 (+0.42z)| norm 0.2518 (-1.25z)| lr 2.91e-04 | 322.59 ms | 52.3% bf16 MFU | 1623758 tok/s +step 10311/19560 | loss 3.387854 (-0.36z)| norm 0.3028 (+0.69z)| lr 2.91e-04 | 323.35 ms | 52.2% bf16 MFU | 1623642 tok/s +step 10312/19560 | loss 3.532535 (+2.84z)| norm 0.2527 (-1.20z)| lr 2.91e-04 | 322.99 ms | 52.3% bf16 MFU | 1623621 tok/s +step 10313/19560 | loss 3.453920 (+1.10z)| norm 0.2799 (-0.17z)| lr 2.91e-04 | 323.03 ms | 52.2% bf16 MFU | 1623592 tok/s +step 10314/19560 | loss 3.404608 (+0.02z)| norm 0.2610 (-0.88z)| lr 2.91e-04 | 323.15 ms | 52.2% bf16 MFU | 1623534 tok/s +step 10315/19560 | loss 3.425292 (+0.50z)| norm 0.2869 (+0.10z)| lr 2.91e-04 | 322.82 ms | 52.3% bf16 MFU | 1623563 tok/s +step 10316/19560 | loss 3.378580 (-0.57z)| norm 0.2883 (+0.15z)| lr 2.91e-04 | 322.52 ms | 52.3% bf16 MFU | 1623665 tok/s +step 10317/19560 | loss 3.378691 (-0.56z)| norm 0.2638 (-0.77z)| lr 2.91e-04 | 323.65 ms | 52.1% bf16 MFU | 1623478 tok/s +step 10318/19560 | loss 3.411563 (+0.18z)| norm 0.3367 (+1.94z)| lr 2.91e-04 | 322.80 ms | 52.3% bf16 MFU | 1623513 tok/s +step 10319/19560 | loss 3.579107 (+3.99z)| norm 0.2974 (+0.46z)| lr 2.91e-04 | 322.43 ms | 52.3% bf16 MFU | 1623640 tok/s +step 10320/19560 | loss 3.439759 (+0.80z)| norm 0.2973 (+0.45z)| lr 2.91e-04 | 323.02 ms | 52.2% bf16 MFU | 1623612 tok/s +step 10321/19560 | loss 3.409502 (+0.11z)| norm 0.3157 (+1.13z)| lr 2.91e-04 | 323.53 ms | 52.2% bf16 MFU | 1623457 tok/s +step 10322/19560 | loss 3.512889 (+2.39z)| norm 0.2808 (-0.17z)| lr 2.90e-04 | 322.25 ms | 52.4% bf16 MFU | 1623632 tok/s +step 10323/19560 | loss 3.519967 (+2.47z)| norm 0.3156 (+1.12z)| lr 2.90e-04 | 322.73 ms | 52.3% bf16 MFU | 1623676 tok/s +step 10324/19560 | loss 3.460085 (+1.14z)| norm 0.2943 (+0.33z)| lr 2.90e-04 | 323.12 ms | 52.2% bf16 MFU | 1623620 tok/s +step 10325/19560 | loss 3.383724 (-0.53z)| norm 0.2958 (+0.38z)| lr 2.90e-04 | 322.59 ms | 52.3% bf16 MFU | 1623702 tok/s +step 10326/19560 | loss 3.459568 (+1.12z)| norm 0.2996 (+0.51z)| lr 2.90e-04 | 323.62 ms | 52.2% bf16 MFU | 1623520 tok/s +step 10327/19560 | loss 3.383298 (-0.54z)| norm 0.3276 (+1.53z)| lr 2.90e-04 | 323.00 ms | 52.3% bf16 MFU | 1623503 tok/s +step 10328/19560 | loss 3.325290 (-1.78z)| norm 0.3196 (+1.21z)| lr 2.90e-04 | 322.97 ms | 52.3% bf16 MFU | 1623496 tok/s +step 10329/19560 | loss 3.509674 (+2.14z)| norm 0.3150 (+1.03z)| lr 2.90e-04 | 322.52 ms | 52.3% bf16 MFU | 1623600 tok/s +step 10330/19560 | loss 3.437422 (+0.60z)| norm 0.3290 (+1.52z)| lr 2.90e-04 | 322.59 ms | 52.3% bf16 MFU | 1623682 tok/s +step 10331/19560 | loss 3.369223 (-0.84z)| norm 0.3117 (+0.88z)| lr 2.90e-04 | 322.76 ms | 52.3% bf16 MFU | 1623718 tok/s +step 10332/19560 | loss 3.349545 (-1.24z)| norm 0.2805 (-0.24z)| lr 2.90e-04 | 323.31 ms | 52.2% bf16 MFU | 1623612 tok/s +step 10333/19560 | loss 3.398796 (-0.20z)| norm 0.2886 (+0.05z)| lr 2.90e-04 | 322.19 ms | 52.4% bf16 MFU | 1623796 tok/s +step 10334/19560 | loss 3.420596 (+0.25z)| norm 0.2954 (+0.31z)| lr 2.90e-04 | 322.76 ms | 52.3% bf16 MFU | 1623826 tok/s +step 10335/19560 | loss 3.376471 (-0.67z)| norm 0.3391 (+1.97z)| lr 2.90e-04 | 322.75 ms | 52.3% bf16 MFU | 1623857 tok/s +step 10336/19560 | loss 3.376290 (-0.67z)| norm 0.2627 (-0.89z)| lr 2.90e-04 | 322.75 ms | 52.3% bf16 MFU | 1623888 tok/s +step 10337/19560 | loss 3.410424 (+0.04z)| norm 0.2803 (-0.23z)| lr 2.90e-04 | 323.01 ms | 52.2% bf16 MFU | 1623848 tok/s +step 10338/19560 | loss 3.381178 (-0.57z)| norm 0.2938 (+0.27z)| lr 2.90e-04 | 322.93 ms | 52.3% bf16 MFU | 1623834 tok/s +step 10339/19560 | loss 3.437498 (+0.61z)| norm 0.2798 (-0.25z)| lr 2.90e-04 | 323.16 ms | 52.2% bf16 MFU | 1623762 tok/s +step 10340/19560 | loss 3.420348 (+0.24z)| norm 0.3010 (+0.53z)| lr 2.90e-04 | 322.56 ms | 52.3% bf16 MFU | 1623844 tok/s +step 10341/19560 | loss 3.387137 (-0.47z)| norm 0.2448 (-1.57z)| lr 2.90e-04 | 322.72 ms | 52.3% bf16 MFU | 1623881 tok/s +step 10342/19560 | loss 3.375831 (-0.72z)| norm 0.3215 (+1.28z)| lr 2.89e-04 | 322.71 ms | 52.3% bf16 MFU | 1623918 tok/s +step 10343/19560 | loss 3.397388 (-0.25z)| norm 0.2816 (-0.21z)| lr 2.89e-04 | 322.95 ms | 52.3% bf16 MFU | 1623893 tok/s +step 10344/19560 | loss 3.391769 (-0.37z)| norm 0.2754 (-0.44z)| lr 2.89e-04 | 322.28 ms | 52.4% bf16 MFU | 1624039 tok/s +step 10345/19560 | loss 3.383533 (-0.55z)| norm 0.2889 (+0.06z)| lr 2.89e-04 | 322.69 ms | 52.3% bf16 MFU | 1624073 tok/s +step 10346/19560 | loss 3.332181 (-1.62z)| norm 0.2534 (-1.26z)| lr 2.89e-04 | 323.16 ms | 52.2% bf16 MFU | 1623987 tok/s +step 10347/19560 | loss 3.413492 (+0.10z)| norm 0.2869 (-0.01z)| lr 2.89e-04 | 322.46 ms | 52.3% bf16 MFU | 1624082 tok/s +step 10348/19560 | loss 3.370379 (-0.82z)| norm 0.2846 (-0.11z)| lr 2.89e-04 | 322.91 ms | 52.3% bf16 MFU | 1624058 tok/s +step 10349/19560 | loss 3.343875 (-1.36z)| norm 0.2885 (+0.04z)| lr 2.89e-04 | 322.70 ms | 52.3% bf16 MFU | 1624091 tok/s +step 10350/19560 | loss 3.429615 (+0.44z)| norm 0.2699 (-0.66z)| lr 2.89e-04 | 322.52 ms | 52.3% bf16 MFU | 1624166 tok/s +step 10351/19560 | loss 3.381390 (-0.58z)| norm 0.3030 (+0.56z)| lr 2.89e-04 | 322.66 ms | 52.3% bf16 MFU | 1624203 tok/s +step 10352/19560 | loss 3.326123 (-1.71z)| norm 0.2647 (-0.87z)| lr 2.89e-04 | 322.55 ms | 52.3% bf16 MFU | 1624264 tok/s +step 10353/19560 | loss 3.393462 (-0.30z)| norm 0.2809 (-0.26z)| lr 2.89e-04 | 322.73 ms | 52.3% bf16 MFU | 1624278 tok/s +step 10354/19560 | loss 3.401007 (-0.15z)| norm 0.2795 (-0.31z)| lr 2.89e-04 | 322.38 ms | 52.4% bf16 MFU | 1624379 tok/s +step 10355/19560 | loss 3.376009 (-0.66z)| norm 0.2827 (-0.19z)| lr 2.89e-04 | 322.29 ms | 52.4% bf16 MFU | 1624498 tok/s +step 10356/19560 | loss 3.374834 (-0.68z)| norm 0.2722 (-0.59z)| lr 2.89e-04 | 322.66 ms | 52.3% bf16 MFU | 1624518 tok/s +step 10357/19560 | loss 3.435853 (+0.59z)| norm 0.2917 (+0.14z)| lr 2.89e-04 | 322.97 ms | 52.3% bf16 MFU | 1624458 tok/s +step 10358/19560 | loss 3.462766 (+1.14z)| norm 0.2655 (-0.85z)| lr 2.89e-04 | 322.67 ms | 52.3% bf16 MFU | 1624478 tok/s +step 10359/19560 | loss 3.363037 (-0.91z)| norm 0.2807 (-0.28z)| lr 2.89e-04 | 322.40 ms | 52.3% bf16 MFU | 1624565 tok/s +step 10360/19560 | loss 3.346876 (-1.23z)| norm 0.2609 (-1.03z)| lr 2.89e-04 | 322.86 ms | 52.3% bf16 MFU | 1624530 tok/s +step 10361/19560 | loss 3.395375 (-0.23z)| norm 0.2697 (-0.69z)| lr 2.89e-04 | 323.02 ms | 52.2% bf16 MFU | 1624456 tok/s +step 10362/19560 | loss 3.409004 (+0.06z)| norm 0.2953 (+0.26z)| lr 2.88e-04 | 321.72 ms | 52.5% bf16 MFU | 1624716 tok/s +step 10363/19560 | loss 3.305111 (-2.03z)| norm 0.2956 (+0.26z)| lr 2.88e-04 | 322.98 ms | 52.3% bf16 MFU | 1624644 tok/s +step 10364/19560 | loss 3.360792 (-0.88z)| norm 0.2778 (-0.43z)| lr 2.88e-04 | 322.91 ms | 52.3% bf16 MFU | 1624594 tok/s +step 10365/19560 | loss 3.421930 (+0.38z)| norm 0.3005 (+0.44z)| lr 2.88e-04 | 322.59 ms | 52.3% bf16 MFU | 1624627 tok/s +step 10366/19560 | loss 3.415609 (+0.25z)| norm 0.2658 (-0.89z)| lr 2.88e-04 | 322.76 ms | 52.3% bf16 MFU | 1624615 tok/s +step 10367/19560 | loss 3.389968 (-0.28z)| norm 0.2797 (-0.35z)| lr 2.88e-04 | 322.52 ms | 52.3% bf16 MFU | 1624666 tok/s +step 10368/19560 | loss 3.421594 (+0.37z)| norm 0.2594 (-1.12z)| lr 2.88e-04 | 322.99 ms | 52.3% bf16 MFU | 1624594 tok/s +step 10369/19560 | loss 3.323472 (-1.64z)| norm 0.2624 (-1.00z)| lr 2.88e-04 | 323.07 ms | 52.2% bf16 MFU | 1624505 tok/s +step 10370/19560 | loss 3.390385 (-0.25z)| norm 0.2701 (-0.70z)| lr 2.88e-04 | 322.77 ms | 52.3% bf16 MFU | 1624496 tok/s +step 10371/19560 | loss 3.335990 (-1.39z)| norm 0.2608 (-1.05z)| lr 2.88e-04 | 322.33 ms | 52.4% bf16 MFU | 1624600 tok/s +step 10372/19560 | loss 3.352979 (-1.03z)| norm 0.2808 (-0.29z)| lr 2.88e-04 | 323.09 ms | 52.2% bf16 MFU | 1624506 tok/s +step 10373/19560 | loss 3.449191 (+0.98z)| norm 0.2916 (+0.11z)| lr 2.88e-04 | 323.26 ms | 52.2% bf16 MFU | 1624374 tok/s +step 10374/19560 | loss 3.386197 (-0.34z)| norm 0.2927 (+0.15z)| lr 2.88e-04 | 322.47 ms | 52.3% bf16 MFU | 1624447 tok/s +step 10375/19560 | loss 3.409414 (+0.15z)| norm 0.2628 (-0.98z)| lr 2.88e-04 | 322.92 ms | 52.3% bf16 MFU | 1624403 tok/s +step 10376/19560 | loss 3.420921 (+0.38z)| norm 0.2738 (-0.56z)| lr 2.88e-04 | 322.60 ms | 52.3% bf16 MFU | 1624441 tok/s +step 10377/19560 | loss 3.351826 (-1.06z)| norm 0.2722 (-0.62z)| lr 2.88e-04 | 322.82 ms | 52.3% bf16 MFU | 1624424 tok/s +step 10378/19560 | loss 3.383608 (-0.40z)| norm 0.2567 (-1.19z)| lr 2.88e-04 | 322.58 ms | 52.3% bf16 MFU | 1624467 tok/s +step 10379/19560 | loss 3.370326 (-0.67z)| norm 0.2521 (-1.34z)| lr 2.88e-04 | 323.13 ms | 52.2% bf16 MFU | 1624369 tok/s +step 10380/19560 | loss 3.405961 (+0.07z)| norm 0.2622 (-0.95z)| lr 2.88e-04 | 322.60 ms | 52.3% bf16 MFU | 1624412 tok/s +step 10381/19560 | loss 3.420540 (+0.41z)| norm 0.2632 (-0.89z)| lr 2.88e-04 | 322.67 ms | 52.3% bf16 MFU | 1624434 tok/s +step 10382/19560 | loss 3.418366 (+0.37z)| norm 0.2749 (-0.44z)| lr 2.87e-04 | 323.09 ms | 52.2% bf16 MFU | 1624349 tok/s +step 10383/19560 | loss 3.386736 (-0.31z)| norm 0.2628 (-0.89z)| lr 2.87e-04 | 322.42 ms | 52.3% bf16 MFU | 1624436 tok/s +step 10384/19560 | loss 3.442666 (+0.89z)| norm 0.3039 (+0.68z)| lr 2.87e-04 | 322.49 ms | 52.3% bf16 MFU | 1624501 tok/s +step 10385/19560 | loss 3.434072 (+0.70z)| norm 0.2836 (-0.08z)| lr 2.87e-04 | 323.10 ms | 52.2% bf16 MFU | 1624411 tok/s +step 10386/19560 | loss 3.433186 (+0.66z)| norm 0.3017 (+0.63z)| lr 2.87e-04 | 322.97 ms | 52.3% bf16 MFU | 1624357 tok/s +step 10387/19560 | loss 3.351513 (-1.10z)| norm 0.2992 (+0.54z)| lr 2.87e-04 | 322.27 ms | 52.4% bf16 MFU | 1624482 tok/s +step 10388/19560 | loss 3.374281 (-0.60z)| norm 0.2935 (+0.31z)| lr 2.87e-04 | 322.97 ms | 52.3% bf16 MFU | 1624424 tok/s +step 10389/19560 | loss 3.478691 (+1.64z)| norm 0.2975 (+0.47z)| lr 2.87e-04 | 322.88 ms | 52.3% bf16 MFU | 1624391 tok/s +step 10390/19560 | loss 3.347431 (-1.16z)| norm 0.2957 (+0.40z)| lr 2.87e-04 | 323.05 ms | 52.2% bf16 MFU | 1624317 tok/s +step 10391/19560 | loss 3.368057 (-0.73z)| norm 0.3108 (+0.99z)| lr 2.87e-04 | 322.87 ms | 52.3% bf16 MFU | 1624294 tok/s +step 10392/19560 | loss 3.404609 (+0.06z)| norm 0.2683 (-0.68z)| lr 2.87e-04 | 322.69 ms | 52.3% bf16 MFU | 1624316 tok/s +step 10393/19560 | loss 3.428813 (+0.57z)| norm 0.2784 (-0.27z)| lr 2.87e-04 | 322.95 ms | 52.3% bf16 MFU | 1624272 tok/s +step 10394/19560 | loss 3.411651 (+0.19z)| norm 0.2597 (-1.01z)| lr 2.87e-04 | 323.69 ms | 52.1% bf16 MFU | 1624045 tok/s +step 10395/19560 | loss 3.395932 (-0.15z)| norm 0.2646 (-0.80z)| lr 2.87e-04 | 322.78 ms | 52.3% bf16 MFU | 1624057 tok/s +step 10396/19560 | loss 3.359146 (-0.95z)| norm 0.2532 (-1.23z)| lr 2.87e-04 | 322.90 ms | 52.3% bf16 MFU | 1624039 tok/s +step 10397/19560 | loss 3.448559 (+0.99z)| norm 0.2614 (-0.91z)| lr 2.87e-04 | 323.03 ms | 52.2% bf16 MFU | 1623990 tok/s +step 10398/19560 | loss 3.446652 (+0.94z)| norm 0.2631 (-0.83z)| lr 2.87e-04 | 322.87 ms | 52.3% bf16 MFU | 1623982 tok/s +step 10399/19560 | loss 3.462704 (+1.27z)| norm 0.2617 (-0.90z)| lr 2.87e-04 | 323.17 ms | 52.2% bf16 MFU | 1623898 tok/s +step 10400/19560 | loss 3.429826 (+0.55z)| norm 0.2639 (-0.79z)| lr 2.87e-04 | 323.59 ms | 52.2% bf16 MFU | 1623714 tok/s +step 10401/19560 | loss 3.402732 (-0.04z)| norm 0.2514 (-1.33z)| lr 2.87e-04 | 323.49 ms | 52.2% bf16 MFU | 1623564 tok/s +step 10402/19560 | loss 3.376420 (-0.61z)| norm 0.2776 (-0.16z)| lr 2.86e-04 | 322.46 ms | 52.3% bf16 MFU | 1623680 tok/s +step 10403/19560 | loss 3.397403 (-0.15z)| norm 0.2652 (-0.70z)| lr 2.86e-04 | 322.39 ms | 52.4% bf16 MFU | 1623809 tok/s +step 10404/19560 | loss 3.394708 (-0.20z)| norm 0.2719 (-0.39z)| lr 2.86e-04 | 323.91 ms | 52.1% bf16 MFU | 1623549 tok/s +step 10405/19560 | loss 3.343357 (-1.31z)| norm 0.2573 (-1.07z)| lr 2.86e-04 | 322.71 ms | 52.3% bf16 MFU | 1623604 tok/s +step 10406/19560 | loss 3.410275 (+0.16z)| norm 0.2821 (+0.14z)| lr 2.86e-04 | 323.07 ms | 52.2% bf16 MFU | 1623565 tok/s +step 10407/19560 | loss 3.388117 (-0.33z)| norm 0.2661 (-0.63z)| lr 2.86e-04 | 323.12 ms | 52.2% bf16 MFU | 1623517 tok/s +step 10408/19560 | loss 3.402112 (-0.02z)| norm 0.2934 (+0.71z)| lr 2.86e-04 | 323.05 ms | 52.2% bf16 MFU | 1623488 tok/s +step 10409/19560 | loss 3.362937 (-0.87z)| norm 0.2612 (-0.86z)| lr 2.86e-04 | 322.93 ms | 52.3% bf16 MFU | 1623490 tok/s +step 10410/19560 | loss 3.410594 (+0.18z)| norm 0.2935 (+0.73z)| lr 2.86e-04 | 323.29 ms | 52.2% bf16 MFU | 1623402 tok/s +step 10411/19560 | loss 3.342582 (-1.30z)| norm 0.2819 (+0.16z)| lr 2.86e-04 | 322.97 ms | 52.3% bf16 MFU | 1623398 tok/s +step 10412/19560 | loss 3.391511 (-0.23z)| norm 0.2721 (-0.32z)| lr 2.86e-04 | 322.92 ms | 52.3% bf16 MFU | 1623406 tok/s +step 10413/19560 | loss 3.411507 (+0.21z)| norm 0.2828 (+0.21z)| lr 2.86e-04 | 323.40 ms | 52.2% bf16 MFU | 1623294 tok/s +step 10414/19560 | loss 3.424620 (+0.51z)| norm 0.2589 (-0.95z)| lr 2.86e-04 | 322.98 ms | 52.3% bf16 MFU | 1623293 tok/s +step 10415/19560 | loss 3.359536 (-0.95z)| norm 0.2617 (-0.81z)| lr 2.86e-04 | 323.18 ms | 52.2% bf16 MFU | 1623241 tok/s +step 10416/19560 | loss 3.395655 (-0.14z)| norm 0.2880 (+0.48z)| lr 2.86e-04 | 323.11 ms | 52.2% bf16 MFU | 1623210 tok/s +step 10417/19560 | loss 3.399389 (-0.07z)| norm 0.2802 (+0.09z)| lr 2.86e-04 | 322.78 ms | 52.3% bf16 MFU | 1623263 tok/s +step 10418/19560 | loss 3.398056 (-0.10z)| norm 0.2768 (-0.09z)| lr 2.86e-04 | 322.75 ms | 52.3% bf16 MFU | 1623321 tok/s +step 10419/19560 | loss 3.399356 (-0.06z)| norm 0.2783 (-0.02z)| lr 2.86e-04 | 323.64 ms | 52.1% bf16 MFU | 1623154 tok/s +step 10420/19560 | loss 3.383693 (-0.41z)| norm 0.2608 (-0.89z)| lr 2.86e-04 | 323.56 ms | 52.2% bf16 MFU | 1623014 tok/s +step 10421/19560 | loss 3.404852 (+0.06z)| norm 0.2590 (-0.97z)| lr 2.86e-04 | 323.21 ms | 52.2% bf16 MFU | 1622970 tok/s +step 10422/19560 | loss 3.352409 (-1.11z)| norm 0.2632 (-0.76z)| lr 2.85e-04 | 322.95 ms | 52.3% bf16 MFU | 1622993 tok/s +step 10423/19560 | loss 3.395986 (-0.13z)| norm 0.2634 (-0.76z)| lr 2.85e-04 | 323.35 ms | 52.2% bf16 MFU | 1622914 tok/s +step 10424/19560 | loss 3.390321 (-0.26z)| norm 0.2656 (-0.66z)| lr 2.85e-04 | 322.53 ms | 52.3% bf16 MFU | 1623045 tok/s +step 10425/19560 | loss 3.407266 (+0.13z)| norm 0.2692 (-0.49z)| lr 2.85e-04 | 322.27 ms | 52.4% bf16 MFU | 1623237 tok/s +step 10426/19560 | loss 3.401907 (+0.01z)| norm 0.2776 (-0.07z)| lr 2.85e-04 | 323.83 ms | 52.1% bf16 MFU | 1623027 tok/s +step 10427/19560 | loss 3.375858 (-0.59z)| norm 0.2901 (+0.55z)| lr 2.85e-04 | 323.45 ms | 52.2% bf16 MFU | 1622922 tok/s +step 10428/19560 | loss 3.432302 (+0.69z)| norm 0.2541 (-1.29z)| lr 2.85e-04 | 322.52 ms | 52.3% bf16 MFU | 1623055 tok/s +step 10429/19560 | loss 3.358898 (-0.97z)| norm 0.2734 (-0.32z)| lr 2.85e-04 | 322.48 ms | 52.3% bf16 MFU | 1623193 tok/s +step 10430/19560 | loss 3.444359 (+0.96z)| norm 0.2910 (+0.58z)| lr 2.85e-04 | 323.26 ms | 52.2% bf16 MFU | 1623127 tok/s +step 10431/19560 | loss 3.400130 (-0.05z)| norm 0.2861 (+0.32z)| lr 2.85e-04 | 323.19 ms | 52.2% bf16 MFU | 1623083 tok/s +step 10432/19560 | loss 3.395468 (-0.17z)| norm 0.3035 (+1.20z)| lr 2.85e-04 | 322.88 ms | 52.3% bf16 MFU | 1623117 tok/s +step 10433/19560 | loss 3.453726 (+1.16z)| norm 0.2801 (-0.01z)| lr 2.85e-04 | 322.96 ms | 52.3% bf16 MFU | 1623131 tok/s +step 10434/19560 | loss 3.368471 (-0.78z)| norm 0.2816 (+0.06z)| lr 2.85e-04 | 322.81 ms | 52.3% bf16 MFU | 1623181 tok/s +step 10435/19560 | loss 3.306954 (-2.17z)| norm 0.2754 (-0.26z)| lr 2.85e-04 | 322.84 ms | 52.3% bf16 MFU | 1623222 tok/s +step 10436/19560 | loss 3.440907 (+0.92z)| norm 0.2646 (-0.83z)| lr 2.85e-04 | 322.64 ms | 52.3% bf16 MFU | 1623311 tok/s +step 10437/19560 | loss 3.433879 (+0.76z)| norm 0.2755 (-0.26z)| lr 2.85e-04 | 323.37 ms | 52.2% bf16 MFU | 1623211 tok/s +step 10438/19560 | loss 3.419759 (+0.43z)| norm 0.2662 (-0.76z)| lr 2.85e-04 | 322.79 ms | 52.3% bf16 MFU | 1623261 tok/s +step 10439/19560 | loss 3.445274 (+1.01z)| norm 0.2607 (-1.03z)| lr 2.85e-04 | 323.93 ms | 52.1% bf16 MFU | 1623024 tok/s +step 10440/19560 | loss 3.353015 (-1.12z)| norm 0.2764 (-0.21z)| lr 2.85e-04 | 323.02 ms | 52.2% bf16 MFU | 1623026 tok/s +step 10441/19560 | loss 3.383676 (-0.38z)| norm 0.2655 (-0.79z)| lr 2.85e-04 | 323.09 ms | 52.2% bf16 MFU | 1623010 tok/s +step 10442/19560 | loss 3.391340 (-0.19z)| norm 0.2872 (+0.35z)| lr 2.84e-04 | 322.97 ms | 52.3% bf16 MFU | 1623028 tok/s +step 10443/19560 | loss 3.335047 (-1.51z)| norm 0.2737 (-0.36z)| lr 2.84e-04 | 323.84 ms | 52.1% bf16 MFU | 1622824 tok/s +step 10444/19560 | loss 3.437990 (+0.92z)| norm 0.2579 (-1.18z)| lr 2.84e-04 | 323.32 ms | 52.2% bf16 MFU | 1622762 tok/s +step 10445/19560 | loss 3.464849 (+1.52z)| norm 0.2758 (-0.24z)| lr 2.84e-04 | 322.99 ms | 52.3% bf16 MFU | 1622786 tok/s +step 10446/19560 | loss 3.384938 (-0.35z)| norm 0.2496 (-1.63z)| lr 2.84e-04 | 323.32 ms | 52.2% bf16 MFU | 1622725 tok/s +step 10447/19560 | loss 3.440897 (+1.06z)| norm 0.2973 (+0.96z)| lr 2.84e-04 | 324.11 ms | 52.1% bf16 MFU | 1622471 tok/s +step 10448/19560 | loss 3.479510 (+2.00z)| norm 0.2669 (-0.68z)| lr 2.84e-04 | 322.54 ms | 52.3% bf16 MFU | 1622623 tok/s +step 10449/19560 | loss 3.340700 (-1.42z)| norm 0.2733 (-0.32z)| lr 2.84e-04 | 324.15 ms | 52.1% bf16 MFU | 1622362 tok/s +step 10450/19560 | loss 3.330201 (-1.68z)| norm 0.2616 (-0.95z)| lr 2.84e-04 | 323.38 ms | 52.2% bf16 MFU | 1622309 tok/s +step 10451/19560 | loss 3.415502 (+0.51z)| norm 0.2740 (-0.26z)| lr 2.84e-04 | 323.30 ms | 52.2% bf16 MFU | 1622278 tok/s +step 10452/19560 | loss 3.421350 (+0.67z)| norm 0.2655 (-0.72z)| lr 2.84e-04 | 324.07 ms | 52.1% bf16 MFU | 1622056 tok/s +step 10453/19560 | loss 3.430697 (+0.91z)| norm 0.2762 (-0.11z)| lr 2.84e-04 | 322.70 ms | 52.3% bf16 MFU | 1622187 tok/s +step 10454/19560 | loss 3.368309 (-0.72z)| norm 0.2784 (+0.02z)| lr 2.84e-04 | 322.99 ms | 52.3% bf16 MFU | 1622239 tok/s +step 10455/19560 | loss 3.388556 (-0.19z)| norm 0.2750 (-0.15z)| lr 2.84e-04 | 323.70 ms | 52.1% bf16 MFU | 1622110 tok/s +step 10456/19560 | loss 3.390249 (-0.16z)| norm 0.3064 (+1.71z)| lr 2.84e-04 | 323.26 ms | 52.2% bf16 MFU | 1622098 tok/s +step 10457/19560 | loss 3.384296 (-0.30z)| norm 0.2641 (-0.79z)| lr 2.84e-04 | 323.17 ms | 52.2% bf16 MFU | 1622111 tok/s +step 10458/19560 | loss 3.403877 (+0.25z)| norm 0.2894 (+0.79z)| lr 2.84e-04 | 322.98 ms | 52.3% bf16 MFU | 1622169 tok/s +step 10459/19560 | loss 3.465446 (+1.94z)| norm 0.2757 (-0.05z)| lr 2.84e-04 | 323.30 ms | 52.2% bf16 MFU | 1622143 tok/s +step 10460/19560 | loss 3.366281 (-0.82z)| norm 0.2820 (+0.35z)| lr 2.84e-04 | 323.15 ms | 52.2% bf16 MFU | 1622158 tok/s +step 10461/19560 | loss 3.375916 (-0.55z)| norm 0.2863 (+0.62z)| lr 2.84e-04 | 323.05 ms | 52.2% bf16 MFU | 1622197 tok/s +step 10462/19560 | loss 3.420750 (+0.70z)| norm 0.2761 (-0.01z)| lr 2.83e-04 | 323.01 ms | 52.3% bf16 MFU | 1622244 tok/s +step 10463/19560 | loss 3.412811 (+0.47z)| norm 0.2491 (-1.80z)| lr 2.83e-04 | 322.69 ms | 52.3% bf16 MFU | 1622369 tok/s +step 10464/19560 | loss 3.457704 (+1.69z)| norm 0.2646 (-0.75z)| lr 2.83e-04 | 323.12 ms | 52.2% bf16 MFU | 1622381 tok/s +step 10465/19560 | loss 3.378021 (-0.50z)| norm 0.2614 (-0.96z)| lr 2.83e-04 | 323.24 ms | 52.2% bf16 MFU | 1622362 tok/s +step 10466/19560 | loss 3.350063 (-1.26z)| norm 0.2585 (-1.14z)| lr 2.83e-04 | 323.23 ms | 52.2% bf16 MFU | 1622345 tok/s +step 10467/19560 | loss 3.379678 (-0.44z)| norm 0.2703 (-0.33z)| lr 2.83e-04 | 322.51 ms | 52.3% bf16 MFU | 1622511 tok/s +step 10468/19560 | loss 3.356075 (-1.07z)| norm 0.2678 (-0.48z)| lr 2.83e-04 | 323.17 ms | 52.2% bf16 MFU | 1622501 tok/s +step 10469/19560 | loss 3.372047 (-0.63z)| norm 0.2769 (+0.12z)| lr 2.83e-04 | 322.87 ms | 52.3% bf16 MFU | 1622568 tok/s +step 10470/19560 | loss 3.386011 (-0.25z)| norm 0.2441 (-2.18z)| lr 2.83e-04 | 322.57 ms | 52.3% bf16 MFU | 1622707 tok/s +step 10471/19560 | loss 3.407826 (+0.35z)| norm 0.2623 (-0.86z)| lr 2.83e-04 | 323.19 ms | 52.2% bf16 MFU | 1622684 tok/s +step 10472/19560 | loss 3.337674 (-1.55z)| norm 0.2522 (-1.56z)| lr 2.83e-04 | 323.05 ms | 52.2% bf16 MFU | 1622696 tok/s +step 10473/19560 | loss 3.451560 (+1.52z)| norm 0.2679 (-0.44z)| lr 2.83e-04 | 323.00 ms | 52.3% bf16 MFU | 1622719 tok/s +step 10474/19560 | loss 3.358530 (-1.00z)| norm 0.2649 (-0.66z)| lr 2.83e-04 | 322.81 ms | 52.3% bf16 MFU | 1622791 tok/s +step 10475/19560 | loss 3.379099 (-0.43z)| norm 0.2932 (+1.35z)| lr 2.83e-04 | 322.80 ms | 52.3% bf16 MFU | 1622861 tok/s +step 10476/19560 | loss 3.479181 (+2.22z)| norm 0.2867 (+0.89z)| lr 2.83e-04 | 322.69 ms | 52.3% bf16 MFU | 1622955 tok/s +step 10477/19560 | loss 3.396650 (+0.01z)| norm 0.2886 (+1.02z)| lr 2.83e-04 | 322.76 ms | 52.3% bf16 MFU | 1623027 tok/s +step 10478/19560 | loss 3.412914 (+0.45z)| norm 0.2678 (-0.46z)| lr 2.83e-04 | 323.31 ms | 52.2% bf16 MFU | 1622957 tok/s +step 10479/19560 | loss 3.380953 (-0.41z)| norm 0.3218 (+3.28z)| lr 2.83e-04 | 322.48 ms | 52.3% bf16 MFU | 1623100 tok/s +step 10480/19560 | loss 3.412162 (+0.42z)| norm 0.2848 (+0.71z)| lr 2.83e-04 | 322.70 ms | 52.3% bf16 MFU | 1623180 tok/s +step 10481/19560 | loss 3.407699 (+0.29z)| norm 0.2852 (+0.74z)| lr 2.83e-04 | 323.56 ms | 52.2% bf16 MFU | 1623040 tok/s +step 10482/19560 | loss 3.372974 (-0.65z)| norm 0.3041 (+2.00z)| lr 2.82e-04 | 322.86 ms | 52.3% bf16 MFU | 1623082 tok/s +step 10483/19560 | loss 3.391312 (-0.15z)| norm 0.3707 (+5.62z)| lr 2.82e-04 | 322.40 ms | 52.3% bf16 MFU | 1623237 tok/s +step 10484/19560 | loss 3.415012 (+0.49z)| norm 0.2829 (+0.44z)| lr 2.82e-04 | 323.27 ms | 52.2% bf16 MFU | 1623167 tok/s +step 10485/19560 | loss 3.403718 (+0.19z)| norm 0.3150 (+2.28z)| lr 2.82e-04 | 322.97 ms | 52.3% bf16 MFU | 1623175 tok/s +step 10486/19560 | loss 3.357488 (-1.07z)| norm 0.3122 (+2.06z)| lr 2.82e-04 | 322.39 ms | 52.4% bf16 MFU | 1623329 tok/s +step 10487/19560 | loss 3.421847 (+0.70z)| norm 0.3097 (+1.88z)| lr 2.82e-04 | 323.12 ms | 52.2% bf16 MFU | 1623291 tok/s +step 10488/19560 | loss 3.371572 (-0.70z)| norm 0.2781 (+0.10z)| lr 2.82e-04 | 322.77 ms | 52.3% bf16 MFU | 1623345 tok/s +step 10489/19560 | loss 3.367743 (-0.80z)| norm 0.3168 (+2.21z)| lr 2.82e-04 | 323.01 ms | 52.2% bf16 MFU | 1623334 tok/s +step 10490/19560 | loss 3.402845 (+0.18z)| norm 0.2715 (-0.28z)| lr 2.82e-04 | 322.85 ms | 52.3% bf16 MFU | 1623364 tok/s +step 10491/19560 | loss 3.376569 (-0.58z)| norm 0.3210 (+2.41z)| lr 2.82e-04 | 323.07 ms | 52.2% bf16 MFU | 1623338 tok/s +step 10492/19560 | loss 3.378131 (-0.55z)| norm 0.2741 (-0.15z)| lr 2.82e-04 | 323.27 ms | 52.2% bf16 MFU | 1623263 tok/s +step 10493/19560 | loss 3.419372 (+0.63z)| norm 0.2708 (-0.31z)| lr 2.82e-04 | 322.65 ms | 52.3% bf16 MFU | 1623348 tok/s +step 10494/19560 | loss 3.351162 (-1.29z)| norm 0.2839 (+0.39z)| lr 2.82e-04 | 323.18 ms | 52.2% bf16 MFU | 1623295 tok/s +step 10495/19560 | loss 3.383709 (-0.37z)| norm 0.2725 (-0.23z)| lr 2.82e-04 | 322.29 ms | 52.4% bf16 MFU | 1623468 tok/s +step 10496/19560 | loss 3.376975 (-0.55z)| norm 0.2668 (-0.54z)| lr 2.82e-04 | 323.18 ms | 52.2% bf16 MFU | 1623410 tok/s +step 10497/19560 | loss 3.334727 (-1.76z)| norm 0.2743 (-0.14z)| lr 2.82e-04 | 322.67 ms | 52.3% bf16 MFU | 1623481 tok/s +step 10498/19560 | loss 3.441298 (+1.26z)| norm 0.2577 (-1.04z)| lr 2.82e-04 | 322.98 ms | 52.3% bf16 MFU | 1623471 tok/s +step 10499/19560 | loss 3.370346 (-0.77z)| norm 0.2615 (-0.83z)| lr 2.82e-04 | 322.60 ms | 52.3% bf16 MFU | 1623558 tok/s +step 10500/19560 | loss 3.413627 (+0.46z)| norm 0.2464 (-1.63z)| lr 2.82e-04 | 322.54 ms | 52.3% bf16 MFU | 1623654 tok/s +val loss 3.384710 +evaluating HellaSwag: 0/79 evaluating HellaSwag: 10/79 evaluating HellaSwag: 20/79 evaluating HellaSwag: 30/79 evaluating HellaSwag: 40/79 evaluating HellaSwag: 50/79 evaluating HellaSwag: 60/79 evaluating HellaSwag: 70/79 HellaSwag: 2921/10042 = 0.290878 +step 10501/19560 | loss 3.435599 (+1.10z)| norm 0.2781 (+0.10z)| lr 2.82e-04 | 322.10 ms | 52.4% bf16 MFU | 1623857 tok/s +step 10502/19560 | loss 3.385600 (-0.34z)| norm 0.2614 (-0.80z)| lr 2.81e-04 | 322.94 ms | 52.3% bf16 MFU | 1623838 tok/s +step 10503/19560 | loss 3.422592 (+0.72z)| norm 0.2651 (-0.60z)| lr 2.81e-04 | 323.41 ms | 52.2% bf16 MFU | 1623702 tok/s +step 10504/19560 | loss 3.335410 (-1.76z)| norm 0.2615 (-0.79z)| lr 2.81e-04 | 323.10 ms | 52.2% bf16 MFU | 1623651 tok/s +step 10505/19560 | loss 3.414510 (+0.49z)| norm 0.2687 (-0.40z)| lr 2.81e-04 | 322.75 ms | 52.3% bf16 MFU | 1623689 tok/s +step 10506/19560 | loss 3.423794 (+0.75z)| norm 0.2663 (-0.53z)| lr 2.81e-04 | 322.79 ms | 52.3% bf16 MFU | 1623716 tok/s +step 10507/19560 | loss 3.404167 (+0.18z)| norm 0.2661 (-0.55z)| lr 2.81e-04 | 322.90 ms | 52.3% bf16 MFU | 1623715 tok/s +step 10508/19560 | loss 3.349413 (-1.37z)| norm 0.2554 (-1.13z)| lr 2.81e-04 | 323.20 ms | 52.2% bf16 MFU | 1623638 tok/s +step 10509/19560 | loss 3.428109 (+0.87z)| norm 0.2621 (-0.76z)| lr 2.81e-04 | 323.04 ms | 52.2% bf16 MFU | 1623606 tok/s +step 10510/19560 | loss 3.353843 (-1.23z)| norm 0.2651 (-0.59z)| lr 2.81e-04 | 322.91 ms | 52.3% bf16 MFU | 1623607 tok/s +step 10511/19560 | loss 3.362791 (-0.96z)| norm 0.2727 (-0.19z)| lr 2.81e-04 | 322.74 ms | 52.3% bf16 MFU | 1623652 tok/s +step 10512/19560 | loss 3.340265 (-1.57z)| norm 0.2434 (-1.76z)| lr 2.81e-04 | 323.12 ms | 52.2% bf16 MFU | 1623597 tok/s +step 10513/19560 | loss 3.416067 (+0.57z)| norm 0.2680 (-0.41z)| lr 2.81e-04 | 322.68 ms | 52.3% bf16 MFU | 1623656 tok/s +step 10514/19560 | loss 3.314539 (-2.24z)| norm 0.2678 (-0.41z)| lr 2.81e-04 | 322.65 ms | 52.3% bf16 MFU | 1623721 tok/s +step 10515/19560 | loss 3.427630 (+0.90z)| norm 0.2902 (+0.83z)| lr 2.81e-04 | 322.85 ms | 52.3% bf16 MFU | 1623731 tok/s +step 10516/19560 | loss 3.443986 (+1.33z)| norm 0.2972 (+1.22z)| lr 2.81e-04 | 322.95 ms | 52.3% bf16 MFU | 1623717 tok/s +step 10517/19560 | loss 3.417633 (+0.62z)| norm 0.2729 (-0.12z)| lr 2.81e-04 | 322.53 ms | 52.3% bf16 MFU | 1623808 tok/s +step 10518/19560 | loss 3.544463 (+3.94z)| norm 0.3044 (+1.62z)| lr 2.81e-04 | 323.18 ms | 52.2% bf16 MFU | 1623732 tok/s +step 10519/19560 | loss 3.323203 (-1.94z)| norm 0.3002 (+1.41z)| lr 2.81e-04 | 323.14 ms | 52.2% bf16 MFU | 1623669 tok/s +step 10520/19560 | loss 3.349583 (-1.23z)| norm 0.2776 (+0.14z)| lr 2.81e-04 | 322.26 ms | 52.4% bf16 MFU | 1623831 tok/s +step 10521/19560 | loss 3.390671 (-0.14z)| norm 0.2801 (+0.28z)| lr 2.81e-04 | 323.14 ms | 52.2% bf16 MFU | 1623763 tok/s +step 10522/19560 | loss 3.392869 (-0.08z)| norm 0.2803 (+0.28z)| lr 2.80e-04 | 322.65 ms | 52.3% bf16 MFU | 1623822 tok/s +step 10523/19560 | loss 3.325493 (-1.82z)| norm 0.2720 (-0.18z)| lr 2.80e-04 | 322.87 ms | 52.3% bf16 MFU | 1623823 tok/s +step 10524/19560 | loss 3.377193 (-0.48z)| norm 0.2621 (-0.74z)| lr 2.80e-04 | 322.94 ms | 52.3% bf16 MFU | 1623807 tok/s +step 10525/19560 | loss 3.366385 (-0.75z)| norm 0.2830 (+0.42z)| lr 2.80e-04 | 323.11 ms | 52.2% bf16 MFU | 1623748 tok/s +step 10526/19560 | loss 3.542967 (+3.68z)| norm 0.2876 (+0.67z)| lr 2.80e-04 | 322.69 ms | 52.3% bf16 MFU | 1623798 tok/s +step 10527/19560 | loss 3.397644 (+0.06z)| norm 0.2704 (-0.30z)| lr 2.80e-04 | 322.86 ms | 52.3% bf16 MFU | 1623803 tok/s +step 10528/19560 | loss 3.478710 (+2.07z)| norm 0.3018 (+1.44z)| lr 2.80e-04 | 323.38 ms | 52.2% bf16 MFU | 1623675 tok/s +step 10529/19560 | loss 3.420772 (+0.62z)| norm 0.2877 (+0.64z)| lr 2.80e-04 | 322.25 ms | 52.4% bf16 MFU | 1623838 tok/s +step 10530/19560 | loss 3.448711 (+1.30z)| norm 0.3030 (+1.48z)| lr 2.80e-04 | 323.00 ms | 52.3% bf16 MFU | 1623806 tok/s +step 10531/19560 | loss 3.353371 (-1.05z)| norm 0.3382 (+3.27z)| lr 2.80e-04 | 322.37 ms | 52.4% bf16 MFU | 1623935 tok/s +step 10532/19560 | loss 3.426147 (+0.74z)| norm 0.2836 (+0.34z)| lr 2.80e-04 | 322.59 ms | 52.3% bf16 MFU | 1624001 tok/s +step 10533/19560 | loss 3.571831 (+4.01z)| norm 0.3258 (+2.51z)| lr 2.80e-04 | 323.14 ms | 52.2% bf16 MFU | 1623925 tok/s +step 10534/19560 | loss 3.399270 (+0.03z)| norm 0.2719 (-0.30z)| lr 2.80e-04 | 323.01 ms | 52.2% bf16 MFU | 1623886 tok/s +step 10535/19560 | loss 3.403959 (+0.14z)| norm 0.2977 (+1.03z)| lr 2.80e-04 | 322.21 ms | 52.4% bf16 MFU | 1624049 tok/s +step 10536/19560 | loss 3.385666 (-0.28z)| norm 0.2928 (+0.78z)| lr 2.80e-04 | 323.26 ms | 52.2% bf16 MFU | 1623942 tok/s +step 10537/19560 | loss 3.357148 (-0.94z)| norm 0.2952 (+0.89z)| lr 2.80e-04 | 323.25 ms | 52.2% bf16 MFU | 1623841 tok/s +step 10538/19560 | loss 3.405941 (+0.19z)| norm 0.2824 (+0.23z)| lr 2.80e-04 | 322.39 ms | 52.4% bf16 MFU | 1623963 tok/s +step 10539/19560 | loss 3.344558 (-1.23z)| norm 0.2969 (+0.98z)| lr 2.80e-04 | 322.70 ms | 52.3% bf16 MFU | 1624000 tok/s +step 10540/19560 | loss 3.443327 (+1.04z)| norm 0.2883 (+0.52z)| lr 2.80e-04 | 322.30 ms | 52.4% bf16 MFU | 1624136 tok/s +step 10541/19560 | loss 3.371336 (-0.61z)| norm 0.2808 (+0.13z)| lr 2.80e-04 | 322.66 ms | 52.3% bf16 MFU | 1624173 tok/s +step 10542/19560 | loss 3.399459 (+0.04z)| norm 0.2840 (+0.29z)| lr 2.79e-04 | 323.12 ms | 52.2% bf16 MFU | 1624093 tok/s +step 10543/19560 | loss 3.486651 (+2.00z)| norm 0.2890 (+0.54z)| lr 2.79e-04 | 322.88 ms | 52.3% bf16 MFU | 1624078 tok/s +step 10544/19560 | loss 3.428638 (+0.67z)| norm 0.2936 (+0.78z)| lr 2.79e-04 | 322.92 ms | 52.3% bf16 MFU | 1624053 tok/s +step 10545/19560 | loss 3.504817 (+2.33z)| norm 0.3025 (+1.23z)| lr 2.79e-04 | 322.50 ms | 52.3% bf16 MFU | 1624135 tok/s +step 10546/19560 | loss 3.433175 (+0.73z)| norm 0.3043 (+1.30z)| lr 2.79e-04 | 322.83 ms | 52.3% bf16 MFU | 1624129 tok/s +step 10547/19560 | loss 3.416236 (+0.36z)| norm 0.2993 (+1.03z)| lr 2.79e-04 | 322.90 ms | 52.3% bf16 MFU | 1624107 tok/s +step 10548/19560 | loss 3.394439 (-0.13z)| norm 0.2814 (+0.10z)| lr 2.79e-04 | 322.51 ms | 52.3% bf16 MFU | 1624184 tok/s +step 10549/19560 | loss 3.411248 (+0.24z)| norm 0.2706 (-0.47z)| lr 2.79e-04 | 322.87 ms | 52.3% bf16 MFU | 1624166 tok/s +step 10550/19560 | loss 3.453293 (+1.15z)| norm 0.2850 (+0.27z)| lr 2.79e-04 | 322.71 ms | 52.3% bf16 MFU | 1624189 tok/s +step 10551/19560 | loss 3.347005 (-1.18z)| norm 0.2874 (+0.39z)| lr 2.79e-04 | 323.43 ms | 52.2% bf16 MFU | 1624031 tok/s +step 10552/19560 | loss 3.451589 (+1.10z)| norm 0.2703 (-0.50z)| lr 2.79e-04 | 322.37 ms | 52.4% bf16 MFU | 1624147 tok/s +step 10553/19560 | loss 3.520417 (+2.52z)| norm 0.2929 (+0.67z)| lr 2.79e-04 | 322.95 ms | 52.3% bf16 MFU | 1624111 tok/s +step 10554/19560 | loss 3.435887 (+0.71z)| norm 0.2899 (+0.50z)| lr 2.79e-04 | 322.39 ms | 52.3% bf16 MFU | 1624217 tok/s +step 10555/19560 | loss 3.412436 (+0.21z)| norm 0.2697 (-0.54z)| lr 2.79e-04 | 323.16 ms | 52.2% bf16 MFU | 1624126 tok/s +step 10556/19560 | loss 3.396241 (-0.13z)| norm 0.2993 (+0.99z)| lr 2.79e-04 | 322.60 ms | 52.3% bf16 MFU | 1624180 tok/s +step 10557/19560 | loss 3.429553 (+0.57z)| norm 0.2811 (+0.03z)| lr 2.79e-04 | 322.29 ms | 52.4% bf16 MFU | 1624308 tok/s +step 10558/19560 | loss 3.378884 (-0.50z)| norm 0.2939 (+0.70z)| lr 2.79e-04 | 323.33 ms | 52.2% bf16 MFU | 1624170 tok/s +step 10559/19560 | loss 3.396460 (-0.13z)| norm 0.2899 (+0.49z)| lr 2.79e-04 | 323.11 ms | 52.2% bf16 MFU | 1624092 tok/s +step 10560/19560 | loss 3.404680 (+0.05z)| norm 0.2650 (-0.80z)| lr 2.79e-04 | 323.26 ms | 52.2% bf16 MFU | 1623980 tok/s +step 10561/19560 | loss 3.388521 (-0.29z)| norm 0.2871 (+0.36z)| lr 2.79e-04 | 322.07 ms | 52.4% bf16 MFU | 1624174 tok/s +step 10562/19560 | loss 3.346339 (-1.19z)| norm 0.2716 (-0.45z)| lr 2.78e-04 | 323.68 ms | 52.1% bf16 MFU | 1623953 tok/s +step 10563/19560 | loss 3.350066 (-1.13z)| norm 0.2590 (-1.10z)| lr 2.78e-04 | 323.20 ms | 52.2% bf16 MFU | 1623865 tok/s +step 10564/19560 | loss 3.406353 (+0.10z)| norm 0.2567 (-1.21z)| lr 2.78e-04 | 322.84 ms | 52.3% bf16 MFU | 1623871 tok/s +step 10565/19560 | loss 3.341236 (-1.30z)| norm 0.2883 (+0.42z)| lr 2.78e-04 | 322.92 ms | 52.3% bf16 MFU | 1623856 tok/s +step 10566/19560 | loss 3.444220 (+0.93z)| norm 0.2634 (-0.86z)| lr 2.78e-04 | 322.57 ms | 52.3% bf16 MFU | 1623930 tok/s +step 10567/19560 | loss 3.485652 (+1.80z)| norm 0.3305 (+2.53z)| lr 2.78e-04 | 323.17 ms | 52.2% bf16 MFU | 1623850 tok/s +step 10568/19560 | loss 3.380045 (-0.47z)| norm 0.3281 (+2.33z)| lr 2.78e-04 | 323.16 ms | 52.2% bf16 MFU | 1623776 tok/s +step 10569/19560 | loss 3.385063 (-0.36z)| norm 0.2716 (-0.47z)| lr 2.78e-04 | 322.79 ms | 52.3% bf16 MFU | 1623800 tok/s +step 10570/19560 | loss 3.436783 (+0.74z)| norm 0.2883 (+0.36z)| lr 2.78e-04 | 323.07 ms | 52.2% bf16 MFU | 1623751 tok/s +step 10571/19560 | loss 3.387167 (-0.33z)| norm 0.2752 (-0.30z)| lr 2.78e-04 | 322.75 ms | 52.3% bf16 MFU | 1623784 tok/s +step 10572/19560 | loss 3.387872 (-0.31z)| norm 0.3018 (+1.01z)| lr 2.78e-04 | 323.62 ms | 52.2% bf16 MFU | 1623599 tok/s +step 10573/19560 | loss 3.464168 (+1.34z)| norm 0.2926 (+0.54z)| lr 2.78e-04 | 322.86 ms | 52.3% bf16 MFU | 1623613 tok/s +step 10574/19560 | loss 3.396019 (-0.13z)| norm 0.2612 (-1.02z)| lr 2.78e-04 | 322.83 ms | 52.3% bf16 MFU | 1623635 tok/s +step 10575/19560 | loss 3.444334 (+0.91z)| norm 0.3076 (+1.29z)| lr 2.78e-04 | 323.16 ms | 52.2% bf16 MFU | 1623573 tok/s +step 10576/19560 | loss 3.400764 (-0.02z)| norm 0.2519 (-1.47z)| lr 2.78e-04 | 323.19 ms | 52.2% bf16 MFU | 1623506 tok/s +step 10577/19560 | loss 3.409422 (+0.16z)| norm 0.2681 (-0.67z)| lr 2.78e-04 | 322.55 ms | 52.3% bf16 MFU | 1623604 tok/s +step 10578/19560 | loss 3.409169 (+0.14z)| norm 0.2590 (-1.12z)| lr 2.78e-04 | 323.22 ms | 52.2% bf16 MFU | 1623528 tok/s +step 10579/19560 | loss 3.393572 (-0.20z)| norm 0.2695 (-0.60z)| lr 2.78e-04 | 323.02 ms | 52.2% bf16 MFU | 1623504 tok/s +step 10580/19560 | loss 3.357503 (-0.99z)| norm 0.2530 (-1.40z)| lr 2.78e-04 | 323.25 ms | 52.2% bf16 MFU | 1623426 tok/s +step 10581/19560 | loss 3.553186 (+3.20z)| norm 0.2820 (+0.03z)| lr 2.78e-04 | 323.24 ms | 52.2% bf16 MFU | 1623353 tok/s +step 10582/19560 | loss 3.335814 (-1.42z)| norm 0.2620 (-0.95z)| lr 2.77e-04 | 322.39 ms | 52.4% bf16 MFU | 1623498 tok/s +step 10583/19560 | loss 3.383623 (-0.41z)| norm 0.2633 (-0.88z)| lr 2.77e-04 | 322.88 ms | 52.3% bf16 MFU | 1623514 tok/s +step 10584/19560 | loss 3.369281 (-0.71z)| norm 0.2592 (-1.06z)| lr 2.77e-04 | 323.03 ms | 52.2% bf16 MFU | 1623489 tok/s +step 10585/19560 | loss 3.471130 (+1.42z)| norm 0.2818 (+0.04z)| lr 2.77e-04 | 322.80 ms | 52.3% bf16 MFU | 1623525 tok/s +step 10586/19560 | loss 3.391852 (-0.24z)| norm 0.2874 (+0.31z)| lr 2.77e-04 | 323.10 ms | 52.2% bf16 MFU | 1623482 tok/s +step 10587/19560 | loss 3.410735 (+0.17z)| norm 0.2746 (-0.32z)| lr 2.77e-04 | 322.65 ms | 52.3% bf16 MFU | 1623556 tok/s +step 10588/19560 | loss 3.406009 (+0.06z)| norm 0.2792 (-0.09z)| lr 2.77e-04 | 323.05 ms | 52.2% bf16 MFU | 1623524 tok/s +step 10589/19560 | loss 3.431438 (+0.59z)| norm 0.2673 (-0.66z)| lr 2.77e-04 | 323.36 ms | 52.2% bf16 MFU | 1623418 tok/s +step 10590/19560 | loss 3.341713 (-1.29z)| norm 0.2742 (-0.32z)| lr 2.77e-04 | 322.49 ms | 52.3% bf16 MFU | 1623533 tok/s +step 10591/19560 | loss 3.436241 (+0.70z)| norm 0.2697 (-0.56z)| lr 2.77e-04 | 323.15 ms | 52.2% bf16 MFU | 1623477 tok/s +step 10592/19560 | loss 3.437225 (+0.72z)| norm 0.2867 (+0.27z)| lr 2.77e-04 | 323.63 ms | 52.1% bf16 MFU | 1623303 tok/s +step 10593/19560 | loss 3.381245 (-0.46z)| norm 0.2701 (-0.55z)| lr 2.77e-04 | 323.16 ms | 52.2% bf16 MFU | 1623257 tok/s +step 10594/19560 | loss 3.412632 (+0.19z)| norm 0.2760 (-0.27z)| lr 2.77e-04 | 322.79 ms | 52.3% bf16 MFU | 1623305 tok/s +step 10595/19560 | loss 3.377810 (-0.54z)| norm 0.2726 (-0.44z)| lr 2.77e-04 | 323.06 ms | 52.2% bf16 MFU | 1623283 tok/s +step 10596/19560 | loss 3.404307 (+0.01z)| norm 0.2905 (+0.44z)| lr 2.77e-04 | 323.01 ms | 52.2% bf16 MFU | 1623276 tok/s +step 10597/19560 | loss 3.398154 (-0.13z)| norm 0.2818 (+0.01z)| lr 2.77e-04 | 322.67 ms | 52.3% bf16 MFU | 1623353 tok/s +step 10598/19560 | loss 3.471868 (+1.42z)| norm 0.2645 (-0.87z)| lr 2.77e-04 | 323.04 ms | 52.2% bf16 MFU | 1623335 tok/s +step 10599/19560 | loss 3.352904 (-1.08z)| norm 0.2924 (+0.53z)| lr 2.77e-04 | 322.49 ms | 52.3% bf16 MFU | 1623456 tok/s +step 10600/19560 | loss 3.414057 (+0.19z)| norm 0.2698 (-0.63z)| lr 2.77e-04 | 322.68 ms | 52.3% bf16 MFU | 1623522 tok/s +step 10601/19560 | loss 3.401369 (-0.07z)| norm 0.2717 (-0.53z)| lr 2.77e-04 | 322.73 ms | 52.3% bf16 MFU | 1623573 tok/s +step 10602/19560 | loss 3.401865 (-0.06z)| norm 0.2626 (-1.00z)| lr 2.76e-04 | 322.84 ms | 52.3% bf16 MFU | 1623594 tok/s +step 10603/19560 | loss 3.427603 (+0.48z)| norm 0.2801 (-0.10z)| lr 2.76e-04 | 323.48 ms | 52.2% bf16 MFU | 1623453 tok/s +step 10604/19560 | loss 3.402611 (-0.04z)| norm 0.2742 (-0.40z)| lr 2.76e-04 | 322.51 ms | 52.3% bf16 MFU | 1623561 tok/s +step 10605/19560 | loss 3.451191 (+0.99z)| norm 0.2563 (-1.29z)| lr 2.76e-04 | 322.75 ms | 52.3% bf16 MFU | 1623606 tok/s +step 10606/19560 | loss 3.393765 (-0.24z)| norm 0.2783 (-0.18z)| lr 2.76e-04 | 323.12 ms | 52.2% bf16 MFU | 1623554 tok/s +step 10607/19560 | loss 3.404373 (-0.01z)| norm 0.2609 (-1.05z)| lr 2.76e-04 | 322.49 ms | 52.3% bf16 MFU | 1623663 tok/s +step 10608/19560 | loss 3.363288 (-0.89z)| norm 0.2660 (-0.78z)| lr 2.76e-04 | 322.36 ms | 52.4% bf16 MFU | 1623800 tok/s +step 10609/19560 | loss 3.349326 (-1.17z)| norm 0.2646 (-0.84z)| lr 2.76e-04 | 322.93 ms | 52.3% bf16 MFU | 1623787 tok/s +step 10610/19560 | loss 3.442530 (+0.81z)| norm 0.2863 (+0.28z)| lr 2.76e-04 | 322.58 ms | 52.3% bf16 MFU | 1623863 tok/s +step 10611/19560 | loss 3.387735 (-0.36z)| norm 0.2531 (-1.50z)| lr 2.76e-04 | 323.27 ms | 52.2% bf16 MFU | 1623761 tok/s +step 10612/19560 | loss 3.383370 (-0.45z)| norm 0.2725 (-0.41z)| lr 2.76e-04 | 322.80 ms | 52.3% bf16 MFU | 1623783 tok/s +step 10613/19560 | loss 3.339187 (-1.37z)| norm 0.2387 (-2.26z)| lr 2.76e-04 | 323.10 ms | 52.2% bf16 MFU | 1623729 tok/s +step 10614/19560 | loss 3.341651 (-1.31z)| norm 0.2563 (-1.26z)| lr 2.76e-04 | 323.59 ms | 52.2% bf16 MFU | 1623554 tok/s +step 10615/19560 | loss 3.427078 (+0.49z)| norm 0.2569 (-1.21z)| lr 2.76e-04 | 323.30 ms | 52.2% bf16 MFU | 1623460 tok/s +step 10616/19560 | loss 3.382172 (-0.46z)| norm 0.2537 (-1.37z)| lr 2.76e-04 | 323.27 ms | 52.2% bf16 MFU | 1623379 tok/s +step 10617/19560 | loss 3.476957 (+1.51z)| norm 0.2794 (+0.08z)| lr 2.76e-04 | 323.09 ms | 52.2% bf16 MFU | 1623348 tok/s +step 10618/19560 | loss 3.428929 (+0.50z)| norm 0.2928 (+0.84z)| lr 2.76e-04 | 323.00 ms | 52.3% bf16 MFU | 1623340 tok/s +step 10619/19560 | loss 3.412087 (+0.14z)| norm 0.2604 (-1.00z)| lr 2.76e-04 | 323.28 ms | 52.2% bf16 MFU | 1623262 tok/s +step 10620/19560 | loss 3.398341 (-0.15z)| norm 0.2784 (+0.04z)| lr 2.76e-04 | 323.10 ms | 52.2% bf16 MFU | 1623233 tok/s +step 10621/19560 | loss 3.370250 (-0.73z)| norm 0.2708 (-0.40z)| lr 2.76e-04 | 323.67 ms | 52.1% bf16 MFU | 1623063 tok/s +step 10622/19560 | loss 3.388854 (-0.35z)| norm 0.2801 (+0.14z)| lr 2.75e-04 | 323.15 ms | 52.2% bf16 MFU | 1623031 tok/s +step 10623/19560 | loss 3.327197 (-1.62z)| norm 0.2720 (-0.33z)| lr 2.75e-04 | 323.20 ms | 52.2% bf16 MFU | 1622989 tok/s +step 10624/19560 | loss 3.378855 (-0.54z)| norm 0.2718 (-0.34z)| lr 2.75e-04 | 323.52 ms | 52.2% bf16 MFU | 1622867 tok/s +step 10625/19560 | loss 3.413596 (+0.17z)| norm 0.2841 (+0.37z)| lr 2.75e-04 | 323.27 ms | 52.2% bf16 MFU | 1622816 tok/s +step 10626/19560 | loss 3.395108 (-0.21z)| norm 0.2526 (-1.45z)| lr 2.75e-04 | 323.43 ms | 52.2% bf16 MFU | 1622727 tok/s +step 10627/19560 | loss 3.374343 (-0.65z)| norm 0.2838 (+0.34z)| lr 2.75e-04 | 323.09 ms | 52.2% bf16 MFU | 1622728 tok/s +step 10628/19560 | loss 3.440241 (+0.73z)| norm 0.2607 (-1.01z)| lr 2.75e-04 | 323.81 ms | 52.1% bf16 MFU | 1622547 tok/s +step 10629/19560 | loss 3.449539 (+0.92z)| norm 0.2899 (+0.69z)| lr 2.75e-04 | 323.18 ms | 52.2% bf16 MFU | 1622534 tok/s +step 10630/19560 | loss 3.324717 (-1.67z)| norm 0.2511 (-1.56z)| lr 2.75e-04 | 323.62 ms | 52.2% bf16 MFU | 1622410 tok/s +step 10631/19560 | loss 3.404038 (-0.02z)| norm 0.2897 (+0.67z)| lr 2.75e-04 | 323.55 ms | 52.2% bf16 MFU | 1622311 tok/s +step 10632/19560 | loss 3.330280 (-1.55z)| norm 0.2690 (-0.54z)| lr 2.75e-04 | 323.09 ms | 52.2% bf16 MFU | 1622333 tok/s +step 10633/19560 | loss 3.546464 (+2.83z)| norm 0.2920 (+0.79z)| lr 2.75e-04 | 323.63 ms | 52.1% bf16 MFU | 1622217 tok/s +step 10634/19560 | loss 3.398584 (-0.14z)| norm 0.3100 (+1.80z)| lr 2.75e-04 | 322.82 ms | 52.3% bf16 MFU | 1622312 tok/s +step 10635/19560 | loss 3.441256 (+0.71z)| norm 0.3136 (+1.95z)| lr 2.75e-04 | 323.29 ms | 52.2% bf16 MFU | 1622283 tok/s +step 10636/19560 | loss 3.471297 (+1.29z)| norm 0.2924 (+0.74z)| lr 2.75e-04 | 323.23 ms | 52.2% bf16 MFU | 1622271 tok/s +step 10637/19560 | loss 3.466181 (+1.18z)| norm 0.2972 (+1.00z)| lr 2.75e-04 | 322.70 ms | 52.3% bf16 MFU | 1622392 tok/s +step 10638/19560 | loss 3.451528 (+0.87z)| norm 0.3103 (+1.71z)| lr 2.75e-04 | 322.79 ms | 52.3% bf16 MFU | 1622484 tok/s +step 10639/19560 | loss 3.411135 (+0.05z)| norm 0.2742 (-0.33z)| lr 2.75e-04 | 323.51 ms | 52.2% bf16 MFU | 1622391 tok/s +step 10640/19560 | loss 3.392468 (-0.33z)| norm 0.2987 (+1.04z)| lr 2.75e-04 | 323.07 ms | 52.2% bf16 MFU | 1622412 tok/s +step 10641/19560 | loss 3.367826 (-0.82z)| norm 0.2972 (+0.94z)| lr 2.75e-04 | 323.61 ms | 52.2% bf16 MFU | 1622298 tok/s +step 10642/19560 | loss 3.620166 (+4.00z)| norm 0.2702 (-0.60z)| lr 2.74e-04 | 322.74 ms | 52.3% bf16 MFU | 1622409 tok/s +step 10643/19560 | loss 3.464312 (+1.01z)| norm 0.2829 (+0.12z)| lr 2.74e-04 | 323.31 ms | 52.2% bf16 MFU | 1622370 tok/s +step 10644/19560 | loss 3.403600 (-0.14z)| norm 0.2876 (+0.40z)| lr 2.74e-04 | 323.04 ms | 52.2% bf16 MFU | 1622401 tok/s +step 10645/19560 | loss 3.385879 (-0.47z)| norm 0.2756 (-0.29z)| lr 2.74e-04 | 322.56 ms | 52.3% bf16 MFU | 1622550 tok/s +step 10646/19560 | loss 3.346598 (-1.21z)| norm 0.3007 (+1.16z)| lr 2.74e-04 | 323.40 ms | 52.2% bf16 MFU | 1622481 tok/s +step 10647/19560 | loss 3.388628 (-0.41z)| norm 0.2833 (+0.16z)| lr 2.74e-04 | 323.12 ms | 52.2% bf16 MFU | 1622486 tok/s +step 10648/19560 | loss 3.377681 (-0.63z)| norm 0.2819 (+0.08z)| lr 2.74e-04 | 323.04 ms | 52.2% bf16 MFU | 1622510 tok/s +step 10649/19560 | loss 3.525964 (+2.23z)| norm 0.3005 (+1.14z)| lr 2.74e-04 | 323.21 ms | 52.2% bf16 MFU | 1622490 tok/s +step 10650/19560 | loss 3.402624 (-0.16z)| norm 0.3021 (+1.22z)| lr 2.74e-04 | 323.02 ms | 52.2% bf16 MFU | 1622520 tok/s +step 10651/19560 | loss 3.355084 (-1.09z)| norm 0.2587 (-1.25z)| lr 2.74e-04 | 323.08 ms | 52.2% bf16 MFU | 1622533 tok/s +step 10652/19560 | loss 3.405024 (-0.12z)| norm 0.3044 (+1.32z)| lr 2.74e-04 | 322.60 ms | 52.3% bf16 MFU | 1622666 tok/s +step 10653/19560 | loss 3.356997 (-1.06z)| norm 0.2669 (-0.80z)| lr 2.74e-04 | 322.86 ms | 52.3% bf16 MFU | 1622726 tok/s +step 10654/19560 | loss 3.400246 (-0.20z)| norm 0.2924 (+0.65z)| lr 2.74e-04 | 323.41 ms | 52.2% bf16 MFU | 1622646 tok/s +step 10655/19560 | loss 3.400630 (-0.19z)| norm 0.2955 (+0.81z)| lr 2.74e-04 | 322.45 ms | 52.3% bf16 MFU | 1622811 tok/s +step 10656/19560 | loss 3.424180 (+0.29z)| norm 0.2847 (+0.21z)| lr 2.74e-04 | 322.80 ms | 52.3% bf16 MFU | 1622881 tok/s +step 10657/19560 | loss 3.376660 (-0.66z)| norm 0.3108 (+1.67z)| lr 2.74e-04 | 322.77 ms | 52.3% bf16 MFU | 1622953 tok/s +step 10658/19560 | loss 3.401601 (-0.15z)| norm 0.2908 (+0.55z)| lr 2.74e-04 | 323.20 ms | 52.2% bf16 MFU | 1622914 tok/s +step 10659/19560 | loss 3.380474 (-0.58z)| norm 0.2863 (+0.33z)| lr 2.74e-04 | 323.15 ms | 52.2% bf16 MFU | 1622888 tok/s +step 10660/19560 | loss 3.353937 (-1.10z)| norm 0.2674 (-0.77z)| lr 2.74e-04 | 322.30 ms | 52.4% bf16 MFU | 1623080 tok/s +step 10661/19560 | loss 3.427356 (+0.42z)| norm 0.2975 (+1.03z)| lr 2.74e-04 | 322.76 ms | 52.3% bf16 MFU | 1623146 tok/s +step 10662/19560 | loss 3.439938 (+0.67z)| norm 0.2901 (+0.58z)| lr 2.73e-04 | 322.48 ms | 52.3% bf16 MFU | 1623279 tok/s +step 10663/19560 | loss 3.372608 (-0.73z)| norm 0.2999 (+1.16z)| lr 2.73e-04 | 322.73 ms | 52.3% bf16 MFU | 1623343 tok/s +step 10664/19560 | loss 3.423221 (+0.32z)| norm 0.2917 (+0.67z)| lr 2.73e-04 | 322.69 ms | 52.3% bf16 MFU | 1623413 tok/s +step 10665/19560 | loss 3.415965 (+0.16z)| norm 0.2746 (-0.35z)| lr 2.73e-04 | 322.95 ms | 52.3% bf16 MFU | 1623415 tok/s +step 10666/19560 | loss 3.361058 (-0.98z)| norm 0.2887 (+0.50z)| lr 2.73e-04 | 323.15 ms | 52.2% bf16 MFU | 1623365 tok/s +step 10667/19560 | loss 3.461561 (+1.11z)| norm 0.2825 (+0.13z)| lr 2.73e-04 | 322.30 ms | 52.4% bf16 MFU | 1623533 tok/s +step 10668/19560 | loss 3.442847 (+0.71z)| norm 0.2926 (+0.74z)| lr 2.73e-04 | 322.58 ms | 52.3% bf16 MFU | 1623621 tok/s +step 10669/19560 | loss 3.358073 (-1.06z)| norm 0.2732 (-0.43z)| lr 2.73e-04 | 322.61 ms | 52.3% bf16 MFU | 1623698 tok/s +step 10670/19560 | loss 3.385269 (-0.49z)| norm 0.2894 (+0.55z)| lr 2.73e-04 | 323.65 ms | 52.1% bf16 MFU | 1623510 tok/s +step 10671/19560 | loss 3.352719 (-1.16z)| norm 0.2550 (-1.50z)| lr 2.73e-04 | 322.97 ms | 52.3% bf16 MFU | 1623501 tok/s +step 10672/19560 | loss 3.444701 (+0.78z)| norm 0.2926 (+0.75z)| lr 2.73e-04 | 322.75 ms | 52.3% bf16 MFU | 1623548 tok/s +step 10673/19560 | loss 3.373425 (-0.71z)| norm 0.2594 (-1.22z)| lr 2.73e-04 | 322.90 ms | 52.3% bf16 MFU | 1623554 tok/s +step 10674/19560 | loss 3.400724 (-0.12z)| norm 0.2605 (-1.14z)| lr 2.73e-04 | 322.73 ms | 52.3% bf16 MFU | 1623604 tok/s +step 10675/19560 | loss 3.391866 (-0.31z)| norm 0.2802 (+0.06z)| lr 2.73e-04 | 322.95 ms | 52.3% bf16 MFU | 1623596 tok/s +step 10676/19560 | loss 3.401643 (-0.10z)| norm 0.2660 (-0.79z)| lr 2.73e-04 | 322.98 ms | 52.3% bf16 MFU | 1623581 tok/s +step 10677/19560 | loss 3.428911 (+0.48z)| norm 0.2781 (-0.06z)| lr 2.73e-04 | 322.75 ms | 52.3% bf16 MFU | 1623623 tok/s +step 10678/19560 | loss 3.428447 (+0.47z)| norm 0.2708 (-0.50z)| lr 2.73e-04 | 322.51 ms | 52.3% bf16 MFU | 1623724 tok/s +step 10679/19560 | loss 3.377926 (-0.62z)| norm 0.2701 (-0.53z)| lr 2.73e-04 | 322.94 ms | 52.3% bf16 MFU | 1623711 tok/s +step 10680/19560 | loss 3.351926 (-1.16z)| norm 0.2562 (-1.36z)| lr 2.73e-04 | 322.80 ms | 52.3% bf16 MFU | 1623735 tok/s +step 10681/19560 | loss 3.377589 (-0.60z)| norm 0.2622 (-0.99z)| lr 2.73e-04 | 323.06 ms | 52.2% bf16 MFU | 1623692 tok/s +step 10682/19560 | loss 3.365414 (-0.85z)| norm 0.2535 (-1.48z)| lr 2.73e-04 | 322.63 ms | 52.3% bf16 MFU | 1623760 tok/s +step 10683/19560 | loss 3.404235 (+0.00z)| norm 0.2879 (+0.56z)| lr 2.72e-04 | 322.72 ms | 52.3% bf16 MFU | 1623802 tok/s +step 10684/19560 | loss 3.375926 (-0.61z)| norm 0.2656 (-0.75z)| lr 2.72e-04 | 322.87 ms | 52.3% bf16 MFU | 1623804 tok/s +step 10685/19560 | loss 3.362299 (-0.90z)| norm 0.2758 (-0.14z)| lr 2.72e-04 | 322.80 ms | 52.3% bf16 MFU | 1623822 tok/s +step 10686/19560 | loss 3.370076 (-0.73z)| norm 0.2861 (+0.48z)| lr 2.72e-04 | 322.82 ms | 52.3% bf16 MFU | 1623837 tok/s +step 10687/19560 | loss 3.414963 (+0.25z)| norm 0.2672 (-0.64z)| lr 2.72e-04 | 322.67 ms | 52.3% bf16 MFU | 1623886 tok/s +step 10688/19560 | loss 3.388234 (-0.33z)| norm 0.2543 (-1.40z)| lr 2.72e-04 | 323.06 ms | 52.2% bf16 MFU | 1623836 tok/s +step 10689/19560 | loss 3.394121 (-0.20z)| norm 0.2583 (-1.14z)| lr 2.72e-04 | 322.53 ms | 52.3% bf16 MFU | 1623922 tok/s +step 10690/19560 | loss 3.456031 (+1.13z)| norm 0.2657 (-0.70z)| lr 2.72e-04 | 322.99 ms | 52.3% bf16 MFU | 1623887 tok/s +step 10691/19560 | loss 3.352018 (-1.14z)| norm 0.2526 (-1.47z)| lr 2.72e-04 | 322.88 ms | 52.3% bf16 MFU | 1623882 tok/s +step 10692/19560 | loss 3.382294 (-0.48z)| norm 0.2785 (+0.05z)| lr 2.72e-04 | 322.65 ms | 52.3% bf16 MFU | 1623935 tok/s +step 10693/19560 | loss 3.428077 (+0.51z)| norm 0.2636 (-0.82z)| lr 2.72e-04 | 322.64 ms | 52.3% bf16 MFU | 1623988 tok/s +step 10694/19560 | loss 3.377477 (-0.59z)| norm 0.2839 (+0.38z)| lr 2.72e-04 | 322.89 ms | 52.3% bf16 MFU | 1623977 tok/s +step 10695/19560 | loss 3.332013 (-1.57z)| norm 0.2607 (-1.01z)| lr 2.72e-04 | 322.93 ms | 52.3% bf16 MFU | 1623954 tok/s +step 10696/19560 | loss 3.422458 (+0.42z)| norm 0.2915 (+0.95z)| lr 2.72e-04 | 322.72 ms | 52.3% bf16 MFU | 1623986 tok/s +step 10697/19560 | loss 3.383605 (-0.44z)| norm 0.2890 (+0.77z)| lr 2.72e-04 | 322.71 ms | 52.3% bf16 MFU | 1624018 tok/s +step 10698/19560 | loss 3.410459 (+0.16z)| norm 0.2669 (-0.63z)| lr 2.72e-04 | 322.57 ms | 52.3% bf16 MFU | 1624084 tok/s +step 10699/19560 | loss 3.367115 (-0.79z)| norm 0.2656 (-0.71z)| lr 2.72e-04 | 323.44 ms | 52.2% bf16 MFU | 1623928 tok/s +step 10700/19560 | loss 3.415569 (+0.27z)| norm 0.2940 (+1.12z)| lr 2.72e-04 | 322.66 ms | 52.3% bf16 MFU | 1623975 tok/s +step 10701/19560 | loss 3.436249 (+0.74z)| norm 0.3233 (+2.90z)| lr 2.72e-04 | 323.10 ms | 52.2% bf16 MFU | 1623910 tok/s +step 10702/19560 | loss 3.378518 (-0.54z)| norm 0.2839 (+0.43z)| lr 2.72e-04 | 323.20 ms | 52.2% bf16 MFU | 1623823 tok/s +step 10703/19560 | loss 3.455290 (+1.16z)| norm 0.3033 (+1.65z)| lr 2.71e-04 | 322.17 ms | 52.4% bf16 MFU | 1624001 tok/s +step 10704/19560 | loss 3.357970 (-0.99z)| norm 0.2919 (+0.92z)| lr 2.71e-04 | 322.69 ms | 52.3% bf16 MFU | 1624038 tok/s +step 10705/19560 | loss 3.436163 (+0.74z)| norm 0.2991 (+1.36z)| lr 2.71e-04 | 322.88 ms | 52.3% bf16 MFU | 1624025 tok/s +step 10706/19560 | loss 3.441918 (+0.86z)| norm 0.2746 (-0.19z)| lr 2.71e-04 | 322.62 ms | 52.3% bf16 MFU | 1624080 tok/s +step 10707/19560 | loss 3.375228 (-0.61z)| norm 0.2935 (+0.99z)| lr 2.71e-04 | 322.84 ms | 52.3% bf16 MFU | 1624076 tok/s +step 10708/19560 | loss 3.444368 (+0.90z)| norm 0.2766 (-0.09z)| lr 2.71e-04 | 322.71 ms | 52.3% bf16 MFU | 1624105 tok/s +step 10709/19560 | loss 3.400588 (-0.04z)| norm 0.2768 (-0.08z)| lr 2.71e-04 | 322.93 ms | 52.3% bf16 MFU | 1624077 tok/s +step 10710/19560 | loss 3.384720 (-0.42z)| norm 0.2651 (-0.82z)| lr 2.71e-04 | 322.75 ms | 52.3% bf16 MFU | 1624095 tok/s +step 10711/19560 | loss 3.454783 (+1.19z)| norm 0.3207 (+2.62z)| lr 2.71e-04 | 322.90 ms | 52.3% bf16 MFU | 1624076 tok/s +step 10712/19560 | loss 3.488813 (+1.93z)| norm 0.2760 (-0.17z)| lr 2.71e-04 | 322.72 ms | 52.3% bf16 MFU | 1624102 tok/s +step 10713/19560 | loss 3.382797 (-0.48z)| norm 0.2859 (+0.45z)| lr 2.71e-04 | 322.64 ms | 52.3% bf16 MFU | 1624147 tok/s +step 10714/19560 | loss 3.339990 (-1.44z)| norm 0.2694 (-0.57z)| lr 2.71e-04 | 322.17 ms | 52.4% bf16 MFU | 1624307 tok/s +step 10715/19560 | loss 3.362694 (-0.91z)| norm 0.2651 (-0.83z)| lr 2.71e-04 | 323.30 ms | 52.2% bf16 MFU | 1624174 tok/s +step 10716/19560 | loss 3.456952 (+1.22z)| norm 0.2745 (-0.24z)| lr 2.71e-04 | 322.67 ms | 52.3% bf16 MFU | 1624208 tok/s +step 10717/19560 | loss 3.299334 (-2.28z)| norm 0.2687 (-0.61z)| lr 2.71e-04 | 322.66 ms | 52.3% bf16 MFU | 1624243 tok/s +step 10718/19560 | loss 3.407059 (+0.10z)| norm 0.3283 (+2.97z)| lr 2.71e-04 | 322.03 ms | 52.4% bf16 MFU | 1624434 tok/s +step 10719/19560 | loss 3.452195 (+1.10z)| norm 0.2897 (+0.64z)| lr 2.71e-04 | 323.62 ms | 52.2% bf16 MFU | 1624216 tok/s +step 10720/19560 | loss 3.447496 (+0.99z)| norm 0.3097 (+1.81z)| lr 2.71e-04 | 322.81 ms | 52.3% bf16 MFU | 1624213 tok/s +step 10721/19560 | loss 3.378647 (-0.54z)| norm 0.2891 (+0.58z)| lr 2.71e-04 | 322.81 ms | 52.3% bf16 MFU | 1624209 tok/s +step 10722/19560 | loss 3.459506 (+1.24z)| norm 0.2951 (+0.93z)| lr 2.71e-04 | 322.50 ms | 52.3% bf16 MFU | 1624284 tok/s +step 10723/19560 | loss 3.382895 (-0.45z)| norm 0.2849 (+0.32z)| lr 2.70e-04 | 322.31 ms | 52.4% bf16 MFU | 1624402 tok/s +step 10724/19560 | loss 3.314046 (-1.93z)| norm 0.2428 (-2.12z)| lr 2.70e-04 | 322.64 ms | 52.3% bf16 MFU | 1624431 tok/s +step 10725/19560 | loss 3.387422 (-0.33z)| norm 0.2814 (+0.13z)| lr 2.70e-04 | 322.51 ms | 52.3% bf16 MFU | 1624491 tok/s +step 10726/19560 | loss 3.393522 (-0.18z)| norm 0.2703 (-0.52z)| lr 2.70e-04 | 322.52 ms | 52.3% bf16 MFU | 1624547 tok/s +step 10727/19560 | loss 3.346948 (-1.20z)| norm 0.2581 (-1.21z)| lr 2.70e-04 | 322.88 ms | 52.3% bf16 MFU | 1624508 tok/s +step 10728/19560 | loss 3.523655 (+2.58z)| norm 0.2869 (+0.46z)| lr 2.70e-04 | 322.55 ms | 52.3% bf16 MFU | 1624554 tok/s +step 10729/19560 | loss 3.394302 (-0.18z)| norm 0.2971 (+1.03z)| lr 2.70e-04 | 322.41 ms | 52.3% bf16 MFU | 1624633 tok/s +step 10730/19560 | loss 3.314786 (-1.84z)| norm 0.2635 (-0.92z)| lr 2.70e-04 | 322.56 ms | 52.3% bf16 MFU | 1624671 tok/s +step 10731/19560 | loss 3.408947 (+0.15z)| norm 0.3088 (+1.68z)| lr 2.70e-04 | 322.54 ms | 52.3% bf16 MFU | 1624712 tok/s +step 10732/19560 | loss 3.395818 (-0.12z)| norm 0.2898 (+0.58z)| lr 2.70e-04 | 323.29 ms | 52.2% bf16 MFU | 1624563 tok/s +step 10733/19560 | loss 3.393825 (-0.16z)| norm 0.2805 (+0.04z)| lr 2.70e-04 | 322.55 ms | 52.3% bf16 MFU | 1624608 tok/s +step 10734/19560 | loss 3.369970 (-0.66z)| norm 0.2651 (-0.84z)| lr 2.70e-04 | 322.62 ms | 52.3% bf16 MFU | 1624633 tok/s +step 10735/19560 | loss 3.329175 (-1.49z)| norm 0.2592 (-1.18z)| lr 2.70e-04 | 322.47 ms | 52.3% bf16 MFU | 1624694 tok/s +step 10736/19560 | loss 3.358791 (-0.87z)| norm 0.2746 (-0.30z)| lr 2.70e-04 | 323.00 ms | 52.3% bf16 MFU | 1624619 tok/s +step 10737/19560 | loss 3.391263 (-0.20z)| norm 0.2509 (-1.64z)| lr 2.70e-04 | 322.86 ms | 52.3% bf16 MFU | 1624581 tok/s +step 10738/19560 | loss 3.477076 (+1.59z)| norm 0.2631 (-0.94z)| lr 2.70e-04 | 322.95 ms | 52.3% bf16 MFU | 1624523 tok/s +step 10739/19560 | loss 3.399070 (-0.04z)| norm 0.2771 (-0.15z)| lr 2.70e-04 | 323.01 ms | 52.2% bf16 MFU | 1624454 tok/s +step 10740/19560 | loss 3.386901 (-0.30z)| norm 0.2538 (-1.47z)| lr 2.70e-04 | 323.07 ms | 52.2% bf16 MFU | 1624374 tok/s +step 10741/19560 | loss 3.408275 (+0.14z)| norm 0.2808 (+0.06z)| lr 2.70e-04 | 322.26 ms | 52.4% bf16 MFU | 1624500 tok/s +step 10742/19560 | loss 3.436077 (+0.71z)| norm 0.2718 (-0.48z)| lr 2.70e-04 | 322.48 ms | 52.3% bf16 MFU | 1624564 tok/s +step 10743/19560 | loss 3.466893 (+1.35z)| norm 0.2594 (-1.21z)| lr 2.69e-04 | 323.40 ms | 52.2% bf16 MFU | 1624395 tok/s +step 10744/19560 | loss 3.332922 (-1.45z)| norm 0.2825 (+0.14z)| lr 2.69e-04 | 322.46 ms | 52.3% bf16 MFU | 1624471 tok/s +step 10745/19560 | loss 3.430089 (+0.59z)| norm 0.2720 (-0.48z)| lr 2.69e-04 | 322.30 ms | 52.4% bf16 MFU | 1624582 tok/s +step 10746/19560 | loss 3.352609 (-1.02z)| norm 0.3014 (+1.25z)| lr 2.69e-04 | 322.96 ms | 52.3% bf16 MFU | 1624521 tok/s +step 10747/19560 | loss 3.384270 (-0.35z)| norm 0.2866 (+0.37z)| lr 2.69e-04 | 323.01 ms | 52.2% bf16 MFU | 1624451 tok/s +step 10748/19560 | loss 3.396918 (-0.09z)| norm 0.2656 (-0.87z)| lr 2.69e-04 | 322.84 ms | 52.3% bf16 MFU | 1624428 tok/s +step 10749/19560 | loss 3.348980 (-1.09z)| norm 0.2798 (-0.04z)| lr 2.69e-04 | 322.54 ms | 52.3% bf16 MFU | 1624481 tok/s +step 10750/19560 | loss 3.479374 (+1.61z)| norm 0.2975 (+1.00z)| lr 2.69e-04 | 322.91 ms | 52.3% bf16 MFU | 1624440 tok/s +val loss 3.378517