mtzig commited on
Commit
197ed4b
·
verified ·
1 Parent(s): 70c76f4

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe9917eefc699fb584c90fe89ad3475d9d653632a0d8800333b645f29471b58f
3
  size 13648688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e00c75665fad68257b13112a896f0e5bfe09116a5aa7d74e1a11ae36dfaf5ff
3
  size 13648688
last-checkpoint/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28310f9cdeec28ac828f4b272b8422f969423fa5606b6a5280dcae9804a9906
3
+ size 20450800
last-checkpoint/global_step200/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:336f8476bbfc2636bde52666b9fd678cb9ea4f3960967488f79a0e126589659b
3
+ size 20450800
last-checkpoint/global_step200/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdbd809b7cfdfc9e87ae66c687d52ef0e79bad9c1ffb974143c9207b719a8143
3
+ size 20450800
last-checkpoint/global_step200/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a3bd6c3c45c00ca288387009386aae02b69e0f61a21d8aa7f447ce9a0ac7b72
3
+ size 20450800
last-checkpoint/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11585c69f2eb7fc6ae25f8f7d341eb09bb43308fbb6c512ffd6a07473415410
3
+ size 152238
last-checkpoint/global_step200/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da48dbffc3ff23a696096bc8ab8bbd5b584bc72a5fe5c27c36305aa26dd9e47b
3
+ size 152238
last-checkpoint/global_step200/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b2092bba3deb5b81e41ea98bc99a0f8aa82b9246b54ef2a0e9e7063d5f9aab6
3
+ size 152238
last-checkpoint/global_step200/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39ad00fd5370f273bbd2355e4bf561718800f099a2c153171eb9e0ff78a91ba3
3
+ size 152238
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step100
 
1
+ global_step200
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d956a3852ddc13f142821bad54db6c6d693f7d72f4550279e09e4404f3bfdec1
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c24a39de75f23f0d84f98b720b05e4f552cdd0306626c901205b2d9690be33
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:716c2e2e4da4262d03391c65c97fed6a6bf9dbd38227044c4cc078e9b1d3e484
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75baba3fff49778fb1f9915f06b5fd052daf5b241c6df48d63ce4cc2fd74ad52
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b9c68b6d4332dd6959712f085dbab7211a28c68a4142f58967dafba061945d6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43726b0b6816cd7dc4fb0c6379613d398bab9cf9069c5ee8ac83eea24f4fa621
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8354c248eaac6c4d78f2cba932653de1707044637067c835c8c37099f88ac1c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c26a88d8955baf0e31629f8efe3e6d01a1336c21cf4cdd20becb43acefdfdd69
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42250aa8b2737410eec03ac1598a8cb0d204fc4d857fc37b8834793ce14af0fd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:863b742f77d90cf639cf492ec2d91dab7a9ebd0f58799b06186327b2c961991e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.12296341838303104,
5
  "eval_steps": 40,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -743,6 +743,742 @@
743
  "learning_rate": 1.9970093649572567e-05,
744
  "loss": 0.2257,
745
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  }
747
  ],
748
  "logging_steps": 1,
@@ -762,7 +1498,7 @@
762
  "attributes": {}
763
  }
764
  },
765
- "total_flos": 98733826211840.0,
766
  "train_batch_size": 4,
767
  "trial_name": null,
768
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.24592683676606208,
5
  "eval_steps": 40,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
743
  "learning_rate": 1.9970093649572567e-05,
744
  "loss": 0.2257,
745
  "step": 100
746
+ },
747
+ {
748
+ "epoch": 0.12419305256686136,
749
+ "grad_norm": 0.18210033905044673,
750
+ "learning_rate": 1.9966680315970647e-05,
751
+ "loss": 0.2629,
752
+ "step": 101
753
+ },
754
+ {
755
+ "epoch": 0.12542268675069168,
756
+ "grad_norm": 0.22735479010308166,
757
+ "learning_rate": 1.996308289887366e-05,
758
+ "loss": 0.3055,
759
+ "step": 102
760
+ },
761
+ {
762
+ "epoch": 0.12665232093452197,
763
+ "grad_norm": 0.21880883838789394,
764
+ "learning_rate": 1.9959301464725507e-05,
765
+ "loss": 0.2896,
766
+ "step": 103
767
+ },
768
+ {
769
+ "epoch": 0.1278819551183523,
770
+ "grad_norm": 0.1568188763941854,
771
+ "learning_rate": 1.995533608336886e-05,
772
+ "loss": 0.1856,
773
+ "step": 104
774
+ },
775
+ {
776
+ "epoch": 0.1291115893021826,
777
+ "grad_norm": 0.21046557286925857,
778
+ "learning_rate": 1.995118682804388e-05,
779
+ "loss": 0.2328,
780
+ "step": 105
781
+ },
782
+ {
783
+ "epoch": 0.1303412234860129,
784
+ "grad_norm": 0.23586873079680978,
785
+ "learning_rate": 1.9946853775386857e-05,
786
+ "loss": 0.2608,
787
+ "step": 106
788
+ },
789
+ {
790
+ "epoch": 0.13157085766984322,
791
+ "grad_norm": 0.19655270590211088,
792
+ "learning_rate": 1.9942337005428805e-05,
793
+ "loss": 0.2982,
794
+ "step": 107
795
+ },
796
+ {
797
+ "epoch": 0.13280049185367354,
798
+ "grad_norm": 0.31014757888444056,
799
+ "learning_rate": 1.9937636601593965e-05,
800
+ "loss": 0.3041,
801
+ "step": 108
802
+ },
803
+ {
804
+ "epoch": 0.13403012603750383,
805
+ "grad_norm": 0.3337300802942654,
806
+ "learning_rate": 1.9932752650698285e-05,
807
+ "loss": 0.3085,
808
+ "step": 109
809
+ },
810
+ {
811
+ "epoch": 0.13525976022133415,
812
+ "grad_norm": 0.2637379526580471,
813
+ "learning_rate": 1.9927685242947804e-05,
814
+ "loss": 0.301,
815
+ "step": 110
816
+ },
817
+ {
818
+ "epoch": 0.13648939440516447,
819
+ "grad_norm": 0.22577400909500966,
820
+ "learning_rate": 1.9922434471936987e-05,
821
+ "loss": 0.2234,
822
+ "step": 111
823
+ },
824
+ {
825
+ "epoch": 0.13771902858899476,
826
+ "grad_norm": 0.21966929441456037,
827
+ "learning_rate": 1.9917000434647e-05,
828
+ "loss": 0.2743,
829
+ "step": 112
830
+ },
831
+ {
832
+ "epoch": 0.13894866277282508,
833
+ "grad_norm": 0.2540802763063313,
834
+ "learning_rate": 1.991138323144392e-05,
835
+ "loss": 0.2307,
836
+ "step": 113
837
+ },
838
+ {
839
+ "epoch": 0.1401782969566554,
840
+ "grad_norm": 0.3204592419266727,
841
+ "learning_rate": 1.990558296607687e-05,
842
+ "loss": 0.2156,
843
+ "step": 114
844
+ },
845
+ {
846
+ "epoch": 0.1414079311404857,
847
+ "grad_norm": 0.24090237444938584,
848
+ "learning_rate": 1.9899599745676123e-05,
849
+ "loss": 0.2809,
850
+ "step": 115
851
+ },
852
+ {
853
+ "epoch": 0.142637565324316,
854
+ "grad_norm": 0.2611220625407227,
855
+ "learning_rate": 1.9893433680751105e-05,
856
+ "loss": 0.2307,
857
+ "step": 116
858
+ },
859
+ {
860
+ "epoch": 0.14386719950814633,
861
+ "grad_norm": 0.37292140730199286,
862
+ "learning_rate": 1.9887084885188354e-05,
863
+ "loss": 0.2051,
864
+ "step": 117
865
+ },
866
+ {
867
+ "epoch": 0.14509683369197662,
868
+ "grad_norm": 0.21515044559789018,
869
+ "learning_rate": 1.9880553476249437e-05,
870
+ "loss": 0.2416,
871
+ "step": 118
872
+ },
873
+ {
874
+ "epoch": 0.14632646787580694,
875
+ "grad_norm": 0.27918682412366513,
876
+ "learning_rate": 1.9873839574568756e-05,
877
+ "loss": 0.2711,
878
+ "step": 119
879
+ },
880
+ {
881
+ "epoch": 0.14755610205963726,
882
+ "grad_norm": 0.23334267872599612,
883
+ "learning_rate": 1.9866943304151346e-05,
884
+ "loss": 0.305,
885
+ "step": 120
886
+ },
887
+ {
888
+ "epoch": 0.14755610205963726,
889
+ "eval_accuracy": 0.8021390374331551,
890
+ "eval_f1": 0.5316455696202531,
891
+ "eval_loss": 0.4852343797683716,
892
+ "eval_precision": 0.7241379310344828,
893
+ "eval_recall": 0.42,
894
+ "eval_runtime": 22.4489,
895
+ "eval_samples_per_second": 2.227,
896
+ "eval_steps_per_second": 0.178,
897
+ "step": 120
898
+ },
899
+ {
900
+ "epoch": 0.14878573624346758,
901
+ "grad_norm": 0.255894846030308,
902
+ "learning_rate": 1.9859864792370565e-05,
903
+ "loss": 0.2925,
904
+ "step": 121
905
+ },
906
+ {
907
+ "epoch": 0.15001537042729787,
908
+ "grad_norm": 0.2894368730807284,
909
+ "learning_rate": 1.985260416996575e-05,
910
+ "loss": 0.3092,
911
+ "step": 122
912
+ },
913
+ {
914
+ "epoch": 0.1512450046111282,
915
+ "grad_norm": 0.2819736938873003,
916
+ "learning_rate": 1.9845161571039805e-05,
917
+ "loss": 0.2827,
918
+ "step": 123
919
+ },
920
+ {
921
+ "epoch": 0.1524746387949585,
922
+ "grad_norm": 0.2849595623136817,
923
+ "learning_rate": 1.983753713305672e-05,
924
+ "loss": 0.3301,
925
+ "step": 124
926
+ },
927
+ {
928
+ "epoch": 0.1537042729787888,
929
+ "grad_norm": 0.36826872994416193,
930
+ "learning_rate": 1.982973099683902e-05,
931
+ "loss": 0.2657,
932
+ "step": 125
933
+ },
934
+ {
935
+ "epoch": 0.15493390716261912,
936
+ "grad_norm": 0.26692338354049955,
937
+ "learning_rate": 1.98217433065652e-05,
938
+ "loss": 0.1732,
939
+ "step": 126
940
+ },
941
+ {
942
+ "epoch": 0.15616354134644944,
943
+ "grad_norm": 0.2573459669567664,
944
+ "learning_rate": 1.9813574209767013e-05,
945
+ "loss": 0.2586,
946
+ "step": 127
947
+ },
948
+ {
949
+ "epoch": 0.15739317553027973,
950
+ "grad_norm": 0.3781515395257974,
951
+ "learning_rate": 1.9805223857326794e-05,
952
+ "loss": 0.2687,
953
+ "step": 128
954
+ },
955
+ {
956
+ "epoch": 0.15862280971411005,
957
+ "grad_norm": 0.2609213888795932,
958
+ "learning_rate": 1.9796692403474632e-05,
959
+ "loss": 0.243,
960
+ "step": 129
961
+ },
962
+ {
963
+ "epoch": 0.15985244389794037,
964
+ "grad_norm": 0.33148152190355884,
965
+ "learning_rate": 1.9787980005785553e-05,
966
+ "loss": 0.284,
967
+ "step": 130
968
+ },
969
+ {
970
+ "epoch": 0.16108207808177066,
971
+ "grad_norm": 0.24671664914766978,
972
+ "learning_rate": 1.977908682517658e-05,
973
+ "loss": 0.2743,
974
+ "step": 131
975
+ },
976
+ {
977
+ "epoch": 0.16231171226560098,
978
+ "grad_norm": 0.23136891800428114,
979
+ "learning_rate": 1.9770013025903797e-05,
980
+ "loss": 0.1988,
981
+ "step": 132
982
+ },
983
+ {
984
+ "epoch": 0.1635413464494313,
985
+ "grad_norm": 0.3625913614143042,
986
+ "learning_rate": 1.9760758775559275e-05,
987
+ "loss": 0.3571,
988
+ "step": 133
989
+ },
990
+ {
991
+ "epoch": 0.1647709806332616,
992
+ "grad_norm": 0.4654838486463352,
993
+ "learning_rate": 1.9751324245068008e-05,
994
+ "loss": 0.2483,
995
+ "step": 134
996
+ },
997
+ {
998
+ "epoch": 0.1660006148170919,
999
+ "grad_norm": 0.23389863720470436,
1000
+ "learning_rate": 1.974170960868474e-05,
1001
+ "loss": 0.2121,
1002
+ "step": 135
1003
+ },
1004
+ {
1005
+ "epoch": 0.16723024900092223,
1006
+ "grad_norm": 0.26709730418277217,
1007
+ "learning_rate": 1.973191504399076e-05,
1008
+ "loss": 0.2277,
1009
+ "step": 136
1010
+ },
1011
+ {
1012
+ "epoch": 0.16845988318475252,
1013
+ "grad_norm": 0.3065908377026066,
1014
+ "learning_rate": 1.97219407318906e-05,
1015
+ "loss": 0.2811,
1016
+ "step": 137
1017
+ },
1018
+ {
1019
+ "epoch": 0.16968951736858284,
1020
+ "grad_norm": 0.2561047976359151,
1021
+ "learning_rate": 1.9711786856608714e-05,
1022
+ "loss": 0.2702,
1023
+ "step": 138
1024
+ },
1025
+ {
1026
+ "epoch": 0.17091915155241316,
1027
+ "grad_norm": 0.3029014908375809,
1028
+ "learning_rate": 1.970145360568607e-05,
1029
+ "loss": 0.2703,
1030
+ "step": 139
1031
+ },
1032
+ {
1033
+ "epoch": 0.17214878573624345,
1034
+ "grad_norm": 0.2928125726746547,
1035
+ "learning_rate": 1.969094116997668e-05,
1036
+ "loss": 0.2925,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 0.17337841992007377,
1041
+ "grad_norm": 0.417971783345298,
1042
+ "learning_rate": 1.968024974364408e-05,
1043
+ "loss": 0.2496,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 0.1746080541039041,
1048
+ "grad_norm": 0.2496018935199012,
1049
+ "learning_rate": 1.9669379524157755e-05,
1050
+ "loss": 0.2279,
1051
+ "step": 142
1052
+ },
1053
+ {
1054
+ "epoch": 0.1758376882877344,
1055
+ "grad_norm": 0.3714257892665527,
1056
+ "learning_rate": 1.9658330712289456e-05,
1057
+ "loss": 0.295,
1058
+ "step": 143
1059
+ },
1060
+ {
1061
+ "epoch": 0.1770673224715647,
1062
+ "grad_norm": 0.2580656993421598,
1063
+ "learning_rate": 1.9647103512109535e-05,
1064
+ "loss": 0.253,
1065
+ "step": 144
1066
+ },
1067
+ {
1068
+ "epoch": 0.17829695665539502,
1069
+ "grad_norm": 0.2560142798427468,
1070
+ "learning_rate": 1.9635698130983153e-05,
1071
+ "loss": 0.251,
1072
+ "step": 145
1073
+ },
1074
+ {
1075
+ "epoch": 0.17952659083922534,
1076
+ "grad_norm": 0.2607452518822491,
1077
+ "learning_rate": 1.962411477956645e-05,
1078
+ "loss": 0.2395,
1079
+ "step": 146
1080
+ },
1081
+ {
1082
+ "epoch": 0.18075622502305563,
1083
+ "grad_norm": 0.25191931122780475,
1084
+ "learning_rate": 1.9612353671802658e-05,
1085
+ "loss": 0.2389,
1086
+ "step": 147
1087
+ },
1088
+ {
1089
+ "epoch": 0.18198585920688595,
1090
+ "grad_norm": 0.2991928933187448,
1091
+ "learning_rate": 1.960041502491815e-05,
1092
+ "loss": 0.3016,
1093
+ "step": 148
1094
+ },
1095
+ {
1096
+ "epoch": 0.18321549339071627,
1097
+ "grad_norm": 0.2775853628798545,
1098
+ "learning_rate": 1.9588299059418434e-05,
1099
+ "loss": 0.2558,
1100
+ "step": 149
1101
+ },
1102
+ {
1103
+ "epoch": 0.18444512757454656,
1104
+ "grad_norm": 0.25029731861303234,
1105
+ "learning_rate": 1.957600599908406e-05,
1106
+ "loss": 0.221,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 0.18567476175837688,
1111
+ "grad_norm": 0.3473969966642877,
1112
+ "learning_rate": 1.9563536070966513e-05,
1113
+ "loss": 0.3639,
1114
+ "step": 151
1115
+ },
1116
+ {
1117
+ "epoch": 0.1869043959422072,
1118
+ "grad_norm": 0.21700372590221412,
1119
+ "learning_rate": 1.9550889505383996e-05,
1120
+ "loss": 0.2122,
1121
+ "step": 152
1122
+ },
1123
+ {
1124
+ "epoch": 0.1881340301260375,
1125
+ "grad_norm": 0.26057622594959473,
1126
+ "learning_rate": 1.9538066535917196e-05,
1127
+ "loss": 0.2631,
1128
+ "step": 153
1129
+ },
1130
+ {
1131
+ "epoch": 0.1893636643098678,
1132
+ "grad_norm": 0.31462240972233546,
1133
+ "learning_rate": 1.952506739940496e-05,
1134
+ "loss": 0.2576,
1135
+ "step": 154
1136
+ },
1137
+ {
1138
+ "epoch": 0.19059329849369813,
1139
+ "grad_norm": 0.26307248972641967,
1140
+ "learning_rate": 1.9511892335939904e-05,
1141
+ "loss": 0.2419,
1142
+ "step": 155
1143
+ },
1144
+ {
1145
+ "epoch": 0.19182293267752842,
1146
+ "grad_norm": 0.3849365062506917,
1147
+ "learning_rate": 1.9498541588864022e-05,
1148
+ "loss": 0.2316,
1149
+ "step": 156
1150
+ },
1151
+ {
1152
+ "epoch": 0.19305256686135874,
1153
+ "grad_norm": 0.2858572602811596,
1154
+ "learning_rate": 1.948501540476414e-05,
1155
+ "loss": 0.2242,
1156
+ "step": 157
1157
+ },
1158
+ {
1159
+ "epoch": 0.19428220104518906,
1160
+ "grad_norm": 0.20651885971329154,
1161
+ "learning_rate": 1.9471314033467413e-05,
1162
+ "loss": 0.2597,
1163
+ "step": 158
1164
+ },
1165
+ {
1166
+ "epoch": 0.19551183522901935,
1167
+ "grad_norm": 0.24219070077131208,
1168
+ "learning_rate": 1.945743772803666e-05,
1169
+ "loss": 0.1932,
1170
+ "step": 159
1171
+ },
1172
+ {
1173
+ "epoch": 0.19674146941284967,
1174
+ "grad_norm": 0.2870212902155916,
1175
+ "learning_rate": 1.9443386744765726e-05,
1176
+ "loss": 0.256,
1177
+ "step": 160
1178
+ },
1179
+ {
1180
+ "epoch": 0.19674146941284967,
1181
+ "eval_accuracy": 0.8021390374331551,
1182
+ "eval_f1": 0.4931506849315068,
1183
+ "eval_loss": 0.43281251192092896,
1184
+ "eval_precision": 0.782608695652174,
1185
+ "eval_recall": 0.36,
1186
+ "eval_runtime": 23.7371,
1187
+ "eval_samples_per_second": 2.106,
1188
+ "eval_steps_per_second": 0.169,
1189
+ "step": 160
1190
+ },
1191
+ {
1192
+ "epoch": 0.19797110359668,
1193
+ "grad_norm": 0.2614284036745401,
1194
+ "learning_rate": 1.942916134317473e-05,
1195
+ "loss": 0.2436,
1196
+ "step": 161
1197
+ },
1198
+ {
1199
+ "epoch": 0.1992007377805103,
1200
+ "grad_norm": 0.26937001401772626,
1201
+ "learning_rate": 1.9414761786005293e-05,
1202
+ "loss": 0.1725,
1203
+ "step": 162
1204
+ },
1205
+ {
1206
+ "epoch": 0.2004303719643406,
1207
+ "grad_norm": 0.28202676187309017,
1208
+ "learning_rate": 1.9400188339215657e-05,
1209
+ "loss": 0.2591,
1210
+ "step": 163
1211
+ },
1212
+ {
1213
+ "epoch": 0.20166000614817092,
1214
+ "grad_norm": 0.27016058532381143,
1215
+ "learning_rate": 1.9385441271975786e-05,
1216
+ "loss": 0.2003,
1217
+ "step": 164
1218
+ },
1219
+ {
1220
+ "epoch": 0.20288964033200124,
1221
+ "grad_norm": 0.20605906416711317,
1222
+ "learning_rate": 1.9370520856662406e-05,
1223
+ "loss": 0.1778,
1224
+ "step": 165
1225
+ },
1226
+ {
1227
+ "epoch": 0.20411927451583153,
1228
+ "grad_norm": 0.21687941485697337,
1229
+ "learning_rate": 1.9355427368853946e-05,
1230
+ "loss": 0.2145,
1231
+ "step": 166
1232
+ },
1233
+ {
1234
+ "epoch": 0.20534890869966185,
1235
+ "grad_norm": 0.4233260372619336,
1236
+ "learning_rate": 1.9340161087325483e-05,
1237
+ "loss": 0.1657,
1238
+ "step": 167
1239
+ },
1240
+ {
1241
+ "epoch": 0.20657854288349217,
1242
+ "grad_norm": 0.26680222767798134,
1243
+ "learning_rate": 1.932472229404356e-05,
1244
+ "loss": 0.1846,
1245
+ "step": 168
1246
+ },
1247
+ {
1248
+ "epoch": 0.20780817706732246,
1249
+ "grad_norm": 0.3380713604084801,
1250
+ "learning_rate": 1.9309111274161005e-05,
1251
+ "loss": 0.2896,
1252
+ "step": 169
1253
+ },
1254
+ {
1255
+ "epoch": 0.20903781125115278,
1256
+ "grad_norm": 0.3308557221991788,
1257
+ "learning_rate": 1.9293328316011645e-05,
1258
+ "loss": 0.2199,
1259
+ "step": 170
1260
+ },
1261
+ {
1262
+ "epoch": 0.2102674454349831,
1263
+ "grad_norm": 0.3136020181316013,
1264
+ "learning_rate": 1.927737371110499e-05,
1265
+ "loss": 0.213,
1266
+ "step": 171
1267
+ },
1268
+ {
1269
+ "epoch": 0.2114970796188134,
1270
+ "grad_norm": 0.35343090913220965,
1271
+ "learning_rate": 1.9261247754120846e-05,
1272
+ "loss": 0.2322,
1273
+ "step": 172
1274
+ },
1275
+ {
1276
+ "epoch": 0.2127267138026437,
1277
+ "grad_norm": 0.2824826725729394,
1278
+ "learning_rate": 1.924495074290388e-05,
1279
+ "loss": 0.2523,
1280
+ "step": 173
1281
+ },
1282
+ {
1283
+ "epoch": 0.21395634798647403,
1284
+ "grad_norm": 0.34041228997887535,
1285
+ "learning_rate": 1.92284829784581e-05,
1286
+ "loss": 0.2854,
1287
+ "step": 174
1288
+ },
1289
+ {
1290
+ "epoch": 0.21518598217030432,
1291
+ "grad_norm": 0.3318426940507861,
1292
+ "learning_rate": 1.9211844764941318e-05,
1293
+ "loss": 0.1669,
1294
+ "step": 175
1295
+ },
1296
+ {
1297
+ "epoch": 0.21641561635413464,
1298
+ "grad_norm": 0.3090750522717658,
1299
+ "learning_rate": 1.919503640965951e-05,
1300
+ "loss": 0.1843,
1301
+ "step": 176
1302
+ },
1303
+ {
1304
+ "epoch": 0.21764525053796496,
1305
+ "grad_norm": 0.297018364631407,
1306
+ "learning_rate": 1.917805822306117e-05,
1307
+ "loss": 0.2038,
1308
+ "step": 177
1309
+ },
1310
+ {
1311
+ "epoch": 0.21887488472179525,
1312
+ "grad_norm": 0.2679923976809244,
1313
+ "learning_rate": 1.916091051873154e-05,
1314
+ "loss": 0.14,
1315
+ "step": 178
1316
+ },
1317
+ {
1318
+ "epoch": 0.22010451890562557,
1319
+ "grad_norm": 0.3130518688020483,
1320
+ "learning_rate": 1.9143593613386845e-05,
1321
+ "loss": 0.1871,
1322
+ "step": 179
1323
+ },
1324
+ {
1325
+ "epoch": 0.2213341530894559,
1326
+ "grad_norm": 0.38588801765002845,
1327
+ "learning_rate": 1.9126107826868436e-05,
1328
+ "loss": 0.275,
1329
+ "step": 180
1330
+ },
1331
+ {
1332
+ "epoch": 0.22256378727328618,
1333
+ "grad_norm": 0.3157899959865116,
1334
+ "learning_rate": 1.9108453482136866e-05,
1335
+ "loss": 0.2098,
1336
+ "step": 181
1337
+ },
1338
+ {
1339
+ "epoch": 0.2237934214571165,
1340
+ "grad_norm": 0.3517837295222883,
1341
+ "learning_rate": 1.9090630905265963e-05,
1342
+ "loss": 0.2855,
1343
+ "step": 182
1344
+ },
1345
+ {
1346
+ "epoch": 0.22502305564094682,
1347
+ "grad_norm": 0.3092532190930959,
1348
+ "learning_rate": 1.9072640425436762e-05,
1349
+ "loss": 0.2278,
1350
+ "step": 183
1351
+ },
1352
+ {
1353
+ "epoch": 0.22625268982477714,
1354
+ "grad_norm": 0.38543714174275906,
1355
+ "learning_rate": 1.905448237493147e-05,
1356
+ "loss": 0.289,
1357
+ "step": 184
1358
+ },
1359
+ {
1360
+ "epoch": 0.22748232400860743,
1361
+ "grad_norm": 0.3569146643137087,
1362
+ "learning_rate": 1.9036157089127278e-05,
1363
+ "loss": 0.2716,
1364
+ "step": 185
1365
+ },
1366
+ {
1367
+ "epoch": 0.22871195819243775,
1368
+ "grad_norm": 0.40683148192292634,
1369
+ "learning_rate": 1.901766490649022e-05,
1370
+ "loss": 0.2983,
1371
+ "step": 186
1372
+ },
1373
+ {
1374
+ "epoch": 0.22994159237626807,
1375
+ "grad_norm": 0.4253940265898542,
1376
+ "learning_rate": 1.8999006168568883e-05,
1377
+ "loss": 0.2284,
1378
+ "step": 187
1379
+ },
1380
+ {
1381
+ "epoch": 0.23117122656009836,
1382
+ "grad_norm": 0.26669631723987736,
1383
+ "learning_rate": 1.8980181219988117e-05,
1384
+ "loss": 0.1757,
1385
+ "step": 188
1386
+ },
1387
+ {
1388
+ "epoch": 0.23240086074392868,
1389
+ "grad_norm": 0.2721222099304509,
1390
+ "learning_rate": 1.8961190408442662e-05,
1391
+ "loss": 0.2298,
1392
+ "step": 189
1393
+ },
1394
+ {
1395
+ "epoch": 0.233630494927759,
1396
+ "grad_norm": 0.36072785713712757,
1397
+ "learning_rate": 1.8942034084690727e-05,
1398
+ "loss": 0.2847,
1399
+ "step": 190
1400
+ },
1401
+ {
1402
+ "epoch": 0.2348601291115893,
1403
+ "grad_norm": 0.34669408899608534,
1404
+ "learning_rate": 1.8922712602547516e-05,
1405
+ "loss": 0.2453,
1406
+ "step": 191
1407
+ },
1408
+ {
1409
+ "epoch": 0.2360897632954196,
1410
+ "grad_norm": 0.3295391150787403,
1411
+ "learning_rate": 1.89032263188787e-05,
1412
+ "loss": 0.2633,
1413
+ "step": 192
1414
+ },
1415
+ {
1416
+ "epoch": 0.23731939747924993,
1417
+ "grad_norm": 0.2836153191531997,
1418
+ "learning_rate": 1.8883575593593793e-05,
1419
+ "loss": 0.2218,
1420
+ "step": 193
1421
+ },
1422
+ {
1423
+ "epoch": 0.23854903166308022,
1424
+ "grad_norm": 0.2680039523838409,
1425
+ "learning_rate": 1.8863760789639548e-05,
1426
+ "loss": 0.2422,
1427
+ "step": 194
1428
+ },
1429
+ {
1430
+ "epoch": 0.23977866584691054,
1431
+ "grad_norm": 0.18616539199723142,
1432
+ "learning_rate": 1.8843782272993225e-05,
1433
+ "loss": 0.1552,
1434
+ "step": 195
1435
+ },
1436
+ {
1437
+ "epoch": 0.24100830003074086,
1438
+ "grad_norm": 0.2810251721018552,
1439
+ "learning_rate": 1.8823640412655844e-05,
1440
+ "loss": 0.1982,
1441
+ "step": 196
1442
+ },
1443
+ {
1444
+ "epoch": 0.24223793421457115,
1445
+ "grad_norm": 0.2586309972920896,
1446
+ "learning_rate": 1.880333558064536e-05,
1447
+ "loss": 0.2115,
1448
+ "step": 197
1449
+ },
1450
+ {
1451
+ "epoch": 0.24346756839840147,
1452
+ "grad_norm": 0.2529365933222149,
1453
+ "learning_rate": 1.878286815198979e-05,
1454
+ "loss": 0.2142,
1455
+ "step": 198
1456
+ },
1457
+ {
1458
+ "epoch": 0.2446972025822318,
1459
+ "grad_norm": 0.376229567543972,
1460
+ "learning_rate": 1.876223850472032e-05,
1461
+ "loss": 0.2328,
1462
+ "step": 199
1463
+ },
1464
+ {
1465
+ "epoch": 0.24592683676606208,
1466
+ "grad_norm": 0.23504541587554245,
1467
+ "learning_rate": 1.8741447019864263e-05,
1468
+ "loss": 0.2062,
1469
+ "step": 200
1470
+ },
1471
+ {
1472
+ "epoch": 0.24592683676606208,
1473
+ "eval_accuracy": 0.786096256684492,
1474
+ "eval_f1": 0.42857142857142855,
1475
+ "eval_loss": 0.46992188692092896,
1476
+ "eval_precision": 0.75,
1477
+ "eval_recall": 0.3,
1478
+ "eval_runtime": 23.5849,
1479
+ "eval_samples_per_second": 2.12,
1480
+ "eval_steps_per_second": 0.17,
1481
+ "step": 200
1482
  }
1483
  ],
1484
  "logging_steps": 1,
 
1498
  "attributes": {}
1499
  }
1500
  },
1501
+ "total_flos": 197328810967040.0,
1502
  "train_batch_size": 4,
1503
  "trial_name": null,
1504
  "trial_params": null