hawkling commited on
Commit
165e556
·
verified ·
1 Parent(s): 46407f6

Model save

Browse files
Files changed (1) hide show
  1. trainer_state.json +30 -30
trainer_state.json CHANGED
@@ -10,175 +10,175 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "grad_norm": 3.7410537349353388,
14
  "learning_rate": 2e-05,
15
  "loss": 3.2188,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08,
20
- "grad_norm": 3.724477625112808,
21
  "learning_rate": 2e-05,
22
  "loss": 3.2031,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.12,
27
- "grad_norm": 4.847707450779626,
28
  "learning_rate": 2e-05,
29
  "loss": 3.4062,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.16,
34
- "grad_norm": 5.609117941274957,
35
  "learning_rate": 2e-05,
36
  "loss": 3.3906,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.2,
41
- "grad_norm": 5.086957233893569,
42
  "learning_rate": 2e-05,
43
  "loss": 3.0156,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.24,
48
- "grad_norm": 5.227277254140017,
49
  "learning_rate": 2e-05,
50
  "loss": 3.4844,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.28,
55
- "grad_norm": 4.351893594691178,
56
  "learning_rate": 2e-05,
57
  "loss": 3.3438,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.32,
62
- "grad_norm": 4.143312345927856,
63
  "learning_rate": 2e-05,
64
  "loss": 3.2656,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.36,
69
- "grad_norm": 5.095207786632339,
70
  "learning_rate": 2e-05,
71
  "loss": 3.5625,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.4,
76
- "grad_norm": 5.292856607844618,
77
  "learning_rate": 2e-05,
78
  "loss": 3.3906,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.44,
83
- "grad_norm": 3.9969754196981433,
84
  "learning_rate": 2e-05,
85
  "loss": 3.5,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.48,
90
- "grad_norm": 6.349497527343764,
91
  "learning_rate": 2e-05,
92
- "loss": 3.0469,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.52,
97
- "grad_norm": 5.2510974905297,
98
  "learning_rate": 2e-05,
99
  "loss": 3.2031,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.56,
104
- "grad_norm": 4.472101838754883,
105
  "learning_rate": 2e-05,
106
  "loss": 3.1875,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.6,
111
- "grad_norm": 4.03317126613206,
112
  "learning_rate": 2e-05,
113
  "loss": 3.3906,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.64,
118
- "grad_norm": 3.1679359694764315,
119
  "learning_rate": 2e-05,
120
  "loss": 3.2656,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.68,
125
- "grad_norm": 3.2017940861615557,
126
  "learning_rate": 2e-05,
127
  "loss": 3.0781,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.72,
132
- "grad_norm": 4.001252869474549,
133
  "learning_rate": 2e-05,
134
  "loss": 3.3906,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.76,
139
- "grad_norm": 5.765014522502107,
140
  "learning_rate": 2e-05,
141
  "loss": 3.0469,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.8,
146
- "grad_norm": 5.4796650093828685,
147
  "learning_rate": 2e-05,
148
  "loss": 3.2031,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.84,
153
- "grad_norm": 5.915029810230642,
154
  "learning_rate": 2e-05,
155
  "loss": 3.5312,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.88,
160
- "grad_norm": 4.775032664581898,
161
  "learning_rate": 2e-05,
162
  "loss": 3.0469,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.92,
167
- "grad_norm": 4.653673104315309,
168
  "learning_rate": 2e-05,
169
  "loss": 3.0469,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.96,
174
- "grad_norm": 5.417622869264991,
175
  "learning_rate": 2e-05,
176
  "loss": 3.2344,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 1.0,
181
- "grad_norm": 7.2444031678746645,
182
  "learning_rate": 2e-05,
183
  "loss": 3.1562,
184
  "step": 25
@@ -187,10 +187,10 @@
187
  "epoch": 1.0,
188
  "step": 25,
189
  "total_flos": 478414897152.0,
190
- "train_loss": 3.264375,
191
- "train_runtime": 88.1114,
192
- "train_samples_per_second": 2.236,
193
- "train_steps_per_second": 0.284
194
  }
195
  ],
196
  "logging_steps": 1.0,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "grad_norm": 2.97260032828452,
14
  "learning_rate": 2e-05,
15
  "loss": 3.2188,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08,
20
+ "grad_norm": 3.594459617966852,
21
  "learning_rate": 2e-05,
22
  "loss": 3.2031,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.12,
27
+ "grad_norm": 4.533624625545066,
28
  "learning_rate": 2e-05,
29
  "loss": 3.4062,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.16,
34
+ "grad_norm": 5.240356703299346,
35
  "learning_rate": 2e-05,
36
  "loss": 3.3906,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.2,
41
+ "grad_norm": 5.560506359364048,
42
  "learning_rate": 2e-05,
43
  "loss": 3.0156,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.24,
48
+ "grad_norm": 4.893164995962358,
49
  "learning_rate": 2e-05,
50
  "loss": 3.4844,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.28,
55
+ "grad_norm": 3.9500718110988577,
56
  "learning_rate": 2e-05,
57
  "loss": 3.3438,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.32,
62
+ "grad_norm": 3.769524663807585,
63
  "learning_rate": 2e-05,
64
  "loss": 3.2656,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.36,
69
+ "grad_norm": 4.641711022263081,
70
  "learning_rate": 2e-05,
71
  "loss": 3.5625,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.4,
76
+ "grad_norm": 5.045996826645826,
77
  "learning_rate": 2e-05,
78
  "loss": 3.3906,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.44,
83
+ "grad_norm": 3.9379891091293446,
84
  "learning_rate": 2e-05,
85
  "loss": 3.5,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.48,
90
+ "grad_norm": 5.935094879674494,
91
  "learning_rate": 2e-05,
92
+ "loss": 3.0312,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.52,
97
+ "grad_norm": 4.966274615752918,
98
  "learning_rate": 2e-05,
99
  "loss": 3.2031,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.56,
104
+ "grad_norm": 4.115242373102183,
105
  "learning_rate": 2e-05,
106
  "loss": 3.1875,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.6,
111
+ "grad_norm": 3.8744700920947444,
112
  "learning_rate": 2e-05,
113
  "loss": 3.3906,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.64,
118
+ "grad_norm": 3.1817540324181763,
119
  "learning_rate": 2e-05,
120
  "loss": 3.2656,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.68,
125
+ "grad_norm": 3.2254117611390924,
126
  "learning_rate": 2e-05,
127
  "loss": 3.0781,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.72,
132
+ "grad_norm": 4.082116872038495,
133
  "learning_rate": 2e-05,
134
  "loss": 3.3906,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.76,
139
+ "grad_norm": 5.513160685623603,
140
  "learning_rate": 2e-05,
141
  "loss": 3.0469,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.8,
146
+ "grad_norm": 5.2179385595071155,
147
  "learning_rate": 2e-05,
148
  "loss": 3.2031,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.84,
153
+ "grad_norm": 5.4603287380267,
154
  "learning_rate": 2e-05,
155
  "loss": 3.5312,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.88,
160
+ "grad_norm": 4.596875242645009,
161
  "learning_rate": 2e-05,
162
  "loss": 3.0469,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.92,
167
+ "grad_norm": 4.546647529822756,
168
  "learning_rate": 2e-05,
169
  "loss": 3.0469,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.96,
174
+ "grad_norm": 5.146569623435821,
175
  "learning_rate": 2e-05,
176
  "loss": 3.2344,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 1.0,
181
+ "grad_norm": 6.727524092488977,
182
  "learning_rate": 2e-05,
183
  "loss": 3.1562,
184
  "step": 25
 
187
  "epoch": 1.0,
188
  "step": 25,
189
  "total_flos": 478414897152.0,
190
+ "train_loss": 3.26375,
191
+ "train_runtime": 93.9312,
192
+ "train_samples_per_second": 2.097,
193
+ "train_steps_per_second": 0.266
194
  }
195
  ],
196
  "logging_steps": 1.0,