File size: 26,959 Bytes
f1cea8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 355,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "grad_norm": 2.158629831791895,
      "learning_rate": 1.3888888888888887e-08,
      "logits/chosen": -2.804708957672119,
      "logits/rejected": -2.8150453567504883,
      "logps/chosen": -217.97438049316406,
      "logps/rejected": -216.58865356445312,
      "loss": 0.6931,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/margins_max": 0.0,
      "rewards/margins_min": 0.0,
      "rewards/margins_std": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.03,
      "grad_norm": 8.21578821692664,
      "learning_rate": 1.3888888888888888e-07,
      "logits/chosen": -2.8844423294067383,
      "logits/rejected": -2.799159526824951,
      "logps/chosen": -366.7507629394531,
      "logps/rejected": -275.4356384277344,
      "loss": 0.6932,
      "rewards/accuracies": 0.4027777910232544,
      "rewards/chosen": -0.00016000178584363312,
      "rewards/margins": -0.0003212409501429647,
      "rewards/margins_max": 0.002417487557977438,
      "rewards/margins_min": -0.004127700813114643,
      "rewards/margins_std": 0.0029805246740579605,
      "rewards/rejected": 0.00016123917885124683,
      "step": 10
    },
    {
      "epoch": 0.06,
      "grad_norm": 2.0515925664507124,
      "learning_rate": 2.7777777777777776e-07,
      "logits/chosen": -2.74739670753479,
      "logits/rejected": -2.6935513019561768,
      "logps/chosen": -329.11138916015625,
      "logps/rejected": -216.7494659423828,
      "loss": 0.6929,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.00035773837589658797,
      "rewards/margins": 0.00041456689359620214,
      "rewards/margins_max": 0.0031081512570381165,
      "rewards/margins_min": -0.002314414829015732,
      "rewards/margins_std": 0.0023837233893573284,
      "rewards/rejected": -5.682848859578371e-05,
      "step": 20
    },
    {
      "epoch": 0.08,
      "grad_norm": 2.310838256933408,
      "learning_rate": 4.1666666666666667e-07,
      "logits/chosen": -2.835951328277588,
      "logits/rejected": -2.7546262741088867,
      "logps/chosen": -329.10211181640625,
      "logps/rejected": -233.05508422851562,
      "loss": 0.6917,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.0018310332670807838,
      "rewards/margins": 0.002559047192335129,
      "rewards/margins_max": 0.006475468166172504,
      "rewards/margins_min": -0.0006590075790882111,
      "rewards/margins_std": 0.0032629654742777348,
      "rewards/rejected": -0.0007280135178007185,
      "step": 30
    },
    {
      "epoch": 0.11,
      "grad_norm": 1.9718465521886885,
      "learning_rate": 4.998060489154965e-07,
      "logits/chosen": -2.8140110969543457,
      "logits/rejected": -2.76088285446167,
      "logps/chosen": -285.4794006347656,
      "logps/rejected": -227.7167205810547,
      "loss": 0.6905,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 0.0035794698633253574,
      "rewards/margins": 0.0046408590860664845,
      "rewards/margins_max": 0.011065036058425903,
      "rewards/margins_min": -0.0008336328901350498,
      "rewards/margins_std": 0.005423419643193483,
      "rewards/rejected": -0.0010613898048177361,
      "step": 40
    },
    {
      "epoch": 0.14,
      "grad_norm": 2.3429157043740894,
      "learning_rate": 4.976275538042932e-07,
      "logits/chosen": -2.813694477081299,
      "logits/rejected": -2.7310705184936523,
      "logps/chosen": -317.00640869140625,
      "logps/rejected": -234.43209838867188,
      "loss": 0.688,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 0.007789776660501957,
      "rewards/margins": 0.010368594899773598,
      "rewards/margins_max": 0.021328028291463852,
      "rewards/margins_min": 0.0012870692880824208,
      "rewards/margins_std": 0.009302936494350433,
      "rewards/rejected": -0.00257881754077971,
      "step": 50
    },
    {
      "epoch": 0.17,
      "grad_norm": 2.326595034093221,
      "learning_rate": 4.930493069997119e-07,
      "logits/chosen": -2.7512717247009277,
      "logits/rejected": -2.7030184268951416,
      "logps/chosen": -343.24273681640625,
      "logps/rejected": -264.2438049316406,
      "loss": 0.6845,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 0.015377024188637733,
      "rewards/margins": 0.01811446249485016,
      "rewards/margins_max": 0.03716810420155525,
      "rewards/margins_min": 0.003869078354910016,
      "rewards/margins_std": 0.01510803122073412,
      "rewards/rejected": -0.0027374387718737125,
      "step": 60
    },
    {
      "epoch": 0.2,
      "grad_norm": 1.802371962995753,
      "learning_rate": 4.861156761634013e-07,
      "logits/chosen": -2.8008124828338623,
      "logits/rejected": -2.7141239643096924,
      "logps/chosen": -360.14227294921875,
      "logps/rejected": -237.1912841796875,
      "loss": 0.6809,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.02196549065411091,
      "rewards/margins": 0.026430394500494003,
      "rewards/margins_max": 0.05226575583219528,
      "rewards/margins_min": 0.005380354821681976,
      "rewards/margins_std": 0.02180148847401142,
      "rewards/rejected": -0.0044649080373346806,
      "step": 70
    },
    {
      "epoch": 0.23,
      "grad_norm": 2.0226092318731266,
      "learning_rate": 4.768938549177392e-07,
      "logits/chosen": -2.842362403869629,
      "logits/rejected": -2.778277635574341,
      "logps/chosen": -329.4476318359375,
      "logps/rejected": -288.3177795410156,
      "loss": 0.6774,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 0.02400829829275608,
      "rewards/margins": 0.03225512057542801,
      "rewards/margins_max": 0.06589716672897339,
      "rewards/margins_min": 0.006357196718454361,
      "rewards/margins_std": 0.027716059237718582,
      "rewards/rejected": -0.008246822282671928,
      "step": 80
    },
    {
      "epoch": 0.25,
      "grad_norm": 2.439721586727848,
      "learning_rate": 4.654732116743193e-07,
      "logits/chosen": -2.7840921878814697,
      "logits/rejected": -2.700878620147705,
      "logps/chosen": -336.05194091796875,
      "logps/rejected": -200.1630096435547,
      "loss": 0.672,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 0.029071927070617676,
      "rewards/margins": 0.04093674570322037,
      "rewards/margins_max": 0.08617201447486877,
      "rewards/margins_min": 0.006826425436884165,
      "rewards/margins_std": 0.0362737737596035,
      "rewards/rejected": -0.011864816769957542,
      "step": 90
    },
    {
      "epoch": 0.28,
      "grad_norm": 2.2225465127306605,
      "learning_rate": 4.519644235671752e-07,
      "logits/chosen": -2.8582470417022705,
      "logits/rejected": -2.7655489444732666,
      "logps/chosen": -342.58416748046875,
      "logps/rejected": -265.08441162109375,
      "loss": 0.6666,
      "rewards/accuracies": 0.862500011920929,
      "rewards/chosen": 0.037609733641147614,
      "rewards/margins": 0.050220172852277756,
      "rewards/margins_max": 0.10150803625583649,
      "rewards/margins_min": 0.007549063768237829,
      "rewards/margins_std": 0.0440022274851799,
      "rewards/rejected": -0.01261043269187212,
      "step": 100
    },
    {
      "epoch": 0.28,
      "eval_logits/chosen": -2.7978174686431885,
      "eval_logits/rejected": -2.7595677375793457,
      "eval_logps/chosen": -285.2066650390625,
      "eval_logps/rejected": -259.86334228515625,
      "eval_loss": 0.6906961798667908,
      "eval_rewards/accuracies": 0.578000009059906,
      "eval_rewards/chosen": -0.00613220501691103,
      "eval_rewards/margins": 0.006711836438626051,
      "eval_rewards/margins_max": 0.04891812801361084,
      "eval_rewards/margins_min": -0.02950645610690117,
      "eval_rewards/margins_std": 0.025911005213856697,
      "eval_rewards/rejected": -0.012844040989875793,
      "eval_runtime": 428.4446,
      "eval_samples_per_second": 4.668,
      "eval_steps_per_second": 0.292,
      "step": 100
    },
    {
      "epoch": 0.31,
      "grad_norm": 2.4760394100510057,
      "learning_rate": 4.364984038837727e-07,
      "logits/chosen": -2.8690743446350098,
      "logits/rejected": -2.7577908039093018,
      "logps/chosen": -385.70233154296875,
      "logps/rejected": -288.461669921875,
      "loss": 0.6591,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.0566016249358654,
      "rewards/margins": 0.06779664754867554,
      "rewards/margins_max": 0.13572999835014343,
      "rewards/margins_min": 0.010938728228211403,
      "rewards/margins_std": 0.05764765292406082,
      "rewards/rejected": -0.011195014230906963,
      "step": 110
    },
    {
      "epoch": 0.34,
      "grad_norm": 2.035920276115207,
      "learning_rate": 4.1922503338800447e-07,
      "logits/chosen": -2.8610854148864746,
      "logits/rejected": -2.7858219146728516,
      "logps/chosen": -387.9818115234375,
      "logps/rejected": -267.68585205078125,
      "loss": 0.657,
      "rewards/accuracies": 0.925000011920929,
      "rewards/chosen": 0.06763629615306854,
      "rewards/margins": 0.07911892235279083,
      "rewards/margins_max": 0.16764040291309357,
      "rewards/margins_min": 0.013401249423623085,
      "rewards/margins_std": 0.07113669812679291,
      "rewards/rejected": -0.01148262806236744,
      "step": 120
    },
    {
      "epoch": 0.37,
      "grad_norm": 2.010676971608138,
      "learning_rate": 4.003117078299021e-07,
      "logits/chosen": -2.818753957748413,
      "logits/rejected": -2.741856098175049,
      "logps/chosen": -396.28985595703125,
      "logps/rejected": -302.45050048828125,
      "loss": 0.6454,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 0.08936750888824463,
      "rewards/margins": 0.10461701452732086,
      "rewards/margins_max": 0.20179173350334167,
      "rewards/margins_min": 0.02413741685450077,
      "rewards/margins_std": 0.08073713630437851,
      "rewards/rejected": -0.015249502845108509,
      "step": 130
    },
    {
      "epoch": 0.39,
      "grad_norm": 1.7425219980216828,
      "learning_rate": 3.799417157181075e-07,
      "logits/chosen": -2.7920029163360596,
      "logits/rejected": -2.7359843254089355,
      "logps/chosen": -364.29058837890625,
      "logps/rejected": -272.58355712890625,
      "loss": 0.6467,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 0.08406248688697815,
      "rewards/margins": 0.10730169713497162,
      "rewards/margins_max": 0.22186696529388428,
      "rewards/margins_min": 0.012349050492048264,
      "rewards/margins_std": 0.09653683751821518,
      "rewards/rejected": -0.02323923259973526,
      "step": 140
    },
    {
      "epoch": 0.42,
      "grad_norm": 2.0933384277297957,
      "learning_rate": 3.583124620760659e-07,
      "logits/chosen": -2.825629711151123,
      "logits/rejected": -2.7282826900482178,
      "logps/chosen": -315.4014892578125,
      "logps/rejected": -216.2842254638672,
      "loss": 0.6435,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.07449642568826675,
      "rewards/margins": 0.09953755140304565,
      "rewards/margins_max": 0.21898682415485382,
      "rewards/margins_min": 0.014027351513504982,
      "rewards/margins_std": 0.09459034353494644,
      "rewards/rejected": -0.0250411219894886,
      "step": 150
    },
    {
      "epoch": 0.45,
      "grad_norm": 1.769669348441601,
      "learning_rate": 3.356335553954679e-07,
      "logits/chosen": -2.74135684967041,
      "logits/rejected": -2.6822197437286377,
      "logps/chosen": -335.69464111328125,
      "logps/rejected": -237.88046264648438,
      "loss": 0.6336,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 0.09816019237041473,
      "rewards/margins": 0.1330377608537674,
      "rewards/margins_max": 0.2625694274902344,
      "rewards/margins_min": 0.02169904112815857,
      "rewards/margins_std": 0.1116378903388977,
      "rewards/rejected": -0.03487757220864296,
      "step": 160
    },
    {
      "epoch": 0.48,
      "grad_norm": 1.8260362870579057,
      "learning_rate": 3.121247763262235e-07,
      "logits/chosen": -2.8216443061828613,
      "logits/rejected": -2.7401599884033203,
      "logps/chosen": -364.33587646484375,
      "logps/rejected": -299.15887451171875,
      "loss": 0.635,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 0.11069444566965103,
      "rewards/margins": 0.1395573914051056,
      "rewards/margins_max": 0.2983313202857971,
      "rewards/margins_min": 0.007289635483175516,
      "rewards/margins_std": 0.13639435172080994,
      "rewards/rejected": -0.02886294387280941,
      "step": 170
    },
    {
      "epoch": 0.51,
      "grad_norm": 2.082827875491237,
      "learning_rate": 2.880139477883347e-07,
      "logits/chosen": -2.789100408554077,
      "logits/rejected": -2.700629949569702,
      "logps/chosen": -339.28125,
      "logps/rejected": -296.9674377441406,
      "loss": 0.6302,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 0.08692200481891632,
      "rewards/margins": 0.11842750012874603,
      "rewards/margins_max": 0.23567883670330048,
      "rewards/margins_min": 0.011810391210019588,
      "rewards/margins_std": 0.10012297332286835,
      "rewards/rejected": -0.03150549530982971,
      "step": 180
    },
    {
      "epoch": 0.54,
      "grad_norm": 2.575609586836664,
      "learning_rate": 2.635347271463544e-07,
      "logits/chosen": -2.787972927093506,
      "logits/rejected": -2.6533846855163574,
      "logps/chosen": -349.08880615234375,
      "logps/rejected": -242.5450897216797,
      "loss": 0.6257,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 0.10282168537378311,
      "rewards/margins": 0.14908696711063385,
      "rewards/margins_max": 0.28802552819252014,
      "rewards/margins_min": 0.025781046599149704,
      "rewards/margins_std": 0.1190432757139206,
      "rewards/rejected": -0.04626528546214104,
      "step": 190
    },
    {
      "epoch": 0.56,
      "grad_norm": 2.049618345880406,
      "learning_rate": 2.3892434184240534e-07,
      "logits/chosen": -2.857001543045044,
      "logits/rejected": -2.7506966590881348,
      "logps/chosen": -387.255126953125,
      "logps/rejected": -270.194091796875,
      "loss": 0.6251,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 0.11739673465490341,
      "rewards/margins": 0.1590125858783722,
      "rewards/margins_max": 0.32226094603538513,
      "rewards/margins_min": 0.03523118048906326,
      "rewards/margins_std": 0.12896928191184998,
      "rewards/rejected": -0.04161586984992027,
      "step": 200
    },
    {
      "epoch": 0.56,
      "eval_logits/chosen": -2.769979953765869,
      "eval_logits/rejected": -2.7318813800811768,
      "eval_logps/chosen": -288.19482421875,
      "eval_logps/rejected": -263.79888916015625,
      "eval_loss": 0.6876310110092163,
      "eval_rewards/accuracies": 0.5870000123977661,
      "eval_rewards/chosen": -0.03601397946476936,
      "eval_rewards/margins": 0.016185704618692398,
      "eval_rewards/margins_max": 0.11952462792396545,
      "eval_rewards/margins_min": -0.07521206140518188,
      "eval_rewards/margins_std": 0.0641048476099968,
      "eval_rewards/rejected": -0.05219968408346176,
      "eval_runtime": 427.8872,
      "eval_samples_per_second": 4.674,
      "eval_steps_per_second": 0.292,
      "step": 200
    },
    {
      "epoch": 0.59,
      "grad_norm": 1.9845466839870691,
      "learning_rate": 2.1442129043167873e-07,
      "logits/chosen": -2.751984119415283,
      "logits/rejected": -2.6815638542175293,
      "logps/chosen": -344.3485412597656,
      "logps/rejected": -262.97393798828125,
      "loss": 0.6188,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 0.10976312309503555,
      "rewards/margins": 0.16779468953609467,
      "rewards/margins_max": 0.34073972702026367,
      "rewards/margins_min": 0.03692127764225006,
      "rewards/margins_std": 0.14643600583076477,
      "rewards/rejected": -0.05803157761693001,
      "step": 210
    },
    {
      "epoch": 0.62,
      "grad_norm": 2.0373527988549145,
      "learning_rate": 1.9026303129961048e-07,
      "logits/chosen": -2.8502397537231445,
      "logits/rejected": -2.7268834114074707,
      "logps/chosen": -393.9187927246094,
      "logps/rejected": -280.2196960449219,
      "loss": 0.6142,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.13092514872550964,
      "rewards/margins": 0.18590331077575684,
      "rewards/margins_max": 0.3502216637134552,
      "rewards/margins_min": 0.03089449368417263,
      "rewards/margins_std": 0.14906269311904907,
      "rewards/rejected": -0.0549781434237957,
      "step": 220
    },
    {
      "epoch": 0.65,
      "grad_norm": 2.1828985591845402,
      "learning_rate": 1.6668368145931396e-07,
      "logits/chosen": -2.875049114227295,
      "logits/rejected": -2.744711399078369,
      "logps/chosen": -390.4495849609375,
      "logps/rejected": -268.98565673828125,
      "loss": 0.6067,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 0.12363851070404053,
      "rewards/margins": 0.17946995794773102,
      "rewards/margins_max": 0.34318283200263977,
      "rewards/margins_min": 0.036444298923015594,
      "rewards/margins_std": 0.13844837248325348,
      "rewards/rejected": -0.05583144351840019,
      "step": 230
    },
    {
      "epoch": 0.68,
      "grad_norm": 1.8193698783653034,
      "learning_rate": 1.4391174773015834e-07,
      "logits/chosen": -2.802640199661255,
      "logits/rejected": -2.71109938621521,
      "logps/chosen": -333.38397216796875,
      "logps/rejected": -289.92462158203125,
      "loss": 0.6224,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 0.09798085689544678,
      "rewards/margins": 0.14519774913787842,
      "rewards/margins_max": 0.293338418006897,
      "rewards/margins_min": 0.01530275959521532,
      "rewards/margins_std": 0.12239019572734833,
      "rewards/rejected": -0.047216884791851044,
      "step": 240
    },
    {
      "epoch": 0.7,
      "grad_norm": 1.9709337022102749,
      "learning_rate": 1.2216791228457775e-07,
      "logits/chosen": -2.7975411415100098,
      "logits/rejected": -2.6804046630859375,
      "logps/chosen": -351.70257568359375,
      "logps/rejected": -260.0617370605469,
      "loss": 0.6084,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 0.12191393226385117,
      "rewards/margins": 0.1914171278476715,
      "rewards/margins_max": 0.36713144183158875,
      "rewards/margins_min": 0.05136305093765259,
      "rewards/margins_std": 0.142560213804245,
      "rewards/rejected": -0.06950321048498154,
      "step": 250
    },
    {
      "epoch": 0.73,
      "grad_norm": 1.752772587188972,
      "learning_rate": 1.0166289402331391e-07,
      "logits/chosen": -2.8487606048583984,
      "logits/rejected": -2.737738847732544,
      "logps/chosen": -345.0237731933594,
      "logps/rejected": -265.47198486328125,
      "loss": 0.6074,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.11421672999858856,
      "rewards/margins": 0.17133933305740356,
      "rewards/margins_max": 0.3663348853588104,
      "rewards/margins_min": 0.02202555350959301,
      "rewards/margins_std": 0.15875253081321716,
      "rewards/rejected": -0.057122599333524704,
      "step": 260
    },
    {
      "epoch": 0.76,
      "grad_norm": 2.056956615608033,
      "learning_rate": 8.259540650444734e-08,
      "logits/chosen": -2.8006067276000977,
      "logits/rejected": -2.7100348472595215,
      "logps/chosen": -365.325927734375,
      "logps/rejected": -270.2814636230469,
      "loss": 0.6098,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 0.1257423460483551,
      "rewards/margins": 0.20250901579856873,
      "rewards/margins_max": 0.3718946874141693,
      "rewards/margins_min": 0.04439568892121315,
      "rewards/margins_std": 0.1491011530160904,
      "rewards/rejected": -0.07676666229963303,
      "step": 270
    },
    {
      "epoch": 0.79,
      "grad_norm": 1.9417182779069821,
      "learning_rate": 6.515023221586721e-08,
      "logits/chosen": -2.7494287490844727,
      "logits/rejected": -2.7017343044281006,
      "logps/chosen": -320.38360595703125,
      "logps/rejected": -279.5456848144531,
      "loss": 0.6125,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 0.10271165519952774,
      "rewards/margins": 0.16274654865264893,
      "rewards/margins_max": 0.3613172173500061,
      "rewards/margins_min": 0.03712720423936844,
      "rewards/margins_std": 0.14939478039741516,
      "rewards/rejected": -0.060034893453121185,
      "step": 280
    },
    {
      "epoch": 0.82,
      "grad_norm": 2.159880856830845,
      "learning_rate": 4.949643185335287e-08,
      "logits/chosen": -2.7616562843322754,
      "logits/rejected": -2.6814732551574707,
      "logps/chosen": -331.0811462402344,
      "logps/rejected": -272.906982421875,
      "loss": 0.6168,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.0991094559431076,
      "rewards/margins": 0.16621682047843933,
      "rewards/margins_max": 0.34492212533950806,
      "rewards/margins_min": 0.02063518390059471,
      "rewards/margins_std": 0.14825591444969177,
      "rewards/rejected": -0.06710737198591232,
      "step": 290
    },
    {
      "epoch": 0.85,
      "grad_norm": 2.247267229121733,
      "learning_rate": 3.578570595810274e-08,
      "logits/chosen": -2.805422306060791,
      "logits/rejected": -2.7308857440948486,
      "logps/chosen": -351.537109375,
      "logps/rejected": -296.57861328125,
      "loss": 0.6029,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 0.12097059190273285,
      "rewards/margins": 0.19219207763671875,
      "rewards/margins_max": 0.3688820004463196,
      "rewards/margins_min": 0.05189325660467148,
      "rewards/margins_std": 0.14433594048023224,
      "rewards/rejected": -0.07122147083282471,
      "step": 300
    },
    {
      "epoch": 0.85,
      "eval_logits/chosen": -2.757275342941284,
      "eval_logits/rejected": -2.7190775871276855,
      "eval_logps/chosen": -289.97894287109375,
      "eval_logps/rejected": -266.02178955078125,
      "eval_loss": 0.6861926913261414,
      "eval_rewards/accuracies": 0.5870000123977661,
      "eval_rewards/chosen": -0.05385516211390495,
      "eval_rewards/margins": 0.020573224872350693,
      "eval_rewards/margins_max": 0.14790384471416473,
      "eval_rewards/margins_min": -0.09322728216648102,
      "eval_rewards/margins_std": 0.079450324177742,
      "eval_rewards/rejected": -0.07442838698625565,
      "eval_runtime": 427.9454,
      "eval_samples_per_second": 4.673,
      "eval_steps_per_second": 0.292,
      "step": 300
    },
    {
      "epoch": 0.87,
      "grad_norm": 1.8974226134430692,
      "learning_rate": 2.415092479103503e-08,
      "logits/chosen": -2.840935230255127,
      "logits/rejected": -2.709672212600708,
      "logps/chosen": -345.2643737792969,
      "logps/rejected": -222.6641082763672,
      "loss": 0.6093,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 0.1101798266172409,
      "rewards/margins": 0.1844477355480194,
      "rewards/margins_max": 0.3727789521217346,
      "rewards/margins_min": 0.046926215291023254,
      "rewards/margins_std": 0.1535920351743698,
      "rewards/rejected": -0.0742679089307785,
      "step": 310
    },
    {
      "epoch": 0.9,
      "grad_norm": 1.7695491697233519,
      "learning_rate": 1.4704840690808656e-08,
      "logits/chosen": -2.796245813369751,
      "logits/rejected": -2.7119815349578857,
      "logps/chosen": -339.24664306640625,
      "logps/rejected": -268.58984375,
      "loss": 0.6037,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 0.1139553040266037,
      "rewards/margins": 0.17954252660274506,
      "rewards/margins_max": 0.37754225730895996,
      "rewards/margins_min": 0.0253077894449234,
      "rewards/margins_std": 0.16321782767772675,
      "rewards/rejected": -0.06558724492788315,
      "step": 320
    },
    {
      "epoch": 0.93,
      "grad_norm": 2.0645664151522136,
      "learning_rate": 7.538995394063995e-09,
      "logits/chosen": -2.8658013343811035,
      "logits/rejected": -2.760768175125122,
      "logps/chosen": -386.96258544921875,
      "logps/rejected": -275.399658203125,
      "loss": 0.6073,
      "rewards/accuracies": 0.887499988079071,
      "rewards/chosen": 0.13314954936504364,
      "rewards/margins": 0.20348164439201355,
      "rewards/margins_max": 0.3888625502586365,
      "rewards/margins_min": 0.05053550750017166,
      "rewards/margins_std": 0.15889115631580353,
      "rewards/rejected": -0.0703321173787117,
      "step": 330
    },
    {
      "epoch": 0.96,
      "grad_norm": 2.0081843762801617,
      "learning_rate": 2.7228329070159705e-09,
      "logits/chosen": -2.7621803283691406,
      "logits/rejected": -2.6747400760650635,
      "logps/chosen": -334.4164123535156,
      "logps/rejected": -258.71417236328125,
      "loss": 0.607,
      "rewards/accuracies": 0.8999999761581421,
      "rewards/chosen": 0.10603541135787964,
      "rewards/margins": 0.17292913794517517,
      "rewards/margins_max": 0.366424024105072,
      "rewards/margins_min": 0.027338892221450806,
      "rewards/margins_std": 0.15261869132518768,
      "rewards/rejected": -0.06689374148845673,
      "step": 340
    },
    {
      "epoch": 0.99,
      "grad_norm": 3.396556816184061,
      "learning_rate": 3.0302652553296226e-10,
      "logits/chosen": -2.754178285598755,
      "logits/rejected": -2.6804542541503906,
      "logps/chosen": -348.5409851074219,
      "logps/rejected": -294.7231750488281,
      "loss": 0.6046,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 0.10722661018371582,
      "rewards/margins": 0.18621695041656494,
      "rewards/margins_max": 0.36825960874557495,
      "rewards/margins_min": 0.045198000967502594,
      "rewards/margins_std": 0.14424237608909607,
      "rewards/rejected": -0.07899035513401031,
      "step": 350
    },
    {
      "epoch": 1.0,
      "step": 355,
      "total_flos": 0.0,
      "train_loss": 0.6394854995566355,
      "train_runtime": 4022.9516,
      "train_samples_per_second": 1.411,
      "train_steps_per_second": 0.088
    }
  ],
  "logging_steps": 10,
  "max_steps": 355,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}