File size: 30,334 Bytes
b73f124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "episode": 10240,
  "epoch": 0.14045290575664887,
  "eval_steps": 200.0,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "episode": 256,
      "epoch": 0.003511322643916222,
      "eps": 6,
      "loss/policy_avg": -0.07090990990400314,
      "loss/value_avg": 0.0,
      "lr": 3e-06,
      "objective/entropy": 49.42120361328125,
      "objective/kl": 0.006465356796979904,
      "objective/non_score_reward": -0.000646535714622587,
      "objective/rlhf_reward": -1.1137903928756714,
      "objective/scores": -1.109375,
      "policy/approxkl_avg": 27.096786499023438,
      "policy/clipfrac_avg": 0.732421875,
      "policy/entropy_avg": 0.92181396484375,
      "step": 5,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0399832725524902,
      "val/ratio_var": 0.010045886039733887
    },
    {
      "episode": 512,
      "epoch": 0.007022645287832444,
      "eps": 6,
      "loss/policy_avg": -0.06497187167406082,
      "loss/value_avg": 0.0,
      "lr": 2.9923273657289e-06,
      "objective/entropy": 48.286014556884766,
      "objective/kl": 0.8119473457336426,
      "objective/non_score_reward": -0.08119472861289978,
      "objective/rlhf_reward": -1.266162633895874,
      "objective/scores": -1.1875,
      "policy/approxkl_avg": 18.666072845458984,
      "policy/clipfrac_avg": 0.7314453125,
      "policy/entropy_avg": 0.912261962890625,
      "step": 10,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.020957112312317,
      "val/ratio_var": 0.00411860179156065
    },
    {
      "episode": 768,
      "epoch": 0.010533967931748666,
      "eps": 6,
      "loss/policy_avg": -0.0872286781668663,
      "loss/value_avg": 0.0,
      "lr": 2.9846547314578008e-06,
      "objective/entropy": 49.34376525878906,
      "objective/kl": 1.9591996669769287,
      "objective/non_score_reward": -0.1959199756383896,
      "objective/rlhf_reward": -1.2858657836914062,
      "objective/scores": -1.09375,
      "policy/approxkl_avg": 20.772502899169922,
      "policy/clipfrac_avg": 0.73828125,
      "policy/entropy_avg": 0.927978515625,
      "step": 15,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0191609859466553,
      "val/ratio_var": 0.00307083735242486
    },
    {
      "episode": 1024,
      "epoch": 0.014045290575664887,
      "eps": 6,
      "loss/policy_avg": -0.07566041499376297,
      "loss/value_avg": 0.0,
      "lr": 2.9769820971867007e-06,
      "objective/entropy": 53.13662338256836,
      "objective/kl": 2.4811532497406006,
      "objective/non_score_reward": -0.24811533093452454,
      "objective/rlhf_reward": -1.2548893690109253,
      "objective/scores": -1.0078125,
      "policy/approxkl_avg": 20.665164947509766,
      "policy/clipfrac_avg": 0.7314453125,
      "policy/entropy_avg": 0.989776611328125,
      "step": 20,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.011010766029358,
      "val/ratio_var": 0.004201602190732956
    },
    {
      "episode": 1280,
      "epoch": 0.01755661321958111,
      "eps": 6,
      "loss/policy_avg": -0.08593496680259705,
      "loss/value_avg": 0.0,
      "lr": 2.9693094629156014e-06,
      "objective/entropy": 53.72633743286133,
      "objective/kl": 3.3111624717712402,
      "objective/non_score_reward": -0.3311161994934082,
      "objective/rlhf_reward": -1.339456558227539,
      "objective/scores": -1.0078125,
      "policy/approxkl_avg": 25.559288024902344,
      "policy/clipfrac_avg": 0.7353515625,
      "policy/entropy_avg": 0.997894287109375,
      "step": 25,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0134021043777466,
      "val/ratio_var": 0.0019979747012257576
    },
    {
      "episode": 1536,
      "epoch": 0.021067935863497332,
      "eps": 6,
      "loss/policy_avg": -0.09734417498111725,
      "loss/value_avg": 0.0,
      "lr": 2.9616368286445014e-06,
      "objective/entropy": 51.259735107421875,
      "objective/kl": 5.089182376861572,
      "objective/non_score_reward": -0.5089181661605835,
      "objective/rlhf_reward": -1.2202520370483398,
      "objective/scores": -0.7109375,
      "policy/approxkl_avg": 29.841636657714844,
      "policy/clipfrac_avg": 0.736328125,
      "policy/entropy_avg": 0.960479736328125,
      "step": 30,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 26,
      "val/ratio": 1.0178756713867188,
      "val/ratio_var": 0.009866585955023766
    },
    {
      "episode": 1792,
      "epoch": 0.024579258507413555,
      "eps": 6,
      "loss/policy_avg": -0.06831618398427963,
      "loss/value_avg": 0.0,
      "lr": 2.9539641943734013e-06,
      "objective/entropy": 40.643272399902344,
      "objective/kl": 6.974010944366455,
      "objective/non_score_reward": -0.6974011063575745,
      "objective/rlhf_reward": -1.2684605121612549,
      "objective/scores": -0.5703125,
      "policy/approxkl_avg": 35.33942413330078,
      "policy/clipfrac_avg": 0.6982421875,
      "policy/entropy_avg": 0.7505035400390625,
      "step": 35,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.00449800491333,
      "val/ratio_var": 0.0022142010275274515
    },
    {
      "episode": 2048,
      "epoch": 0.028090581151329775,
      "eps": 6,
      "loss/policy_avg": -0.04068079590797424,
      "loss/value_avg": 0.0,
      "lr": 2.946291560102302e-06,
      "objective/entropy": 23.142562866210938,
      "objective/kl": 8.180486679077148,
      "objective/non_score_reward": -0.8180487155914307,
      "objective/rlhf_reward": -1.0729957818984985,
      "objective/scores": -0.255859375,
      "policy/approxkl_avg": 23.68307876586914,
      "policy/clipfrac_avg": 0.5859375,
      "policy/entropy_avg": 0.4361400604248047,
      "step": 40,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.0077030658721924,
      "val/ratio_var": 0.0024766812566667795
    },
    {
      "episode": 2304,
      "epoch": 0.031601903795246,
      "eps": 6,
      "loss/policy_avg": -0.07307010889053345,
      "loss/value_avg": 0.0,
      "lr": 2.938618925831202e-06,
      "objective/entropy": 19.376842498779297,
      "objective/kl": 8.770210266113281,
      "objective/non_score_reward": -0.8770210146903992,
      "objective/rlhf_reward": -1.0002652406692505,
      "objective/scores": -0.12353515625,
      "policy/approxkl_avg": 31.00873565673828,
      "policy/clipfrac_avg": 0.5302734375,
      "policy/entropy_avg": 0.33237457275390625,
      "step": 45,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.996111273765564,
      "val/ratio_var": 0.001100091845728457
    },
    {
      "episode": 2560,
      "epoch": 0.03511322643916222,
      "eps": 6,
      "loss/policy_avg": -0.04584116116166115,
      "loss/value_avg": 0.0,
      "lr": 2.9309462915601027e-06,
      "objective/entropy": 11.984097480773926,
      "objective/kl": 8.4966402053833,
      "objective/non_score_reward": -0.849664032459259,
      "objective/rlhf_reward": -0.8017911911010742,
      "objective/scores": 0.0478515625,
      "policy/approxkl_avg": 22.561037063598633,
      "policy/clipfrac_avg": 0.451171875,
      "policy/entropy_avg": 0.19393539428710938,
      "step": 50,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.9952375888824463,
      "val/ratio_var": 0.000761833623982966
    },
    {
      "episode": 2816,
      "epoch": 0.03862454908307844,
      "eps": 5,
      "loss/policy_avg": -0.029720915481448174,
      "loss/value_avg": 0.0,
      "lr": 2.9232736572890026e-06,
      "objective/entropy": 4.9489898681640625,
      "objective/kl": 8.733837127685547,
      "objective/non_score_reward": -0.8733837604522705,
      "objective/rlhf_reward": -0.7492713928222656,
      "objective/scores": 0.1240234375,
      "policy/approxkl_avg": 16.253189086914062,
      "policy/clipfrac_avg": 0.341796875,
      "policy/entropy_avg": 0.07728099822998047,
      "step": 55,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 18,
      "val/ratio": 0.9972053170204163,
      "val/ratio_var": 0.00032430028659291565
    },
    {
      "episode": 3072,
      "epoch": 0.042135871726994664,
      "eps": 5,
      "loss/policy_avg": -0.01298562902957201,
      "loss/value_avg": 0.0,
      "lr": 2.9156010230179026e-06,
      "objective/entropy": 1.3101667165756226,
      "objective/kl": 8.699792861938477,
      "objective/non_score_reward": -0.8699792623519897,
      "objective/rlhf_reward": -0.5752952098846436,
      "objective/scores": 0.294921875,
      "policy/approxkl_avg": 2.27925968170166,
      "policy/clipfrac_avg": 0.236328125,
      "policy/entropy_avg": 0.02513742446899414,
      "step": 60,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.0017118453979492,
      "val/ratio_var": 0.00016639505338389426
    },
    {
      "episode": 3328,
      "epoch": 0.04564719437091089,
      "eps": 5,
      "loss/policy_avg": -0.02618303708732128,
      "loss/value_avg": 0.0,
      "lr": 2.9079283887468033e-06,
      "objective/entropy": 2.3685269355773926,
      "objective/kl": 9.208517074584961,
      "objective/non_score_reward": -0.9208516478538513,
      "objective/rlhf_reward": -0.5182289481163025,
      "objective/scores": 0.40234375,
      "policy/approxkl_avg": 2.6189699172973633,
      "policy/clipfrac_avg": 0.310546875,
      "policy/entropy_avg": 0.04020071029663086,
      "step": 65,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.003983497619629,
      "val/ratio_var": 0.0009448421187698841
    },
    {
      "episode": 3584,
      "epoch": 0.04915851701482711,
      "eps": 5,
      "loss/policy_avg": -0.02327096462249756,
      "loss/value_avg": 0.0,
      "lr": 2.9002557544757032e-06,
      "objective/entropy": 2.0416018962860107,
      "objective/kl": 9.701976776123047,
      "objective/non_score_reward": -0.9701976776123047,
      "objective/rlhf_reward": -0.49486449360847473,
      "objective/scores": 0.474609375,
      "policy/approxkl_avg": 1.271956443786621,
      "policy/clipfrac_avg": 0.2734375,
      "policy/entropy_avg": 0.041253089904785156,
      "step": 70,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0039558410644531,
      "val/ratio_var": 0.00041477559716440737
    },
    {
      "episode": 3840,
      "epoch": 0.052669839658743334,
      "eps": 5,
      "loss/policy_avg": -0.033096276223659515,
      "loss/value_avg": 0.0,
      "lr": 2.892583120204604e-06,
      "objective/entropy": 2.7795495986938477,
      "objective/kl": 10.028523445129395,
      "objective/non_score_reward": -1.0028523206710815,
      "objective/rlhf_reward": -0.46555712819099426,
      "objective/scores": 0.5390625,
      "policy/approxkl_avg": 3.055203676223755,
      "policy/clipfrac_avg": 0.3427734375,
      "policy/entropy_avg": 0.053270816802978516,
      "step": 75,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 23,
      "val/ratio": 1.0012407302856445,
      "val/ratio_var": 0.00011274257121840492
    },
    {
      "episode": 4096,
      "epoch": 0.05618116230265955,
      "eps": 5,
      "loss/policy_avg": -0.01961323618888855,
      "loss/value_avg": 0.0,
      "lr": 2.884910485933504e-06,
      "objective/entropy": 2.5525641441345215,
      "objective/kl": 10.111019134521484,
      "objective/non_score_reward": -1.0111019611358643,
      "objective/rlhf_reward": -0.510233461856842,
      "objective/scores": 0.5,
      "policy/approxkl_avg": 1.331697940826416,
      "policy/clipfrac_avg": 0.2861328125,
      "policy/entropy_avg": 0.048857688903808594,
      "step": 80,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 25,
      "val/ratio": 1.011049509048462,
      "val/ratio_var": 0.004252108279615641
    },
    {
      "episode": 4352,
      "epoch": 0.05969248494657577,
      "eps": 5,
      "loss/policy_avg": -0.009127877652645111,
      "loss/value_avg": 0.0,
      "lr": 2.877237851662404e-06,
      "objective/entropy": 3.016789674758911,
      "objective/kl": 11.257818222045898,
      "objective/non_score_reward": -1.125781774520874,
      "objective/rlhf_reward": -0.4276960492134094,
      "objective/scores": 0.69921875,
      "policy/approxkl_avg": 1.4772686958312988,
      "policy/clipfrac_avg": 0.35546875,
      "policy/entropy_avg": 0.053719520568847656,
      "step": 85,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 6,
      "val/ratio": 1.0042904615402222,
      "val/ratio_var": 0.0008556774700991809
    },
    {
      "episode": 4608,
      "epoch": 0.063203807590492,
      "eps": 5,
      "loss/policy_avg": -0.025049656629562378,
      "loss/value_avg": 0.0,
      "lr": 2.8695652173913046e-06,
      "objective/entropy": 2.5907459259033203,
      "objective/kl": 10.457273483276367,
      "objective/non_score_reward": -1.0457274913787842,
      "objective/rlhf_reward": -0.3816419839859009,
      "objective/scores": 0.6640625,
      "policy/approxkl_avg": 2.3460922241210938,
      "policy/clipfrac_avg": 0.322265625,
      "policy/entropy_avg": 0.04626178741455078,
      "step": 90,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.0003862380981445,
      "val/ratio_var": 7.93520302977413e-05
    },
    {
      "episode": 4864,
      "epoch": 0.06671513023440821,
      "eps": 5,
      "loss/policy_avg": -0.01828361675143242,
      "loss/value_avg": 0.0,
      "lr": 2.8618925831202045e-06,
      "objective/entropy": 2.397810220718384,
      "objective/kl": 10.732559204101562,
      "objective/non_score_reward": -1.073256015777588,
      "objective/rlhf_reward": -0.35966813564300537,
      "objective/scores": 0.71484375,
      "policy/approxkl_avg": 1.1093428134918213,
      "policy/clipfrac_avg": 0.32421875,
      "policy/entropy_avg": 0.041881561279296875,
      "step": 95,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0054664611816406,
      "val/ratio_var": 0.0017973663052543998
    },
    {
      "episode": 5120,
      "epoch": 0.07022645287832444,
      "eps": 5,
      "loss/policy_avg": -0.04088423401117325,
      "loss/value_avg": 0.0,
      "lr": 2.8542199488491053e-06,
      "objective/entropy": 2.343449592590332,
      "objective/kl": 11.780994415283203,
      "objective/non_score_reward": -1.1780993938446045,
      "objective/rlhf_reward": -0.4628324806690216,
      "objective/scores": 0.71484375,
      "policy/approxkl_avg": 0.894420325756073,
      "policy/clipfrac_avg": 0.46875,
      "policy/entropy_avg": 0.04486083984375,
      "step": 100,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 11,
      "val/ratio": 1.0009559392929077,
      "val/ratio_var": 4.804596756002866e-05
    },
    {
      "episode": 5376,
      "epoch": 0.07373777552224066,
      "eps": 5,
      "loss/policy_avg": -0.020697183907032013,
      "loss/value_avg": 0.0,
      "lr": 2.846547314578005e-06,
      "objective/entropy": 1.9023351669311523,
      "objective/kl": 10.29288101196289,
      "objective/non_score_reward": -1.0292882919311523,
      "objective/rlhf_reward": -0.29047834873199463,
      "objective/scores": 0.73828125,
      "policy/approxkl_avg": 0.9143690466880798,
      "policy/clipfrac_avg": 0.373046875,
      "policy/entropy_avg": 0.028568267822265625,
      "step": 105,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 10,
      "val/ratio": 1.000715732574463,
      "val/ratio_var": 4.201457340968773e-05
    },
    {
      "episode": 5632,
      "epoch": 0.07724909816615688,
      "eps": 5,
      "loss/policy_avg": -0.012633640319108963,
      "loss/value_avg": 0.0,
      "lr": 2.8388746803069055e-06,
      "objective/entropy": 1.3839142322540283,
      "objective/kl": 10.57151985168457,
      "objective/non_score_reward": -1.0571520328521729,
      "objective/rlhf_reward": -0.2935946583747864,
      "objective/scores": 0.765625,
      "policy/approxkl_avg": 0.6525547504425049,
      "policy/clipfrac_avg": 0.2646484375,
      "policy/entropy_avg": 0.0345916748046875,
      "step": 110,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 10,
      "val/ratio": 0.9999199509620667,
      "val/ratio_var": 2.6978697860613465e-05
    },
    {
      "episode": 5888,
      "epoch": 0.0807604208100731,
      "eps": 5,
      "loss/policy_avg": -0.026668714359402657,
      "loss/value_avg": 0.0,
      "lr": 2.831202046035806e-06,
      "objective/entropy": 2.17741322517395,
      "objective/kl": 11.39688491821289,
      "objective/non_score_reward": -1.139688491821289,
      "objective/rlhf_reward": -0.3027456998825073,
      "objective/scores": 0.8359375,
      "policy/approxkl_avg": 8.829752922058105,
      "policy/clipfrac_avg": 0.35546875,
      "policy/entropy_avg": 0.034277915954589844,
      "step": 115,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 8,
      "val/ratio": 1.0012441873550415,
      "val/ratio_var": 9.009366476675496e-05
    },
    {
      "episode": 6144,
      "epoch": 0.08427174345398933,
      "eps": 5,
      "loss/policy_avg": -0.011602860875427723,
      "loss/value_avg": 0.0,
      "lr": 2.823529411764706e-06,
      "objective/entropy": 1.418602466583252,
      "objective/kl": 10.246469497680664,
      "objective/non_score_reward": -1.0246469974517822,
      "objective/rlhf_reward": -0.22599510848522186,
      "objective/scores": 0.796875,
      "policy/approxkl_avg": 0.31790149211883545,
      "policy/clipfrac_avg": 0.2314453125,
      "policy/entropy_avg": 0.028847694396972656,
      "step": 120,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 9,
      "val/ratio": 1.0009679794311523,
      "val/ratio_var": 3.900106457876973e-05
    },
    {
      "episode": 6400,
      "epoch": 0.08778306609790555,
      "eps": 5,
      "loss/policy_avg": -0.0157505851238966,
      "loss/value_avg": 0.0,
      "lr": 2.8158567774936066e-06,
      "objective/entropy": 1.936393141746521,
      "objective/kl": 10.550077438354492,
      "objective/non_score_reward": -1.0550076961517334,
      "objective/rlhf_reward": -0.252943217754364,
      "objective/scores": 0.80078125,
      "policy/approxkl_avg": 6.545133113861084,
      "policy/clipfrac_avg": 0.341796875,
      "policy/entropy_avg": 0.039971351623535156,
      "step": 125,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 12,
      "val/ratio": 1.0001187324523926,
      "val/ratio_var": 0.00011527155584190041
    },
    {
      "episode": 6656,
      "epoch": 0.09129438874182177,
      "eps": 5,
      "loss/policy_avg": -0.00908716581761837,
      "loss/value_avg": 0.0,
      "lr": 2.8081841432225065e-06,
      "objective/entropy": 1.9167767763137817,
      "objective/kl": 10.831771850585938,
      "objective/non_score_reward": -1.0831772089004517,
      "objective/rlhf_reward": -0.24270595610141754,
      "objective/scores": 0.83984375,
      "policy/approxkl_avg": 13.507976531982422,
      "policy/clipfrac_avg": 0.25,
      "policy/entropy_avg": 0.034499168395996094,
      "step": 130,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 7,
      "val/ratio": 1.0004911422729492,
      "val/ratio_var": 0.00018595268193166703
    },
    {
      "episode": 6912,
      "epoch": 0.094805711385738,
      "eps": 5,
      "loss/policy_avg": -0.017197387292981148,
      "loss/value_avg": 0.0,
      "lr": 2.800511508951407e-06,
      "objective/entropy": 1.7237651348114014,
      "objective/kl": 11.095592498779297,
      "objective/non_score_reward": -1.1095592975616455,
      "objective/rlhf_reward": -0.21057555079460144,
      "objective/scores": 0.8984375,
      "policy/approxkl_avg": 2.7560040950775146,
      "policy/clipfrac_avg": 0.2841796875,
      "policy/entropy_avg": 0.032952308654785156,
      "step": 135,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 2,
      "val/ratio": 0.9994020462036133,
      "val/ratio_var": 3.074964843108319e-05
    },
    {
      "episode": 7168,
      "epoch": 0.09831703402965422,
      "eps": 5,
      "loss/policy_avg": -0.012010859325528145,
      "loss/value_avg": 0.0,
      "lr": 2.792838874680307e-06,
      "objective/entropy": 1.5862581729888916,
      "objective/kl": 10.674396514892578,
      "objective/non_score_reward": -1.0674396753311157,
      "objective/rlhf_reward": -0.14433012902736664,
      "objective/scores": 0.921875,
      "policy/approxkl_avg": 1.1186727285385132,
      "policy/clipfrac_avg": 0.2783203125,
      "policy/entropy_avg": 0.0295562744140625,
      "step": 140,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0007727146148682,
      "val/ratio_var": 4.557183274300769e-05
    },
    {
      "episode": 7424,
      "epoch": 0.10182835667357044,
      "eps": 5,
      "loss/policy_avg": -0.013728385791182518,
      "loss/value_avg": 0.0,
      "lr": 2.785166240409207e-06,
      "objective/entropy": 1.5388869047164917,
      "objective/kl": 10.359582901000977,
      "objective/non_score_reward": -1.035958170890808,
      "objective/rlhf_reward": -0.14511710405349731,
      "objective/scores": 0.890625,
      "policy/approxkl_avg": 0.5204602479934692,
      "policy/clipfrac_avg": 0.283203125,
      "policy/entropy_avg": 0.028924942016601562,
      "step": 145,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 14,
      "val/ratio": 1.056097149848938,
      "val/ratio_var": 0.13372056186199188
    },
    {
      "episode": 7680,
      "epoch": 0.10533967931748667,
      "eps": 5,
      "loss/policy_avg": -0.014945434406399727,
      "loss/value_avg": 0.0,
      "lr": 2.7774936061381074e-06,
      "objective/entropy": 2.0769755840301514,
      "objective/kl": 11.147063255310059,
      "objective/non_score_reward": -1.11470627784729,
      "objective/rlhf_reward": -0.08940108120441437,
      "objective/scores": 1.0234375,
      "policy/approxkl_avg": 0.5961493253707886,
      "policy/clipfrac_avg": 0.3681640625,
      "policy/entropy_avg": 0.037804603576660156,
      "step": 150,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0033739805221558,
      "val/ratio_var": 0.00030022990540601313
    },
    {
      "episode": 7936,
      "epoch": 0.10885100196140288,
      "eps": 5,
      "loss/policy_avg": -0.02276831492781639,
      "loss/value_avg": 0.0,
      "lr": 2.7698209718670078e-06,
      "objective/entropy": 2.1412830352783203,
      "objective/kl": 11.697949409484863,
      "objective/non_score_reward": -1.169795036315918,
      "objective/rlhf_reward": -0.13582009077072144,
      "objective/scores": 1.03125,
      "policy/approxkl_avg": 0.7155288457870483,
      "policy/clipfrac_avg": 0.3193359375,
      "policy/entropy_avg": 0.037835121154785156,
      "step": 155,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 13,
      "val/ratio": 1.0014090538024902,
      "val/ratio_var": 5.2470270020421594e-05
    },
    {
      "episode": 8192,
      "epoch": 0.1123623246053191,
      "eps": 5,
      "loss/policy_avg": -0.013076605275273323,
      "loss/value_avg": 0.0,
      "lr": 2.762148337595908e-06,
      "objective/entropy": 1.634714126586914,
      "objective/kl": 11.629154205322266,
      "objective/non_score_reward": -1.1629154682159424,
      "objective/rlhf_reward": -0.28488799929618835,
      "objective/scores": 0.87890625,
      "policy/approxkl_avg": 0.4181188941001892,
      "policy/clipfrac_avg": 0.3037109375,
      "policy/entropy_avg": 0.029273509979248047,
      "step": 160,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0008339881896973,
      "val/ratio_var": 1.4662801731901709e-05
    },
    {
      "episode": 8448,
      "epoch": 0.11587364724923532,
      "eps": 5,
      "loss/policy_avg": -0.01651182770729065,
      "loss/value_avg": 0.0,
      "lr": 2.7544757033248085e-06,
      "objective/entropy": 1.9540742635726929,
      "objective/kl": 11.4830322265625,
      "objective/non_score_reward": -1.1483032703399658,
      "objective/rlhf_reward": -0.05983233451843262,
      "objective/scores": 1.0859375,
      "policy/approxkl_avg": 18.791297912597656,
      "policy/clipfrac_avg": 0.2880859375,
      "policy/entropy_avg": 0.03601264953613281,
      "step": 165,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 16,
      "val/ratio": 1.0220942497253418,
      "val/ratio_var": 0.02208283357322216
    },
    {
      "episode": 8704,
      "epoch": 0.11938496989315155,
      "eps": 5,
      "loss/policy_avg": -0.013821810483932495,
      "loss/value_avg": 0.0,
      "lr": 2.7468030690537084e-06,
      "objective/entropy": 1.6243339776992798,
      "objective/kl": 11.435280799865723,
      "objective/non_score_reward": -1.1435281038284302,
      "objective/rlhf_reward": -0.12443088740110397,
      "objective/scores": 1.015625,
      "policy/approxkl_avg": 0.29013216495513916,
      "policy/clipfrac_avg": 0.28125,
      "policy/entropy_avg": 0.03498649597167969,
      "step": 170,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 15,
      "val/ratio": 1.0027971267700195,
      "val/ratio_var": 0.0002298366161994636
    },
    {
      "episode": 8960,
      "epoch": 0.12289629253706777,
      "eps": 5,
      "loss/policy_avg": -0.011003649793565273,
      "loss/value_avg": 0.0,
      "lr": 2.7391304347826087e-06,
      "objective/entropy": 2.000375986099243,
      "objective/kl": 11.78514575958252,
      "objective/non_score_reward": -1.1785145998001099,
      "objective/rlhf_reward": -0.2609584331512451,
      "objective/scores": 0.91796875,
      "policy/approxkl_avg": 0.8603074550628662,
      "policy/clipfrac_avg": 0.2998046875,
      "policy/entropy_avg": 0.034775733947753906,
      "step": 175,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 19,
      "val/ratio": 1.0012288093566895,
      "val/ratio_var": 3.532394111971371e-05
    },
    {
      "episode": 9216,
      "epoch": 0.126407615180984,
      "eps": 5,
      "loss/policy_avg": -0.010885423980653286,
      "loss/value_avg": 0.0,
      "lr": 2.731457800511509e-06,
      "objective/entropy": 1.5240473747253418,
      "objective/kl": 12.420597076416016,
      "objective/non_score_reward": -1.2420598268508911,
      "objective/rlhf_reward": -0.16641265153884888,
      "objective/scores": 1.078125,
      "policy/approxkl_avg": 0.46217110753059387,
      "policy/clipfrac_avg": 0.2783203125,
      "policy/entropy_avg": 0.029424667358398438,
      "step": 180,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 1.0007582902908325,
      "val/ratio_var": 2.4759892767178826e-05
    },
    {
      "episode": 9472,
      "epoch": 0.12991893782490022,
      "eps": 5,
      "loss/policy_avg": -0.01097183395177126,
      "loss/value_avg": 0.0,
      "lr": 2.7237851662404094e-06,
      "objective/entropy": 1.6292238235473633,
      "objective/kl": 12.73173713684082,
      "objective/non_score_reward": -1.2731736898422241,
      "objective/rlhf_reward": -0.10916168242692947,
      "objective/scores": 1.1640625,
      "policy/approxkl_avg": 0.5525862574577332,
      "policy/clipfrac_avg": 0.310546875,
      "policy/entropy_avg": 0.031815528869628906,
      "step": 185,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 22,
      "val/ratio": 1.0027148723602295,
      "val/ratio_var": 0.00016600274830125272
    },
    {
      "episode": 9728,
      "epoch": 0.13343026046881643,
      "eps": 5,
      "loss/policy_avg": -0.010572239756584167,
      "loss/value_avg": 0.0,
      "lr": 2.7161125319693097e-06,
      "objective/entropy": 2.028618335723877,
      "objective/kl": 12.439943313598633,
      "objective/non_score_reward": -1.2439942359924316,
      "objective/rlhf_reward": -0.06748821586370468,
      "objective/scores": 1.171875,
      "policy/approxkl_avg": 0.4930054843425751,
      "policy/clipfrac_avg": 0.2841796875,
      "policy/entropy_avg": 0.03688812255859375,
      "step": 190,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 22,
      "val/ratio": 1.001340627670288,
      "val/ratio_var": 4.4035481550963596e-05
    },
    {
      "episode": 9984,
      "epoch": 0.13694158311273266,
      "eps": 5,
      "loss/policy_avg": -0.019254155457019806,
      "loss/value_avg": 0.0,
      "lr": 2.7084398976982097e-06,
      "objective/entropy": 2.295351266860962,
      "objective/kl": 13.32223892211914,
      "objective/non_score_reward": -1.332223892211914,
      "objective/rlhf_reward": -0.1836824268102646,
      "objective/scores": 1.1484375,
      "policy/approxkl_avg": 3.1426281929016113,
      "policy/clipfrac_avg": 0.3251953125,
      "policy/entropy_avg": 0.03939247131347656,
      "step": 195,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 17,
      "val/ratio": 1.0032271146774292,
      "val/ratio_var": 0.00019827872165478766
    },
    {
      "episode": 10240,
      "epoch": 0.14045290575664887,
      "eps": 5,
      "loss/policy_avg": -0.018122296780347824,
      "loss/value_avg": 0.0,
      "lr": 2.70076726342711e-06,
      "objective/entropy": 2.345075845718384,
      "objective/kl": 12.536066055297852,
      "objective/non_score_reward": -1.2536065578460693,
      "objective/rlhf_reward": -0.056986674666404724,
      "objective/scores": 1.1953125,
      "policy/approxkl_avg": 27.5201473236084,
      "policy/clipfrac_avg": 0.3046875,
      "policy/entropy_avg": 0.04156017303466797,
      "step": 200,
      "val/clipfrac_avg": 0.0,
      "val/num_eos_tokens": 20,
      "val/ratio": 0.9993807077407837,
      "val/ratio_var": 0.00011275127326371148
    }
  ],
  "logging_steps": 100,
  "max_steps": 391,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1.3716104077797742,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": true,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0,
  "train_batch_size": null,
  "trial_name": null,
  "trial_params": null
}