BraylonDash commited on
Commit
db0f377
·
verified ·
1 Parent(s): 7579e25

Model save

Browse files
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ base_model: DUAL-GPO/phi-2-sft-lora-ultrachat-merged
9
+ model-index:
10
+ - name: phi-2-kto-i0
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # phi-2-kto-i0
18
+
19
+ This model is a fine-tuned version of [DUAL-GPO/phi-2-sft-lora-ultrachat-merged](https://huggingface.co/DUAL-GPO/phi-2-sft-lora-ultrachat-merged) on the None dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-06
39
+ - train_batch_size: 4
40
+ - eval_batch_size: 4
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 2
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 8
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 1
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - PEFT 0.7.1
59
+ - Transformers 4.36.2
60
+ - Pytorch 2.1.2
61
+ - Datasets 2.14.6
62
+ - Tokenizers 0.15.2
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65be6d18e888c900d7755837f1226dfe9579b05eda2b61f9cc57a7850c058f27
3
  size 335579632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b481bae16a28ec9c1d5d85715b5a5373bcf2ceda3ead20d169ccf6d23ef0a42
3
  size 335579632
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.09940854217369519,
4
+ "train_runtime": 5146.4957,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 11.879,
7
+ "train_steps_per_second": 0.371
8
+ }
emissions.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ timestamp,project_name,run_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,pue
2
+ 2024-09-28T23:06:01,codecarbon,4ad68e38-f708-4965-8116-4cc9c3bca396,5146.503284215927,0.0029854159599475927,5.800862828755433e-07,42.5,519.923,188.74309015274048,0.0607566486707992,0.9262474175948083,0.2688400132142028,1.2558440794798107,Canada,CAN,quebec,,,Linux-5.15.0-84-generic-x86_64-with-glibc2.35,3.10.14,2.2.3,32,Intel(R) Xeon(R) W-3335 CPU @ 3.40GHz,4,4 x NVIDIA GeForce RTX 4090,-71.2,46.8,503.3149070739746,machine,N,1.0
runs/Sep28_21-39-26_gpu4-119-5/events.out.tfevents.1727523614.gpu4-119-5.12283.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ec31845e7575e7c44b4f47e9669c35c8ff5a7ed076194436c52f8a89cf8c258
3
- size 30238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa5134d16c0b02dfad1fc80ca244c33555c4fb3dc31bec86ad9eb2901146c89e
3
+ size 31226
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.09940854217369519,
4
+ "train_runtime": 5146.4957,
5
+ "train_samples": 61135,
6
+ "train_samples_per_second": 11.879,
7
+ "train_steps_per_second": 0.371
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9997382884061764,
5
+ "eval_steps": 500,
6
+ "global_step": 1910,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.617801047120419e-08,
14
+ "logits/chosen": 0.8436492085456848,
15
+ "logits/rejected": 1.1560968160629272,
16
+ "logps/chosen": -330.2955322265625,
17
+ "logps/rejected": -239.8994140625,
18
+ "loss": 0.5,
19
+ "rewards/accuracies": 0.0,
20
+ "rewards/chosen": 0.0,
21
+ "rewards/margins": 0.0,
22
+ "rewards/rejected": 0.0,
23
+ "step": 1
24
+ },
25
+ {
26
+ "epoch": 0.01,
27
+ "learning_rate": 2.617801047120419e-07,
28
+ "logits/chosen": 1.0090492963790894,
29
+ "logits/rejected": 1.0627849102020264,
30
+ "logps/chosen": -279.4153137207031,
31
+ "logps/rejected": -249.27322387695312,
32
+ "loss": 0.5,
33
+ "rewards/accuracies": 0.375,
34
+ "rewards/chosen": -8.76396952662617e-05,
35
+ "rewards/margins": -9.456619591219351e-05,
36
+ "rewards/rejected": 6.926496553205652e-06,
37
+ "step": 10
38
+ },
39
+ {
40
+ "epoch": 0.01,
41
+ "learning_rate": 5.235602094240838e-07,
42
+ "logits/chosen": 1.0303412675857544,
43
+ "logits/rejected": 1.0532195568084717,
44
+ "logps/chosen": -321.72723388671875,
45
+ "logps/rejected": -270.56353759765625,
46
+ "loss": 0.5,
47
+ "rewards/accuracies": 0.4000000059604645,
48
+ "rewards/chosen": -6.834287341916934e-05,
49
+ "rewards/margins": -4.8897858505370095e-05,
50
+ "rewards/rejected": -1.9445011275820434e-05,
51
+ "step": 20
52
+ },
53
+ {
54
+ "epoch": 0.02,
55
+ "learning_rate": 7.853403141361258e-07,
56
+ "logits/chosen": 1.002454400062561,
57
+ "logits/rejected": 1.06557297706604,
58
+ "logps/chosen": -252.0704345703125,
59
+ "logps/rejected": -246.32705688476562,
60
+ "loss": 0.5,
61
+ "rewards/accuracies": 0.48750001192092896,
62
+ "rewards/chosen": 1.5753510524518788e-05,
63
+ "rewards/margins": 5.4146301408763975e-05,
64
+ "rewards/rejected": -3.83927981602028e-05,
65
+ "step": 30
66
+ },
67
+ {
68
+ "epoch": 0.02,
69
+ "learning_rate": 1.0471204188481676e-06,
70
+ "logits/chosen": 1.0041682720184326,
71
+ "logits/rejected": 1.1504443883895874,
72
+ "logps/chosen": -235.38217163085938,
73
+ "logps/rejected": -230.2617645263672,
74
+ "loss": 0.5,
75
+ "rewards/accuracies": 0.45625001192092896,
76
+ "rewards/chosen": 7.3400560722802766e-06,
77
+ "rewards/margins": 2.9947289021947654e-06,
78
+ "rewards/rejected": 4.3453355829115026e-06,
79
+ "step": 40
80
+ },
81
+ {
82
+ "epoch": 0.03,
83
+ "learning_rate": 1.3089005235602096e-06,
84
+ "logits/chosen": 0.9595837593078613,
85
+ "logits/rejected": 1.0130202770233154,
86
+ "logps/chosen": -294.26007080078125,
87
+ "logps/rejected": -249.2256317138672,
88
+ "loss": 0.5,
89
+ "rewards/accuracies": 0.48750001192092896,
90
+ "rewards/chosen": 0.00017269175441469997,
91
+ "rewards/margins": 9.17307916097343e-05,
92
+ "rewards/rejected": 8.096096280496567e-05,
93
+ "step": 50
94
+ },
95
+ {
96
+ "epoch": 0.03,
97
+ "learning_rate": 1.5706806282722515e-06,
98
+ "logits/chosen": 0.9245076179504395,
99
+ "logits/rejected": 1.023485779762268,
100
+ "logps/chosen": -242.47689819335938,
101
+ "logps/rejected": -230.57373046875,
102
+ "loss": 0.5,
103
+ "rewards/accuracies": 0.4749999940395355,
104
+ "rewards/chosen": 0.0002746728132478893,
105
+ "rewards/margins": 0.00012865502503700554,
106
+ "rewards/rejected": 0.00014601778821088374,
107
+ "step": 60
108
+ },
109
+ {
110
+ "epoch": 0.04,
111
+ "learning_rate": 1.8324607329842933e-06,
112
+ "logits/chosen": 0.9357272386550903,
113
+ "logits/rejected": 1.0410839319229126,
114
+ "logps/chosen": -257.8460388183594,
115
+ "logps/rejected": -238.37973022460938,
116
+ "loss": 0.5,
117
+ "rewards/accuracies": 0.5062500238418579,
118
+ "rewards/chosen": 0.00047300319420173764,
119
+ "rewards/margins": 0.00019578025967348367,
120
+ "rewards/rejected": 0.0002772229490801692,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.04,
125
+ "learning_rate": 2.094240837696335e-06,
126
+ "logits/chosen": 1.0097007751464844,
127
+ "logits/rejected": 1.0268934965133667,
128
+ "logps/chosen": -263.69903564453125,
129
+ "logps/rejected": -256.5643615722656,
130
+ "loss": 0.4999,
131
+ "rewards/accuracies": 0.518750011920929,
132
+ "rewards/chosen": 0.0005936628440394998,
133
+ "rewards/margins": 0.00022301140415947884,
134
+ "rewards/rejected": 0.0003706514835357666,
135
+ "step": 80
136
+ },
137
+ {
138
+ "epoch": 0.05,
139
+ "learning_rate": 2.356020942408377e-06,
140
+ "logits/chosen": 0.9857368469238281,
141
+ "logits/rejected": 1.050782561302185,
142
+ "logps/chosen": -252.1823272705078,
143
+ "logps/rejected": -253.6891326904297,
144
+ "loss": 0.4999,
145
+ "rewards/accuracies": 0.6000000238418579,
146
+ "rewards/chosen": 0.000997263239696622,
147
+ "rewards/margins": 0.00041304732440039515,
148
+ "rewards/rejected": 0.0005842159152962267,
149
+ "step": 90
150
+ },
151
+ {
152
+ "epoch": 0.05,
153
+ "learning_rate": 2.617801047120419e-06,
154
+ "logits/chosen": 1.0416964292526245,
155
+ "logits/rejected": 1.0389362573623657,
156
+ "logps/chosen": -254.76235961914062,
157
+ "logps/rejected": -224.39559936523438,
158
+ "loss": 0.4998,
159
+ "rewards/accuracies": 0.48124998807907104,
160
+ "rewards/chosen": 0.0013410584069788456,
161
+ "rewards/margins": 0.0005450797034427524,
162
+ "rewards/rejected": 0.0007959787035360932,
163
+ "step": 100
164
+ },
165
+ {
166
+ "epoch": 0.06,
167
+ "learning_rate": 2.8795811518324613e-06,
168
+ "logits/chosen": 1.0654562711715698,
169
+ "logits/rejected": 1.1301515102386475,
170
+ "logps/chosen": -294.14031982421875,
171
+ "logps/rejected": -258.11077880859375,
172
+ "loss": 0.4998,
173
+ "rewards/accuracies": 0.574999988079071,
174
+ "rewards/chosen": 0.001866974518634379,
175
+ "rewards/margins": 0.0006641600048169494,
176
+ "rewards/rejected": 0.0012028145138174295,
177
+ "step": 110
178
+ },
179
+ {
180
+ "epoch": 0.06,
181
+ "learning_rate": 3.141361256544503e-06,
182
+ "logits/chosen": 0.9807151556015015,
183
+ "logits/rejected": 1.125035285949707,
184
+ "logps/chosen": -303.8504943847656,
185
+ "logps/rejected": -249.7647705078125,
186
+ "loss": 0.4997,
187
+ "rewards/accuracies": 0.6000000238418579,
188
+ "rewards/chosen": 0.002772308187559247,
189
+ "rewards/margins": 0.0014164599124342203,
190
+ "rewards/rejected": 0.001355848042294383,
191
+ "step": 120
192
+ },
193
+ {
194
+ "epoch": 0.07,
195
+ "learning_rate": 3.403141361256545e-06,
196
+ "logits/chosen": 1.096975564956665,
197
+ "logits/rejected": 1.1348248720169067,
198
+ "logps/chosen": -278.3834533691406,
199
+ "logps/rejected": -245.82968139648438,
200
+ "loss": 0.4995,
201
+ "rewards/accuracies": 0.6499999761581421,
202
+ "rewards/chosen": 0.003960819449275732,
203
+ "rewards/margins": 0.0022526984103024006,
204
+ "rewards/rejected": 0.0017081208061426878,
205
+ "step": 130
206
+ },
207
+ {
208
+ "epoch": 0.07,
209
+ "learning_rate": 3.6649214659685865e-06,
210
+ "logits/chosen": 1.0514932870864868,
211
+ "logits/rejected": 1.1338948011398315,
212
+ "logps/chosen": -275.76031494140625,
213
+ "logps/rejected": -258.5254821777344,
214
+ "loss": 0.4995,
215
+ "rewards/accuracies": 0.581250011920929,
216
+ "rewards/chosen": 0.004108738619834185,
217
+ "rewards/margins": 0.001808557310141623,
218
+ "rewards/rejected": 0.0023001814261078835,
219
+ "step": 140
220
+ },
221
+ {
222
+ "epoch": 0.08,
223
+ "learning_rate": 3.926701570680629e-06,
224
+ "logits/chosen": 0.9971652030944824,
225
+ "logits/rejected": 1.0917918682098389,
226
+ "logps/chosen": -291.89044189453125,
227
+ "logps/rejected": -254.80679321289062,
228
+ "loss": 0.4993,
229
+ "rewards/accuracies": 0.637499988079071,
230
+ "rewards/chosen": 0.0049656108021736145,
231
+ "rewards/margins": 0.002658768789842725,
232
+ "rewards/rejected": 0.0023068420123308897,
233
+ "step": 150
234
+ },
235
+ {
236
+ "epoch": 0.08,
237
+ "learning_rate": 4.18848167539267e-06,
238
+ "logits/chosen": 1.0320146083831787,
239
+ "logits/rejected": 1.053504228591919,
240
+ "logps/chosen": -285.04559326171875,
241
+ "logps/rejected": -244.1322784423828,
242
+ "loss": 0.4993,
243
+ "rewards/accuracies": 0.606249988079071,
244
+ "rewards/chosen": 0.005488889757543802,
245
+ "rewards/margins": 0.002789679216220975,
246
+ "rewards/rejected": 0.00269921007566154,
247
+ "step": 160
248
+ },
249
+ {
250
+ "epoch": 0.09,
251
+ "learning_rate": 4.450261780104713e-06,
252
+ "logits/chosen": 1.0273762941360474,
253
+ "logits/rejected": 1.062558650970459,
254
+ "logps/chosen": -287.9652099609375,
255
+ "logps/rejected": -232.247314453125,
256
+ "loss": 0.4993,
257
+ "rewards/accuracies": 0.6187499761581421,
258
+ "rewards/chosen": 0.007104066200554371,
259
+ "rewards/margins": 0.0033009883482009172,
260
+ "rewards/rejected": 0.0038030785508453846,
261
+ "step": 170
262
+ },
263
+ {
264
+ "epoch": 0.09,
265
+ "learning_rate": 4.712041884816754e-06,
266
+ "logits/chosen": 1.0532909631729126,
267
+ "logits/rejected": 1.1673284769058228,
268
+ "logps/chosen": -274.5193786621094,
269
+ "logps/rejected": -238.21286010742188,
270
+ "loss": 0.499,
271
+ "rewards/accuracies": 0.606249988079071,
272
+ "rewards/chosen": 0.00742004532366991,
273
+ "rewards/margins": 0.003918725997209549,
274
+ "rewards/rejected": 0.003501318860799074,
275
+ "step": 180
276
+ },
277
+ {
278
+ "epoch": 0.1,
279
+ "learning_rate": 4.9738219895287965e-06,
280
+ "logits/chosen": 1.1504939794540405,
281
+ "logits/rejected": 1.1638376712799072,
282
+ "logps/chosen": -237.76797485351562,
283
+ "logps/rejected": -211.50613403320312,
284
+ "loss": 0.499,
285
+ "rewards/accuracies": 0.543749988079071,
286
+ "rewards/chosen": 0.007698298431932926,
287
+ "rewards/margins": 0.003723274450749159,
288
+ "rewards/rejected": 0.00397502351552248,
289
+ "step": 190
290
+ },
291
+ {
292
+ "epoch": 0.1,
293
+ "learning_rate": 4.999661831436499e-06,
294
+ "logits/chosen": 1.0712188482284546,
295
+ "logits/rejected": 1.0771671533584595,
296
+ "logps/chosen": -288.3528747558594,
297
+ "logps/rejected": -265.5425109863281,
298
+ "loss": 0.4989,
299
+ "rewards/accuracies": 0.675000011920929,
300
+ "rewards/chosen": 0.008598363026976585,
301
+ "rewards/margins": 0.005429488606750965,
302
+ "rewards/rejected": 0.003168874653056264,
303
+ "step": 200
304
+ },
305
+ {
306
+ "epoch": 0.11,
307
+ "learning_rate": 4.9984929711403395e-06,
308
+ "logits/chosen": 1.1236344575881958,
309
+ "logits/rejected": 1.2009334564208984,
310
+ "logps/chosen": -254.3011932373047,
311
+ "logps/rejected": -224.9448699951172,
312
+ "loss": 0.4988,
313
+ "rewards/accuracies": 0.581250011920929,
314
+ "rewards/chosen": 0.007906198501586914,
315
+ "rewards/margins": 0.005368872079998255,
316
+ "rewards/rejected": 0.0025373264215886593,
317
+ "step": 210
318
+ },
319
+ {
320
+ "epoch": 0.12,
321
+ "learning_rate": 4.996489634487865e-06,
322
+ "logits/chosen": 1.0867538452148438,
323
+ "logits/rejected": 1.2004356384277344,
324
+ "logps/chosen": -258.08062744140625,
325
+ "logps/rejected": -240.8439483642578,
326
+ "loss": 0.4988,
327
+ "rewards/accuracies": 0.625,
328
+ "rewards/chosen": 0.007928581908345222,
329
+ "rewards/margins": 0.004648840986192226,
330
+ "rewards/rejected": 0.003279739525169134,
331
+ "step": 220
332
+ },
333
+ {
334
+ "epoch": 0.12,
335
+ "learning_rate": 4.9936524905772466e-06,
336
+ "logits/chosen": 1.0192543268203735,
337
+ "logits/rejected": 1.2005066871643066,
338
+ "logps/chosen": -274.07037353515625,
339
+ "logps/rejected": -256.2618713378906,
340
+ "loss": 0.4988,
341
+ "rewards/accuracies": 0.5375000238418579,
342
+ "rewards/chosen": 0.006800153758376837,
343
+ "rewards/margins": 0.003369166050106287,
344
+ "rewards/rejected": 0.0034309872426092625,
345
+ "step": 230
346
+ },
347
+ {
348
+ "epoch": 0.13,
349
+ "learning_rate": 4.9899824869915e-06,
350
+ "logits/chosen": 1.111426830291748,
351
+ "logits/rejected": 1.1554086208343506,
352
+ "logps/chosen": -243.208984375,
353
+ "logps/rejected": -205.7252655029297,
354
+ "loss": 0.4984,
355
+ "rewards/accuracies": 0.612500011920929,
356
+ "rewards/chosen": 0.007739508058875799,
357
+ "rewards/margins": 0.0072770556434988976,
358
+ "rewards/rejected": 0.0004624520370271057,
359
+ "step": 240
360
+ },
361
+ {
362
+ "epoch": 0.13,
363
+ "learning_rate": 4.985480849482012e-06,
364
+ "logits/chosen": 1.1005799770355225,
365
+ "logits/rejected": 1.230799913406372,
366
+ "logps/chosen": -272.18597412109375,
367
+ "logps/rejected": -257.9790954589844,
368
+ "loss": 0.4988,
369
+ "rewards/accuracies": 0.5249999761581421,
370
+ "rewards/chosen": 0.005292638670653105,
371
+ "rewards/margins": 0.0029095064383000135,
372
+ "rewards/rejected": 0.0023831322323530912,
373
+ "step": 250
374
+ },
375
+ {
376
+ "epoch": 0.14,
377
+ "learning_rate": 4.980149081559142e-06,
378
+ "logits/chosen": 1.0777183771133423,
379
+ "logits/rejected": 1.155970573425293,
380
+ "logps/chosen": -294.93328857421875,
381
+ "logps/rejected": -261.9263610839844,
382
+ "loss": 0.4982,
383
+ "rewards/accuracies": 0.6312500238418579,
384
+ "rewards/chosen": 0.009707033634185791,
385
+ "rewards/margins": 0.008258306421339512,
386
+ "rewards/rejected": 0.00144872663076967,
387
+ "step": 260
388
+ },
389
+ {
390
+ "epoch": 0.14,
391
+ "learning_rate": 4.9739889639900655e-06,
392
+ "logits/chosen": 1.1088669300079346,
393
+ "logits/rejected": 1.1434690952301025,
394
+ "logps/chosen": -254.5012664794922,
395
+ "logps/rejected": -254.6510009765625,
396
+ "loss": 0.4979,
397
+ "rewards/accuracies": 0.65625,
398
+ "rewards/chosen": 0.009705386124551296,
399
+ "rewards/margins": 0.009683574549853802,
400
+ "rewards/rejected": 2.181164381909184e-05,
401
+ "step": 270
402
+ },
403
+ {
404
+ "epoch": 0.15,
405
+ "learning_rate": 4.967002554204009e-06,
406
+ "logits/chosen": 1.0548467636108398,
407
+ "logits/rejected": 1.1509649753570557,
408
+ "logps/chosen": -245.9481964111328,
409
+ "logps/rejected": -229.8827362060547,
410
+ "loss": 0.4985,
411
+ "rewards/accuracies": 0.59375,
412
+ "rewards/chosen": 0.009255246259272099,
413
+ "rewards/margins": 0.006528814323246479,
414
+ "rewards/rejected": 0.0027264312375336885,
415
+ "step": 280
416
+ },
417
+ {
418
+ "epoch": 0.15,
419
+ "learning_rate": 4.959192185605089e-06,
420
+ "logits/chosen": 1.0842396020889282,
421
+ "logits/rejected": 1.1220932006835938,
422
+ "logps/chosen": -266.4988708496094,
423
+ "logps/rejected": -246.9526824951172,
424
+ "loss": 0.4988,
425
+ "rewards/accuracies": 0.574999988079071,
426
+ "rewards/chosen": 0.009208474308252335,
427
+ "rewards/margins": 0.007726150564849377,
428
+ "rewards/rejected": 0.0014823225792497396,
429
+ "step": 290
430
+ },
431
+ {
432
+ "epoch": 0.16,
433
+ "learning_rate": 4.950560466792969e-06,
434
+ "logits/chosen": 1.1049131155014038,
435
+ "logits/rejected": 1.1441484689712524,
436
+ "logps/chosen": -275.13421630859375,
437
+ "logps/rejected": -246.1587677001953,
438
+ "loss": 0.4984,
439
+ "rewards/accuracies": 0.574999988079071,
440
+ "rewards/chosen": 0.006010602228343487,
441
+ "rewards/margins": 0.00892153661698103,
442
+ "rewards/rejected": -0.0029109339229762554,
443
+ "step": 300
444
+ },
445
+ {
446
+ "epoch": 0.16,
447
+ "learning_rate": 4.9411102806916185e-06,
448
+ "logits/chosen": 1.021583080291748,
449
+ "logits/rejected": 1.047163963317871,
450
+ "logps/chosen": -323.06097412109375,
451
+ "logps/rejected": -254.7588653564453,
452
+ "loss": 0.4977,
453
+ "rewards/accuracies": 0.6499999761581421,
454
+ "rewards/chosen": 0.008619217202067375,
455
+ "rewards/margins": 0.012051543220877647,
456
+ "rewards/rejected": -0.003432326018810272,
457
+ "step": 310
458
+ },
459
+ {
460
+ "epoch": 0.17,
461
+ "learning_rate": 4.930844783586424e-06,
462
+ "logits/chosen": 1.024611473083496,
463
+ "logits/rejected": 1.0655776262283325,
464
+ "logps/chosen": -238.3491668701172,
465
+ "logps/rejected": -231.0393829345703,
466
+ "loss": 0.498,
467
+ "rewards/accuracies": 0.59375,
468
+ "rewards/chosen": 0.006629918701946735,
469
+ "rewards/margins": 0.010882768779993057,
470
+ "rewards/rejected": -0.004252850078046322,
471
+ "step": 320
472
+ },
473
+ {
474
+ "epoch": 0.17,
475
+ "learning_rate": 4.919767404070033e-06,
476
+ "logits/chosen": 1.04720139503479,
477
+ "logits/rejected": 1.0630711317062378,
478
+ "logps/chosen": -261.62982177734375,
479
+ "logps/rejected": -247.97607421875,
480
+ "loss": 0.4981,
481
+ "rewards/accuracies": 0.5874999761581421,
482
+ "rewards/chosen": 0.0036115895491093397,
483
+ "rewards/margins": 0.009240304119884968,
484
+ "rewards/rejected": -0.005628715269267559,
485
+ "step": 330
486
+ },
487
+ {
488
+ "epoch": 0.18,
489
+ "learning_rate": 4.907881841897216e-06,
490
+ "logits/chosen": 1.0087223052978516,
491
+ "logits/rejected": 1.059715986251831,
492
+ "logps/chosen": -314.62408447265625,
493
+ "logps/rejected": -248.10879516601562,
494
+ "loss": 0.4979,
495
+ "rewards/accuracies": 0.637499988079071,
496
+ "rewards/chosen": 0.003107100958004594,
497
+ "rewards/margins": 0.013965976424515247,
498
+ "rewards/rejected": -0.010858876630663872,
499
+ "step": 340
500
+ },
501
+ {
502
+ "epoch": 0.18,
503
+ "learning_rate": 4.89519206674919e-06,
504
+ "logits/chosen": 0.9633463621139526,
505
+ "logits/rejected": 1.0100409984588623,
506
+ "logps/chosen": -241.84793090820312,
507
+ "logps/rejected": -252.7783203125,
508
+ "loss": 0.4976,
509
+ "rewards/accuracies": 0.625,
510
+ "rewards/chosen": 0.0028146414551883936,
511
+ "rewards/margins": 0.013054436072707176,
512
+ "rewards/rejected": -0.010239794850349426,
513
+ "step": 350
514
+ },
515
+ {
516
+ "epoch": 0.19,
517
+ "learning_rate": 4.881702316907769e-06,
518
+ "logits/chosen": 0.9069837331771851,
519
+ "logits/rejected": 1.0270668268203735,
520
+ "logps/chosen": -210.9730987548828,
521
+ "logps/rejected": -243.6437225341797,
522
+ "loss": 0.4983,
523
+ "rewards/accuracies": 0.5062500238418579,
524
+ "rewards/chosen": 0.0008282591588795185,
525
+ "rewards/margins": 0.010188087821006775,
526
+ "rewards/rejected": -0.009359828196465969,
527
+ "step": 360
528
+ },
529
+ {
530
+ "epoch": 0.19,
531
+ "learning_rate": 4.86741709783982e-06,
532
+ "logits/chosen": 0.8630668520927429,
533
+ "logits/rejected": 0.9914480447769165,
534
+ "logps/chosen": -332.7330627441406,
535
+ "logps/rejected": -281.46807861328125,
536
+ "loss": 0.4977,
537
+ "rewards/accuracies": 0.574999988079071,
538
+ "rewards/chosen": 0.0035103503614664078,
539
+ "rewards/margins": 0.01303508598357439,
540
+ "rewards/rejected": -0.009524735622107983,
541
+ "step": 370
542
+ },
543
+ {
544
+ "epoch": 0.2,
545
+ "learning_rate": 4.852341180692471e-06,
546
+ "logits/chosen": 0.9135398864746094,
547
+ "logits/rejected": 0.9984884262084961,
548
+ "logps/chosen": -284.92620849609375,
549
+ "logps/rejected": -252.03970336914062,
550
+ "loss": 0.4976,
551
+ "rewards/accuracies": 0.606249988079071,
552
+ "rewards/chosen": 0.0040648458525538445,
553
+ "rewards/margins": 0.0157476756721735,
554
+ "rewards/rejected": -0.011682827956974506,
555
+ "step": 380
556
+ },
557
+ {
558
+ "epoch": 0.2,
559
+ "learning_rate": 4.836479600699579e-06,
560
+ "logits/chosen": 0.9406082034111023,
561
+ "logits/rejected": 0.9047748446464539,
562
+ "logps/chosen": -278.61248779296875,
563
+ "logps/rejected": -284.1888732910156,
564
+ "loss": 0.4972,
565
+ "rewards/accuracies": 0.643750011920929,
566
+ "rewards/chosen": 0.006144961342215538,
567
+ "rewards/margins": 0.017049867659807205,
568
+ "rewards/rejected": -0.010904906317591667,
569
+ "step": 390
570
+ },
571
+ {
572
+ "epoch": 0.21,
573
+ "learning_rate": 4.819837655500014e-06,
574
+ "logits/chosen": 0.8400663137435913,
575
+ "logits/rejected": 0.9222391843795776,
576
+ "logps/chosen": -230.8615264892578,
577
+ "logps/rejected": -221.2638397216797,
578
+ "loss": 0.4984,
579
+ "rewards/accuracies": 0.625,
580
+ "rewards/chosen": 0.0005404525436460972,
581
+ "rewards/margins": 0.011931750923395157,
582
+ "rewards/rejected": -0.011391298845410347,
583
+ "step": 400
584
+ },
585
+ {
586
+ "epoch": 0.21,
587
+ "learning_rate": 4.802420903368286e-06,
588
+ "logits/chosen": 0.8889272809028625,
589
+ "logits/rejected": 0.8912805318832397,
590
+ "logps/chosen": -268.0902099609375,
591
+ "logps/rejected": -250.4331512451172,
592
+ "loss": 0.4979,
593
+ "rewards/accuracies": 0.53125,
594
+ "rewards/chosen": 0.0009850022615864873,
595
+ "rewards/margins": 0.010063153691589832,
596
+ "rewards/rejected": -0.009078151546418667,
597
+ "step": 410
598
+ },
599
+ {
600
+ "epoch": 0.22,
601
+ "learning_rate": 4.784235161358124e-06,
602
+ "logits/chosen": 0.8787338137626648,
603
+ "logits/rejected": 0.9284510612487793,
604
+ "logps/chosen": -288.6819152832031,
605
+ "logps/rejected": -265.958984375,
606
+ "loss": 0.4971,
607
+ "rewards/accuracies": 0.6812499761581421,
608
+ "rewards/chosen": 0.0032099136151373386,
609
+ "rewards/margins": 0.021029185503721237,
610
+ "rewards/rejected": -0.017819274216890335,
611
+ "step": 420
612
+ },
613
+ {
614
+ "epoch": 0.23,
615
+ "learning_rate": 4.765286503359632e-06,
616
+ "logits/chosen": 0.8820232152938843,
617
+ "logits/rejected": 0.9475772976875305,
618
+ "logps/chosen": -270.6169738769531,
619
+ "logps/rejected": -259.78839111328125,
620
+ "loss": 0.4973,
621
+ "rewards/accuracies": 0.6499999761581421,
622
+ "rewards/chosen": -0.0021267482079565525,
623
+ "rewards/margins": 0.019616421312093735,
624
+ "rewards/rejected": -0.021743169054389,
625
+ "step": 430
626
+ },
627
+ {
628
+ "epoch": 0.23,
629
+ "learning_rate": 4.745581258070654e-06,
630
+ "logits/chosen": 0.7767919301986694,
631
+ "logits/rejected": 0.87933349609375,
632
+ "logps/chosen": -254.14315795898438,
633
+ "logps/rejected": -252.87222290039062,
634
+ "loss": 0.498,
635
+ "rewards/accuracies": 0.5625,
636
+ "rewards/chosen": -0.0043312786146998405,
637
+ "rewards/margins": 0.013770043849945068,
638
+ "rewards/rejected": -0.018101321533322334,
639
+ "step": 440
640
+ },
641
+ {
642
+ "epoch": 0.24,
643
+ "learning_rate": 4.725126006883047e-06,
644
+ "logits/chosen": 0.7937654256820679,
645
+ "logits/rejected": 0.8364180326461792,
646
+ "logps/chosen": -238.3746337890625,
647
+ "logps/rejected": -241.1796875,
648
+ "loss": 0.4977,
649
+ "rewards/accuracies": 0.5375000238418579,
650
+ "rewards/chosen": -0.00781493354588747,
651
+ "rewards/margins": 0.011845615692436695,
652
+ "rewards/rejected": -0.019660547375679016,
653
+ "step": 450
654
+ },
655
+ {
656
+ "epoch": 0.24,
657
+ "learning_rate": 4.70392758168454e-06,
658
+ "logits/chosen": 0.7985974550247192,
659
+ "logits/rejected": 0.8068701028823853,
660
+ "logps/chosen": -345.21343994140625,
661
+ "logps/rejected": -304.43817138671875,
662
+ "loss": 0.4965,
663
+ "rewards/accuracies": 0.6312500238418579,
664
+ "rewards/chosen": -0.007276026997715235,
665
+ "rewards/margins": 0.02650422975420952,
666
+ "rewards/rejected": -0.033780258148908615,
667
+ "step": 460
668
+ },
669
+ {
670
+ "epoch": 0.25,
671
+ "learning_rate": 4.68199306257695e-06,
672
+ "logits/chosen": 0.7760607004165649,
673
+ "logits/rejected": 0.773891806602478,
674
+ "logps/chosen": -327.35369873046875,
675
+ "logps/rejected": -314.1829528808594,
676
+ "loss": 0.4961,
677
+ "rewards/accuracies": 0.65625,
678
+ "rewards/chosen": -0.014054256491363049,
679
+ "rewards/margins": 0.03367748484015465,
680
+ "rewards/rejected": -0.04773174598813057,
681
+ "step": 470
682
+ },
683
+ {
684
+ "epoch": 0.25,
685
+ "learning_rate": 4.659329775511478e-06,
686
+ "logits/chosen": 0.7017660140991211,
687
+ "logits/rejected": 0.7137667536735535,
688
+ "logps/chosen": -287.37652587890625,
689
+ "logps/rejected": -271.36358642578125,
690
+ "loss": 0.497,
691
+ "rewards/accuracies": 0.581250011920929,
692
+ "rewards/chosen": -0.021392906084656715,
693
+ "rewards/margins": 0.025359559804201126,
694
+ "rewards/rejected": -0.04675246775150299,
695
+ "step": 480
696
+ },
697
+ {
698
+ "epoch": 0.26,
699
+ "learning_rate": 4.635945289841902e-06,
700
+ "logits/chosen": 0.5314046144485474,
701
+ "logits/rejected": 0.5452633500099182,
702
+ "logps/chosen": -337.0295104980469,
703
+ "logps/rejected": -379.64593505859375,
704
+ "loss": 0.4958,
705
+ "rewards/accuracies": 0.612500011920929,
706
+ "rewards/chosen": -0.05135764926671982,
707
+ "rewards/margins": 0.04310908168554306,
708
+ "rewards/rejected": -0.09446673840284348,
709
+ "step": 490
710
+ },
711
+ {
712
+ "epoch": 0.26,
713
+ "learning_rate": 4.611847415796476e-06,
714
+ "logits/chosen": 0.29375532269477844,
715
+ "logits/rejected": 0.2797163724899292,
716
+ "logps/chosen": -427.3785095214844,
717
+ "logps/rejected": -405.41461181640625,
718
+ "loss": 0.4932,
719
+ "rewards/accuracies": 0.5625,
720
+ "rewards/chosen": -0.12762612104415894,
721
+ "rewards/margins": 0.03379129245877266,
722
+ "rewards/rejected": -0.16141743957996368,
723
+ "step": 500
724
+ },
725
+ {
726
+ "epoch": 0.27,
727
+ "learning_rate": 4.587044201869378e-06,
728
+ "logits/chosen": -0.2227209359407425,
729
+ "logits/rejected": -0.20223090052604675,
730
+ "logps/chosen": -787.1062622070312,
731
+ "logps/rejected": -1045.249267578125,
732
+ "loss": 0.4818,
733
+ "rewards/accuracies": 0.5249999761581421,
734
+ "rewards/chosen": -0.5222705602645874,
735
+ "rewards/margins": 0.26146870851516724,
736
+ "rewards/rejected": -0.7837392687797546,
737
+ "step": 510
738
+ },
739
+ {
740
+ "epoch": 0.27,
741
+ "learning_rate": 4.561543932132574e-06,
742
+ "logits/chosen": -0.11980749666690826,
743
+ "logits/rejected": -0.12788312137126923,
744
+ "logps/chosen": -732.790283203125,
745
+ "logps/rejected": -833.4085693359375,
746
+ "loss": 0.4873,
747
+ "rewards/accuracies": 0.5062500238418579,
748
+ "rewards/chosen": -0.45401230454444885,
749
+ "rewards/margins": 0.1505609005689621,
750
+ "rewards/rejected": -0.6045731902122498,
751
+ "step": 520
752
+ },
753
+ {
754
+ "epoch": 0.28,
755
+ "learning_rate": 4.535355123469009e-06,
756
+ "logits/chosen": -0.14909827709197998,
757
+ "logits/rejected": -0.18795037269592285,
758
+ "logps/chosen": -696.719482421875,
759
+ "logps/rejected": -1046.4390869140625,
760
+ "loss": 0.4836,
761
+ "rewards/accuracies": 0.581250011920929,
762
+ "rewards/chosen": -0.42250218987464905,
763
+ "rewards/margins": 0.3782690167427063,
764
+ "rewards/rejected": -0.8007712364196777,
765
+ "step": 530
766
+ },
767
+ {
768
+ "epoch": 0.28,
769
+ "learning_rate": 4.508486522728037e-06,
770
+ "logits/chosen": -0.18408063054084778,
771
+ "logits/rejected": -0.14851421117782593,
772
+ "logps/chosen": -893.5338134765625,
773
+ "logps/rejected": -1111.295654296875,
774
+ "loss": 0.4841,
775
+ "rewards/accuracies": 0.550000011920929,
776
+ "rewards/chosen": -0.629914402961731,
777
+ "rewards/margins": 0.23182418942451477,
778
+ "rewards/rejected": -0.8617385625839233,
779
+ "step": 540
780
+ },
781
+ {
782
+ "epoch": 0.29,
783
+ "learning_rate": 4.480947103804044e-06,
784
+ "logits/chosen": -0.20195765793323517,
785
+ "logits/rejected": -0.2249602973461151,
786
+ "logps/chosen": -970.3370971679688,
787
+ "logps/rejected": -1377.3724365234375,
788
+ "loss": 0.4747,
789
+ "rewards/accuracies": 0.625,
790
+ "rewards/chosen": -0.7055306434631348,
791
+ "rewards/margins": 0.46008825302124023,
792
+ "rewards/rejected": -1.165618896484375,
793
+ "step": 550
794
+ },
795
+ {
796
+ "epoch": 0.29,
797
+ "learning_rate": 4.452746064639239e-06,
798
+ "logits/chosen": -0.27148136496543884,
799
+ "logits/rejected": -0.24398574233055115,
800
+ "logps/chosen": -1213.016357421875,
801
+ "logps/rejected": -1345.276123046875,
802
+ "loss": 0.4846,
803
+ "rewards/accuracies": 0.48750001192092896,
804
+ "rewards/chosen": -0.8822624087333679,
805
+ "rewards/margins": 0.20509858429431915,
806
+ "rewards/rejected": -1.0873609781265259,
807
+ "step": 560
808
+ },
809
+ {
810
+ "epoch": 0.3,
811
+ "learning_rate": 4.423892824151617e-06,
812
+ "logits/chosen": -0.32366353273391724,
813
+ "logits/rejected": -0.3419601321220398,
814
+ "logps/chosen": -1556.5191650390625,
815
+ "logps/rejected": -1787.370361328125,
816
+ "loss": 0.4843,
817
+ "rewards/accuracies": 0.550000011920929,
818
+ "rewards/chosen": -1.2833073139190674,
819
+ "rewards/margins": 0.28631919622421265,
820
+ "rewards/rejected": -1.5696265697479248,
821
+ "step": 570
822
+ },
823
+ {
824
+ "epoch": 0.3,
825
+ "learning_rate": 4.3943970190891164e-06,
826
+ "logits/chosen": -0.23246267437934875,
827
+ "logits/rejected": -0.25014322996139526,
828
+ "logps/chosen": -1218.4473876953125,
829
+ "logps/rejected": -1182.8861083984375,
830
+ "loss": 0.4809,
831
+ "rewards/accuracies": 0.48124998807907104,
832
+ "rewards/chosen": -0.9385588765144348,
833
+ "rewards/margins": 0.02590467967092991,
834
+ "rewards/rejected": -0.9644634127616882,
835
+ "step": 580
836
+ },
837
+ {
838
+ "epoch": 0.31,
839
+ "learning_rate": 4.364268500811025e-06,
840
+ "logits/chosen": -0.17416557669639587,
841
+ "logits/rejected": -0.17902135848999023,
842
+ "logps/chosen": -985.0808715820312,
843
+ "logps/rejected": -1348.008056640625,
844
+ "loss": 0.4847,
845
+ "rewards/accuracies": 0.606249988079071,
846
+ "rewards/chosen": -0.6908355951309204,
847
+ "rewards/margins": 0.41670989990234375,
848
+ "rewards/rejected": -1.1075454950332642,
849
+ "step": 590
850
+ },
851
+ {
852
+ "epoch": 0.31,
853
+ "learning_rate": 4.333517331997704e-06,
854
+ "logits/chosen": -0.14922045171260834,
855
+ "logits/rejected": -0.19132760167121887,
856
+ "logps/chosen": -1128.3173828125,
857
+ "logps/rejected": -1489.837158203125,
858
+ "loss": 0.4745,
859
+ "rewards/accuracies": 0.5562499761581421,
860
+ "rewards/chosen": -0.8316856622695923,
861
+ "rewards/margins": 0.4227636754512787,
862
+ "rewards/rejected": -1.2544492483139038,
863
+ "step": 600
864
+ },
865
+ {
866
+ "epoch": 0.32,
867
+ "learning_rate": 4.302153783289737e-06,
868
+ "logits/chosen": -0.1274535059928894,
869
+ "logits/rejected": -0.17803938686847687,
870
+ "logps/chosen": -1048.890625,
871
+ "logps/rejected": -1506.1158447265625,
872
+ "loss": 0.4743,
873
+ "rewards/accuracies": 0.625,
874
+ "rewards/chosen": -0.7957647442817688,
875
+ "rewards/margins": 0.4669608175754547,
876
+ "rewards/rejected": -1.2627254724502563,
877
+ "step": 610
878
+ },
879
+ {
880
+ "epoch": 0.32,
881
+ "learning_rate": 4.270188329857613e-06,
882
+ "logits/chosen": -0.14815063774585724,
883
+ "logits/rejected": -0.15499570965766907,
884
+ "logps/chosen": -1084.8118896484375,
885
+ "logps/rejected": -1618.885009765625,
886
+ "loss": 0.4711,
887
+ "rewards/accuracies": 0.5625,
888
+ "rewards/chosen": -0.7856907844543457,
889
+ "rewards/margins": 0.5791957974433899,
890
+ "rewards/rejected": -1.3648868799209595,
891
+ "step": 620
892
+ },
893
+ {
894
+ "epoch": 0.33,
895
+ "learning_rate": 4.237631647903115e-06,
896
+ "logits/chosen": -0.024261217564344406,
897
+ "logits/rejected": -0.038342759013175964,
898
+ "logps/chosen": -723.5900268554688,
899
+ "logps/rejected": -1155.1717529296875,
900
+ "loss": 0.4678,
901
+ "rewards/accuracies": 0.5874999761581421,
902
+ "rewards/chosen": -0.46949324011802673,
903
+ "rewards/margins": 0.45854002237319946,
904
+ "rewards/rejected": -0.9280332326889038,
905
+ "step": 630
906
+ },
907
+ {
908
+ "epoch": 0.33,
909
+ "learning_rate": 4.204494611093548e-06,
910
+ "logits/chosen": -0.05518772080540657,
911
+ "logits/rejected": -0.100825235247612,
912
+ "logps/chosen": -1270.6005859375,
913
+ "logps/rejected": -1703.8551025390625,
914
+ "loss": 0.4819,
915
+ "rewards/accuracies": 0.48124998807907104,
916
+ "rewards/chosen": -0.9448369145393372,
917
+ "rewards/margins": 0.4941697120666504,
918
+ "rewards/rejected": -1.4390065670013428,
919
+ "step": 640
920
+ },
921
+ {
922
+ "epoch": 0.34,
923
+ "learning_rate": 4.170788286930024e-06,
924
+ "logits/chosen": -0.06449203193187714,
925
+ "logits/rejected": -0.1527264416217804,
926
+ "logps/chosen": -1250.4991455078125,
927
+ "logps/rejected": -1752.0111083984375,
928
+ "loss": 0.4822,
929
+ "rewards/accuracies": 0.550000011920929,
930
+ "rewards/chosen": -0.9863438606262207,
931
+ "rewards/margins": 0.5237391591072083,
932
+ "rewards/rejected": -1.5100830793380737,
933
+ "step": 650
934
+ },
935
+ {
936
+ "epoch": 0.35,
937
+ "learning_rate": 4.136523933051005e-06,
938
+ "logits/chosen": -0.10980840772390366,
939
+ "logits/rejected": -0.13391873240470886,
940
+ "logps/chosen": -1053.7823486328125,
941
+ "logps/rejected": -1614.2884521484375,
942
+ "loss": 0.4762,
943
+ "rewards/accuracies": 0.518750011920929,
944
+ "rewards/chosen": -0.8304306864738464,
945
+ "rewards/margins": 0.5787540078163147,
946
+ "rewards/rejected": -1.4091846942901611,
947
+ "step": 660
948
+ },
949
+ {
950
+ "epoch": 0.35,
951
+ "learning_rate": 4.101712993472348e-06,
952
+ "logits/chosen": -0.10138118267059326,
953
+ "logits/rejected": -0.13220438361167908,
954
+ "logps/chosen": -1581.559326171875,
955
+ "logps/rejected": -1862.4993896484375,
956
+ "loss": 0.481,
957
+ "rewards/accuracies": 0.512499988079071,
958
+ "rewards/chosen": -1.2885770797729492,
959
+ "rewards/margins": 0.32578420639038086,
960
+ "rewards/rejected": -1.6143611669540405,
961
+ "step": 670
962
+ },
963
+ {
964
+ "epoch": 0.36,
965
+ "learning_rate": 4.066367094765091e-06,
966
+ "logits/chosen": -0.06212924048304558,
967
+ "logits/rejected": -0.09771373122930527,
968
+ "logps/chosen": -1470.7352294921875,
969
+ "logps/rejected": -1844.652587890625,
970
+ "loss": 0.4783,
971
+ "rewards/accuracies": 0.53125,
972
+ "rewards/chosen": -1.204660177230835,
973
+ "rewards/margins": 0.3980388641357422,
974
+ "rewards/rejected": -1.6026990413665771,
975
+ "step": 680
976
+ },
977
+ {
978
+ "epoch": 0.36,
979
+ "learning_rate": 4.030498042172277e-06,
980
+ "logits/chosen": 0.01754361391067505,
981
+ "logits/rejected": -0.048445507884025574,
982
+ "logps/chosen": -979.1268310546875,
983
+ "logps/rejected": -1244.6566162109375,
984
+ "loss": 0.4726,
985
+ "rewards/accuracies": 0.5687500238418579,
986
+ "rewards/chosen": -0.7009618878364563,
987
+ "rewards/margins": 0.3003775477409363,
988
+ "rewards/rejected": -1.0013394355773926,
989
+ "step": 690
990
+ },
991
+ {
992
+ "epoch": 0.37,
993
+ "learning_rate": 3.994117815666095e-06,
994
+ "logits/chosen": -0.04728760942816734,
995
+ "logits/rejected": -0.0919174998998642,
996
+ "logps/chosen": -1344.916015625,
997
+ "logps/rejected": -1900.5986328125,
998
+ "loss": 0.472,
999
+ "rewards/accuracies": 0.5249999761581421,
1000
+ "rewards/chosen": -1.0581436157226562,
1001
+ "rewards/margins": 0.5791832208633423,
1002
+ "rewards/rejected": -1.6373268365859985,
1003
+ "step": 700
1004
+ },
1005
+ {
1006
+ "epoch": 0.37,
1007
+ "learning_rate": 3.957238565946672e-06,
1008
+ "logits/chosen": 0.004687662236392498,
1009
+ "logits/rejected": -0.06074858829379082,
1010
+ "logps/chosen": -1193.521240234375,
1011
+ "logps/rejected": -2065.345947265625,
1012
+ "loss": 0.4653,
1013
+ "rewards/accuracies": 0.637499988079071,
1014
+ "rewards/chosen": -0.8945425152778625,
1015
+ "rewards/margins": 0.8969429731369019,
1016
+ "rewards/rejected": -1.7914857864379883,
1017
+ "step": 710
1018
+ },
1019
+ {
1020
+ "epoch": 0.38,
1021
+ "learning_rate": 3.919872610383831e-06,
1022
+ "logits/chosen": 0.07505255192518234,
1023
+ "logits/rejected": -0.015723228454589844,
1024
+ "logps/chosen": -1065.49365234375,
1025
+ "logps/rejected": -1707.6328125,
1026
+ "loss": 0.4739,
1027
+ "rewards/accuracies": 0.550000011920929,
1028
+ "rewards/chosen": -0.7956100702285767,
1029
+ "rewards/margins": 0.6891741752624512,
1030
+ "rewards/rejected": -1.4847842454910278,
1031
+ "step": 720
1032
+ },
1033
+ {
1034
+ "epoch": 0.38,
1035
+ "learning_rate": 3.882032428903195e-06,
1036
+ "logits/chosen": 0.02505052089691162,
1037
+ "logits/rejected": -0.009700920432806015,
1038
+ "logps/chosen": -1372.3634033203125,
1039
+ "logps/rejected": -2129.860595703125,
1040
+ "loss": 0.4656,
1041
+ "rewards/accuracies": 0.5562499761581421,
1042
+ "rewards/chosen": -1.0954824686050415,
1043
+ "rewards/margins": 0.7768491506576538,
1044
+ "rewards/rejected": -1.8723316192626953,
1045
+ "step": 730
1046
+ },
1047
+ {
1048
+ "epoch": 0.39,
1049
+ "learning_rate": 3.84373065981799e-06,
1050
+ "logits/chosen": 0.1249980553984642,
1051
+ "logits/rejected": 0.04747745767235756,
1052
+ "logps/chosen": -956.18115234375,
1053
+ "logps/rejected": -1541.792724609375,
1054
+ "loss": 0.4693,
1055
+ "rewards/accuracies": 0.625,
1056
+ "rewards/chosen": -0.6708589792251587,
1057
+ "rewards/margins": 0.6129963994026184,
1058
+ "rewards/rejected": -1.2838553190231323,
1059
+ "step": 740
1060
+ },
1061
+ {
1062
+ "epoch": 0.39,
1063
+ "learning_rate": 3.8049800956079552e-06,
1064
+ "logits/chosen": 0.23526708781719208,
1065
+ "logits/rejected": 0.19636312127113342,
1066
+ "logps/chosen": -1106.01513671875,
1067
+ "logps/rejected": -1326.5162353515625,
1068
+ "loss": 0.4752,
1069
+ "rewards/accuracies": 0.518750011920929,
1070
+ "rewards/chosen": -0.8387149572372437,
1071
+ "rewards/margins": 0.24964456260204315,
1072
+ "rewards/rejected": -1.0883597135543823,
1073
+ "step": 750
1074
+ },
1075
+ {
1076
+ "epoch": 0.4,
1077
+ "learning_rate": 3.765793678646753e-06,
1078
+ "logits/chosen": 0.19188269972801208,
1079
+ "logits/rejected": 0.1782020926475525,
1080
+ "logps/chosen": -802.7251586914062,
1081
+ "logps/rejected": -1634.812255859375,
1082
+ "loss": 0.4647,
1083
+ "rewards/accuracies": 0.606249988079071,
1084
+ "rewards/chosen": -0.5356382727622986,
1085
+ "rewards/margins": 0.8580523729324341,
1086
+ "rewards/rejected": -1.3936904668807983,
1087
+ "step": 760
1088
+ },
1089
+ {
1090
+ "epoch": 0.4,
1091
+ "learning_rate": 3.726184496879323e-06,
1092
+ "logits/chosen": 0.14159968495368958,
1093
+ "logits/rejected": 0.08811040967702866,
1094
+ "logps/chosen": -1127.4029541015625,
1095
+ "logps/rejected": -1502.1641845703125,
1096
+ "loss": 0.4756,
1097
+ "rewards/accuracies": 0.550000011920929,
1098
+ "rewards/chosen": -0.8773125410079956,
1099
+ "rewards/margins": 0.3974476158618927,
1100
+ "rewards/rejected": -1.274760365486145,
1101
+ "step": 770
1102
+ },
1103
+ {
1104
+ "epoch": 0.41,
1105
+ "learning_rate": 3.686165779450619e-06,
1106
+ "logits/chosen": 0.1939581334590912,
1107
+ "logits/rejected": 0.1522776186466217,
1108
+ "logps/chosen": -968.0919799804688,
1109
+ "logps/rejected": -1507.5386962890625,
1110
+ "loss": 0.4793,
1111
+ "rewards/accuracies": 0.48750001192092896,
1112
+ "rewards/chosen": -0.7149516344070435,
1113
+ "rewards/margins": 0.5672934055328369,
1114
+ "rewards/rejected": -1.2822450399398804,
1115
+ "step": 780
1116
+ },
1117
+ {
1118
+ "epoch": 0.41,
1119
+ "learning_rate": 3.645750892287178e-06,
1120
+ "logits/chosen": 0.1306479275226593,
1121
+ "logits/rejected": 0.05887848883867264,
1122
+ "logps/chosen": -1289.082275390625,
1123
+ "logps/rejected": -1864.7164306640625,
1124
+ "loss": 0.4721,
1125
+ "rewards/accuracies": 0.5562499761581421,
1126
+ "rewards/chosen": -0.9888286590576172,
1127
+ "rewards/margins": 0.6331573724746704,
1128
+ "rewards/rejected": -1.6219860315322876,
1129
+ "step": 790
1130
+ },
1131
+ {
1132
+ "epoch": 0.42,
1133
+ "learning_rate": 3.604953333633009e-06,
1134
+ "logits/chosen": 0.205116868019104,
1135
+ "logits/rejected": 0.15303435921669006,
1136
+ "logps/chosen": -848.7705078125,
1137
+ "logps/rejected": -1336.090576171875,
1138
+ "loss": 0.4708,
1139
+ "rewards/accuracies": 0.574999988079071,
1140
+ "rewards/chosen": -0.5991231799125671,
1141
+ "rewards/margins": 0.5247097015380859,
1142
+ "rewards/rejected": -1.1238329410552979,
1143
+ "step": 800
1144
+ },
1145
+ {
1146
+ "epoch": 0.42,
1147
+ "learning_rate": 3.56378672954129e-06,
1148
+ "logits/chosen": 0.22229023277759552,
1149
+ "logits/rejected": 0.17705193161964417,
1150
+ "logps/chosen": -1094.6126708984375,
1151
+ "logps/rejected": -1681.7445068359375,
1152
+ "loss": 0.469,
1153
+ "rewards/accuracies": 0.5562499761581421,
1154
+ "rewards/chosen": -0.8267385363578796,
1155
+ "rewards/margins": 0.6382580995559692,
1156
+ "rewards/rejected": -1.4649966955184937,
1157
+ "step": 810
1158
+ },
1159
+ {
1160
+ "epoch": 0.43,
1161
+ "learning_rate": 3.5222648293233806e-06,
1162
+ "logits/chosen": 0.1940724402666092,
1163
+ "logits/rejected": 0.1474287211894989,
1164
+ "logps/chosen": -1133.5128173828125,
1165
+ "logps/rejected": -1901.333984375,
1166
+ "loss": 0.4687,
1167
+ "rewards/accuracies": 0.5687500238418579,
1168
+ "rewards/chosen": -0.8616431951522827,
1169
+ "rewards/margins": 0.8072026968002319,
1170
+ "rewards/rejected": -1.6688458919525146,
1171
+ "step": 820
1172
+ },
1173
+ {
1174
+ "epoch": 0.43,
1175
+ "learning_rate": 3.4804015009566573e-06,
1176
+ "logits/chosen": 0.14867620170116425,
1177
+ "logits/rejected": 0.050886522978544235,
1178
+ "logps/chosen": -1169.879638671875,
1179
+ "logps/rejected": -2415.080078125,
1180
+ "loss": 0.4639,
1181
+ "rewards/accuracies": 0.6499999761581421,
1182
+ "rewards/chosen": -0.9197257161140442,
1183
+ "rewards/margins": 1.2611197233200073,
1184
+ "rewards/rejected": -2.180845260620117,
1185
+ "step": 830
1186
+ },
1187
+ {
1188
+ "epoch": 0.44,
1189
+ "learning_rate": 3.4382107264527244e-06,
1190
+ "logits/chosen": 0.16670770943164825,
1191
+ "logits/rejected": 0.11358609050512314,
1192
+ "logps/chosen": -1215.7694091796875,
1193
+ "logps/rejected": -1938.170654296875,
1194
+ "loss": 0.4701,
1195
+ "rewards/accuracies": 0.5562499761581421,
1196
+ "rewards/chosen": -0.9481611251831055,
1197
+ "rewards/margins": 0.7456313967704773,
1198
+ "rewards/rejected": -1.6937923431396484,
1199
+ "step": 840
1200
+ },
1201
+ {
1202
+ "epoch": 0.44,
1203
+ "learning_rate": 3.3957065971875387e-06,
1204
+ "logits/chosen": 0.24467554688453674,
1205
+ "logits/rejected": 0.1815129816532135,
1206
+ "logps/chosen": -1700.726806640625,
1207
+ "logps/rejected": -2238.2724609375,
1208
+ "loss": 0.4738,
1209
+ "rewards/accuracies": 0.4749999940395355,
1210
+ "rewards/chosen": -1.4334746599197388,
1211
+ "rewards/margins": 0.5637392997741699,
1212
+ "rewards/rejected": -1.9972139596939087,
1213
+ "step": 850
1214
+ },
1215
+ {
1216
+ "epoch": 0.45,
1217
+ "learning_rate": 3.352903309194999e-06,
1218
+ "logits/chosen": 0.25681573152542114,
1219
+ "logits/rejected": 0.22445912659168243,
1220
+ "logps/chosen": -1175.2008056640625,
1221
+ "logps/rejected": -1852.9886474609375,
1222
+ "loss": 0.476,
1223
+ "rewards/accuracies": 0.6000000238418579,
1224
+ "rewards/chosen": -0.9056652784347534,
1225
+ "rewards/margins": 0.6970826387405396,
1226
+ "rewards/rejected": -1.6027476787567139,
1227
+ "step": 860
1228
+ },
1229
+ {
1230
+ "epoch": 0.46,
1231
+ "learning_rate": 3.309815158425591e-06,
1232
+ "logits/chosen": 0.35658639669418335,
1233
+ "logits/rejected": 0.23468701541423798,
1234
+ "logps/chosen": -1126.1968994140625,
1235
+ "logps/rejected": -1490.5289306640625,
1236
+ "loss": 0.4765,
1237
+ "rewards/accuracies": 0.53125,
1238
+ "rewards/chosen": -0.8638699650764465,
1239
+ "rewards/margins": 0.4017399847507477,
1240
+ "rewards/rejected": -1.265609860420227,
1241
+ "step": 870
1242
+ },
1243
+ {
1244
+ "epoch": 0.46,
1245
+ "learning_rate": 3.266456535971654e-06,
1246
+ "logits/chosen": 0.29603832960128784,
1247
+ "logits/rejected": 0.2804957330226898,
1248
+ "logps/chosen": -1391.908447265625,
1249
+ "logps/rejected": -1630.26220703125,
1250
+ "loss": 0.4842,
1251
+ "rewards/accuracies": 0.5625,
1252
+ "rewards/chosen": -1.103570580482483,
1253
+ "rewards/margins": 0.2950761914253235,
1254
+ "rewards/rejected": -1.3986468315124512,
1255
+ "step": 880
1256
+ },
1257
+ {
1258
+ "epoch": 0.47,
1259
+ "learning_rate": 3.2228419232608692e-06,
1260
+ "logits/chosen": 0.25324004888534546,
1261
+ "logits/rejected": 0.19424840807914734,
1262
+ "logps/chosen": -1254.45947265625,
1263
+ "logps/rejected": -1625.465087890625,
1264
+ "loss": 0.483,
1265
+ "rewards/accuracies": 0.4437499940395355,
1266
+ "rewards/chosen": -1.0103174448013306,
1267
+ "rewards/margins": 0.3879779279232025,
1268
+ "rewards/rejected": -1.3982954025268555,
1269
+ "step": 890
1270
+ },
1271
+ {
1272
+ "epoch": 0.47,
1273
+ "learning_rate": 3.1789858872195888e-06,
1274
+ "logits/chosen": 0.35612553358078003,
1275
+ "logits/rejected": 0.2640685737133026,
1276
+ "logps/chosen": -1018.1083984375,
1277
+ "logps/rejected": -1447.966796875,
1278
+ "loss": 0.4713,
1279
+ "rewards/accuracies": 0.612500011920929,
1280
+ "rewards/chosen": -0.7736637592315674,
1281
+ "rewards/margins": 0.4553070068359375,
1282
+ "rewards/rejected": -1.2289707660675049,
1283
+ "step": 900
1284
+ },
1285
+ {
1286
+ "epoch": 0.48,
1287
+ "learning_rate": 3.1349030754075945e-06,
1288
+ "logits/chosen": 0.32709187269210815,
1289
+ "logits/rejected": 0.27523329854011536,
1290
+ "logps/chosen": -996.7443237304688,
1291
+ "logps/rejected": -1309.497802734375,
1292
+ "loss": 0.4674,
1293
+ "rewards/accuracies": 0.550000011920929,
1294
+ "rewards/chosen": -0.6882850527763367,
1295
+ "rewards/margins": 0.3720100224018097,
1296
+ "rewards/rejected": -1.0602951049804688,
1297
+ "step": 910
1298
+ },
1299
+ {
1300
+ "epoch": 0.48,
1301
+ "learning_rate": 3.0906082111259313e-06,
1302
+ "logits/chosen": 0.28385213017463684,
1303
+ "logits/rejected": 0.26248598098754883,
1304
+ "logps/chosen": -1238.9512939453125,
1305
+ "logps/rejected": -1446.0545654296875,
1306
+ "loss": 0.4729,
1307
+ "rewards/accuracies": 0.53125,
1308
+ "rewards/chosen": -0.9621122479438782,
1309
+ "rewards/margins": 0.24497418105602264,
1310
+ "rewards/rejected": -1.207086443901062,
1311
+ "step": 920
1312
+ },
1313
+ {
1314
+ "epoch": 0.49,
1315
+ "learning_rate": 3.046116088499449e-06,
1316
+ "logits/chosen": 0.20961081981658936,
1317
+ "logits/rejected": 0.12288858741521835,
1318
+ "logps/chosen": -1385.43359375,
1319
+ "logps/rejected": -2388.202392578125,
1320
+ "loss": 0.4591,
1321
+ "rewards/accuracies": 0.581250011920929,
1322
+ "rewards/chosen": -1.1082097291946411,
1323
+ "rewards/margins": 1.0280786752700806,
1324
+ "rewards/rejected": -2.1362884044647217,
1325
+ "step": 930
1326
+ },
1327
+ {
1328
+ "epoch": 0.49,
1329
+ "learning_rate": 3.0014415675356813e-06,
1330
+ "logits/chosen": 0.2143702507019043,
1331
+ "logits/rejected": 0.12640917301177979,
1332
+ "logps/chosen": -1842.924072265625,
1333
+ "logps/rejected": -2572.03759765625,
1334
+ "loss": 0.4703,
1335
+ "rewards/accuracies": 0.518750011920929,
1336
+ "rewards/chosen": -1.5414974689483643,
1337
+ "rewards/margins": 0.7958974838256836,
1338
+ "rewards/rejected": -2.337394952774048,
1339
+ "step": 940
1340
+ },
1341
+ {
1342
+ "epoch": 0.5,
1343
+ "learning_rate": 2.9565995691617242e-06,
1344
+ "logits/chosen": 0.2267983853816986,
1345
+ "logits/rejected": 0.19906947016716003,
1346
+ "logps/chosen": -1659.0390625,
1347
+ "logps/rejected": -1897.612548828125,
1348
+ "loss": 0.4796,
1349
+ "rewards/accuracies": 0.45625001192092896,
1350
+ "rewards/chosen": -1.4338902235031128,
1351
+ "rewards/margins": 0.23841390013694763,
1352
+ "rewards/rejected": -1.672304391860962,
1353
+ "step": 950
1354
+ },
1355
+ {
1356
+ "epoch": 0.5,
1357
+ "learning_rate": 2.9116050702407706e-06,
1358
+ "logits/chosen": 0.2076607495546341,
1359
+ "logits/rejected": 0.15953665971755981,
1360
+ "logps/chosen": -1761.4957275390625,
1361
+ "logps/rejected": -2119.157470703125,
1362
+ "loss": 0.4733,
1363
+ "rewards/accuracies": 0.518750011920929,
1364
+ "rewards/chosen": -1.521126627922058,
1365
+ "rewards/margins": 0.37375563383102417,
1366
+ "rewards/rejected": -1.8948824405670166,
1367
+ "step": 960
1368
+ },
1369
+ {
1370
+ "epoch": 0.51,
1371
+ "learning_rate": 2.8664730985699537e-06,
1372
+ "logits/chosen": 0.2155609130859375,
1373
+ "logits/rejected": 0.15363694727420807,
1374
+ "logps/chosen": -1374.277587890625,
1375
+ "logps/rejected": -2335.11962890625,
1376
+ "loss": 0.4691,
1377
+ "rewards/accuracies": 0.512499988079071,
1378
+ "rewards/chosen": -1.131838083267212,
1379
+ "rewards/margins": 0.9804404973983765,
1380
+ "rewards/rejected": -2.112278461456299,
1381
+ "step": 970
1382
+ },
1383
+ {
1384
+ "epoch": 0.51,
1385
+ "learning_rate": 2.8212187278611907e-06,
1386
+ "logits/chosen": 0.3766547739505768,
1387
+ "logits/rejected": 0.23996052145957947,
1388
+ "logps/chosen": -978.6238403320312,
1389
+ "logps/rejected": -1637.2352294921875,
1390
+ "loss": 0.4666,
1391
+ "rewards/accuracies": 0.5874999761581421,
1392
+ "rewards/chosen": -0.6910273432731628,
1393
+ "rewards/margins": 0.7083319425582886,
1394
+ "rewards/rejected": -1.3993593454360962,
1395
+ "step": 980
1396
+ },
1397
+ {
1398
+ "epoch": 0.52,
1399
+ "learning_rate": 2.7758570727066843e-06,
1400
+ "logits/chosen": 0.3515971899032593,
1401
+ "logits/rejected": 0.2718420922756195,
1402
+ "logps/chosen": -945.19921875,
1403
+ "logps/rejected": -1549.3182373046875,
1404
+ "loss": 0.4667,
1405
+ "rewards/accuracies": 0.543749988079071,
1406
+ "rewards/chosen": -0.6832455396652222,
1407
+ "rewards/margins": 0.6422259211540222,
1408
+ "rewards/rejected": -1.3254715204238892,
1409
+ "step": 990
1410
+ },
1411
+ {
1412
+ "epoch": 0.52,
1413
+ "learning_rate": 2.730403283530767e-06,
1414
+ "logits/chosen": 0.3331068158149719,
1415
+ "logits/rejected": 0.21990351378917694,
1416
+ "logps/chosen": -957.8298950195312,
1417
+ "logps/rejected": -1847.413330078125,
1418
+ "loss": 0.4687,
1419
+ "rewards/accuracies": 0.6000000238418579,
1420
+ "rewards/chosen": -0.6945260167121887,
1421
+ "rewards/margins": 0.9040031433105469,
1422
+ "rewards/rejected": -1.5985292196273804,
1423
+ "step": 1000
1424
+ },
1425
+ {
1426
+ "epoch": 0.53,
1427
+ "learning_rate": 2.6848725415297888e-06,
1428
+ "logits/chosen": 0.24949748814105988,
1429
+ "logits/rejected": 0.1596693992614746,
1430
+ "logps/chosen": -1084.4876708984375,
1431
+ "logps/rejected": -1898.144287109375,
1432
+ "loss": 0.4618,
1433
+ "rewards/accuracies": 0.59375,
1434
+ "rewards/chosen": -0.8169393539428711,
1435
+ "rewards/margins": 0.8545991778373718,
1436
+ "rewards/rejected": -1.6715381145477295,
1437
+ "step": 1010
1438
+ },
1439
+ {
1440
+ "epoch": 0.53,
1441
+ "learning_rate": 2.639280053601719e-06,
1442
+ "logits/chosen": 0.22901049256324768,
1443
+ "logits/rejected": 0.1595744788646698,
1444
+ "logps/chosen": -1491.752197265625,
1445
+ "logps/rejected": -2144.299560546875,
1446
+ "loss": 0.4707,
1447
+ "rewards/accuracies": 0.59375,
1448
+ "rewards/chosen": -1.2246992588043213,
1449
+ "rewards/margins": 0.6539346575737,
1450
+ "rewards/rejected": -1.8786340951919556,
1451
+ "step": 1020
1452
+ },
1453
+ {
1454
+ "epoch": 0.54,
1455
+ "learning_rate": 2.59364104726716e-06,
1456
+ "logits/chosen": 0.31597059965133667,
1457
+ "logits/rejected": 0.21497178077697754,
1458
+ "logps/chosen": -1171.93212890625,
1459
+ "logps/rejected": -1925.6861572265625,
1460
+ "loss": 0.465,
1461
+ "rewards/accuracies": 0.5562499761581421,
1462
+ "rewards/chosen": -0.8903388977050781,
1463
+ "rewards/margins": 0.8044350743293762,
1464
+ "rewards/rejected": -1.6947739124298096,
1465
+ "step": 1030
1466
+ },
1467
+ {
1468
+ "epoch": 0.54,
1469
+ "learning_rate": 2.547970765583491e-06,
1470
+ "logits/chosen": 0.35459914803504944,
1471
+ "logits/rejected": 0.21209494769573212,
1472
+ "logps/chosen": -1010.3555908203125,
1473
+ "logps/rejected": -1694.515869140625,
1474
+ "loss": 0.4642,
1475
+ "rewards/accuracies": 0.6187499761581421,
1476
+ "rewards/chosen": -0.7338335514068604,
1477
+ "rewards/margins": 0.7184348106384277,
1478
+ "rewards/rejected": -1.452268362045288,
1479
+ "step": 1040
1480
+ },
1481
+ {
1482
+ "epoch": 0.55,
1483
+ "learning_rate": 2.502284462053799e-06,
1484
+ "logits/chosen": 0.2834840416908264,
1485
+ "logits/rejected": 0.19832350313663483,
1486
+ "logps/chosen": -1069.5567626953125,
1487
+ "logps/rejected": -1713.046142578125,
1488
+ "loss": 0.4653,
1489
+ "rewards/accuracies": 0.574999988079071,
1490
+ "rewards/chosen": -0.781063437461853,
1491
+ "rewards/margins": 0.6842104196548462,
1492
+ "rewards/rejected": -1.4652738571166992,
1493
+ "step": 1050
1494
+ },
1495
+ {
1496
+ "epoch": 0.55,
1497
+ "learning_rate": 2.456597395532338e-06,
1498
+ "logits/chosen": 0.23369982838630676,
1499
+ "logits/rejected": 0.15703235566616058,
1500
+ "logps/chosen": -1476.560546875,
1501
+ "logps/rejected": -2163.74267578125,
1502
+ "loss": 0.4708,
1503
+ "rewards/accuracies": 0.581250011920929,
1504
+ "rewards/chosen": -1.1821922063827515,
1505
+ "rewards/margins": 0.7186304330825806,
1506
+ "rewards/rejected": -1.900822639465332,
1507
+ "step": 1060
1508
+ },
1509
+ {
1510
+ "epoch": 0.56,
1511
+ "learning_rate": 2.4109248251281953e-06,
1512
+ "logits/chosen": 0.2690127491950989,
1513
+ "logits/rejected": 0.1083533763885498,
1514
+ "logps/chosen": -1436.783447265625,
1515
+ "logps/rejected": -2573.591064453125,
1516
+ "loss": 0.4639,
1517
+ "rewards/accuracies": 0.6000000238418579,
1518
+ "rewards/chosen": -1.1651248931884766,
1519
+ "rewards/margins": 1.1516902446746826,
1520
+ "rewards/rejected": -2.316815137863159,
1521
+ "step": 1070
1522
+ },
1523
+ {
1524
+ "epoch": 0.57,
1525
+ "learning_rate": 2.365282005108875e-06,
1526
+ "logits/chosen": 0.2598133087158203,
1527
+ "logits/rejected": 0.17415449023246765,
1528
+ "logps/chosen": -1348.472412109375,
1529
+ "logps/rejected": -1934.5325927734375,
1530
+ "loss": 0.4721,
1531
+ "rewards/accuracies": 0.5562499761581421,
1532
+ "rewards/chosen": -1.0647830963134766,
1533
+ "rewards/margins": 0.6361646056175232,
1534
+ "rewards/rejected": -1.7009475231170654,
1535
+ "step": 1080
1536
+ },
1537
+ {
1538
+ "epoch": 0.57,
1539
+ "learning_rate": 2.319684179805491e-06,
1540
+ "logits/chosen": 0.28293731808662415,
1541
+ "logits/rejected": 0.16613037884235382,
1542
+ "logps/chosen": -1299.3868408203125,
1543
+ "logps/rejected": -2169.75830078125,
1544
+ "loss": 0.4726,
1545
+ "rewards/accuracies": 0.53125,
1546
+ "rewards/chosen": -1.0253424644470215,
1547
+ "rewards/margins": 0.8982712626457214,
1548
+ "rewards/rejected": -1.9236137866973877,
1549
+ "step": 1090
1550
+ },
1551
+ {
1552
+ "epoch": 0.58,
1553
+ "learning_rate": 2.2741465785212905e-06,
1554
+ "logits/chosen": 0.3770299553871155,
1555
+ "logits/rejected": 0.3206137418746948,
1556
+ "logps/chosen": -845.8580322265625,
1557
+ "logps/rejected": -1371.0318603515625,
1558
+ "loss": 0.4754,
1559
+ "rewards/accuracies": 0.6000000238418579,
1560
+ "rewards/chosen": -0.573067843914032,
1561
+ "rewards/margins": 0.5587201714515686,
1562
+ "rewards/rejected": -1.1317881345748901,
1563
+ "step": 1100
1564
+ },
1565
+ {
1566
+ "epoch": 0.58,
1567
+ "learning_rate": 2.2286844104451848e-06,
1568
+ "logits/chosen": 0.29950836300849915,
1569
+ "logits/rejected": 0.2572200298309326,
1570
+ "logps/chosen": -1225.456298828125,
1571
+ "logps/rejected": -1701.4114990234375,
1572
+ "loss": 0.4717,
1573
+ "rewards/accuracies": 0.574999988079071,
1574
+ "rewards/chosen": -0.9399654269218445,
1575
+ "rewards/margins": 0.5411953926086426,
1576
+ "rewards/rejected": -1.4811608791351318,
1577
+ "step": 1110
1578
+ },
1579
+ {
1580
+ "epoch": 0.59,
1581
+ "learning_rate": 2.183312859572008e-06,
1582
+ "logits/chosen": 0.2056627720594406,
1583
+ "logits/rejected": 0.13243384659290314,
1584
+ "logps/chosen": -1311.5948486328125,
1585
+ "logps/rejected": -2090.031494140625,
1586
+ "loss": 0.473,
1587
+ "rewards/accuracies": 0.574999988079071,
1588
+ "rewards/chosen": -1.03976309299469,
1589
+ "rewards/margins": 0.8429223895072937,
1590
+ "rewards/rejected": -1.8826854228973389,
1591
+ "step": 1120
1592
+ },
1593
+ {
1594
+ "epoch": 0.59,
1595
+ "learning_rate": 2.1380470796311843e-06,
1596
+ "logits/chosen": 0.26897698640823364,
1597
+ "logits/rejected": 0.19322913885116577,
1598
+ "logps/chosen": -1409.248779296875,
1599
+ "logps/rejected": -1968.114013671875,
1600
+ "loss": 0.4624,
1601
+ "rewards/accuracies": 0.637499988079071,
1602
+ "rewards/chosen": -1.1417442560195923,
1603
+ "rewards/margins": 0.591802716255188,
1604
+ "rewards/rejected": -1.7335469722747803,
1605
+ "step": 1130
1606
+ },
1607
+ {
1608
+ "epoch": 0.6,
1609
+ "learning_rate": 2.092902189025507e-06,
1610
+ "logits/chosen": 0.298466295003891,
1611
+ "logits/rejected": 0.1567627638578415,
1612
+ "logps/chosen": -1206.5018310546875,
1613
+ "logps/rejected": -2206.86767578125,
1614
+ "loss": 0.4604,
1615
+ "rewards/accuracies": 0.59375,
1616
+ "rewards/chosen": -0.9475823640823364,
1617
+ "rewards/margins": 1.0231791734695435,
1618
+ "rewards/rejected": -1.9707612991333008,
1619
+ "step": 1140
1620
+ },
1621
+ {
1622
+ "epoch": 0.6,
1623
+ "learning_rate": 2.0478932657817105e-06,
1624
+ "logits/chosen": 0.31211769580841064,
1625
+ "logits/rejected": 0.1320025771856308,
1626
+ "logps/chosen": -1475.707275390625,
1627
+ "logps/rejected": -2485.997802734375,
1628
+ "loss": 0.4686,
1629
+ "rewards/accuracies": 0.581250011920929,
1630
+ "rewards/chosen": -1.1953158378601074,
1631
+ "rewards/margins": 1.0596123933792114,
1632
+ "rewards/rejected": -2.2549283504486084,
1633
+ "step": 1150
1634
+ },
1635
+ {
1636
+ "epoch": 0.61,
1637
+ "learning_rate": 2.0030353425145376e-06,
1638
+ "logits/chosen": 0.29154402017593384,
1639
+ "logits/rejected": 0.20484980940818787,
1640
+ "logps/chosen": -1307.7490234375,
1641
+ "logps/rejected": -1891.5804443359375,
1642
+ "loss": 0.475,
1643
+ "rewards/accuracies": 0.6000000238418579,
1644
+ "rewards/chosen": -1.068193793296814,
1645
+ "rewards/margins": 0.5871996879577637,
1646
+ "rewards/rejected": -1.6553936004638672,
1647
+ "step": 1160
1648
+ },
1649
+ {
1650
+ "epoch": 0.61,
1651
+ "learning_rate": 1.958343401405964e-06,
1652
+ "logits/chosen": 0.2675972282886505,
1653
+ "logits/rejected": 0.20726804435253143,
1654
+ "logps/chosen": -1136.7181396484375,
1655
+ "logps/rejected": -1507.15234375,
1656
+ "loss": 0.4705,
1657
+ "rewards/accuracies": 0.53125,
1658
+ "rewards/chosen": -0.8827505111694336,
1659
+ "rewards/margins": 0.3945409953594208,
1660
+ "rewards/rejected": -1.2772915363311768,
1661
+ "step": 1170
1662
+ },
1663
+ {
1664
+ "epoch": 0.62,
1665
+ "learning_rate": 1.9138323692012734e-06,
1666
+ "logits/chosen": 0.273415207862854,
1667
+ "logits/rejected": 0.16786028444766998,
1668
+ "logps/chosen": -1736.076416015625,
1669
+ "logps/rejected": -2560.149169921875,
1670
+ "loss": 0.4705,
1671
+ "rewards/accuracies": 0.5687500238418579,
1672
+ "rewards/chosen": -1.4262146949768066,
1673
+ "rewards/margins": 0.8943548202514648,
1674
+ "rewards/rejected": -2.3205695152282715,
1675
+ "step": 1180
1676
+ },
1677
+ {
1678
+ "epoch": 0.62,
1679
+ "learning_rate": 1.8695171122236443e-06,
1680
+ "logits/chosen": 0.20894399285316467,
1681
+ "logits/rejected": 0.10228965431451797,
1682
+ "logps/chosen": -1324.954345703125,
1683
+ "logps/rejected": -2638.982666015625,
1684
+ "loss": 0.4668,
1685
+ "rewards/accuracies": 0.606249988079071,
1686
+ "rewards/chosen": -1.0546363592147827,
1687
+ "rewards/margins": 1.3503773212432861,
1688
+ "rewards/rejected": -2.4050137996673584,
1689
+ "step": 1190
1690
+ },
1691
+ {
1692
+ "epoch": 0.63,
1693
+ "learning_rate": 1.8254124314089225e-06,
1694
+ "logits/chosen": 0.3430663049221039,
1695
+ "logits/rejected": 0.2673262655735016,
1696
+ "logps/chosen": -861.5838623046875,
1697
+ "logps/rejected": -1974.1458740234375,
1698
+ "loss": 0.4518,
1699
+ "rewards/accuracies": 0.625,
1700
+ "rewards/chosen": -0.5944491624832153,
1701
+ "rewards/margins": 1.1017727851867676,
1702
+ "rewards/rejected": -1.696221947669983,
1703
+ "step": 1200
1704
+ },
1705
+ {
1706
+ "epoch": 0.63,
1707
+ "learning_rate": 1.781533057362221e-06,
1708
+ "logits/chosen": 0.3156498670578003,
1709
+ "logits/rejected": 0.185347780585289,
1710
+ "logps/chosen": -1168.6217041015625,
1711
+ "logps/rejected": -1924.7115478515625,
1712
+ "loss": 0.4583,
1713
+ "rewards/accuracies": 0.59375,
1714
+ "rewards/chosen": -0.8821272850036621,
1715
+ "rewards/margins": 0.8161047101020813,
1716
+ "rewards/rejected": -1.6982319355010986,
1717
+ "step": 1210
1718
+ },
1719
+ {
1720
+ "epoch": 0.64,
1721
+ "learning_rate": 1.7378936454380277e-06,
1722
+ "logits/chosen": 0.36333876848220825,
1723
+ "logits/rejected": 0.26434630155563354,
1724
+ "logps/chosen": -1027.678466796875,
1725
+ "logps/rejected": -1634.684814453125,
1726
+ "loss": 0.4654,
1727
+ "rewards/accuracies": 0.581250011920929,
1728
+ "rewards/chosen": -0.7384849786758423,
1729
+ "rewards/margins": 0.6454702615737915,
1730
+ "rewards/rejected": -1.383955478668213,
1731
+ "step": 1220
1732
+ },
1733
+ {
1734
+ "epoch": 0.64,
1735
+ "learning_rate": 1.6945087708454273e-06,
1736
+ "logits/chosen": 0.27189189195632935,
1737
+ "logits/rejected": 0.18399885296821594,
1738
+ "logps/chosen": -1334.14990234375,
1739
+ "logps/rejected": -1880.106201171875,
1740
+ "loss": 0.4767,
1741
+ "rewards/accuracies": 0.5062500238418579,
1742
+ "rewards/chosen": -1.090384840965271,
1743
+ "rewards/margins": 0.5727940797805786,
1744
+ "rewards/rejected": -1.66317880153656,
1745
+ "step": 1230
1746
+ },
1747
+ {
1748
+ "epoch": 0.65,
1749
+ "learning_rate": 1.651392923780105e-06,
1750
+ "logits/chosen": 0.4100673794746399,
1751
+ "logits/rejected": 0.2657643258571625,
1752
+ "logps/chosen": -1111.376220703125,
1753
+ "logps/rejected": -1941.037353515625,
1754
+ "loss": 0.46,
1755
+ "rewards/accuracies": 0.574999988079071,
1756
+ "rewards/chosen": -0.8117152452468872,
1757
+ "rewards/margins": 0.8833802938461304,
1758
+ "rewards/rejected": -1.695095419883728,
1759
+ "step": 1240
1760
+ },
1761
+ {
1762
+ "epoch": 0.65,
1763
+ "learning_rate": 1.608560504584737e-06,
1764
+ "logits/chosen": 0.301455020904541,
1765
+ "logits/rejected": 0.22863301634788513,
1766
+ "logps/chosen": -1159.3509521484375,
1767
+ "logps/rejected": -2089.1953125,
1768
+ "loss": 0.4631,
1769
+ "rewards/accuracies": 0.5562499761581421,
1770
+ "rewards/chosen": -0.8735660314559937,
1771
+ "rewards/margins": 0.9659594297409058,
1772
+ "rewards/rejected": -1.839525580406189,
1773
+ "step": 1250
1774
+ },
1775
+ {
1776
+ "epoch": 0.66,
1777
+ "learning_rate": 1.5660258189393945e-06,
1778
+ "logits/chosen": 0.19146260619163513,
1779
+ "logits/rejected": 0.14353962242603302,
1780
+ "logps/chosen": -1484.316650390625,
1781
+ "logps/rejected": -2343.659423828125,
1782
+ "loss": 0.4687,
1783
+ "rewards/accuracies": 0.5625,
1784
+ "rewards/chosen": -1.2383463382720947,
1785
+ "rewards/margins": 0.8734383583068848,
1786
+ "rewards/rejected": -2.1117844581604004,
1787
+ "step": 1260
1788
+ },
1789
+ {
1790
+ "epoch": 0.66,
1791
+ "learning_rate": 1.5238030730835578e-06,
1792
+ "logits/chosen": 0.31026071310043335,
1793
+ "logits/rejected": 0.19475135207176208,
1794
+ "logps/chosen": -1738.6246337890625,
1795
+ "logps/rejected": -2328.933349609375,
1796
+ "loss": 0.4693,
1797
+ "rewards/accuracies": 0.5375000238418579,
1798
+ "rewards/chosen": -1.462491750717163,
1799
+ "rewards/margins": 0.6261566281318665,
1800
+ "rewards/rejected": -2.0886483192443848,
1801
+ "step": 1270
1802
+ },
1803
+ {
1804
+ "epoch": 0.67,
1805
+ "learning_rate": 1.4819063690713565e-06,
1806
+ "logits/chosen": 0.26937440037727356,
1807
+ "logits/rejected": 0.15669001638889313,
1808
+ "logps/chosen": -1396.046630859375,
1809
+ "logps/rejected": -2102.262939453125,
1810
+ "loss": 0.4598,
1811
+ "rewards/accuracies": 0.5625,
1812
+ "rewards/chosen": -1.1547179222106934,
1813
+ "rewards/margins": 0.7373046278953552,
1814
+ "rewards/rejected": -1.892022728919983,
1815
+ "step": 1280
1816
+ },
1817
+ {
1818
+ "epoch": 0.68,
1819
+ "learning_rate": 1.4403497000615885e-06,
1820
+ "logits/chosen": 0.3091123700141907,
1821
+ "logits/rejected": 0.204463392496109,
1822
+ "logps/chosen": -1624.702392578125,
1823
+ "logps/rejected": -2571.47412109375,
1824
+ "loss": 0.4654,
1825
+ "rewards/accuracies": 0.550000011920929,
1826
+ "rewards/chosen": -1.3447537422180176,
1827
+ "rewards/margins": 0.9728642702102661,
1828
+ "rewards/rejected": -2.3176181316375732,
1829
+ "step": 1290
1830
+ },
1831
+ {
1832
+ "epoch": 0.68,
1833
+ "learning_rate": 1.3991469456441273e-06,
1834
+ "logits/chosen": 0.31638103723526,
1835
+ "logits/rejected": 0.23879094421863556,
1836
+ "logps/chosen": -1413.901123046875,
1837
+ "logps/rejected": -2330.88330078125,
1838
+ "loss": 0.4546,
1839
+ "rewards/accuracies": 0.637499988079071,
1840
+ "rewards/chosen": -1.1106030941009521,
1841
+ "rewards/margins": 0.9651702642440796,
1842
+ "rewards/rejected": -2.075773239135742,
1843
+ "step": 1300
1844
+ },
1845
+ {
1846
+ "epoch": 0.69,
1847
+ "learning_rate": 1.3583118672042441e-06,
1848
+ "logits/chosen": 0.274738609790802,
1849
+ "logits/rejected": 0.18945345282554626,
1850
+ "logps/chosen": -1652.8541259765625,
1851
+ "logps/rejected": -2093.89990234375,
1852
+ "loss": 0.4704,
1853
+ "rewards/accuracies": 0.512499988079071,
1854
+ "rewards/chosen": -1.371697187423706,
1855
+ "rewards/margins": 0.5045391917228699,
1856
+ "rewards/rejected": -1.8762363195419312,
1857
+ "step": 1310
1858
+ },
1859
+ {
1860
+ "epoch": 0.69,
1861
+ "learning_rate": 1.3178581033264218e-06,
1862
+ "logits/chosen": 0.27669447660446167,
1863
+ "logits/rejected": 0.16615034639835358,
1864
+ "logps/chosen": -1164.213134765625,
1865
+ "logps/rejected": -2034.477783203125,
1866
+ "loss": 0.4566,
1867
+ "rewards/accuracies": 0.5625,
1868
+ "rewards/chosen": -0.9321501851081848,
1869
+ "rewards/margins": 0.888770580291748,
1870
+ "rewards/rejected": -1.820920705795288,
1871
+ "step": 1320
1872
+ },
1873
+ {
1874
+ "epoch": 0.7,
1875
+ "learning_rate": 1.2777991652391757e-06,
1876
+ "logits/chosen": 0.31228479743003845,
1877
+ "logits/rejected": 0.21845977008342743,
1878
+ "logps/chosen": -1202.439697265625,
1879
+ "logps/rejected": -1930.3785400390625,
1880
+ "loss": 0.4661,
1881
+ "rewards/accuracies": 0.550000011920929,
1882
+ "rewards/chosen": -0.9665547609329224,
1883
+ "rewards/margins": 0.7526635527610779,
1884
+ "rewards/rejected": -1.7192184925079346,
1885
+ "step": 1330
1886
+ },
1887
+ {
1888
+ "epoch": 0.7,
1889
+ "learning_rate": 1.2381484323024178e-06,
1890
+ "logits/chosen": 0.35927221179008484,
1891
+ "logits/rejected": 0.2287793606519699,
1892
+ "logps/chosen": -1124.3046875,
1893
+ "logps/rejected": -2056.501708984375,
1894
+ "loss": 0.4623,
1895
+ "rewards/accuracies": 0.6312500238418579,
1896
+ "rewards/chosen": -0.8218294382095337,
1897
+ "rewards/margins": 0.98065185546875,
1898
+ "rewards/rejected": -1.8024810552597046,
1899
+ "step": 1340
1900
+ },
1901
+ {
1902
+ "epoch": 0.71,
1903
+ "learning_rate": 1.1989191475388518e-06,
1904
+ "logits/chosen": 0.3698425889015198,
1905
+ "logits/rejected": 0.2954414486885071,
1906
+ "logps/chosen": -1166.0611572265625,
1907
+ "logps/rejected": -1549.630126953125,
1908
+ "loss": 0.4695,
1909
+ "rewards/accuracies": 0.5625,
1910
+ "rewards/chosen": -0.8860333561897278,
1911
+ "rewards/margins": 0.42343559861183167,
1912
+ "rewards/rejected": -1.3094689846038818,
1913
+ "step": 1350
1914
+ },
1915
+ {
1916
+ "epoch": 0.71,
1917
+ "learning_rate": 1.160124413210918e-06,
1918
+ "logits/chosen": 0.35506299138069153,
1919
+ "logits/rejected": 0.2409767210483551,
1920
+ "logps/chosen": -1092.040283203125,
1921
+ "logps/rejected": -1912.9970703125,
1922
+ "loss": 0.4582,
1923
+ "rewards/accuracies": 0.550000011920929,
1924
+ "rewards/chosen": -0.802148163318634,
1925
+ "rewards/margins": 0.8678997755050659,
1926
+ "rewards/rejected": -1.6700481176376343,
1927
+ "step": 1360
1928
+ },
1929
+ {
1930
+ "epoch": 0.72,
1931
+ "learning_rate": 1.1217771864447396e-06,
1932
+ "logits/chosen": 0.3243677616119385,
1933
+ "logits/rejected": 0.17696735262870789,
1934
+ "logps/chosen": -991.9781494140625,
1935
+ "logps/rejected": -2300.01806640625,
1936
+ "loss": 0.4563,
1937
+ "rewards/accuracies": 0.637499988079071,
1938
+ "rewards/chosen": -0.6990408897399902,
1939
+ "rewards/margins": 1.3517526388168335,
1940
+ "rewards/rejected": -2.050793409347534,
1941
+ "step": 1370
1942
+ },
1943
+ {
1944
+ "epoch": 0.72,
1945
+ "learning_rate": 1.08389027490255e-06,
1946
+ "logits/chosen": 0.27917546033859253,
1947
+ "logits/rejected": 0.13479743897914886,
1948
+ "logps/chosen": -1405.369384765625,
1949
+ "logps/rejected": -2042.785888671875,
1950
+ "loss": 0.4724,
1951
+ "rewards/accuracies": 0.643750011920929,
1952
+ "rewards/chosen": -1.150667428970337,
1953
+ "rewards/margins": 0.6895908713340759,
1954
+ "rewards/rejected": -1.8402583599090576,
1955
+ "step": 1380
1956
+ },
1957
+ {
1958
+ "epoch": 0.73,
1959
+ "learning_rate": 1.046476332505036e-06,
1960
+ "logits/chosen": 0.3343364894390106,
1961
+ "logits/rejected": 0.23037847876548767,
1962
+ "logps/chosen": -1098.7138671875,
1963
+ "logps/rejected": -2268.34130859375,
1964
+ "loss": 0.463,
1965
+ "rewards/accuracies": 0.5874999761581421,
1966
+ "rewards/chosen": -0.8331190347671509,
1967
+ "rewards/margins": 1.2149721384048462,
1968
+ "rewards/rejected": -2.048090934753418,
1969
+ "step": 1390
1970
+ },
1971
+ {
1972
+ "epoch": 0.73,
1973
+ "learning_rate": 1.0095478552050348e-06,
1974
+ "logits/chosen": 0.26566246151924133,
1975
+ "logits/rejected": 0.2032911777496338,
1976
+ "logps/chosen": -956.0791015625,
1977
+ "logps/rejected": -1707.116943359375,
1978
+ "loss": 0.4575,
1979
+ "rewards/accuracies": 0.59375,
1980
+ "rewards/chosen": -0.7228320837020874,
1981
+ "rewards/margins": 0.797810435295105,
1982
+ "rewards/rejected": -1.5206425189971924,
1983
+ "step": 1400
1984
+ },
1985
+ {
1986
+ "epoch": 0.74,
1987
+ "learning_rate": 9.731171768139808e-07,
1988
+ "logits/chosen": 0.3556608557701111,
1989
+ "logits/rejected": 0.28849393129348755,
1990
+ "logps/chosen": -1063.8748779296875,
1991
+ "logps/rejected": -1503.830322265625,
1992
+ "loss": 0.4712,
1993
+ "rewards/accuracies": 0.59375,
1994
+ "rewards/chosen": -0.8245790600776672,
1995
+ "rewards/margins": 0.47296270728111267,
1996
+ "rewards/rejected": -1.2975417375564575,
1997
+ "step": 1410
1998
+ },
1999
+ {
2000
+ "epoch": 0.74,
2001
+ "learning_rate": 9.371964648825221e-07,
2002
+ "logits/chosen": 0.3505791425704956,
2003
+ "logits/rejected": 0.22841492295265198,
2004
+ "logps/chosen": -1045.915283203125,
2005
+ "logps/rejected": -2175.98876953125,
2006
+ "loss": 0.4619,
2007
+ "rewards/accuracies": 0.637499988079071,
2008
+ "rewards/chosen": -0.7385074496269226,
2009
+ "rewards/margins": 1.1998847723007202,
2010
+ "rewards/rejected": -1.9383922815322876,
2011
+ "step": 1420
2012
+ },
2013
+ {
2014
+ "epoch": 0.75,
2015
+ "learning_rate": 9.017977166366445e-07,
2016
+ "logits/chosen": 0.2420744001865387,
2017
+ "logits/rejected": 0.17516903579235077,
2018
+ "logps/chosen": -1474.8148193359375,
2019
+ "logps/rejected": -1958.3011474609375,
2020
+ "loss": 0.4696,
2021
+ "rewards/accuracies": 0.581250011920929,
2022
+ "rewards/chosen": -1.2239506244659424,
2023
+ "rewards/margins": 0.5244899988174438,
2024
+ "rewards/rejected": -1.7484405040740967,
2025
+ "step": 1430
2026
+ },
2027
+ {
2028
+ "epoch": 0.75,
2029
+ "learning_rate": 8.669327549707096e-07,
2030
+ "logits/chosen": 0.2893267571926117,
2031
+ "logits/rejected": 0.18889647722244263,
2032
+ "logps/chosen": -1455.797607421875,
2033
+ "logps/rejected": -2097.127685546875,
2034
+ "loss": 0.467,
2035
+ "rewards/accuracies": 0.6000000238418579,
2036
+ "rewards/chosen": -1.1924570798873901,
2037
+ "rewards/margins": 0.6752533912658691,
2038
+ "rewards/rejected": -1.8677103519439697,
2039
+ "step": 1440
2040
+ },
2041
+ {
2042
+ "epoch": 0.76,
2043
+ "learning_rate": 8.326132244986932e-07,
2044
+ "logits/chosen": 0.24378347396850586,
2045
+ "logits/rejected": 0.050407588481903076,
2046
+ "logps/chosen": -1516.9559326171875,
2047
+ "logps/rejected": -2759.12353515625,
2048
+ "loss": 0.4595,
2049
+ "rewards/accuracies": 0.581250011920929,
2050
+ "rewards/chosen": -1.2377126216888428,
2051
+ "rewards/margins": 1.2976287603378296,
2052
+ "rewards/rejected": -2.535341262817383,
2053
+ "step": 1450
2054
+ },
2055
+ {
2056
+ "epoch": 0.76,
2057
+ "learning_rate": 7.988505876649863e-07,
2058
+ "logits/chosen": 0.2632651925086975,
2059
+ "logits/rejected": 0.18519091606140137,
2060
+ "logps/chosen": -1460.3145751953125,
2061
+ "logps/rejected": -1786.594482421875,
2062
+ "loss": 0.4719,
2063
+ "rewards/accuracies": 0.53125,
2064
+ "rewards/chosen": -1.1815235614776611,
2065
+ "rewards/margins": 0.385366290807724,
2066
+ "rewards/rejected": -1.566890001296997,
2067
+ "step": 1460
2068
+ },
2069
+ {
2070
+ "epoch": 0.77,
2071
+ "learning_rate": 7.656561209160248e-07,
2072
+ "logits/chosen": 0.2761257290840149,
2073
+ "logits/rejected": 0.16277745366096497,
2074
+ "logps/chosen": -1284.623291015625,
2075
+ "logps/rejected": -2513.677734375,
2076
+ "loss": 0.4624,
2077
+ "rewards/accuracies": 0.612500011920929,
2078
+ "rewards/chosen": -0.9854960441589355,
2079
+ "rewards/margins": 1.282775640487671,
2080
+ "rewards/rejected": -2.2682716846466064,
2081
+ "step": 1470
2082
+ },
2083
+ {
2084
+ "epoch": 0.77,
2085
+ "learning_rate": 7.330409109340563e-07,
2086
+ "logits/chosen": 0.2461864948272705,
2087
+ "logits/rejected": 0.16639626026153564,
2088
+ "logps/chosen": -1436.0191650390625,
2089
+ "logps/rejected": -2242.687744140625,
2090
+ "loss": 0.4672,
2091
+ "rewards/accuracies": 0.5687500238418579,
2092
+ "rewards/chosen": -1.1464489698410034,
2093
+ "rewards/margins": 0.8480439186096191,
2094
+ "rewards/rejected": -1.9944928884506226,
2095
+ "step": 1480
2096
+ },
2097
+ {
2098
+ "epoch": 0.78,
2099
+ "learning_rate": 7.010158509342682e-07,
2100
+ "logits/chosen": 0.2478228360414505,
2101
+ "logits/rejected": 0.1436949521303177,
2102
+ "logps/chosen": -1367.794921875,
2103
+ "logps/rejected": -2203.85107421875,
2104
+ "loss": 0.4721,
2105
+ "rewards/accuracies": 0.5249999761581421,
2106
+ "rewards/chosen": -1.1351174116134644,
2107
+ "rewards/margins": 0.8538748621940613,
2108
+ "rewards/rejected": -1.9889923334121704,
2109
+ "step": 1490
2110
+ },
2111
+ {
2112
+ "epoch": 0.79,
2113
+ "learning_rate": 6.695916370265529e-07,
2114
+ "logits/chosen": 0.27428361773490906,
2115
+ "logits/rejected": 0.18057170510292053,
2116
+ "logps/chosen": -1440.5146484375,
2117
+ "logps/rejected": -2161.803466796875,
2118
+ "loss": 0.4628,
2119
+ "rewards/accuracies": 0.5874999761581421,
2120
+ "rewards/chosen": -1.1880239248275757,
2121
+ "rewards/margins": 0.7238161563873291,
2122
+ "rewards/rejected": -1.9118402004241943,
2123
+ "step": 1500
2124
+ },
2125
+ {
2126
+ "epoch": 0.79,
2127
+ "learning_rate": 6.387787646430854e-07,
2128
+ "logits/chosen": 0.25450989603996277,
2129
+ "logits/rejected": 0.1020331159234047,
2130
+ "logps/chosen": -1392.365966796875,
2131
+ "logps/rejected": -2656.48974609375,
2132
+ "loss": 0.4575,
2133
+ "rewards/accuracies": 0.637499988079071,
2134
+ "rewards/chosen": -1.118877649307251,
2135
+ "rewards/margins": 1.2837135791778564,
2136
+ "rewards/rejected": -2.4025912284851074,
2137
+ "step": 1510
2138
+ },
2139
+ {
2140
+ "epoch": 0.8,
2141
+ "learning_rate": 6.085875250329401e-07,
2142
+ "logits/chosen": 0.3250389099121094,
2143
+ "logits/rejected": 0.23088189959526062,
2144
+ "logps/chosen": -1277.065673828125,
2145
+ "logps/rejected": -2237.5419921875,
2146
+ "loss": 0.4588,
2147
+ "rewards/accuracies": 0.637499988079071,
2148
+ "rewards/chosen": -0.98078453540802,
2149
+ "rewards/margins": 1.0101826190948486,
2150
+ "rewards/rejected": -1.9909673929214478,
2151
+ "step": 1520
2152
+ },
2153
+ {
2154
+ "epoch": 0.8,
2155
+ "learning_rate": 5.79028001824894e-07,
2156
+ "logits/chosen": 0.34990447759628296,
2157
+ "logits/rejected": 0.1642770618200302,
2158
+ "logps/chosen": -1346.687744140625,
2159
+ "logps/rejected": -3187.396484375,
2160
+ "loss": 0.4642,
2161
+ "rewards/accuracies": 0.7250000238418579,
2162
+ "rewards/chosen": -1.0415836572647095,
2163
+ "rewards/margins": 1.8741792440414429,
2164
+ "rewards/rejected": -2.9157626628875732,
2165
+ "step": 1530
2166
+ },
2167
+ {
2168
+ "epoch": 0.81,
2169
+ "learning_rate": 5.501100676595761e-07,
2170
+ "logits/chosen": 0.2536852955818176,
2171
+ "logits/rejected": 0.1401246041059494,
2172
+ "logps/chosen": -1562.163818359375,
2173
+ "logps/rejected": -2294.75732421875,
2174
+ "loss": 0.4614,
2175
+ "rewards/accuracies": 0.625,
2176
+ "rewards/chosen": -1.2489855289459229,
2177
+ "rewards/margins": 0.7926613092422485,
2178
+ "rewards/rejected": -2.041646957397461,
2179
+ "step": 1540
2180
+ },
2181
+ {
2182
+ "epoch": 0.81,
2183
+ "learning_rate": 5.218433808920884e-07,
2184
+ "logits/chosen": 0.2926151752471924,
2185
+ "logits/rejected": 0.09962544590234756,
2186
+ "logps/chosen": -1433.572509765625,
2187
+ "logps/rejected": -2299.615478515625,
2188
+ "loss": 0.4524,
2189
+ "rewards/accuracies": 0.5687500238418579,
2190
+ "rewards/chosen": -1.1709363460540771,
2191
+ "rewards/margins": 0.9143635630607605,
2192
+ "rewards/rejected": -2.0853002071380615,
2193
+ "step": 1550
2194
+ },
2195
+ {
2196
+ "epoch": 0.82,
2197
+ "learning_rate": 4.942373823661928e-07,
2198
+ "logits/chosen": 0.23216836154460907,
2199
+ "logits/rejected": 0.19754758477210999,
2200
+ "logps/chosen": -1521.939208984375,
2201
+ "logps/rejected": -2178.3291015625,
2202
+ "loss": 0.4698,
2203
+ "rewards/accuracies": 0.5874999761581421,
2204
+ "rewards/chosen": -1.2342188358306885,
2205
+ "rewards/margins": 0.6809908151626587,
2206
+ "rewards/rejected": -1.9152095317840576,
2207
+ "step": 1560
2208
+ },
2209
+ {
2210
+ "epoch": 0.82,
2211
+ "learning_rate": 4.6730129226114363e-07,
2212
+ "logits/chosen": 0.19226306676864624,
2213
+ "logits/rejected": 0.13501006364822388,
2214
+ "logps/chosen": -1532.320068359375,
2215
+ "logps/rejected": -2355.093505859375,
2216
+ "loss": 0.4712,
2217
+ "rewards/accuracies": 0.574999988079071,
2218
+ "rewards/chosen": -1.2820327281951904,
2219
+ "rewards/margins": 0.8527010679244995,
2220
+ "rewards/rejected": -2.1347339153289795,
2221
+ "step": 1570
2222
+ },
2223
+ {
2224
+ "epoch": 0.83,
2225
+ "learning_rate": 4.4104410701222703e-07,
2226
+ "logits/chosen": 0.15366807579994202,
2227
+ "logits/rejected": 0.11835174262523651,
2228
+ "logps/chosen": -1608.6761474609375,
2229
+ "logps/rejected": -2489.91455078125,
2230
+ "loss": 0.469,
2231
+ "rewards/accuracies": 0.5687500238418579,
2232
+ "rewards/chosen": -1.338335394859314,
2233
+ "rewards/margins": 0.8876321911811829,
2234
+ "rewards/rejected": -2.2259676456451416,
2235
+ "step": 1580
2236
+ },
2237
+ {
2238
+ "epoch": 0.83,
2239
+ "learning_rate": 4.154745963060197e-07,
2240
+ "logits/chosen": 0.21381524205207825,
2241
+ "logits/rejected": 0.0645713359117508,
2242
+ "logps/chosen": -1354.0247802734375,
2243
+ "logps/rejected": -2909.98828125,
2244
+ "loss": 0.4559,
2245
+ "rewards/accuracies": 0.637499988079071,
2246
+ "rewards/chosen": -1.0845643281936646,
2247
+ "rewards/margins": 1.571176290512085,
2248
+ "rewards/rejected": -2.655740737915039,
2249
+ "step": 1590
2250
+ },
2251
+ {
2252
+ "epoch": 0.84,
2253
+ "learning_rate": 3.9060130015138863e-07,
2254
+ "logits/chosen": 0.25924235582351685,
2255
+ "logits/rejected": 0.1109732836484909,
2256
+ "logps/chosen": -1437.39501953125,
2257
+ "logps/rejected": -2759.584228515625,
2258
+ "loss": 0.4559,
2259
+ "rewards/accuracies": 0.65625,
2260
+ "rewards/chosen": -1.1647388935089111,
2261
+ "rewards/margins": 1.3532658815383911,
2262
+ "rewards/rejected": -2.5180046558380127,
2263
+ "step": 1600
2264
+ },
2265
+ {
2266
+ "epoch": 0.84,
2267
+ "learning_rate": 3.664325260271953e-07,
2268
+ "logits/chosen": 0.22887060046195984,
2269
+ "logits/rejected": 0.09053263813257217,
2270
+ "logps/chosen": -1473.260009765625,
2271
+ "logps/rejected": -1995.0269775390625,
2272
+ "loss": 0.4712,
2273
+ "rewards/accuracies": 0.48750001192092896,
2274
+ "rewards/chosen": -1.2130258083343506,
2275
+ "rewards/margins": 0.5937215089797974,
2276
+ "rewards/rejected": -1.8067471981048584,
2277
+ "step": 1610
2278
+ },
2279
+ {
2280
+ "epoch": 0.85,
2281
+ "learning_rate": 3.429763461076677e-07,
2282
+ "logits/chosen": 0.1899276226758957,
2283
+ "logits/rejected": 0.12356813251972198,
2284
+ "logps/chosen": -1743.064453125,
2285
+ "logps/rejected": -2304.783203125,
2286
+ "loss": 0.4677,
2287
+ "rewards/accuracies": 0.574999988079071,
2288
+ "rewards/chosen": -1.4444160461425781,
2289
+ "rewards/margins": 0.6152055859565735,
2290
+ "rewards/rejected": -2.059621572494507,
2291
+ "step": 1620
2292
+ },
2293
+ {
2294
+ "epoch": 0.85,
2295
+ "learning_rate": 3.202405945663556e-07,
2296
+ "logits/chosen": 0.22914421558380127,
2297
+ "logits/rejected": 0.09422020614147186,
2298
+ "logps/chosen": -1509.6998291015625,
2299
+ "logps/rejected": -2195.837646484375,
2300
+ "loss": 0.4638,
2301
+ "rewards/accuracies": 0.4937500059604645,
2302
+ "rewards/chosen": -1.2225978374481201,
2303
+ "rewards/margins": 0.7319514155387878,
2304
+ "rewards/rejected": -1.9545494318008423,
2305
+ "step": 1630
2306
+ },
2307
+ {
2308
+ "epoch": 0.86,
2309
+ "learning_rate": 2.982328649595856e-07,
2310
+ "logits/chosen": 0.24722608923912048,
2311
+ "logits/rejected": 0.10591373592615128,
2312
+ "logps/chosen": -1233.9052734375,
2313
+ "logps/rejected": -2268.322509765625,
2314
+ "loss": 0.4653,
2315
+ "rewards/accuracies": 0.59375,
2316
+ "rewards/chosen": -0.9642356634140015,
2317
+ "rewards/margins": 1.0845736265182495,
2318
+ "rewards/rejected": -2.04880952835083,
2319
+ "step": 1640
2320
+ },
2321
+ {
2322
+ "epoch": 0.86,
2323
+ "learning_rate": 2.7696050769026954e-07,
2324
+ "logits/chosen": 0.21008674800395966,
2325
+ "logits/rejected": 0.05934596806764603,
2326
+ "logps/chosen": -1442.0106201171875,
2327
+ "logps/rejected": -2874.48388671875,
2328
+ "loss": 0.4615,
2329
+ "rewards/accuracies": 0.6187499761581421,
2330
+ "rewards/chosen": -1.2054154872894287,
2331
+ "rewards/margins": 1.4088830947875977,
2332
+ "rewards/rejected": -2.6142985820770264,
2333
+ "step": 1650
2334
+ },
2335
+ {
2336
+ "epoch": 0.87,
2337
+ "learning_rate": 2.564306275529341e-07,
2338
+ "logits/chosen": 0.18529877066612244,
2339
+ "logits/rejected": 0.12559422850608826,
2340
+ "logps/chosen": -1704.0299072265625,
2341
+ "logps/rejected": -2808.08349609375,
2342
+ "loss": 0.4591,
2343
+ "rewards/accuracies": 0.5874999761581421,
2344
+ "rewards/chosen": -1.4349021911621094,
2345
+ "rewards/margins": 1.1065049171447754,
2346
+ "rewards/rejected": -2.5414071083068848,
2347
+ "step": 1660
2348
+ },
2349
+ {
2350
+ "epoch": 0.87,
2351
+ "learning_rate": 2.3665008136077332e-07,
2352
+ "logits/chosen": 0.19881121814250946,
2353
+ "logits/rejected": 0.17202343046665192,
2354
+ "logps/chosen": -1710.2633056640625,
2355
+ "logps/rejected": -1984.5556640625,
2356
+ "loss": 0.473,
2357
+ "rewards/accuracies": 0.574999988079071,
2358
+ "rewards/chosen": -1.3971118927001953,
2359
+ "rewards/margins": 0.33112001419067383,
2360
+ "rewards/rejected": -1.7282320261001587,
2361
+ "step": 1670
2362
+ },
2363
+ {
2364
+ "epoch": 0.88,
2365
+ "learning_rate": 2.1762547565553293e-07,
2366
+ "logits/chosen": 0.17657816410064697,
2367
+ "logits/rejected": 0.11265295743942261,
2368
+ "logps/chosen": -1725.0482177734375,
2369
+ "logps/rejected": -1982.76953125,
2370
+ "loss": 0.466,
2371
+ "rewards/accuracies": 0.4937500059604645,
2372
+ "rewards/chosen": -1.5021684169769287,
2373
+ "rewards/margins": 0.2558698058128357,
2374
+ "rewards/rejected": -1.7580381631851196,
2375
+ "step": 1680
2376
+ },
2377
+ {
2378
+ "epoch": 0.88,
2379
+ "learning_rate": 1.993631645009747e-07,
2380
+ "logits/chosen": 0.19522444903850555,
2381
+ "logits/rejected": 0.058800529688596725,
2382
+ "logps/chosen": -1578.4208984375,
2383
+ "logps/rejected": -2554.65185546875,
2384
+ "loss": 0.4675,
2385
+ "rewards/accuracies": 0.606249988079071,
2386
+ "rewards/chosen": -1.2845125198364258,
2387
+ "rewards/margins": 1.006216049194336,
2388
+ "rewards/rejected": -2.290728807449341,
2389
+ "step": 1690
2390
+ },
2391
+ {
2392
+ "epoch": 0.89,
2393
+ "learning_rate": 1.818692473606748e-07,
2394
+ "logits/chosen": 0.2271948605775833,
2395
+ "logits/rejected": 0.18108686804771423,
2396
+ "logps/chosen": -1478.1927490234375,
2397
+ "logps/rejected": -2156.734375,
2398
+ "loss": 0.4747,
2399
+ "rewards/accuracies": 0.53125,
2400
+ "rewards/chosen": -1.2414519786834717,
2401
+ "rewards/margins": 0.7026728391647339,
2402
+ "rewards/rejected": -1.9441248178482056,
2403
+ "step": 1700
2404
+ },
2405
+ {
2406
+ "epoch": 0.9,
2407
+ "learning_rate": 1.6514956706084885e-07,
2408
+ "logits/chosen": 0.23735575377941132,
2409
+ "logits/rejected": 0.11482490599155426,
2410
+ "logps/chosen": -1801.324462890625,
2411
+ "logps/rejected": -2704.887939453125,
2412
+ "loss": 0.4736,
2413
+ "rewards/accuracies": 0.59375,
2414
+ "rewards/chosen": -1.552073359489441,
2415
+ "rewards/margins": 0.8944045901298523,
2416
+ "rewards/rejected": -2.4464781284332275,
2417
+ "step": 1710
2418
+ },
2419
+ {
2420
+ "epoch": 0.9,
2421
+ "learning_rate": 1.4920970783889737e-07,
2422
+ "logits/chosen": 0.22280173003673553,
2423
+ "logits/rejected": 0.11919368803501129,
2424
+ "logps/chosen": -1566.2589111328125,
2425
+ "logps/rejected": -2471.8125,
2426
+ "loss": 0.4684,
2427
+ "rewards/accuracies": 0.5625,
2428
+ "rewards/chosen": -1.2630040645599365,
2429
+ "rewards/margins": 0.966931164264679,
2430
+ "rewards/rejected": -2.229935646057129,
2431
+ "step": 1720
2432
+ },
2433
+ {
2434
+ "epoch": 0.91,
2435
+ "learning_rate": 1.340549934783164e-07,
2436
+ "logits/chosen": 0.2689998745918274,
2437
+ "logits/rejected": 0.12245997041463852,
2438
+ "logps/chosen": -1098.0716552734375,
2439
+ "logps/rejected": -2332.2578125,
2440
+ "loss": 0.4659,
2441
+ "rewards/accuracies": 0.6187499761581421,
2442
+ "rewards/chosen": -0.8609301447868347,
2443
+ "rewards/margins": 1.2441167831420898,
2444
+ "rewards/rejected": -2.1050469875335693,
2445
+ "step": 1730
2446
+ },
2447
+ {
2448
+ "epoch": 0.91,
2449
+ "learning_rate": 1.196904855305961e-07,
2450
+ "logits/chosen": 0.2383730709552765,
2451
+ "logits/rejected": 0.15037932991981506,
2452
+ "logps/chosen": -1544.904052734375,
2453
+ "logps/rejected": -2499.219482421875,
2454
+ "loss": 0.4561,
2455
+ "rewards/accuracies": 0.5874999761581421,
2456
+ "rewards/chosen": -1.2684749364852905,
2457
+ "rewards/margins": 0.9886786341667175,
2458
+ "rewards/rejected": -2.2571537494659424,
2459
+ "step": 1740
2460
+ },
2461
+ {
2462
+ "epoch": 0.92,
2463
+ "learning_rate": 1.0612098162470302e-07,
2464
+ "logits/chosen": 0.20837631821632385,
2465
+ "logits/rejected": 0.1260487288236618,
2466
+ "logps/chosen": -1376.4371337890625,
2467
+ "logps/rejected": -2311.586669921875,
2468
+ "loss": 0.4467,
2469
+ "rewards/accuracies": 0.612500011920929,
2470
+ "rewards/chosen": -1.126542329788208,
2471
+ "rewards/margins": 0.9472710490226746,
2472
+ "rewards/rejected": -2.0738134384155273,
2473
+ "step": 1750
2474
+ },
2475
+ {
2476
+ "epoch": 0.92,
2477
+ "learning_rate": 9.335101386471285e-08,
2478
+ "logits/chosen": 0.2322504222393036,
2479
+ "logits/rejected": 0.06627029925584793,
2480
+ "logps/chosen": -1435.283447265625,
2481
+ "logps/rejected": -2674.15576171875,
2482
+ "loss": 0.4715,
2483
+ "rewards/accuracies": 0.581250011920929,
2484
+ "rewards/chosen": -1.151064157485962,
2485
+ "rewards/margins": 1.2704349756240845,
2486
+ "rewards/rejected": -2.421499252319336,
2487
+ "step": 1760
2488
+ },
2489
+ {
2490
+ "epoch": 0.93,
2491
+ "learning_rate": 8.138484731612273e-08,
2492
+ "logits/chosen": 0.2155352383852005,
2493
+ "logits/rejected": 0.12622274458408356,
2494
+ "logps/chosen": -1182.394287109375,
2495
+ "logps/rejected": -2245.24462890625,
2496
+ "loss": 0.4629,
2497
+ "rewards/accuracies": 0.550000011920929,
2498
+ "rewards/chosen": -0.9620729684829712,
2499
+ "rewards/margins": 1.0421679019927979,
2500
+ "rewards/rejected": -2.0042405128479004,
2501
+ "step": 1770
2502
+ },
2503
+ {
2504
+ "epoch": 0.93,
2505
+ "learning_rate": 7.022647858135501e-08,
2506
+ "logits/chosen": 0.30309510231018066,
2507
+ "logits/rejected": 0.18017789721488953,
2508
+ "logps/chosen": -1599.3291015625,
2509
+ "logps/rejected": -2475.38720703125,
2510
+ "loss": 0.465,
2511
+ "rewards/accuracies": 0.5687500238418579,
2512
+ "rewards/chosen": -1.3132779598236084,
2513
+ "rewards/margins": 0.8995476961135864,
2514
+ "rewards/rejected": -2.2128255367279053,
2515
+ "step": 1780
2516
+ },
2517
+ {
2518
+ "epoch": 0.94,
2519
+ "learning_rate": 5.987963446492384e-08,
2520
+ "logits/chosen": 0.23334476351737976,
2521
+ "logits/rejected": 0.17126549780368805,
2522
+ "logps/chosen": -1491.8856201171875,
2523
+ "logps/rejected": -2064.55078125,
2524
+ "loss": 0.4679,
2525
+ "rewards/accuracies": 0.574999988079071,
2526
+ "rewards/chosen": -1.2208709716796875,
2527
+ "rewards/margins": 0.6093058586120605,
2528
+ "rewards/rejected": -1.8301767110824585,
2529
+ "step": 1790
2530
+ },
2531
+ {
2532
+ "epoch": 0.94,
2533
+ "learning_rate": 5.034777072871394e-08,
2534
+ "logits/chosen": 0.23894283175468445,
2535
+ "logits/rejected": 0.16225464642047882,
2536
+ "logps/chosen": -1209.31494140625,
2537
+ "logps/rejected": -1923.9974365234375,
2538
+ "loss": 0.4748,
2539
+ "rewards/accuracies": 0.5874999761581421,
2540
+ "rewards/chosen": -0.9387443661689758,
2541
+ "rewards/margins": 0.7473281621932983,
2542
+ "rewards/rejected": -1.6860727071762085,
2543
+ "step": 1800
2544
+ },
2545
+ {
2546
+ "epoch": 0.95,
2547
+ "learning_rate": 4.163407093778243e-08,
2548
+ "logits/chosen": 0.30054157972335815,
2549
+ "logits/rejected": 0.17386284470558167,
2550
+ "logps/chosen": -1040.991455078125,
2551
+ "logps/rejected": -2445.29541015625,
2552
+ "loss": 0.4516,
2553
+ "rewards/accuracies": 0.65625,
2554
+ "rewards/chosen": -0.7577398419380188,
2555
+ "rewards/margins": 1.4329960346221924,
2556
+ "rewards/rejected": -2.1907360553741455,
2557
+ "step": 1810
2558
+ },
2559
+ {
2560
+ "epoch": 0.95,
2561
+ "learning_rate": 3.37414453970758e-08,
2562
+ "logits/chosen": 0.303236186504364,
2563
+ "logits/rejected": 0.1971709430217743,
2564
+ "logps/chosen": -1248.239501953125,
2565
+ "logps/rejected": -2541.384033203125,
2566
+ "loss": 0.4512,
2567
+ "rewards/accuracies": 0.6312500238418579,
2568
+ "rewards/chosen": -0.9271729588508606,
2569
+ "rewards/margins": 1.3627898693084717,
2570
+ "rewards/rejected": -2.2899627685546875,
2571
+ "step": 1820
2572
+ },
2573
+ {
2574
+ "epoch": 0.96,
2575
+ "learning_rate": 2.6672530179410183e-08,
2576
+ "logits/chosen": 0.25464674830436707,
2577
+ "logits/rejected": 0.13462017476558685,
2578
+ "logps/chosen": -1484.759521484375,
2579
+ "logps/rejected": -2381.3857421875,
2580
+ "loss": 0.4582,
2581
+ "rewards/accuracies": 0.6499999761581421,
2582
+ "rewards/chosen": -1.2104871273040771,
2583
+ "rewards/margins": 0.955074667930603,
2584
+ "rewards/rejected": -2.165562152862549,
2585
+ "step": 1830
2586
+ },
2587
+ {
2588
+ "epoch": 0.96,
2589
+ "learning_rate": 2.04296862450451e-08,
2590
+ "logits/chosen": 0.34345191717147827,
2591
+ "logits/rejected": 0.1766502857208252,
2592
+ "logps/chosen": -1336.6195068359375,
2593
+ "logps/rejected": -2531.597412109375,
2594
+ "loss": 0.4675,
2595
+ "rewards/accuracies": 0.574999988079071,
2596
+ "rewards/chosen": -1.0585607290267944,
2597
+ "rewards/margins": 1.2248413562774658,
2598
+ "rewards/rejected": -2.28340220451355,
2599
+ "step": 1840
2600
+ },
2601
+ {
2602
+ "epoch": 0.97,
2603
+ "learning_rate": 1.501499865314171e-08,
2604
+ "logits/chosen": 0.31596893072128296,
2605
+ "logits/rejected": 0.17752663791179657,
2606
+ "logps/chosen": -1208.4625244140625,
2607
+ "logps/rejected": -2460.017578125,
2608
+ "loss": 0.4534,
2609
+ "rewards/accuracies": 0.606249988079071,
2610
+ "rewards/chosen": -0.9176605939865112,
2611
+ "rewards/margins": 1.2910888195037842,
2612
+ "rewards/rejected": -2.208749294281006,
2613
+ "step": 1850
2614
+ },
2615
+ {
2616
+ "epoch": 0.97,
2617
+ "learning_rate": 1.0430275865371265e-08,
2618
+ "logits/chosen": 0.30796024203300476,
2619
+ "logits/rejected": 0.15131710469722748,
2620
+ "logps/chosen": -1164.2542724609375,
2621
+ "logps/rejected": -2230.33056640625,
2622
+ "loss": 0.4555,
2623
+ "rewards/accuracies": 0.643750011920929,
2624
+ "rewards/chosen": -0.8924150466918945,
2625
+ "rewards/margins": 1.0938717126846313,
2626
+ "rewards/rejected": -1.9862868785858154,
2627
+ "step": 1860
2628
+ },
2629
+ {
2630
+ "epoch": 0.98,
2631
+ "learning_rate": 6.677049141901315e-09,
2632
+ "logits/chosen": 0.26449787616729736,
2633
+ "logits/rejected": 0.12270595878362656,
2634
+ "logps/chosen": -1493.645263671875,
2635
+ "logps/rejected": -2633.17626953125,
2636
+ "loss": 0.4614,
2637
+ "rewards/accuracies": 0.581250011920929,
2638
+ "rewards/chosen": -1.2356170415878296,
2639
+ "rewards/margins": 1.1605656147003174,
2640
+ "rewards/rejected": -2.3961825370788574,
2641
+ "step": 1870
2642
+ },
2643
+ {
2644
+ "epoch": 0.98,
2645
+ "learning_rate": 3.756572029968708e-09,
2646
+ "logits/chosen": 0.23211045563220978,
2647
+ "logits/rejected": 0.13400281965732574,
2648
+ "logps/chosen": -1511.829345703125,
2649
+ "logps/rejected": -2489.31494140625,
2650
+ "loss": 0.4594,
2651
+ "rewards/accuracies": 0.6187499761581421,
2652
+ "rewards/chosen": -1.2328951358795166,
2653
+ "rewards/margins": 1.0022671222686768,
2654
+ "rewards/rejected": -2.2351622581481934,
2655
+ "step": 1880
2656
+ },
2657
+ {
2658
+ "epoch": 0.99,
2659
+ "learning_rate": 1.6698199452053199e-09,
2660
+ "logits/chosen": 0.19983918964862823,
2661
+ "logits/rejected": 0.11516892910003662,
2662
+ "logps/chosen": -1396.664306640625,
2663
+ "logps/rejected": -2378.28857421875,
2664
+ "loss": 0.4543,
2665
+ "rewards/accuracies": 0.606249988079071,
2666
+ "rewards/chosen": -1.1454143524169922,
2667
+ "rewards/margins": 1.0044304132461548,
2668
+ "rewards/rejected": -2.1498446464538574,
2669
+ "step": 1890
2670
+ },
2671
+ {
2672
+ "epoch": 0.99,
2673
+ "learning_rate": 4.1748984585560094e-10,
2674
+ "logits/chosen": 0.2773471474647522,
2675
+ "logits/rejected": 0.1175018697977066,
2676
+ "logps/chosen": -1402.577392578125,
2677
+ "logps/rejected": -2661.568603515625,
2678
+ "loss": 0.4649,
2679
+ "rewards/accuracies": 0.581250011920929,
2680
+ "rewards/chosen": -1.132912039756775,
2681
+ "rewards/margins": 1.262406587600708,
2682
+ "rewards/rejected": -2.3953185081481934,
2683
+ "step": 1900
2684
+ },
2685
+ {
2686
+ "epoch": 1.0,
2687
+ "learning_rate": 0.0,
2688
+ "logits/chosen": 0.21633613109588623,
2689
+ "logits/rejected": 0.111175537109375,
2690
+ "logps/chosen": -1688.2955322265625,
2691
+ "logps/rejected": -2615.567626953125,
2692
+ "loss": 0.4707,
2693
+ "rewards/accuracies": 0.5062500238418579,
2694
+ "rewards/chosen": -1.4329384565353394,
2695
+ "rewards/margins": 0.9476302862167358,
2696
+ "rewards/rejected": -2.380568504333496,
2697
+ "step": 1910
2698
+ },
2699
+ {
2700
+ "epoch": 1.0,
2701
+ "step": 1910,
2702
+ "total_flos": 0.0,
2703
+ "train_loss": 0.09940854217369519,
2704
+ "train_runtime": 5146.4957,
2705
+ "train_samples_per_second": 11.879,
2706
+ "train_steps_per_second": 0.371
2707
+ }
2708
+ ],
2709
+ "logging_steps": 10,
2710
+ "max_steps": 1910,
2711
+ "num_input_tokens_seen": 0,
2712
+ "num_train_epochs": 1,
2713
+ "save_steps": 20,
2714
+ "total_flos": 0.0,
2715
+ "train_batch_size": 4,
2716
+ "trial_name": null,
2717
+ "trial_params": null
2718
+ }