KKHYA commited on
Commit
e5b1663
·
verified ·
1 Parent(s): e2848a8

Model save

Browse files
Qwen2.5-1.5B-Open-R1-GRPO.log CHANGED
@@ -1,7 +1,7 @@
1
- 2025-02-17 14:24:25 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False
2
- 2025-02-17 14:24:25 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
3
- 2025-02-17 14:24:25 - INFO - __main__ - Script parameters GRPOScriptArguments(dataset_name='AI-MO/NuminaMath-TIR', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False, reward_funcs=['accuracy', 'format'], cosine_min_value_wrong=0.0, cosine_max_value_wrong=-0.5, cosine_min_value_correct=0.5, cosine_max_value_correct=1.0, cosine_max_len=1000, repetition_n_grams=3, repetition_max_penalty=-1.0)
4
- 2025-02-17 14:24:25 - INFO - __main__ - Training parameters GRPOConfig(
5
  _n_gpu=1,
6
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
7
  adafactor=False,
@@ -80,7 +80,7 @@ log_completions=False,
80
  log_level=info,
81
  log_level_replica=warning,
82
  log_on_each_node=True,
83
- logging_dir=output/Qwen2.5-1.5B-Open-R1-GRPO/runs/Feb17_14-24-25_smilelab-a6000,
84
  logging_first_step=False,
85
  logging_nan_inf_filter=True,
86
  logging_steps=5,
@@ -158,14 +158,183 @@ warmup_ratio=0.1,
158
  warmup_steps=0,
159
  weight_decay=0.0,
160
  )
161
- 2025-02-17 14:24:25 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1 distributed training: True, 16-bits training: False
162
- 2025-02-17 14:24:25 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: False
163
- 2025-02-17 14:24:26 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
164
- 2025-02-17 14:24:26 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
165
- 2025-02-17 14:24:26 - INFO - datasets.builder - Found cached dataset numina_math-tir (/home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd)
166
- 2025-02-17 14:24:26 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
167
- 2025-02-17 14:24:26 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-42e22e57818641cd.arrow
168
- 2025-02-17 14:24:26 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-be70039808739fd6.arrow
169
- 2025-02-17 14:24:26 - INFO - __main__ - *** Initializing model kwargs ***
170
- 2025-02-17 14:24:50 - INFO - __main__ - *** Train ***
171
- 2025-02-18 05:27:58 - INFO - __main__ - *** Save model ***
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-02-18 14:33:03 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False
2
+ 2025-02-18 14:33:03 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
3
+ 2025-02-18 14:33:03 - INFO - __main__ - Script parameters GRPOScriptArguments(dataset_name='AI-MO/NuminaMath-TIR', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False, reward_funcs=['accuracy', 'format'], cosine_min_value_wrong=0.0, cosine_max_value_wrong=-0.5, cosine_min_value_correct=0.5, cosine_max_value_correct=1.0, cosine_max_len=1000, repetition_n_grams=3, repetition_max_penalty=-1.0)
4
+ 2025-02-18 14:33:03 - INFO - __main__ - Training parameters GRPOConfig(
5
  _n_gpu=1,
6
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
7
  adafactor=False,
 
80
  log_level=info,
81
  log_level_replica=warning,
82
  log_on_each_node=True,
83
+ logging_dir=output/Qwen2.5-1.5B-Open-R1-GRPO/runs/Feb18_14-33-03_smilelab-a6000,
84
  logging_first_step=False,
85
  logging_nan_inf_filter=True,
86
  logging_steps=5,
 
158
  warmup_steps=0,
159
  weight_decay=0.0,
160
  )
161
+ 2025-02-18 14:33:03 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: False
162
+ 2025-02-18 14:33:03 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1 distributed training: True, 16-bits training: False
163
+ 2025-02-18 14:33:04 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
164
+ 2025-02-18 14:33:04 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
165
+ 2025-02-18 14:33:04 - INFO - datasets.builder - Found cached dataset numina_math-tir (/home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd)
166
+ 2025-02-18 14:33:04 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
167
+ 2025-02-18 14:33:04 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-42e22e57818641cd.arrow
168
+ 2025-02-18 14:33:04 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-be70039808739fd6.arrow
169
+ 2025-02-18 14:33:04 - INFO - __main__ - *** Initializing model kwargs ***
170
+ 2025-02-18 14:33:31 - INFO - __main__ - *** Train ***
171
+ 2025-02-18 14:39:05 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False
172
+ 2025-02-18 14:39:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='Qwen/Qwen2.5-1.5B-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
173
+ 2025-02-18 14:39:05 - INFO - __main__ - Script parameters GRPOScriptArguments(dataset_name='AI-MO/NuminaMath-TIR', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False, reward_funcs=['accuracy', 'format'], cosine_min_value_wrong=0.0, cosine_max_value_wrong=-0.5, cosine_min_value_correct=0.5, cosine_max_value_correct=1.0, cosine_max_len=1000, repetition_n_grams=3, repetition_max_penalty=-1.0)
174
+ 2025-02-18 14:39:05 - INFO - __main__ - Training parameters GRPOConfig(
175
+ _n_gpu=1,
176
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
177
+ adafactor=False,
178
+ adam_beta1=0.9,
179
+ adam_beta2=0.999,
180
+ adam_epsilon=1e-08,
181
+ auto_find_batch_size=False,
182
+ average_tokens_across_devices=False,
183
+ batch_eval_metrics=False,
184
+ benchmarks=[],
185
+ beta=0.04,
186
+ bf16=True,
187
+ bf16_full_eval=False,
188
+ callbacks=[],
189
+ data_seed=None,
190
+ dataloader_drop_last=False,
191
+ dataloader_num_workers=0,
192
+ dataloader_persistent_workers=False,
193
+ dataloader_pin_memory=True,
194
+ dataloader_prefetch_factor=None,
195
+ ddp_backend=None,
196
+ ddp_broadcast_buffers=None,
197
+ ddp_bucket_cap_mb=None,
198
+ ddp_find_unused_parameters=None,
199
+ ddp_timeout=1800,
200
+ debug=[],
201
+ deepspeed=None,
202
+ disable_tqdm=False,
203
+ dispatch_batches=None,
204
+ do_eval=True,
205
+ do_predict=False,
206
+ do_train=False,
207
+ ds3_gather_for_generation=True,
208
+ eval_accumulation_steps=None,
209
+ eval_delay=0,
210
+ eval_do_concat_batches=True,
211
+ eval_on_start=False,
212
+ eval_steps=100,
213
+ eval_strategy=IntervalStrategy.STEPS,
214
+ eval_use_gather_object=False,
215
+ evaluation_strategy=None,
216
+ fp16=False,
217
+ fp16_backend=auto,
218
+ fp16_full_eval=False,
219
+ fp16_opt_level=O1,
220
+ fsdp=[],
221
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
222
+ fsdp_min_num_params=0,
223
+ fsdp_transformer_layer_cls_to_wrap=None,
224
+ full_determinism=False,
225
+ gradient_accumulation_steps=16,
226
+ gradient_checkpointing=True,
227
+ gradient_checkpointing_kwargs={'use_reentrant': False},
228
+ greater_is_better=None,
229
+ group_by_length=False,
230
+ half_precision_backend=auto,
231
+ hub_always_push=False,
232
+ hub_model_id=Qwen2.5-1.5B-Open-R1-GRPO,
233
+ hub_model_revision=main,
234
+ hub_private_repo=None,
235
+ hub_strategy=HubStrategy.EVERY_SAVE,
236
+ hub_token=<HUB_TOKEN>,
237
+ ignore_data_skip=False,
238
+ include_for_metrics=[],
239
+ include_inputs_for_metrics=False,
240
+ include_num_input_tokens_seen=False,
241
+ include_tokens_per_second=False,
242
+ jit_mode_eval=False,
243
+ label_names=None,
244
+ label_smoothing_factor=0.0,
245
+ learning_rate=2e-05,
246
+ length_column_name=length,
247
+ load_best_model_at_end=False,
248
+ local_rank=0,
249
+ log_completions=False,
250
+ log_level=info,
251
+ log_level_replica=warning,
252
+ log_on_each_node=True,
253
+ logging_dir=output/Qwen2.5-1.5B-Open-R1-GRPO/runs/Feb18_14-39-05_smilelab-a6000,
254
+ logging_first_step=False,
255
+ logging_nan_inf_filter=True,
256
+ logging_steps=5,
257
+ logging_strategy=IntervalStrategy.STEPS,
258
+ lr_scheduler_kwargs={},
259
+ lr_scheduler_type=SchedulerType.COSINE,
260
+ max_completion_length=1024,
261
+ max_grad_norm=1.0,
262
+ max_prompt_length=512,
263
+ max_steps=-1,
264
+ metric_for_best_model=None,
265
+ model_init_kwargs=None,
266
+ mp_parameters=,
267
+ neftune_noise_alpha=None,
268
+ no_cuda=False,
269
+ num_generations=2,
270
+ num_train_epochs=1,
271
+ optim=OptimizerNames.ADAMW_TORCH,
272
+ optim_args=None,
273
+ optim_target_modules=None,
274
+ output_dir=output/Qwen2.5-1.5B-Open-R1-GRPO,
275
+ overwrite_hub_revision=False,
276
+ overwrite_output_dir=True,
277
+ past_index=-1,
278
+ per_device_eval_batch_size=32,
279
+ per_device_train_batch_size=16,
280
+ prediction_loss_only=False,
281
+ push_to_hub=True,
282
+ push_to_hub_model_id=None,
283
+ push_to_hub_organization=None,
284
+ push_to_hub_revision=False,
285
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
286
+ ray_scope=last,
287
+ ref_model_mixup_alpha=0.9,
288
+ ref_model_sync_steps=64,
289
+ remove_unused_columns=False,
290
+ report_to=['wandb'],
291
+ restore_callback_states_from_checkpoint=False,
292
+ resume_from_checkpoint=None,
293
+ reward_weights=None,
294
+ run_name=output/Qwen2.5-1.5B-Open-R1-GRPO,
295
+ save_on_each_node=False,
296
+ save_only_model=False,
297
+ save_safetensors=True,
298
+ save_steps=500,
299
+ save_strategy=SaveStrategy.NO,
300
+ save_total_limit=None,
301
+ seed=42,
302
+ skip_memory_metrics=True,
303
+ split_batches=None,
304
+ sync_ref_model=False,
305
+ system_prompt=None,
306
+ temperature=0.9,
307
+ tf32=None,
308
+ torch_compile=False,
309
+ torch_compile_backend=None,
310
+ torch_compile_mode=None,
311
+ torch_empty_cache_steps=None,
312
+ torchdynamo=None,
313
+ tpu_metrics_debug=False,
314
+ tpu_num_cores=None,
315
+ use_cpu=False,
316
+ use_ipex=False,
317
+ use_legacy_prediction_loop=False,
318
+ use_liger_kernel=False,
319
+ use_mps_device=False,
320
+ use_vllm=True,
321
+ vllm_device=auto,
322
+ vllm_dtype=auto,
323
+ vllm_gpu_memory_utilization=0.7,
324
+ vllm_max_model_len=None,
325
+ wandb_entity=None,
326
+ wandb_project=None,
327
+ warmup_ratio=0.1,
328
+ warmup_steps=0,
329
+ weight_decay=0.0,
330
+ )
331
+ 2025-02-18 14:39:05 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1 distributed training: True, 16-bits training: False
332
+ 2025-02-18 14:39:07 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
333
+ 2025-02-18 14:39:07 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
334
+ 2025-02-18 14:39:07 - INFO - datasets.builder - Found cached dataset numina_math-tir (/home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd)
335
+ 2025-02-18 14:39:07 - INFO - datasets.info - Loading Dataset info from /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd
336
+ 2025-02-18 14:39:07 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-42e22e57818641cd.arrow
337
+ 2025-02-18 14:39:07 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/mingyuan/.cache/huggingface/datasets/AI-MO___numina_math-tir/default/0.0.0/77a91d7b7a1a98ac4b1beb7d86c09d156b935dcd/cache-be70039808739fd6.arrow
338
+ 2025-02-18 14:39:07 - INFO - __main__ - *** Initializing model kwargs ***
339
+ 2025-02-18 14:39:33 - INFO - __main__ - *** Train ***
340
+ 2025-02-19 03:54:32 - INFO - __main__ - *** Save model ***
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/no-exsit/huggingface/runs/2dcu6uam)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/no-exsit/huggingface/runs/ya30e8vy)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.1736428747392735,
4
- "train_runtime": 54182.9853,
5
  "train_samples": 72441,
6
- "train_samples_per_second": 1.337,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 27.009561450669516,
4
+ "train_runtime": 47693.8258,
5
  "train_samples": 72441,
6
+ "train_samples_per_second": 1.519,
7
+ "train_steps_per_second": 0.006
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f560b0bd324be829118baf66fc7c7aeaf451d3a837a09bb002084221d0d76b3e
3
- size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48021ab932819ba74e1bce8a7cb87f288cbc1346247da4002d7bb453f1e52516
3
+ size 3087467144
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.1736428747392735,
4
- "train_runtime": 54182.9853,
5
  "train_samples": 72441,
6
- "train_samples_per_second": 1.337,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 27.009561450669516,
4
+ "train_runtime": 47693.8258,
5
  "train_samples": 72441,
6
+ "train_samples_per_second": 1.519,
7
+ "train_steps_per_second": 0.006
8
  }
trainer_state.json CHANGED
@@ -9,775 +9,775 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 490.6283988952637,
13
  "epoch": 0.0176678445229682,
14
- "grad_norm": 1.3560127019882202,
15
- "kl": 0.0019670963287353516,
16
  "learning_rate": 3.448275862068966e-06,
17
- "loss": 0.0001,
18
- "reward": 0.21250000642612576,
19
- "reward_std": 0.21599052464589477,
20
- "rewards/accuracy_reward": 0.05130208469927311,
21
- "rewards/format_reward": 0.16119792133104055,
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 534.7315277099609,
26
  "epoch": 0.0353356890459364,
27
- "grad_norm": 82.46902465820312,
28
- "kl": 0.11877202987670898,
29
  "learning_rate": 6.896551724137932e-06,
30
- "loss": 0.0048,
31
- "reward": 0.10625000358559192,
32
- "reward_std": 0.16045468132942914,
33
- "rewards/accuracy_reward": 0.022395833861082792,
34
- "rewards/format_reward": 0.0838541688863188,
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 527.6500186920166,
39
  "epoch": 0.053003533568904596,
40
- "grad_norm": 0.742699146270752,
41
- "kl": 0.10464630126953126,
42
  "learning_rate": 1.0344827586206898e-05,
43
- "loss": 0.0042,
44
- "reward": 0.1148437530733645,
45
- "reward_std": 0.16899234298616647,
46
- "rewards/accuracy_reward": 0.025000000628642736,
47
- "rewards/format_reward": 0.08984375256113708,
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 534.8018405914306,
52
  "epoch": 0.0706713780918728,
53
- "grad_norm": 11.84140682220459,
54
- "kl": 0.17176666259765624,
55
  "learning_rate": 1.3793103448275863e-05,
56
- "loss": 0.0069,
57
- "reward": 0.10625000288709999,
58
- "reward_std": 0.14975022254511713,
59
- "rewards/accuracy_reward": 0.026822917419485746,
60
- "rewards/format_reward": 0.07942708525806666,
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 536.3153781890869,
65
  "epoch": 0.08833922261484099,
66
- "grad_norm": 9.572907447814941,
67
- "kl": 0.37183837890625,
68
  "learning_rate": 1.7241379310344828e-05,
69
- "loss": 0.0149,
70
- "reward": 0.10234375295694917,
71
- "reward_std": 0.14782514404505492,
72
- "rewards/accuracy_reward": 0.02083333386108279,
73
- "rewards/format_reward": 0.0815104188863188,
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 539.7317886352539,
78
  "epoch": 0.10600706713780919,
79
- "grad_norm": 20.72088050842285,
80
- "kl": 213.514892578125,
81
  "learning_rate": 1.999923511388017e-05,
82
- "loss": 8.5621,
83
- "reward": 0.11822917042300105,
84
- "reward_std": 0.16361206490546465,
85
- "rewards/accuracy_reward": 0.025520833930931984,
86
- "rewards/format_reward": 0.09270833609625698,
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 532.9531394958497,
91
  "epoch": 0.12367491166077739,
92
- "grad_norm": 4.47783088684082,
93
- "kl": 1.0462158203125,
94
  "learning_rate": 1.9972476383747748e-05,
95
- "loss": 0.0419,
96
- "reward": 0.1104166700039059,
97
- "reward_std": 0.16090573472902178,
98
- "rewards/accuracy_reward": 0.024218750605359673,
99
- "rewards/format_reward": 0.08619791911914945,
100
  "step": 35
101
  },
102
  {
103
- "completion_length": 531.2823101043701,
104
  "epoch": 0.1413427561837456,
105
- "grad_norm": 0.9901871085166931,
106
- "kl": 2.563671875,
107
  "learning_rate": 1.9907590277344582e-05,
108
- "loss": 0.1025,
109
- "reward": 0.09843750263098627,
110
- "reward_std": 0.14138951590284704,
111
- "rewards/accuracy_reward": 0.01979166711680591,
112
- "rewards/format_reward": 0.07864583539776504,
113
  "step": 40
114
  },
115
  {
116
- "completion_length": 524.4414211273194,
117
  "epoch": 0.15901060070671377,
118
- "grad_norm": 0.5192703604698181,
119
- "kl": 0.683203125,
120
  "learning_rate": 1.9804824871166254e-05,
121
- "loss": 0.0274,
122
- "reward": 0.10703125330619515,
123
- "reward_std": 0.152545028924942,
124
- "rewards/accuracy_reward": 0.02161458383779973,
125
- "rewards/format_reward": 0.0854166692122817,
126
  "step": 45
127
  },
128
  {
129
- "completion_length": 538.3731964111328,
130
  "epoch": 0.17667844522968199,
131
- "grad_norm": 6.639949798583984,
132
- "kl": 0.9129150390625,
133
  "learning_rate": 1.9664573064143604e-05,
134
- "loss": 0.0365,
135
- "reward": 0.10781250279396773,
136
- "reward_std": 0.15840874193236232,
137
- "rewards/accuracy_reward": 0.024218750512227417,
138
- "rewards/format_reward": 0.08359375253785402,
139
  "step": 50
140
  },
141
  {
142
- "completion_length": 521.2586082458496,
143
  "epoch": 0.19434628975265017,
144
- "grad_norm": 4.453002452850342,
145
- "kl": 0.474462890625,
146
  "learning_rate": 1.948737107548771e-05,
147
- "loss": 0.019,
148
- "reward": 0.10729166965465993,
149
- "reward_std": 0.15750663112848998,
150
- "rewards/accuracy_reward": 0.023177084024064244,
151
- "rewards/format_reward": 0.08411458586342632,
152
  "step": 55
153
  },
154
  {
155
- "completion_length": 549.339599609375,
156
  "epoch": 0.21201413427561838,
157
- "grad_norm": 0.5541434288024902,
158
- "kl": 0.66484375,
159
  "learning_rate": 1.9273896394584103e-05,
160
- "loss": 0.0266,
161
- "reward": 0.09843750281725079,
162
- "reward_std": 0.14917831011116506,
163
- "rewards/accuracy_reward": 0.021875000582076608,
164
- "rewards/format_reward": 0.07656250218860805,
165
  "step": 60
166
  },
167
  {
168
- "completion_length": 529.354443359375,
169
  "epoch": 0.22968197879858657,
170
- "grad_norm": 0.7464480400085449,
171
- "kl": 1.283349609375,
172
  "learning_rate": 1.9024965190774262e-05,
173
- "loss": 0.0513,
174
- "reward": 0.11223958733025938,
175
- "reward_std": 0.16225890163332224,
176
- "rewards/accuracy_reward": 0.02656250069849193,
177
- "rewards/format_reward": 0.08567708532791585,
178
  "step": 65
179
  },
180
  {
181
- "completion_length": 525.3435066223144,
182
  "epoch": 0.24734982332155478,
183
- "grad_norm": 0.5229907631874084,
184
- "kl": 0.46806640625,
185
  "learning_rate": 1.8741529192927528e-05,
186
- "loss": 0.0187,
187
- "reward": 0.10755208674818277,
188
- "reward_std": 0.1623797613196075,
189
- "rewards/accuracy_reward": 0.02031250053551048,
190
- "rewards/format_reward": 0.08723958565387875,
191
  "step": 70
192
  },
193
  {
194
- "completion_length": 539.2466316223145,
195
  "epoch": 0.26501766784452296,
196
- "grad_norm": 0.9813037514686584,
197
- "kl": 0.8671142578125,
198
  "learning_rate": 1.8424672050733577e-05,
199
- "loss": 0.0347,
200
- "reward": 0.10989583660848438,
201
- "reward_std": 0.15377733251079917,
202
- "rewards/accuracy_reward": 0.02734375058207661,
203
- "rewards/format_reward": 0.08255208577029408,
204
  "step": 75
205
  },
206
  {
207
- "completion_length": 544.274755859375,
208
  "epoch": 0.2826855123674912,
209
- "grad_norm": 1.2748777866363525,
210
- "kl": 0.545849609375,
211
  "learning_rate": 1.8075605191627242e-05,
212
- "loss": 0.0218,
213
- "reward": 0.10208333632908762,
214
- "reward_std": 0.15401905188336967,
215
- "rewards/accuracy_reward": 0.019270833837799727,
216
- "rewards/format_reward": 0.08281250209547579,
217
  "step": 80
218
  },
219
  {
220
- "completion_length": 536.8455909729004,
221
  "epoch": 0.3003533568904594,
222
- "grad_norm": 0.4829046130180359,
223
- "kl": 0.37125244140625,
224
  "learning_rate": 1.7695663189185703e-05,
225
- "loss": 0.0149,
226
- "reward": 0.11171875377185643,
227
- "reward_std": 0.1631610095500946,
228
- "rewards/accuracy_reward": 0.026822917349636554,
229
- "rewards/format_reward": 0.08489583544433117,
230
  "step": 85
231
  },
232
  {
233
- "completion_length": 526.0974117279053,
234
  "epoch": 0.31802120141342755,
235
- "grad_norm": 4.6327128410339355,
236
- "kl": 1.1112548828125,
237
  "learning_rate": 1.7286298660705877e-05,
238
- "loss": 0.0445,
239
- "reward": 0.09973958632908761,
240
- "reward_std": 0.14544901056215168,
241
- "rewards/accuracy_reward": 0.023958333861082794,
242
- "rewards/format_reward": 0.07578125221189111,
243
  "step": 90
244
  },
245
  {
246
- "completion_length": 524.5125148773193,
247
  "epoch": 0.33568904593639576,
248
- "grad_norm": 0.6081247925758362,
249
- "kl": 0.51201171875,
250
  "learning_rate": 1.6849076713469914e-05,
251
- "loss": 0.0205,
252
- "reward": 0.11250000328291207,
253
- "reward_std": 0.15795768704265356,
254
- "rewards/accuracy_reward": 0.02968750074505806,
255
- "rewards/format_reward": 0.08281250183936209,
256
  "step": 95
257
  },
258
  {
259
- "completion_length": 531.0093910217286,
260
  "epoch": 0.35335689045936397,
261
- "grad_norm": 1.5366660356521606,
262
- "kl": 0.503955078125,
263
  "learning_rate": 1.6385668960932143e-05,
264
- "loss": 0.0202,
265
- "reward": 0.09583333565387875,
266
- "reward_std": 0.13687896616756917,
267
- "rewards/accuracy_reward": 0.01875000037252903,
268
- "rewards/format_reward": 0.07708333500195294,
269
  "step": 100
270
  },
271
  {
272
  "epoch": 0.35335689045936397,
273
- "eval_completion_length": 563.7786560058594,
274
- "eval_kl": 0.3427734375,
275
- "eval_loss": 0.013146836310625076,
276
- "eval_reward": 0.13541666977107525,
277
- "eval_reward_std": 0.20748525112867355,
278
- "eval_rewards/accuracy_reward": 0.023437500465661287,
279
- "eval_rewards/format_reward": 0.1119791679084301,
280
- "eval_runtime": 71.5766,
281
- "eval_samples_per_second": 1.383,
282
- "eval_steps_per_second": 0.028,
283
  "step": 100
284
  },
285
  {
286
- "completion_length": 539.4177280426026,
287
  "epoch": 0.3710247349823322,
288
- "grad_norm": 0.2839926481246948,
289
- "kl": 0.41923828125,
290
  "learning_rate": 1.5897847131705194e-05,
291
- "loss": 0.0168,
292
- "reward": 0.10520833623595535,
293
- "reward_std": 0.15119186379015445,
294
- "rewards/accuracy_reward": 0.02630208395421505,
295
- "rewards/format_reward": 0.0789062523515895,
296
  "step": 105
297
  },
298
  {
299
- "completion_length": 527.8218933105469,
300
  "epoch": 0.38869257950530034,
301
- "grad_norm": 0.34282830357551575,
302
- "kl": 0.4187744140625,
303
  "learning_rate": 1.5387476295779737e-05,
304
- "loss": 0.0168,
305
- "reward": 0.11015625298023224,
306
- "reward_std": 0.1587713214568794,
307
- "rewards/accuracy_reward": 0.02656250069849193,
308
- "rewards/format_reward": 0.08359375258442014,
309
  "step": 110
310
  },
311
  {
312
- "completion_length": 535.7112144470215,
313
  "epoch": 0.40636042402826855,
314
- "grad_norm": 0.5226835012435913,
315
- "kl": 0.46435546875,
316
  "learning_rate": 1.4856507733875837e-05,
317
- "loss": 0.0186,
318
- "reward": 0.10234375311993063,
319
- "reward_std": 0.1523356933146715,
320
- "rewards/accuracy_reward": 0.024479167209938167,
321
- "rewards/format_reward": 0.07786458530463278,
322
  "step": 115
323
  },
324
  {
325
- "completion_length": 531.3724109649659,
326
  "epoch": 0.42402826855123676,
327
- "grad_norm": 0.3377923369407654,
328
- "kl": 0.4623779296875,
329
  "learning_rate": 1.4306971477188223e-05,
330
- "loss": 0.0185,
331
- "reward": 0.11875000363215804,
332
- "reward_std": 0.1620171807706356,
333
- "rewards/accuracy_reward": 0.03020833395421505,
334
- "rewards/format_reward": 0.08854166890960187,
335
  "step": 120
336
  },
337
  {
338
- "completion_length": 543.5169471740722,
339
  "epoch": 0.4416961130742049,
340
- "grad_norm": 0.4957655370235443,
341
- "kl": 0.448388671875,
342
  "learning_rate": 1.3740968546047935e-05,
343
- "loss": 0.0179,
344
- "reward": 0.09843750277068467,
345
- "reward_std": 0.1416312349960208,
346
- "rewards/accuracy_reward": 0.022656250558793545,
347
- "rewards/format_reward": 0.07578125186264514,
348
  "step": 125
349
  },
350
  {
351
- "completion_length": 522.5328289031983,
352
  "epoch": 0.45936395759717313,
353
- "grad_norm": 0.49198487401008606,
354
- "kl": 0.453369140625,
355
  "learning_rate": 1.3160662917174045e-05,
356
- "loss": 0.0181,
357
- "reward": 0.10729166988749057,
358
- "reward_std": 0.15672538382932544,
359
- "rewards/accuracy_reward": 0.024479167279787363,
360
- "rewards/format_reward": 0.08281250211875886,
361
  "step": 130
362
  },
363
  {
364
- "completion_length": 534.0132961273193,
365
  "epoch": 0.47703180212014135,
366
- "grad_norm": 0.4501975178718567,
367
- "kl": 0.90474853515625,
368
  "learning_rate": 1.2568273250226681e-05,
369
- "loss": 0.0361,
370
- "reward": 0.11223958637565375,
371
- "reward_std": 0.1586504613980651,
372
- "rewards/accuracy_reward": 0.032031250884756446,
373
- "rewards/format_reward": 0.08020833532791585,
374
  "step": 135
375
  },
376
  {
377
- "completion_length": 538.9510597229004,
378
  "epoch": 0.49469964664310956,
379
- "grad_norm": 0.8263593316078186,
380
- "kl": 0.4780517578125,
381
  "learning_rate": 1.1966064405292887e-05,
382
- "loss": 0.0191,
383
- "reward": 0.10807291991077364,
384
- "reward_std": 0.15537221673876048,
385
- "rewards/accuracy_reward": 0.02109375048894435,
386
- "rewards/format_reward": 0.08697916909586638,
387
  "step": 140
388
  },
389
  {
390
- "completion_length": 535.6718933105469,
391
  "epoch": 0.5123674911660777,
392
- "grad_norm": 0.37266990542411804,
393
- "kl": 0.430517578125,
394
  "learning_rate": 1.1356338783736256e-05,
395
- "loss": 0.0172,
396
- "reward": 0.10286458616610616,
397
- "reward_std": 0.15098252883180976,
398
- "rewards/accuracy_reward": 0.024218750512227417,
399
- "rewards/format_reward": 0.07864583553746343,
400
  "step": 145
401
  },
402
  {
403
- "completion_length": 535.9765785217285,
404
  "epoch": 0.5300353356890459,
405
- "grad_norm": 0.32091596722602844,
406
- "kl": 0.41318359375,
407
  "learning_rate": 1.0741427525516463e-05,
408
- "loss": 0.0165,
409
- "reward": 0.10651041977107525,
410
- "reward_std": 0.15906913336366416,
411
- "rewards/accuracy_reward": 0.027083333930931985,
412
- "rewards/format_reward": 0.07942708558402956,
413
  "step": 150
414
  },
415
  {
416
- "completion_length": 536.0502750396729,
417
  "epoch": 0.5477031802120141,
418
- "grad_norm": 10.207870483398438,
419
- "kl": 0.556298828125,
420
  "learning_rate": 1.012368159663363e-05,
421
- "loss": 0.0222,
422
- "reward": 0.10286458663176745,
423
- "reward_std": 0.14541662586852908,
424
- "rewards/accuracy_reward": 0.02213541720993817,
425
- "rewards/format_reward": 0.08072916818782687,
426
  "step": 155
427
  },
428
  {
429
- "completion_length": 534.5656383514404,
430
  "epoch": 0.5653710247349824,
431
- "grad_norm": 0.3280162513256073,
432
- "kl": 0.4006103515625,
433
  "learning_rate": 9.505462800772612e-06,
434
- "loss": 0.016,
435
- "reward": 0.11484375381842256,
436
- "reward_std": 0.1651745643466711,
437
- "rewards/accuracy_reward": 0.026041667303070427,
438
- "rewards/format_reward": 0.08880208584014326,
439
  "step": 160
440
  },
441
  {
442
- "completion_length": 540.3789237976074,
443
  "epoch": 0.5830388692579506,
444
- "grad_norm": 3.008737325668335,
445
- "kl": 0.543115234375,
446
  "learning_rate": 8.889134749511956e-06,
447
- "loss": 0.0217,
448
- "reward": 0.1114583365386352,
449
- "reward_std": 0.16135679064318537,
450
- "rewards/accuracy_reward": 0.025260417233221234,
451
- "rewards/format_reward": 0.08619791895616799,
452
  "step": 165
453
  },
454
  {
455
- "completion_length": 537.1291847229004,
456
  "epoch": 0.6007067137809188,
457
- "grad_norm": 0.39234212040901184,
458
- "kl": 0.503076171875,
459
  "learning_rate": 8.277053825620836e-06,
460
- "loss": 0.0201,
461
- "reward": 0.11015625316649676,
462
- "reward_std": 0.14803447835147382,
463
- "rewards/accuracy_reward": 0.02421875048894435,
464
- "rewards/format_reward": 0.08593750237487256,
465
  "step": 170
466
  },
467
  {
468
- "completion_length": 539.1065277099609,
469
  "epoch": 0.6183745583038869,
470
- "grad_norm": 0.4099801182746887,
471
- "kl": 0.456982421875,
472
  "learning_rate": 7.671560173993588e-06,
473
- "loss": 0.0183,
474
- "reward": 0.1072916699340567,
475
- "reward_std": 0.15606499183923006,
476
- "rewards/accuracy_reward": 0.026822917303070425,
477
- "rewards/format_reward": 0.08046875232830644,
478
  "step": 175
479
  },
480
  {
481
- "completion_length": 531.5690284729004,
482
  "epoch": 0.6360424028268551,
483
- "grad_norm": 0.596886396408081,
484
- "kl": 0.5114990234375,
485
  "learning_rate": 7.07496875466589e-06,
486
- "loss": 0.0205,
487
- "reward": 0.1161458363989368,
488
- "reward_std": 0.16755069950595497,
489
- "rewards/accuracy_reward": 0.028125000651925802,
490
- "rewards/format_reward": 0.08802083584014327,
491
  "step": 180
492
  },
493
  {
494
- "completion_length": 535.6968925476074,
495
  "epoch": 0.6537102473498233,
496
- "grad_norm": 0.31039756536483765,
497
- "kl": 0.4974609375,
498
  "learning_rate": 6.489560492119225e-06,
499
- "loss": 0.0199,
500
- "reward": 0.10598958644550294,
501
- "reward_std": 0.15061994921416044,
502
- "rewards/accuracy_reward": 0.021875000512227415,
503
- "rewards/format_reward": 0.08411458577029407,
504
  "step": 185
505
  },
506
  {
507
- "completion_length": 540.5557464599609,
508
  "epoch": 0.6713780918727915,
509
- "grad_norm": 1.3244882822036743,
510
- "kl": 0.4893798828125,
511
  "learning_rate": 5.9175735547120975e-06,
512
- "loss": 0.0196,
513
- "reward": 0.1151041706558317,
514
- "reward_std": 0.15738577228039502,
515
- "rewards/accuracy_reward": 0.029947917396202683,
516
- "rewards/format_reward": 0.08515625260770321,
517
  "step": 190
518
  },
519
  {
520
- "completion_length": 524.1333465576172,
521
  "epoch": 0.6890459363957597,
522
- "grad_norm": 1.8887150287628174,
523
- "kl": 0.474072265625,
524
  "learning_rate": 5.361194797579108e-06,
525
- "loss": 0.019,
526
- "reward": 0.1049479192122817,
527
- "reward_std": 0.14905744856223463,
528
- "rewards/accuracy_reward": 0.02500000058207661,
529
- "rewards/format_reward": 0.07994791893288493,
530
  "step": 195
531
  },
532
  {
533
- "completion_length": 542.0661624908447,
534
  "epoch": 0.7067137809187279,
535
- "grad_norm": 0.528878390789032,
536
- "kl": 0.551904296875,
537
  "learning_rate": 4.8225514017138205e-06,
538
- "loss": 0.0221,
539
- "reward": 0.1114583361428231,
540
- "reward_std": 0.15434924876317382,
541
- "rewards/accuracy_reward": 0.024739583930931987,
542
- "rewards/format_reward": 0.08671875244472176,
543
  "step": 200
544
  },
545
  {
546
  "epoch": 0.7067137809187279,
547
- "eval_completion_length": 569.0494995117188,
548
- "eval_kl": 0.56201171875,
549
- "eval_loss": 0.022066345438361168,
550
- "eval_reward": 0.1171875037252903,
551
- "eval_reward_std": 0.18042195588350296,
552
- "eval_rewards/accuracy_reward": 0.026041666977107525,
553
- "eval_rewards/format_reward": 0.0911458358168602,
554
- "eval_runtime": 71.3459,
555
- "eval_samples_per_second": 1.388,
556
- "eval_steps_per_second": 0.028,
557
  "step": 200
558
  },
559
  {
560
- "completion_length": 534.8948085784912,
561
  "epoch": 0.7243816254416962,
562
- "grad_norm": 0.43772202730178833,
563
- "kl": 0.576123046875,
564
  "learning_rate": 4.303702741201431e-06,
565
- "loss": 0.023,
566
- "reward": 0.10651041900273413,
567
- "reward_std": 0.1511918638832867,
568
- "rewards/accuracy_reward": 0.025260417209938168,
569
- "rewards/format_reward": 0.08125000228174031,
570
  "step": 205
571
  },
572
  {
573
- "completion_length": 533.3744941711426,
574
  "epoch": 0.7420494699646644,
575
- "grad_norm": 0.927462100982666,
576
- "kl": 0.59678955078125,
577
  "learning_rate": 3.8066325096949153e-06,
578
- "loss": 0.0239,
579
- "reward": 0.11250000314321369,
580
- "reward_std": 0.161026596929878,
581
- "rewards/accuracy_reward": 0.02656250053551048,
582
- "rewards/format_reward": 0.08593750209547579,
583
  "step": 210
584
  },
585
  {
586
- "completion_length": 536.620327758789,
587
  "epoch": 0.7597173144876325,
588
- "grad_norm": 0.38544711470603943,
589
- "kl": 0.5203857421875,
590
  "learning_rate": 3.3332411362372063e-06,
591
- "loss": 0.0208,
592
- "reward": 0.10703125295694918,
593
- "reward_std": 0.1591015163809061,
594
- "rewards/accuracy_reward": 0.023437500605359672,
595
- "rewards/format_reward": 0.0835937523515895,
596
  "step": 215
597
  },
598
  {
599
- "completion_length": 534.7442867279053,
600
  "epoch": 0.7773851590106007,
601
- "grad_norm": 0.37316420674324036,
602
- "kl": 0.4793701171875,
603
  "learning_rate": 2.8853385194256677e-06,
604
- "loss": 0.0192,
605
- "reward": 0.10937500353902578,
606
- "reward_std": 0.16291929073631764,
607
- "rewards/accuracy_reward": 0.025000000605359674,
608
- "rewards/format_reward": 0.08437500270083546,
609
  "step": 220
610
  },
611
  {
612
- "completion_length": 527.4945472717285,
613
  "epoch": 0.7950530035335689,
614
- "grad_norm": 0.4162587821483612,
615
- "kl": 0.5058837890625,
616
  "learning_rate": 2.464637107698046e-06,
617
- "loss": 0.0202,
618
- "reward": 0.10416667007375509,
619
- "reward_std": 0.1492991691455245,
620
- "rewards/accuracy_reward": 0.024739583814516665,
621
- "rewards/format_reward": 0.07942708535119891,
622
  "step": 225
623
  },
624
  {
625
- "completion_length": 532.8752773284912,
626
  "epoch": 0.8127208480565371,
627
- "grad_norm": 0.40629276633262634,
628
- "kl": 0.521826171875,
629
  "learning_rate": 2.072745352195794e-06,
630
- "loss": 0.0209,
631
- "reward": 0.10807291981764137,
632
- "reward_std": 0.15311694331467152,
633
- "rewards/accuracy_reward": 0.02213541716337204,
634
- "rewards/format_reward": 0.08593750256113708,
635
  "step": 230
636
  },
637
  {
638
- "completion_length": 528.4742362976074,
639
  "epoch": 0.8303886925795053,
640
- "grad_norm": 0.3422054350376129,
641
- "kl": 0.562890625,
642
  "learning_rate": 1.7111615572361628e-06,
643
- "loss": 0.0225,
644
- "reward": 0.10833333611954003,
645
- "reward_std": 0.15480030244216322,
646
- "rewards/accuracy_reward": 0.027343750465661287,
647
- "rewards/format_reward": 0.08098958558402955,
648
  "step": 235
649
  },
650
  {
651
- "completion_length": 534.3093910217285,
652
  "epoch": 0.8480565371024735,
653
- "grad_norm": 0.28931647539138794,
654
- "kl": 0.5560546875,
655
  "learning_rate": 1.381268151904298e-06,
656
- "loss": 0.0222,
657
- "reward": 0.10286458667833358,
658
- "reward_std": 0.15053147487342358,
659
- "rewards/accuracy_reward": 0.023437500605359672,
660
- "rewards/format_reward": 0.07942708572372795,
661
  "step": 240
662
  },
663
  {
664
- "completion_length": 544.1958484649658,
665
  "epoch": 0.8657243816254417,
666
- "grad_norm": 0.5518237352371216,
667
- "kl": 0.5116455078125,
668
  "learning_rate": 1.0843264046665558e-06,
669
- "loss": 0.0205,
670
- "reward": 0.10651042016688735,
671
- "reward_std": 0.15471182586625218,
672
- "rewards/accuracy_reward": 0.02213541720993817,
673
- "rewards/format_reward": 0.08437500256113709,
674
  "step": 245
675
  },
676
  {
677
- "completion_length": 538.685956954956,
678
  "epoch": 0.8833922261484098,
679
- "grad_norm": 0.4649152159690857,
680
- "kl": 0.5036376953125,
681
  "learning_rate": 8.214716012124491e-07,
682
- "loss": 0.0201,
683
- "reward": 0.1078125033993274,
684
- "reward_std": 0.15783682689070702,
685
- "rewards/accuracy_reward": 0.027083333837799727,
686
- "rewards/format_reward": 0.0807291688863188,
687
  "step": 250
688
  },
689
  {
690
- "completion_length": 534.5523597717286,
691
  "epoch": 0.901060070671378,
692
- "grad_norm": 0.37897783517837524,
693
- "kl": 0.5443115234375,
694
  "learning_rate": 5.937087039615619e-07,
695
- "loss": 0.0218,
696
- "reward": 0.10833333663176745,
697
- "reward_std": 0.15188463851809503,
698
- "rewards/accuracy_reward": 0.031770834187045696,
699
- "rewards/format_reward": 0.07656250230502337,
700
  "step": 255
701
  },
702
  {
703
- "completion_length": 534.8096504211426,
704
  "epoch": 0.9187279151943463,
705
- "grad_norm": 0.3674253523349762,
706
- "kl": 0.5176025390625,
707
  "learning_rate": 4.019085098303077e-07,
708
- "loss": 0.0207,
709
- "reward": 0.11614583698101341,
710
- "reward_std": 0.16189632220193745,
711
- "rewards/accuracy_reward": 0.033072917419485745,
712
- "rewards/format_reward": 0.08307291870005429,
713
  "step": 260
714
  },
715
  {
716
- "completion_length": 536.2896011352539,
717
  "epoch": 0.9363957597173145,
718
- "grad_norm": 0.42442551255226135,
719
- "kl": 0.5372802734375,
720
  "learning_rate": 2.4680432094837394e-07,
721
- "loss": 0.0215,
722
- "reward": 0.11093750363215804,
723
- "reward_std": 0.1559441326186061,
724
- "rewards/accuracy_reward": 0.02708333395421505,
725
- "rewards/format_reward": 0.0838541690260172,
726
  "step": 265
727
  },
728
  {
729
- "completion_length": 542.7255397796631,
730
  "epoch": 0.9540636042402827,
731
- "grad_norm": 0.4530044496059418,
732
- "kl": 0.5424072265625,
733
  "learning_rate": 1.289891410535593e-07,
734
- "loss": 0.0217,
735
- "reward": 0.09973958623595536,
736
- "reward_std": 0.14105932042002678,
737
- "rewards/accuracy_reward": 0.021614583884365858,
738
- "rewards/format_reward": 0.07812500209547579,
739
  "step": 270
740
  },
741
  {
742
- "completion_length": 539.1974124908447,
743
  "epoch": 0.9717314487632509,
744
- "grad_norm": 0.3238506019115448,
745
- "kl": 0.5445068359375,
746
  "learning_rate": 4.8913408283934874e-08,
747
- "loss": 0.0218,
748
- "reward": 0.09322916918899864,
749
- "reward_std": 0.13790193693712355,
750
- "rewards/accuracy_reward": 0.024479167140088975,
751
- "rewards/format_reward": 0.0687500016996637,
752
  "step": 275
753
  },
754
  {
755
- "completion_length": 545.6544456481934,
756
  "epoch": 0.9893992932862191,
757
- "grad_norm": 7.467169284820557,
758
- "kl": 0.6452880859375,
759
  "learning_rate": 6.883273035447335e-09,
760
- "loss": 0.0258,
761
- "reward": 0.10755208674818277,
762
- "reward_std": 0.15865046214312314,
763
- "rewards/accuracy_reward": 0.02864583404734731,
764
- "rewards/format_reward": 0.07890625216532499,
765
  "step": 280
766
  },
767
  {
768
- "completion_length": 522.6118488311768,
769
  "epoch": 1.0,
770
- "kl": 0.5454915364583334,
771
- "reward": 0.10503472515847534,
772
- "reward_std": 0.1531572287591795,
773
- "rewards/accuracy_reward": 0.026475695117066305,
774
- "rewards/format_reward": 0.07855903015782435,
775
  "step": 283,
776
  "total_flos": 0.0,
777
- "train_loss": 0.1736428747392735,
778
- "train_runtime": 54182.9853,
779
- "train_samples_per_second": 1.337,
780
- "train_steps_per_second": 0.005
781
  }
782
  ],
783
  "logging_steps": 5,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 404.396875,
13
  "epoch": 0.0176678445229682,
14
+ "grad_norm": 0.785922280854803,
15
+ "kl": 0.00035384297370910645,
16
  "learning_rate": 3.448275862068966e-06,
17
+ "loss": 0.0,
18
+ "reward": 0.5984375,
19
+ "reward_std": 0.31488348115235565,
20
+ "rewards/accuracy_reward": 0.166015625,
21
+ "rewards/format_reward": 0.432421875,
22
  "step": 5
23
  },
24
  {
25
+ "completion_length": 410.702734375,
26
  "epoch": 0.0353356890459364,
27
+ "grad_norm": 0.3143792157068426,
28
+ "kl": 0.13354759216308593,
29
  "learning_rate": 6.896551724137932e-06,
30
+ "loss": 0.0054,
31
+ "reward": 0.57265625,
32
+ "reward_std": 0.30052037397399545,
33
+ "rewards/accuracy_reward": 0.160546875,
34
+ "rewards/format_reward": 0.412109375,
35
  "step": 10
36
  },
37
  {
38
+ "completion_length": 400.45234375,
39
  "epoch": 0.053003533568904596,
40
+ "grad_norm": 0.6724373940493742,
41
+ "kl": 0.024787521362304686,
42
  "learning_rate": 1.0344827586206898e-05,
43
+ "loss": 0.001,
44
+ "reward": 0.594921875,
45
+ "reward_std": 0.3065970725379884,
46
+ "rewards/accuracy_reward": 0.153515625,
47
+ "rewards/format_reward": 0.44140625,
48
  "step": 15
49
  },
50
  {
51
+ "completion_length": 395.24921875,
52
  "epoch": 0.0706713780918728,
53
+ "grad_norm": 0.6026471879082274,
54
+ "kl": 0.3488151550292969,
55
  "learning_rate": 1.3793103448275863e-05,
56
+ "loss": 0.0139,
57
+ "reward": 0.613671875,
58
+ "reward_std": 0.3264844523742795,
59
+ "rewards/accuracy_reward": 0.168359375,
60
+ "rewards/format_reward": 0.4453125,
61
  "step": 20
62
  },
63
  {
64
+ "completion_length": 398.496484375,
65
  "epoch": 0.08833922261484099,
66
+ "grad_norm": 0.46593386880605636,
67
+ "kl": 0.1805694580078125,
68
  "learning_rate": 1.7241379310344828e-05,
69
+ "loss": 0.0072,
70
+ "reward": 0.61484375,
71
+ "reward_std": 0.2883669765666127,
72
+ "rewards/accuracy_reward": 0.158203125,
73
+ "rewards/format_reward": 0.456640625,
74
  "step": 25
75
  },
76
  {
77
+ "completion_length": 401.319921875,
78
  "epoch": 0.10600706713780919,
79
+ "grad_norm": 0.5247335452215227,
80
+ "kl": 0.145611572265625,
81
  "learning_rate": 1.999923511388017e-05,
82
+ "loss": 0.0058,
83
+ "reward": 0.61328125,
84
+ "reward_std": 0.2905766863375902,
85
+ "rewards/accuracy_reward": 0.16484375,
86
+ "rewards/format_reward": 0.4484375,
87
  "step": 30
88
  },
89
  {
90
+ "completion_length": 402.28671875,
91
  "epoch": 0.12367491166077739,
92
+ "grad_norm": 1.3201550928400456,
93
+ "kl": 0.15986328125,
94
  "learning_rate": 1.9972476383747748e-05,
95
+ "loss": 0.0064,
96
+ "reward": 0.591796875,
97
+ "reward_std": 0.29886309332214295,
98
+ "rewards/accuracy_reward": 0.1515625,
99
+ "rewards/format_reward": 0.440234375,
100
  "step": 35
101
  },
102
  {
103
+ "completion_length": 400.734765625,
104
  "epoch": 0.1413427561837456,
105
+ "grad_norm": 0.4543361752537872,
106
+ "kl": 0.293060302734375,
107
  "learning_rate": 1.9907590277344582e-05,
108
+ "loss": 0.0117,
109
+ "reward": 0.60703125,
110
+ "reward_std": 0.31156891826540234,
111
+ "rewards/accuracy_reward": 0.166015625,
112
+ "rewards/format_reward": 0.441015625,
113
  "step": 40
114
  },
115
  {
116
+ "completion_length": 402.326953125,
117
  "epoch": 0.15901060070671377,
118
+ "grad_norm": 1.8083450201171443,
119
+ "kl": 0.539990234375,
120
  "learning_rate": 1.9804824871166254e-05,
121
+ "loss": 0.0216,
122
+ "reward": 0.59609375,
123
+ "reward_std": 0.3016252293717116,
124
+ "rewards/accuracy_reward": 0.159765625,
125
+ "rewards/format_reward": 0.436328125,
126
  "step": 45
127
  },
128
  {
129
+ "completion_length": 399.735546875,
130
  "epoch": 0.17667844522968199,
131
+ "grad_norm": 0.6440098529655808,
132
+ "kl": 36.3084228515625,
133
  "learning_rate": 1.9664573064143604e-05,
134
+ "loss": 1.4539,
135
+ "reward": 0.6109375,
136
+ "reward_std": 0.3060446453746408,
137
+ "rewards/accuracy_reward": 0.162109375,
138
+ "rewards/format_reward": 0.448828125,
139
  "step": 50
140
  },
141
  {
142
+ "completion_length": 396.709375,
143
  "epoch": 0.19434628975265017,
144
+ "grad_norm": 0.949235195273066,
145
+ "kl": 18740.40302734375,
146
  "learning_rate": 1.948737107548771e-05,
147
+ "loss": 748.2533,
148
+ "reward": 0.61640625,
149
+ "reward_std": 0.3193028993904591,
150
+ "rewards/accuracy_reward": 0.165625,
151
+ "rewards/format_reward": 0.45078125,
152
  "step": 55
153
  },
154
  {
155
+ "completion_length": 410.02734375,
156
  "epoch": 0.21201413427561838,
157
+ "grad_norm": 2.007276336933817,
158
+ "kl": 1.0031005859375,
159
  "learning_rate": 1.9273896394584103e-05,
160
+ "loss": 0.0401,
161
+ "reward": 0.59609375,
162
+ "reward_std": 0.2949961026199162,
163
+ "rewards/accuracy_reward": 0.148828125,
164
+ "rewards/format_reward": 0.447265625,
165
  "step": 60
166
  },
167
  {
168
+ "completion_length": 405.175390625,
169
  "epoch": 0.22968197879858657,
170
+ "grad_norm": 0.7554540760672372,
171
+ "kl": 1.448046875,
172
  "learning_rate": 1.9024965190774262e-05,
173
+ "loss": 0.0579,
174
+ "reward": 0.60546875,
175
+ "reward_std": 0.297205812856555,
176
+ "rewards/accuracy_reward": 0.15546875,
177
+ "rewards/format_reward": 0.45,
178
  "step": 65
179
  },
180
  {
181
+ "completion_length": 403.630859375,
182
  "epoch": 0.24734982332155478,
183
+ "grad_norm": 0.7209308900049199,
184
+ "kl": 19355.033984375,
185
  "learning_rate": 1.8741529192927528e-05,
186
+ "loss": 776.0688,
187
+ "reward": 0.6125,
188
+ "reward_std": 0.30714950021356346,
189
+ "rewards/accuracy_reward": 0.165234375,
190
+ "rewards/format_reward": 0.447265625,
191
  "step": 70
192
  },
193
  {
194
+ "completion_length": 398.434375,
195
  "epoch": 0.26501766784452296,
196
+ "grad_norm": 0.5881139318633425,
197
+ "kl": 1.331982421875,
198
  "learning_rate": 1.8424672050733577e-05,
199
+ "loss": 0.0533,
200
+ "reward": 0.61484375,
201
+ "reward_std": 0.2994155207648873,
202
+ "rewards/accuracy_reward": 0.158203125,
203
+ "rewards/format_reward": 0.456640625,
204
  "step": 75
205
  },
206
  {
207
+ "completion_length": 408.6515625,
208
  "epoch": 0.2826855123674912,
209
+ "grad_norm": 2.6274469054608143,
210
+ "kl": 1.180029296875,
211
  "learning_rate": 1.8075605191627242e-05,
212
+ "loss": 0.0472,
213
+ "reward": 0.607421875,
214
+ "reward_std": 0.2811854241415858,
215
+ "rewards/accuracy_reward": 0.164453125,
216
+ "rewards/format_reward": 0.44296875,
217
  "step": 80
218
  },
219
  {
220
+ "completion_length": 413.64296875,
221
  "epoch": 0.3003533568904594,
222
+ "grad_norm": 0.6738788725405221,
223
+ "kl": 11.340185546875,
224
  "learning_rate": 1.7695663189185703e-05,
225
+ "loss": 0.4546,
226
+ "reward": 0.601953125,
227
+ "reward_std": 0.30549221779219804,
228
+ "rewards/accuracy_reward": 0.179296875,
229
+ "rewards/format_reward": 0.42265625,
230
  "step": 85
231
  },
232
  {
233
+ "completion_length": 413.73359375,
234
  "epoch": 0.31802120141342755,
235
+ "grad_norm": 1.3105277713530405,
236
+ "kl": 1.447265625,
237
  "learning_rate": 1.7286298660705877e-05,
238
+ "loss": 0.0579,
239
+ "reward": 0.60390625,
240
+ "reward_std": 0.30604464691132305,
241
+ "rewards/accuracy_reward": 0.1609375,
242
+ "rewards/format_reward": 0.44296875,
243
  "step": 90
244
  },
245
  {
246
+ "completion_length": 404.303515625,
247
  "epoch": 0.33568904593639576,
248
+ "grad_norm": 0.7623390157660208,
249
+ "kl": 1.0364013671875,
250
  "learning_rate": 1.6849076713469914e-05,
251
+ "loss": 0.0415,
252
+ "reward": 0.601171875,
253
+ "reward_std": 0.2955485317390412,
254
+ "rewards/accuracy_reward": 0.16171875,
255
+ "rewards/format_reward": 0.439453125,
256
  "step": 95
257
  },
258
  {
259
+ "completion_length": 413.7109375,
260
  "epoch": 0.35335689045936397,
261
+ "grad_norm": 1.0600792188331805,
262
+ "kl": 2.1205810546875,
263
  "learning_rate": 1.6385668960932143e-05,
264
+ "loss": 0.0849,
265
+ "reward": 0.572265625,
266
+ "reward_std": 0.3088067832402885,
267
+ "rewards/accuracy_reward": 0.1546875,
268
+ "rewards/format_reward": 0.417578125,
269
  "step": 100
270
  },
271
  {
272
  "epoch": 0.35335689045936397,
273
+ "eval_completion_length": 405.1893997192383,
274
+ "eval_kl": 1.361328125,
275
+ "eval_loss": 0.05589722469449043,
276
+ "eval_reward": 0.61328125,
277
+ "eval_reward_std": 0.2927863895893097,
278
+ "eval_rewards/accuracy_reward": 0.140625,
279
+ "eval_rewards/format_reward": 0.47265625,
280
+ "eval_runtime": 62.5177,
281
+ "eval_samples_per_second": 1.584,
282
+ "eval_steps_per_second": 0.032,
283
  "step": 100
284
  },
285
  {
286
+ "completion_length": 397.241015625,
287
  "epoch": 0.3710247349823322,
288
+ "grad_norm": 1.7180085275135002,
289
+ "kl": 2.092529296875,
290
  "learning_rate": 1.5897847131705194e-05,
291
+ "loss": 0.0837,
292
+ "reward": 0.6046875,
293
+ "reward_std": 0.3170931892469525,
294
+ "rewards/accuracy_reward": 0.167578125,
295
+ "rewards/format_reward": 0.437109375,
296
  "step": 105
297
  },
298
  {
299
+ "completion_length": 403.58828125,
300
  "epoch": 0.38869257950530034,
301
+ "grad_norm": 0.534578040818149,
302
+ "kl": 1.238232421875,
303
  "learning_rate": 1.5387476295779737e-05,
304
+ "loss": 0.0495,
305
+ "reward": 0.597265625,
306
+ "reward_std": 0.31322619933635,
307
+ "rewards/accuracy_reward": 0.1671875,
308
+ "rewards/format_reward": 0.430078125,
309
  "step": 110
310
  },
311
  {
312
+ "completion_length": 408.934375,
313
  "epoch": 0.40636042402826855,
314
+ "grad_norm": 0.41472109596238,
315
+ "kl": 1.536083984375,
316
  "learning_rate": 1.4856507733875837e-05,
317
+ "loss": 0.0614,
318
+ "reward": 0.6015625,
319
+ "reward_std": 0.3082543543539941,
320
+ "rewards/accuracy_reward": 0.173828125,
321
+ "rewards/format_reward": 0.427734375,
322
  "step": 115
323
  },
324
  {
325
+ "completion_length": 393.684375,
326
  "epoch": 0.42402826855123676,
327
+ "grad_norm": 1.0671198220931404,
328
+ "kl": 1.219482421875,
329
  "learning_rate": 1.4306971477188223e-05,
330
+ "loss": 0.0488,
331
+ "reward": 0.602734375,
332
+ "reward_std": 0.31101649152114985,
333
+ "rewards/accuracy_reward": 0.15546875,
334
+ "rewards/format_reward": 0.447265625,
335
  "step": 120
336
  },
337
  {
338
+ "completion_length": 411.417578125,
339
  "epoch": 0.4416961130742049,
340
+ "grad_norm": 0.46043384834399087,
341
+ "kl": 1.910693359375,
342
  "learning_rate": 1.3740968546047935e-05,
343
+ "loss": 0.0764,
344
+ "reward": 0.60390625,
345
+ "reward_std": 0.29941552053205667,
346
+ "rewards/accuracy_reward": 0.16328125,
347
+ "rewards/format_reward": 0.440625,
348
  "step": 125
349
  },
350
  {
351
+ "completion_length": 397.962890625,
352
  "epoch": 0.45936395759717313,
353
+ "grad_norm": 0.6111314907455481,
354
+ "kl": 0.9618408203125,
355
  "learning_rate": 1.3160662917174045e-05,
356
+ "loss": 0.0385,
357
+ "reward": 0.585546875,
358
+ "reward_std": 0.31212134528905155,
359
+ "rewards/accuracy_reward": 0.14765625,
360
+ "rewards/format_reward": 0.437890625,
361
  "step": 130
362
  },
363
  {
364
+ "completion_length": 401.703515625,
365
  "epoch": 0.47703180212014135,
366
+ "grad_norm": 1.4967183682261056,
367
+ "kl": 1.99873046875,
368
  "learning_rate": 1.2568273250226681e-05,
369
+ "loss": 0.08,
370
+ "reward": 0.61171875,
371
+ "reward_std": 0.2927863945718855,
372
+ "rewards/accuracy_reward": 0.1671875,
373
+ "rewards/format_reward": 0.44453125,
374
  "step": 135
375
  },
376
  {
377
+ "completion_length": 406.8,
378
  "epoch": 0.49469964664310956,
379
+ "grad_norm": 0.917377230805497,
380
+ "kl": 1.26953125,
381
  "learning_rate": 1.1966064405292887e-05,
382
+ "loss": 0.0508,
383
+ "reward": 0.59765625,
384
+ "reward_std": 0.31377862663939593,
385
+ "rewards/accuracy_reward": 0.168359375,
386
+ "rewards/format_reward": 0.429296875,
387
  "step": 140
388
  },
389
  {
390
+ "completion_length": 391.3640625,
391
  "epoch": 0.5123674911660777,
392
+ "grad_norm": 0.5259506143043062,
393
+ "kl": 1.326171875,
394
  "learning_rate": 1.1356338783736256e-05,
395
+ "loss": 0.053,
396
+ "reward": 0.6125,
397
+ "reward_std": 0.28615726907737554,
398
+ "rewards/accuracy_reward": 0.166015625,
399
+ "rewards/format_reward": 0.446484375,
400
  "step": 145
401
  },
402
  {
403
+ "completion_length": 397.0296875,
404
  "epoch": 0.5300353356890459,
405
+ "grad_norm": 0.6823442998587644,
406
+ "kl": 1.138134765625,
407
  "learning_rate": 1.0741427525516463e-05,
408
+ "loss": 0.0455,
409
+ "reward": 0.6140625,
410
+ "reward_std": 0.2828427059110254,
411
+ "rewards/accuracy_reward": 0.168359375,
412
+ "rewards/format_reward": 0.445703125,
413
  "step": 150
414
  },
415
  {
416
+ "completion_length": 407.55390625,
417
  "epoch": 0.5477031802120141,
418
+ "grad_norm": 5.1945886494720215,
419
+ "kl": 1.53837890625,
420
  "learning_rate": 1.012368159663363e-05,
421
+ "loss": 0.0615,
422
+ "reward": 0.57109375,
423
+ "reward_std": 0.3038349375128746,
424
+ "rewards/accuracy_reward": 0.14375,
425
+ "rewards/format_reward": 0.42734375,
426
  "step": 155
427
  },
428
  {
429
+ "completion_length": 401.918359375,
430
  "epoch": 0.5653710247349824,
431
+ "grad_norm": 0.4307285006305928,
432
+ "kl": 1.49169921875,
433
  "learning_rate": 9.505462800772612e-06,
434
+ "loss": 0.0597,
435
+ "reward": 0.60546875,
436
+ "reward_std": 0.3049397937953472,
437
+ "rewards/accuracy_reward": 0.1625,
438
+ "rewards/format_reward": 0.44296875,
439
  "step": 160
440
  },
441
  {
442
+ "completion_length": 402.686328125,
443
  "epoch": 0.5830388692579506,
444
+ "grad_norm": 0.49030947447366735,
445
+ "kl": 1.36279296875,
446
  "learning_rate": 8.889134749511956e-06,
447
+ "loss": 0.0545,
448
+ "reward": 0.590625,
449
+ "reward_std": 0.3038349383510649,
450
+ "rewards/accuracy_reward": 0.153125,
451
+ "rewards/format_reward": 0.4375,
452
  "step": 165
453
  },
454
  {
455
+ "completion_length": 405.9875,
456
  "epoch": 0.6007067137809188,
457
+ "grad_norm": 0.4352892915737104,
458
+ "kl": 1.3828125,
459
  "learning_rate": 8.277053825620836e-06,
460
+ "loss": 0.0553,
461
+ "reward": 0.61328125,
462
+ "reward_std": 0.3027300829067826,
463
+ "rewards/accuracy_reward": 0.1546875,
464
+ "rewards/format_reward": 0.45859375,
465
  "step": 170
466
  },
467
  {
468
+ "completion_length": 401.63359375,
469
  "epoch": 0.6183745583038869,
470
+ "grad_norm": 0.8900367662135797,
471
+ "kl": 1.16708984375,
472
  "learning_rate": 7.671560173993588e-06,
473
+ "loss": 0.0467,
474
+ "reward": 0.615625,
475
+ "reward_std": 0.3038349374197423,
476
+ "rewards/accuracy_reward": 0.18046875,
477
+ "rewards/format_reward": 0.43515625,
478
  "step": 175
479
  },
480
  {
481
+ "completion_length": 399.348828125,
482
  "epoch": 0.6360424028268551,
483
+ "grad_norm": 0.8123164664250848,
484
+ "kl": 1.635009765625,
485
  "learning_rate": 7.07496875466589e-06,
486
+ "loss": 0.0654,
487
+ "reward": 0.61640625,
488
+ "reward_std": 0.29720581048168243,
489
+ "rewards/accuracy_reward": 0.1640625,
490
+ "rewards/format_reward": 0.45234375,
491
  "step": 180
492
  },
493
  {
494
+ "completion_length": 401.88515625,
495
  "epoch": 0.6537102473498233,
496
+ "grad_norm": 0.5468048622490435,
497
+ "kl": 1.2939208984375,
498
  "learning_rate": 6.489560492119225e-06,
499
+ "loss": 0.0518,
500
+ "reward": 0.607421875,
501
+ "reward_std": 0.3054922170005739,
502
+ "rewards/accuracy_reward": 0.15546875,
503
+ "rewards/format_reward": 0.451953125,
504
  "step": 185
505
  },
506
  {
507
+ "completion_length": 397.83359375,
508
  "epoch": 0.6713780918727915,
509
+ "grad_norm": 0.7244711255317006,
510
+ "kl": 1.36591796875,
511
  "learning_rate": 5.9175735547120975e-06,
512
+ "loss": 0.0546,
513
+ "reward": 0.6109375,
514
+ "reward_std": 0.3181980476714671,
515
+ "rewards/accuracy_reward": 0.155859375,
516
+ "rewards/format_reward": 0.455078125,
517
  "step": 190
518
  },
519
  {
520
+ "completion_length": 413.570703125,
521
  "epoch": 0.6890459363957597,
522
+ "grad_norm": 0.42585359101119413,
523
+ "kl": 1.764208984375,
524
  "learning_rate": 5.361194797579108e-06,
525
+ "loss": 0.0706,
526
+ "reward": 0.6046875,
527
+ "reward_std": 0.3093592093326151,
528
+ "rewards/accuracy_reward": 0.1671875,
529
+ "rewards/format_reward": 0.4375,
530
  "step": 195
531
  },
532
  {
533
+ "completion_length": 399.438671875,
534
  "epoch": 0.7067137809187279,
535
+ "grad_norm": 0.47550986420528907,
536
+ "kl": 1.2271484375,
537
  "learning_rate": 4.8225514017138205e-06,
538
+ "loss": 0.0491,
539
+ "reward": 0.60546875,
540
+ "reward_std": 0.3038349355570972,
541
+ "rewards/accuracy_reward": 0.1578125,
542
+ "rewards/format_reward": 0.44765625,
543
  "step": 200
544
  },
545
  {
546
  "epoch": 0.7067137809187279,
547
+ "eval_completion_length": 383.06652069091797,
548
+ "eval_kl": 1.0625,
549
+ "eval_loss": 0.043225545436143875,
550
+ "eval_reward": 0.6328125,
551
+ "eval_reward_std": 0.2651650458574295,
552
+ "eval_rewards/accuracy_reward": 0.16015625,
553
+ "eval_rewards/format_reward": 0.47265625,
554
+ "eval_runtime": 64.1406,
555
+ "eval_samples_per_second": 1.543,
556
+ "eval_steps_per_second": 0.031,
557
  "step": 200
558
  },
559
  {
560
+ "completion_length": 410.916796875,
561
  "epoch": 0.7243816254416962,
562
+ "grad_norm": 0.4152907308781029,
563
+ "kl": 1.306787109375,
564
  "learning_rate": 4.303702741201431e-06,
565
+ "loss": 0.0523,
566
+ "reward": 0.605859375,
567
+ "reward_std": 0.29554852955043315,
568
+ "rewards/accuracy_reward": 0.162890625,
569
+ "rewards/format_reward": 0.44296875,
570
  "step": 205
571
  },
572
  {
573
+ "completion_length": 420.941796875,
574
  "epoch": 0.7420494699646644,
575
+ "grad_norm": 0.3864466357905039,
576
+ "kl": 1.347509765625,
577
  "learning_rate": 3.8066325096949153e-06,
578
+ "loss": 0.0539,
579
+ "reward": 0.594140625,
580
+ "reward_std": 0.3010728007182479,
581
+ "rewards/accuracy_reward": 0.16484375,
582
+ "rewards/format_reward": 0.429296875,
583
  "step": 210
584
  },
585
  {
586
+ "completion_length": 405.008984375,
587
  "epoch": 0.7597173144876325,
588
+ "grad_norm": 0.4071374966900544,
589
+ "kl": 1.3837890625,
590
  "learning_rate": 3.3332411362372063e-06,
591
+ "loss": 0.0554,
592
+ "reward": 0.5796875,
593
+ "reward_std": 0.31156891863793135,
594
+ "rewards/accuracy_reward": 0.155859375,
595
+ "rewards/format_reward": 0.423828125,
596
  "step": 215
597
  },
598
  {
599
+ "completion_length": 400.690234375,
600
  "epoch": 0.7773851590106007,
601
+ "grad_norm": 0.48756085064617083,
602
+ "kl": 1.277734375,
603
  "learning_rate": 2.8853385194256677e-06,
604
+ "loss": 0.0511,
605
+ "reward": 0.6046875,
606
+ "reward_std": 0.31488348012790085,
607
+ "rewards/accuracy_reward": 0.173046875,
608
+ "rewards/format_reward": 0.431640625,
609
  "step": 220
610
  },
611
  {
612
+ "completion_length": 418.2671875,
613
  "epoch": 0.7950530035335689,
614
+ "grad_norm": 0.37777218952387104,
615
+ "kl": 1.430517578125,
616
  "learning_rate": 2.464637107698046e-06,
617
+ "loss": 0.0573,
618
+ "reward": 0.59375,
619
+ "reward_std": 0.312673770962283,
620
+ "rewards/accuracy_reward": 0.169140625,
621
+ "rewards/format_reward": 0.424609375,
622
  "step": 225
623
  },
624
  {
625
+ "completion_length": 409.259375,
626
  "epoch": 0.8127208480565371,
627
+ "grad_norm": 0.5380959008484876,
628
+ "kl": 1.55126953125,
629
  "learning_rate": 2.072745352195794e-06,
630
+ "loss": 0.0621,
631
+ "reward": 0.5984375,
632
+ "reward_std": 0.2938912484794855,
633
+ "rewards/accuracy_reward": 0.17734375,
634
+ "rewards/format_reward": 0.42109375,
635
  "step": 230
636
  },
637
  {
638
+ "completion_length": 408.725,
639
  "epoch": 0.8303886925795053,
640
+ "grad_norm": 0.6087277900031041,
641
+ "kl": 1.276611328125,
642
  "learning_rate": 1.7111615572361628e-06,
643
+ "loss": 0.0511,
644
+ "reward": 0.608203125,
645
+ "reward_std": 0.32096018167212603,
646
+ "rewards/accuracy_reward": 0.166015625,
647
+ "rewards/format_reward": 0.4421875,
648
  "step": 235
649
  },
650
  {
651
+ "completion_length": 408.4484375,
652
  "epoch": 0.8480565371024735,
653
+ "grad_norm": 0.40330618192163126,
654
+ "kl": 1.2666015625,
655
  "learning_rate": 1.381268151904298e-06,
656
+ "loss": 0.0507,
657
+ "reward": 0.591015625,
658
+ "reward_std": 0.2999679483473301,
659
+ "rewards/accuracy_reward": 0.1625,
660
+ "rewards/format_reward": 0.428515625,
661
  "step": 240
662
  },
663
  {
664
+ "completion_length": 411.65390625,
665
  "epoch": 0.8657243816254417,
666
+ "grad_norm": 0.5432070185726592,
667
+ "kl": 1.262109375,
668
  "learning_rate": 1.0843264046665558e-06,
669
+ "loss": 0.0505,
670
+ "reward": 0.5953125,
671
+ "reward_std": 0.3016252295579761,
672
+ "rewards/accuracy_reward": 0.15703125,
673
+ "rewards/format_reward": 0.43828125,
674
  "step": 245
675
  },
676
  {
677
+ "completion_length": 409.023046875,
678
  "epoch": 0.8833922261484098,
679
+ "grad_norm": 0.3703261485488551,
680
+ "kl": 1.469970703125,
681
  "learning_rate": 8.214716012124491e-07,
682
+ "loss": 0.0588,
683
+ "reward": 0.605078125,
684
+ "reward_std": 0.3021776580251753,
685
+ "rewards/accuracy_reward": 0.16484375,
686
+ "rewards/format_reward": 0.440234375,
687
  "step": 250
688
  },
689
  {
690
+ "completion_length": 407.982421875,
691
  "epoch": 0.901060070671378,
692
+ "grad_norm": 0.48718776424035376,
693
+ "kl": 1.395263671875,
694
  "learning_rate": 5.937087039615619e-07,
695
+ "loss": 0.0558,
696
+ "reward": 0.61328125,
697
+ "reward_std": 0.2861572677269578,
698
+ "rewards/accuracy_reward": 0.171875,
699
+ "rewards/format_reward": 0.44140625,
700
  "step": 255
701
  },
702
  {
703
+ "completion_length": 399.203125,
704
  "epoch": 0.9187279151943463,
705
+ "grad_norm": 0.4674025132985681,
706
+ "kl": 1.384521484375,
707
  "learning_rate": 4.019085098303077e-07,
708
+ "loss": 0.0554,
709
+ "reward": 0.599609375,
710
+ "reward_std": 0.30438736486248674,
711
+ "rewards/accuracy_reward": 0.154296875,
712
+ "rewards/format_reward": 0.4453125,
713
  "step": 260
714
  },
715
  {
716
+ "completion_length": 407.852734375,
717
  "epoch": 0.9363957597173145,
718
+ "grad_norm": 0.518993833517968,
719
+ "kl": 1.352685546875,
720
  "learning_rate": 2.4680432094837394e-07,
721
+ "loss": 0.0541,
722
+ "reward": 0.5921875,
723
+ "reward_std": 0.2861572689376771,
724
+ "rewards/accuracy_reward": 0.1609375,
725
+ "rewards/format_reward": 0.43125,
726
  "step": 265
727
  },
728
  {
729
+ "completion_length": 418.2390625,
730
  "epoch": 0.9540636042402827,
731
+ "grad_norm": 0.41910418773784364,
732
+ "kl": 1.345751953125,
733
  "learning_rate": 1.289891410535593e-07,
734
+ "loss": 0.0538,
735
+ "reward": 0.598828125,
736
+ "reward_std": 0.3209601787850261,
737
+ "rewards/accuracy_reward": 0.168359375,
738
+ "rewards/format_reward": 0.43046875,
739
  "step": 270
740
  },
741
  {
742
+ "completion_length": 403.1046875,
743
  "epoch": 0.9717314487632509,
744
+ "grad_norm": 0.5629306163631225,
745
+ "kl": 1.38427734375,
746
  "learning_rate": 4.8913408283934874e-08,
747
+ "loss": 0.0554,
748
+ "reward": 0.587109375,
749
+ "reward_std": 0.31101649068295956,
750
+ "rewards/accuracy_reward": 0.155078125,
751
+ "rewards/format_reward": 0.43203125,
752
  "step": 275
753
  },
754
  {
755
+ "completion_length": 396.14375,
756
  "epoch": 0.9893992932862191,
757
+ "grad_norm": 0.40077217392258757,
758
+ "kl": 1.292138671875,
759
  "learning_rate": 6.883273035447335e-09,
760
+ "loss": 0.0517,
761
+ "reward": 0.61484375,
762
+ "reward_std": 0.29610095573589207,
763
+ "rewards/accuracy_reward": 0.16953125,
764
+ "rewards/format_reward": 0.4453125,
765
  "step": 280
766
  },
767
  {
768
+ "completion_length": 388.8455181121826,
769
  "epoch": 1.0,
770
+ "kl": 1.29052734375,
771
+ "reward": 0.6243489583333334,
772
+ "reward_std": 0.3176456190024813,
773
+ "rewards/accuracy_reward": 0.17838541666666666,
774
+ "rewards/format_reward": 0.4459635416666667,
775
  "step": 283,
776
  "total_flos": 0.0,
777
+ "train_loss": 27.009561450669516,
778
+ "train_runtime": 47693.8258,
779
+ "train_samples_per_second": 1.519,
780
+ "train_steps_per_second": 0.006
781
  }
782
  ],
783
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3223c81b4d5af21a4323b99f7322808c9598dd9bb65b24cbea2cf188582b0bc5
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a571ea342950b818515612b20599050de613be01d10ca84f6715a795c3f31929
3
  size 7544