data: tokenizer: null train_files: numina_ds_train_sample.parquet val_files: matheval.parquet prompt_key: prompt max_prompt_length: 1024 max_response_length: 2048 train_batch_size: 32 val_batch_size: 640 return_raw_input_ids: false return_raw_chat: false shuffle: true apply_chat_template: true actor_rollout_ref: hybrid_engine: true model: path: Qwen/Qwen2.5-Math-7B revision: main external_lib: null override_config: {} enable_gradient_checkpointing: true use_remove_padding: false save_hf_repo_id: RyanYr/brm-numina-qwen2.5math-7B-base-lr5e-7constant-n4 actor: loss: brm brm: norm_factor: value qlearn: use_replaybuffer: true replaybuffer_size: 32 replaybuffer_sample_size: 32 replaybuffer_update_size: 32 replaybuffer_update_reward_threshold: 1.0 strategy: fsdp ppo_mini_batch_size: 64 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: 4 use_dynamic_bsz: false ppo_max_token_len_per_gpu: 16384 grad_clip: 1.0 clip_ratio: 0.2 entropy_coeff: 0.001 use_kl_loss: false kl_loss_coef: 0.001 kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false ulysses_sequence_parallel_size: 1 optim: lr: 5.0e-07 lr_warmup_steps_ratio: 0 min_lr_ratio: null warmup_style: constant total_training_steps: 12496 fsdp_config: wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false fsdp_size: -1 ref: fsdp_config: param_offload: false wrap_policy: min_num_params: 0 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 4 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 rollout: name: vllm temperature: 1.0 top_k: -1 top_p: 1 prompt_length: 1024 response_length: 2048 dtype: bfloat16 gpu_memory_utilization: 0.6 ignore_eos: false enforce_eager: true free_cache_engine: true load_format: dummy_dtensor tensor_model_parallel_size: 4 max_num_batched_tokens: 8192 max_num_seqs: 1024 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 4 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 disable_log_stats: true enable_chunked_prefill: true do_sample: true 'n': 4 seed: 42 reward_model: reward_manager: prime extra_rwfn_argdict: format_score: 0 score: 1.0 algorithm: kl_penalty: kl kl_ctrl: type: fixed kl_coef: 0.1 trainer: total_epochs: 1 total_training_steps: null project_name: value-LLM experiment_name: brm-numina-qwen2.5math-7B-base_lr5e-7constant-n4 logger: - wandb val_generations_to_log_to_wandb: 0 nnodes: 1 n_gpus_per_node: 4 save_freq: 80 resume_mode: auto resume_from_path: false test_freq: 20 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: true del_local_ckpt_after_load: false default_local_dir: ./BRM hf_token: null resume_from_hf: enable: true hf_repo_id: RyanYr/brm-numina-qwen2.5math-7B-base-lr5e-7constant-n4 hf_token: null revision: 1932c649200c975a4b3ae08f1ed04561da3736ff val_before_train: true