prefix: /home/arqa39/.torchtune output_dir: /home/arqa39/.torchtune/fed_ppo/mistral_7b tokenizer: _component_: torchtune.models.mistral.mistral_tokenizer path: ${prefix}/models/Mistral-7B-Instruct-v0.2/tokenizer.model max_seq_len: null dataset: _component_: torchtune.datasets.text_completion_dataset source: trl-internal-testing/sentiment-trl-style split: train column: prompt add_eos: false policy: _component_: torchtune.models.mistral.lora_mistral_7b lora_attn_modules: - q_proj - k_proj - v_proj - output_proj apply_lora_to_mlp: true apply_lora_to_output: false lora_rank: 64 lora_alpha: 16 lora_dropout: 0.0 quantize_base: false valmod: _component_: torchtune.models.mistral._component_builders.lora_mistral_classifier attn_dropout: 0.0 embed_dim: 4096 intermediate_dim: 14336 max_seq_len: 32768 norm_eps: 1.0e-05 num_classes: 1 num_heads: 32 num_kv_heads: 8 num_layers: 32 vocab_size: 32001 lora_attn_modules: - q_proj - k_proj - v_proj - output_proj apply_lora_to_mlp: true apply_lora_to_output: = True lora_rank: 16 lora_alpha: 32 lora_dropout: 0.0 quantize_base: false checkpointer: _component_: torchtune.training.FullModelHFCheckpointer checkpoint_dir: ${prefix}/models/Mistral-7B-Instruct-v0.2/ checkpoint_files: - pytorch_model-00001-of-00003.bin - pytorch_model-00002-of-00003.bin - pytorch_model-00003-of-00003.bin recipe_checkpoint: null output_dir: ${output_dir}/policy model_type: MISTRAL value_checkpointer: _component_: torchtune.training.FullModelHFCheckpointer checkpoint_dir: ${prefix}/models/RM-Mistral-7B/ checkpoint_files: - model-00001-of-00003.safetensors - model-00002-of-00003.safetensors - model-00003-of-00003.safetensors output_dir: ${output_dir}/value model_type: REWARD seed: 53710 shuffle: true device: cuda batch_size: 64 num_steps: 10000 ppo_epochs: 2 ppo_batch_size: 32 gradient_accumulation_steps: 1 compile: false optimizer: _component_: bitsandbytes.optim.PagedAdamW lr: 0.0001 optimizer_in_bwd: true log_peak_memory_stats: true enable_activation_checkpointing: true dtype: bf16 forward_batch_size: 16 max_generated_tokens: 58 temperature: 0.7 top_k: null min_response_length: 18 penalise_no_eos: true reward_penalty: -3 stop_token_ids: - 2 - 28723 whiten_rewards: false gamma: 1 lmbda: 0.95 loss: _component_: torchtune.rlhf.loss.PPOLoss epsilon: 0.2 value_coeff: 0.1 value_clip_range: 0.2 kl_coeff: 0.01 wandb_logger: dir: ${prefix} entity: RADFAN project: FedPPO group: SelfReference name: Mistral-7B-LoRA-SelfRef-U13 log_every_n_steps: 1 update_ref_policy_every_n_steps: 13