File size: 2,661 Bytes
b686860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
prefix: /home/arqa39/.torchtune
output_dir: /home/arqa39/.torchtune/fed_ppo/mistral_7b
tokenizer:
  _component_: torchtune.models.mistral.mistral_tokenizer
  path: ${prefix}/models/Mistral-7B-Instruct-v0.2/tokenizer.model
  max_seq_len: null
dataset:
  _component_: torchtune.datasets.text_completion_dataset
  source: trl-internal-testing/sentiment-trl-style
  split: train
  column: prompt
  add_eos: false
policy:
  _component_: torchtune.models.mistral.lora_mistral_7b
  lora_attn_modules:
  - q_proj
  - k_proj
  - v_proj
  - output_proj
  apply_lora_to_mlp: true
  apply_lora_to_output: false
  lora_rank: 64
  lora_alpha: 16
  lora_dropout: 0.0
  quantize_base: false
valmod:
  _component_: torchtune.models.mistral._component_builders.lora_mistral_classifier
  attn_dropout: 0.0
  embed_dim: 4096
  intermediate_dim: 14336
  max_seq_len: 32768
  norm_eps: 1.0e-05
  num_classes: 1
  num_heads: 32
  num_kv_heads: 8
  num_layers: 32
  vocab_size: 32001
  lora_attn_modules:
  - q_proj
  - k_proj
  - v_proj
  - output_proj
  apply_lora_to_mlp: true
  apply_lora_to_output: = True
  lora_rank: 16
  lora_alpha: 32
  lora_dropout: 0.0
  quantize_base: false
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: ${prefix}/models/Mistral-7B-Instruct-v0.2/
  checkpoint_files:
  - pytorch_model-00001-of-00003.bin
  - pytorch_model-00002-of-00003.bin
  - pytorch_model-00003-of-00003.bin
  recipe_checkpoint: null
  output_dir: ${output_dir}/policy
  model_type: MISTRAL
value_checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: ${prefix}/models/RM-Mistral-7B/
  checkpoint_files:
  - model-00001-of-00003.safetensors
  - model-00002-of-00003.safetensors
  - model-00003-of-00003.safetensors
  output_dir: ${output_dir}/value
  model_type: REWARD
seed: 53710
shuffle: true
device: cuda
batch_size: 64
num_steps: 10000
ppo_epochs: 2
ppo_batch_size: 32
gradient_accumulation_steps: 1
compile: false
optimizer:
  _component_: bitsandbytes.optim.PagedAdamW
  lr: 0.0001
optimizer_in_bwd: true
log_peak_memory_stats: true
enable_activation_checkpointing: true
dtype: bf16
forward_batch_size: 16
max_generated_tokens: 58
temperature: 0.7
top_k: null
min_response_length: 18
penalise_no_eos: true
reward_penalty: -3
stop_token_ids:
- 2
- 28723
whiten_rewards: false
gamma: 1
lmbda: 0.95
loss:
  _component_: torchtune.rlhf.loss.PPOLoss
  epsilon: 0.2
  value_coeff: 0.1
  value_clip_range: 0.2
kl_coeff: 0.01
wandb_logger:
  dir: ${prefix}
  entity: RADFAN
  project: FedPPO
  group: SelfReference
  name: Mistral-7B-LoRA-SelfRef-U13
log_every_n_steps: 1
update_ref_policy_every_n_steps: 13