Text Generation
scaling
File size: 1,827 Bytes
890a986
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
optimizer:
  allreduce_bucket_size: 500000000
  beta1: 0.9
  beta2: 0.95
  debug_log: false
  eps: 1e-08
  gradient_clipping: 1.0
  zero: true
  zero_save_static: false
topology:
  activation_checkpointing_type: disabled
  global_batch_size: 1024
  gradient_accumulation_steps: 4
  micro_batch_size: 2
  model_parallel_size: 1
  pipe_parallel_size: 1
  pipe_partition_method: balanced
  pipe_partition_overwrite: null
  sequence_parallel: false
trainer:
  seed: 42
  train_iterations: 72000
training:
  allow_missing_params_in_optimizer: true
training_groups:
- group_name: param_group
  independent_weight_decay: false
  learning_rate_scheduler:
    learning_rate: 0.0006
    learning_rate_decay_iters: 72000
    learning_rate_decay_style: cosine
    learning_rate_minimum: 6e-05
    learning_rate_warmup_steps: 500
  parameters_exclude: null
  weight_decay: 0.1
transformer_architecture:
  attention_bias: false
  attention_num_kv_heads: null
  attention_qkv_in_one: true
  dropout_after_attention: 0.0
  dropout_after_mlp: 0.0
  dropout_attention_probs: 0.0
  dropout_embedding: 0.0
  dropout_image_encoder: 0.0
  hidden_size: 2048
  image_encoder: false
  key_query_norm: false
  layernorm:
    layernorm_epsilon: 1e-05
    optimization_type: torch
  local_attention_window_size: null
  masked_softmax:
    kernel: flash_attention
    scale: 1.0
    softmax_in_fp32: false
  mlp_bias: false
  mlp_factor: 2.6640625
  mlp_type: swiglu
  norm_type: rms
  num_attention_heads: 16
  num_layers: 16
  num_local_attention_heads: 0
  precision: bfloat16
  relative_position_embedding_type: rotary_complex
  reset_attention_mask: false
  reset_position_ids: false
  rotary_embedding_base: 10000
  rotary_percentage: 1.0
  sequence_length: 4096
  umup:
    enable: false
  vocab_file: null
  vocab_size: 65536
  weight_tying: false