|
import mesh_tensorflow.optimize |
|
import mesh_tensorflow.transformer.dataset |
|
import mesh_tensorflow.transformer.learning_rate_schedules |
|
import mesh_tensorflow.transformer.t2t_vocabulary |
|
import mesh_tensorflow.transformer.transformer_layers |
|
import mesh_tensorflow.transformer.utils |
|
import t5.data.sentencepiece_vocabulary |
|
import t5.models.mesh_transformer |
|
|
|
|
|
|
|
d_ff = 2048 |
|
d_kv = 64 |
|
d_model = 512 |
|
dropout_rate = 0.1 |
|
num_heads = 8 |
|
num_layers = 6 |
|
|
|
|
|
|
|
AdafactorOptimizer.beta1 = 0.0 |
|
AdafactorOptimizer.clipping_threshold = 1.0 |
|
AdafactorOptimizer.decay_rate = None |
|
AdafactorOptimizer.epsilon1 = 1e-30 |
|
AdafactorOptimizer.epsilon2 = 0.001 |
|
AdafactorOptimizer.factored = True |
|
AdafactorOptimizer.min_dim_size_to_factor = 128 |
|
AdafactorOptimizer.multiply_by_parameter_scale = True |
|
|
|
|
|
|
|
Bitransformer.shared_embedding = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
decoder/DenseReluDense.activation = 'relu' |
|
decoder/DenseReluDense.dropout_rate = %dropout_rate |
|
decoder/DenseReluDense.hidden_size = %d_ff |
|
|
|
|
|
|
|
encoder/DenseReluDense.activation = 'relu' |
|
encoder/DenseReluDense.dropout_rate = %dropout_rate |
|
encoder/DenseReluDense.hidden_size = %d_ff |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get_variable_dtype.activation_dtype = 'bfloat16' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
decoder/LayerStack.dropout_rate = %dropout_rate |
|
decoder/LayerStack.norm_epsilon = 1e-06 |
|
decoder/LayerStack.recompute_grads = False |
|
|
|
|
|
|
|
encoder/LayerStack.dropout_rate = %dropout_rate |
|
encoder/LayerStack.norm_epsilon = 1e-06 |
|
encoder/LayerStack.recompute_grads = False |
|
|
|
|
|
|
|
make_bitransformer.decoder_name = 'decoder' |
|
make_bitransformer.encoder_name = 'encoder' |
|
|
|
|
|
|
|
decoder/make_layer_stack.block_scope = True |
|
decoder/make_layer_stack.layers = \ |
|
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, |
|
@mesh_tensorflow.transformer.transformer_layers.EncDecAttention, |
|
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] |
|
decoder/make_layer_stack.num_layers = %num_layers |
|
|
|
|
|
|
|
encoder/make_layer_stack.block_scope = True |
|
encoder/make_layer_stack.layers = \ |
|
[@mesh_tensorflow.transformer.transformer_layers.SelfAttention, |
|
@mesh_tensorflow.transformer.transformer_layers.DenseReluDense] |
|
encoder/make_layer_stack.num_layers = %num_layers |
|
|
|
|
|
|
|
maybe_print_dataset.should_print = False |
|
|
|
|
|
|
|
mesh_train_dataset_fn.use_cached = False |
|
|
|
|
|
|
|
MtfModel.autostack = True |
|
MtfModel.ensemble_inputs = None |
|
MtfModel.gcp_project = None |
|
MtfModel.layout_rules = \ |
|
'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch' |
|
MtfModel.mesh_devices = None |
|
MtfModel.mesh_shape = None |
|
MtfModel.model_type = 'bitransformer' |
|
MtfModel.optimizer = None |
|
MtfModel.predict_fn = None |
|
MtfModel.tpu_job_name = None |
|
MtfModel.tpu_zone = None |
|
MtfModel.variable_filter = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_parallel_calls.deterministic = False |
|
|
|
|
|
|
|
pack_dataset.use_custom_ops = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
decoder/SelfAttention.attention_func = None |
|
decoder/SelfAttention.attention_kwargs = None |
|
decoder/SelfAttention.combine_dims = True |
|
decoder/SelfAttention.dropout_rate = %dropout_rate |
|
decoder/SelfAttention.keep_query_heads_dims = False |
|
decoder/SelfAttention.key_value_size = %d_kv |
|
decoder/SelfAttention.num_heads = %num_heads |
|
decoder/SelfAttention.num_memory_heads = 0 |
|
decoder/SelfAttention.relative_attention_num_buckets = 32 |
|
decoder/SelfAttention.relative_attention_type = 'bias_shared' |
|
decoder/SelfAttention.shared_kv = False |
|
|
|
|
|
|
|
encoder/SelfAttention.attention_func = None |
|
encoder/SelfAttention.attention_kwargs = None |
|
encoder/SelfAttention.combine_dims = True |
|
encoder/SelfAttention.dropout_rate = %dropout_rate |
|
encoder/SelfAttention.keep_query_heads_dims = False |
|
encoder/SelfAttention.key_value_size = %d_kv |
|
encoder/SelfAttention.num_heads = %num_heads |
|
encoder/SelfAttention.num_memory_heads = 0 |
|
encoder/SelfAttention.relative_attention_num_buckets = 32 |
|
encoder/SelfAttention.relative_attention_type = 'bias_shared' |
|
encoder/SelfAttention.shared_kv = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentinel_id.return_value = None |
|
|
|
|
|
|
|
serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192 |
|
|
|
|
|
|
|
shift_targets.bos_id = 0 |
|
shift_targets.eos_id = 1 |
|
|
|
|
|
|
|
tpu_estimator_model_fn.model_info_file = None |
|
tpu_estimator_model_fn.outer_batch_size = 1 |
|
tpu_estimator_model_fn.tpu_summaries = False |
|
|
|
|
|
|
|
tpu_mesh_shape.ensemble_parallelism = None |
|
|
|
|
|
|
|
decoder/Unitransformer.d_model = %d_model |
|
decoder/Unitransformer.ensemble = None |
|
decoder/Unitransformer.input_full_attention = False |
|
decoder/Unitransformer.label_smoothing = 0.0 |
|
decoder/Unitransformer.loss_denominator = 233472 |
|
decoder/Unitransformer.loss_fn = None |
|
decoder/Unitransformer.loss_on_targets_only = False |
|
decoder/Unitransformer.max_length = 512 |
|
decoder/Unitransformer.positional_embedding = False |
|
decoder/Unitransformer.shared_embedding_and_softmax_weights = True |
|
decoder/Unitransformer.sinusoid_positional_embedding = False |
|
decoder/Unitransformer.token_dropout_rate = 0.0 |
|
decoder/Unitransformer.vocab_divisor = 128 |
|
decoder/Unitransformer.z_loss = 0.0001 |
|
|
|
|
|
|
|
encoder/Unitransformer.d_model = %d_model |
|
encoder/Unitransformer.ensemble = None |
|
encoder/Unitransformer.input_full_attention = False |
|
encoder/Unitransformer.label_smoothing = 0.0 |
|
encoder/Unitransformer.loss_denominator = None |
|
encoder/Unitransformer.loss_fn = None |
|
encoder/Unitransformer.loss_on_targets_only = False |
|
encoder/Unitransformer.max_length = 512 |
|
encoder/Unitransformer.positional_embedding = False |
|
encoder/Unitransformer.shared_embedding_and_softmax_weights = True |
|
encoder/Unitransformer.sinusoid_positional_embedding = False |
|
encoder/Unitransformer.token_dropout_rate = 0.0 |
|
encoder/Unitransformer.vocab_divisor = 128 |
|
encoder/Unitransformer.z_loss = 0.0001 |
|
|
|
|
|
|
|
VarianceScalingInitializer.distribution = 'normal' |
|
VarianceScalingInitializer.mode = 'fan_in' |
|
VarianceScalingInitializer.scale = 1.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|