import mesh_tensorflow.optimize import mesh_tensorflow.transformer.dataset import mesh_tensorflow.transformer.learning_rate_schedules import mesh_tensorflow.transformer.t2t_vocabulary import mesh_tensorflow.transformer.transformer_layers import mesh_tensorflow.transformer.utils import t5.data.sentencepiece_vocabulary import t5.models.mesh_transformer # Macros: # ============================================================================== d_ff = 2048 d_kv = 64 d_model = 512 dropout_rate = 0.1 init_checkpoint = 'gs://t5-data/pretrained_models/small/model.ckpt-1000000' MIXTURE_NAME = 'all_mix' noise_density = 0.15 num_heads = 8 num_layers = 6 # Parameters for AdafactorOptimizer: # ============================================================================== AdafactorOptimizer.beta1 = 0.0 AdafactorOptimizer.clipping_threshold = 1.0 AdafactorOptimizer.decay_rate = None AdafactorOptimizer.epsilon1 = 1e-30 AdafactorOptimizer.epsilon2 = 0.001 AdafactorOptimizer.factored = True AdafactorOptimizer.min_dim_size_to_factor = 128 AdafactorOptimizer.multiply_by_parameter_scale = True # Parameters for Bitransformer: # ============================================================================== Bitransformer.shared_embedding = True # Parameters for denoise: # ============================================================================== # None. # Parameters for decoder/DenseReluDense: # ============================================================================== decoder/DenseReluDense.activation = 'relu' decoder/DenseReluDense.dropout_rate = %dropout_rate decoder/DenseReluDense.hidden_size = %d_ff # Parameters for encoder/DenseReluDense: # ============================================================================== encoder/DenseReluDense.activation = 'relu' encoder/DenseReluDense.dropout_rate = %dropout_rate encoder/DenseReluDense.hidden_size = %d_ff # Parameters for decoder/EncDecAttention: # ============================================================================== # None. # Parameters for get_variable_dtype: # ============================================================================== get_variable_dtype.activation_dtype = 'bfloat16' # Parameters for get_vocab_embedding_cls: # ============================================================================== # None. # Parameters for get_vocabulary: # ============================================================================== # None. # Parameters for iid_noise_mask: # ============================================================================== # None. # Parameters for decoder/LayerStack: # ============================================================================== decoder/LayerStack.dropout_rate = %dropout_rate decoder/LayerStack.norm_epsilon = 1e-06 decoder/LayerStack.recompute_grads = False # Parameters for encoder/LayerStack: # ============================================================================== encoder/LayerStack.dropout_rate = %dropout_rate encoder/LayerStack.norm_epsilon = 1e-06 encoder/LayerStack.recompute_grads = False # Parameters for make_bitransformer: # ============================================================================== make_bitransformer.decoder_name = 'decoder' make_bitransformer.encoder_name = 'encoder' # Parameters for decoder/make_layer_stack: # ============================================================================== decoder/make_layer_stack.block_scope = True decoder/make_layer_stack.layers = \ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention, @mesh_tensorflow.transformer.transformer_layers.EncDecAttention, @mesh_tensorflow.transformer.transformer_layers.DenseReluDense] decoder/make_layer_stack.num_layers = %num_layers # Parameters for encoder/make_layer_stack: # ============================================================================== encoder/make_layer_stack.block_scope = True encoder/make_layer_stack.layers = \ [@mesh_tensorflow.transformer.transformer_layers.SelfAttention, @mesh_tensorflow.transformer.transformer_layers.DenseReluDense] encoder/make_layer_stack.num_layers = %num_layers # Parameters for maybe_print_dataset: # ============================================================================== maybe_print_dataset.should_print = False # Parameters for mesh_train_dataset_fn: # ============================================================================== mesh_train_dataset_fn.use_cached = False # Parameters for MtfModel: # ============================================================================== MtfModel.autostack = True MtfModel.ensemble_inputs = None MtfModel.gcp_project = None MtfModel.layout_rules = \ 'ensemble:ensemble,batch:batch,d_ff:model,heads:model,vocab:model,experts:batch' MtfModel.mesh_devices = None MtfModel.mesh_shape = None MtfModel.model_type = 'bitransformer' MtfModel.optimizer = None MtfModel.predict_fn = None MtfModel.tpu_job_name = None MtfModel.tpu_zone = None MtfModel.variable_filter = None # Parameters for noise_token_to_sentinel: # ============================================================================== # None. # Parameters for num_parallel_calls: # ============================================================================== num_parallel_calls.deterministic = False # Parameters for pack_dataset: # ============================================================================== pack_dataset.use_custom_ops = False # Parameters for pack_or_pad: # ============================================================================== # None. # Parameters for decoder/SelfAttention: # ============================================================================== decoder/SelfAttention.attention_func = None decoder/SelfAttention.attention_kwargs = None decoder/SelfAttention.combine_dims = True decoder/SelfAttention.dropout_rate = %dropout_rate decoder/SelfAttention.keep_query_heads_dims = False decoder/SelfAttention.key_value_size = %d_kv decoder/SelfAttention.num_heads = %num_heads decoder/SelfAttention.num_memory_heads = 0 decoder/SelfAttention.relative_attention_num_buckets = 32 decoder/SelfAttention.relative_attention_type = 'bias_shared' decoder/SelfAttention.shared_kv = False # Parameters for encoder/SelfAttention: # ============================================================================== encoder/SelfAttention.attention_func = None encoder/SelfAttention.attention_kwargs = None encoder/SelfAttention.combine_dims = True encoder/SelfAttention.dropout_rate = %dropout_rate encoder/SelfAttention.keep_query_heads_dims = False encoder/SelfAttention.key_value_size = %d_kv encoder/SelfAttention.num_heads = %num_heads encoder/SelfAttention.num_memory_heads = 0 encoder/SelfAttention.relative_attention_num_buckets = 32 encoder/SelfAttention.relative_attention_type = 'bias_shared' encoder/SelfAttention.shared_kv = False # Parameters for SentencePieceVocabulary: # ============================================================================== # None. # Parameters for sentinel_id: # ============================================================================== sentinel_id.return_value = None # Parameters for serialize_num_microbatches: # ============================================================================== serialize_num_microbatches.tokens_per_microbatch_per_replica = 8192 # Parameters for shift_targets: # ============================================================================== shift_targets.bos_id = 0 shift_targets.eos_id = 1 # Parameters for tpu_estimator_model_fn: # ============================================================================== tpu_estimator_model_fn.model_info_file = None tpu_estimator_model_fn.outer_batch_size = 1 tpu_estimator_model_fn.tpu_summaries = False # Parameters for tpu_mesh_shape: # ============================================================================== tpu_mesh_shape.ensemble_parallelism = None # Parameters for decoder/Unitransformer: # ============================================================================== decoder/Unitransformer.d_model = %d_model decoder/Unitransformer.ensemble = None decoder/Unitransformer.input_full_attention = False decoder/Unitransformer.label_smoothing = 0.0 decoder/Unitransformer.loss_denominator = 233472 decoder/Unitransformer.loss_fn = None decoder/Unitransformer.loss_on_targets_only = False decoder/Unitransformer.max_length = 512 decoder/Unitransformer.positional_embedding = False decoder/Unitransformer.shared_embedding_and_softmax_weights = True decoder/Unitransformer.sinusoid_positional_embedding = False decoder/Unitransformer.token_dropout_rate = 0.0 decoder/Unitransformer.vocab_divisor = 128 decoder/Unitransformer.z_loss = 0.0001 # Parameters for encoder/Unitransformer: # ============================================================================== encoder/Unitransformer.d_model = %d_model encoder/Unitransformer.ensemble = None encoder/Unitransformer.input_full_attention = False encoder/Unitransformer.label_smoothing = 0.0 encoder/Unitransformer.loss_denominator = None encoder/Unitransformer.loss_fn = None encoder/Unitransformer.loss_on_targets_only = False encoder/Unitransformer.max_length = 512 encoder/Unitransformer.positional_embedding = False encoder/Unitransformer.shared_embedding_and_softmax_weights = True encoder/Unitransformer.sinusoid_positional_embedding = False encoder/Unitransformer.token_dropout_rate = 0.0 encoder/Unitransformer.vocab_divisor = 128 encoder/Unitransformer.z_loss = 0.0001 # Parameters for VarianceScalingInitializer: # ============================================================================== VarianceScalingInitializer.distribution = 'normal' VarianceScalingInitializer.mode = 'fan_in' VarianceScalingInitializer.scale = 1.0 # Parameters for VocabEmbedding: # ============================================================================== # None. # Parameters for Vocabulary: # ============================================================================== # None.